Repository: Illumina/Nirvana Branch: main Commit: 62d30326985a Files: 1266 Total size: 5.3 MB Directory structure: gitextract_hj0kftnn/ ├── .gitattributes ├── .gitignore ├── AnnotationLambda/ │ ├── AnnotationLambda.cs │ ├── AnnotationLambda.csproj │ ├── AssemblyInfo.cs │ └── S3Utilities.cs ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── CacheUtils/ │ ├── AssemblyInfo.cs │ ├── BuildCache.sh │ ├── CacheUtils.cs │ ├── CacheUtils.csproj │ ├── CacheUtils.dll.gene.json │ ├── Commands/ │ │ ├── CombineCacheDirectories/ │ │ │ └── CombineCacheDirectoriesMain.cs │ │ ├── CreateCache/ │ │ │ └── CreateNirvanaDatabaseMain.cs │ │ ├── Download/ │ │ │ ├── DownloadMain.cs │ │ │ ├── ExternalFiles.cs │ │ │ └── GenbankFile.cs │ │ ├── ExtractTranscripts/ │ │ │ └── ExtractTranscriptsMain.cs │ │ ├── GFF/ │ │ │ ├── CreateGffMain.cs │ │ │ └── InternalGenes.cs │ │ ├── Header/ │ │ │ └── HeaderMain.cs │ │ ├── ParseVepCacheDirectory/ │ │ │ ├── ParseVepCacheDirectoryMain.cs │ │ │ ├── RegulatoryRegionMerger.cs │ │ │ ├── TranscriptFilter.cs │ │ │ ├── TranscriptIdFilter.cs │ │ │ ├── TranscriptMerger.cs │ │ │ ├── TranscriptMergerLogger.cs │ │ │ ├── VepCacheParser.cs │ │ │ └── VepRootDirectory.cs │ │ ├── RegulatoryGFF/ │ │ │ └── CreateRegulatoryGffMain.cs │ │ └── UniversalGeneArchive/ │ │ ├── FilePaths.cs │ │ └── UniversalGeneArchiveMain.cs │ ├── DataDumperImport/ │ │ ├── DataStructures/ │ │ │ ├── GenomeSymbolSource.cs │ │ │ ├── Import/ │ │ │ │ ├── IImportNode.cs │ │ │ │ ├── ImportNodeExtensions.cs │ │ │ │ ├── ListObjectKeyValueNode.cs │ │ │ │ ├── ObjectKeyValueNode.cs │ │ │ │ ├── ObjectValueNode.cs │ │ │ │ ├── StringKeyValueNode.cs │ │ │ │ └── StringValueNode.cs │ │ │ └── Mutable/ │ │ │ ├── MutableExon.cs │ │ │ ├── MutableGene.cs │ │ │ ├── MutableTranscript.cs │ │ │ └── MutableTranscriptRegion.cs │ │ ├── FauxRegex/ │ │ │ └── RegexDecisionTree.cs │ │ ├── IO/ │ │ │ ├── DataDumperReader.cs │ │ │ └── EntryType.cs │ │ ├── Import/ │ │ │ ├── Attribute.cs │ │ │ ├── ImportExon.cs │ │ │ ├── ImportGene.cs │ │ │ ├── ImportIntron.cs │ │ │ ├── ImportKeys.cs │ │ │ ├── ImportMapper.cs │ │ │ ├── ImportMapperPair.cs │ │ │ ├── ImportMapperUnit.cs │ │ │ ├── ImportPairGenomic.cs │ │ │ ├── ImportPrediction.cs │ │ │ ├── ImportProteinFunctionPredictions.cs │ │ │ ├── ImportRegulatoryFeature.cs │ │ │ ├── ImportSeqEdits.cs │ │ │ ├── ImportTranscript.cs │ │ │ ├── ImportTranscriptMapper.cs │ │ │ ├── ImportTranslation.cs │ │ │ └── ImportVariantEffectFeatureCache.cs │ │ └── Utilities/ │ │ ├── ImportUtilities.cs │ │ ├── MutableTranscriptComparer.cs │ │ └── TranscriptUtilities.cs │ ├── GFF/ │ │ ├── GeneralAttributes.cs │ │ ├── GffCreator.cs │ │ ├── GffGene.cs │ │ ├── GffUtilities.cs │ │ ├── GffWriter.cs │ │ ├── IGeneralAttributes.cs │ │ ├── IGffGene.cs │ │ ├── IRequiredFields.cs │ │ └── RequiredFields.cs │ ├── Genbank/ │ │ ├── GenbankEntry.cs │ │ ├── GenbankReader.cs │ │ └── GenbankState.cs │ ├── Genes/ │ │ ├── Combiners/ │ │ │ ├── CombinerUtils.cs │ │ │ ├── HgncIdCombiner.cs │ │ │ ├── ICombiner.cs │ │ │ └── PartitionCombiner.cs │ │ ├── DataStores/ │ │ │ ├── AssemblyDataStore.cs │ │ │ ├── EnsemblGtf.cs │ │ │ ├── GeneInfoData.cs │ │ │ ├── GlobalCache.cs │ │ │ ├── Hgnc.cs │ │ │ ├── IUpdateHgncData.cs │ │ │ ├── RefSeqGff.cs │ │ │ └── UpdateHgncData.cs │ │ ├── DataStructures/ │ │ │ ├── EnsemblGene.cs │ │ │ ├── GeneInfo.cs │ │ │ ├── HgncGene.cs │ │ │ ├── IFlatGene.cs │ │ │ ├── RefSeqGene.cs │ │ │ └── UgaGene.cs │ │ ├── GeneFlattener.cs │ │ ├── GeneMerger.cs │ │ ├── GeneSymbolUpdater.cs │ │ ├── HgncIdConsolidator.cs │ │ ├── HgncIdUpdater.cs │ │ ├── IO/ │ │ │ ├── EnsemblGtfReader.cs │ │ │ ├── GeneInfoReader.cs │ │ │ ├── HgncReader.cs │ │ │ ├── RefSeqGffReader.cs │ │ │ ├── UgaGeneReader.cs │ │ │ └── UgaGeneWriter.cs │ │ ├── UgaAssemblyCombiner.cs │ │ └── Utilities/ │ │ └── DictionaryUtilities.cs │ ├── Helpers/ │ │ ├── BioTypeHelper.cs │ │ ├── GeneSymbolSourceHelper.cs │ │ ├── RegulatoryRegionTypeHelper.cs │ │ └── TranscriptCacheHelper.cs │ ├── IntermediateIO/ │ │ ├── CcdsReader.cs │ │ ├── GenbankReader.cs │ │ ├── GenbankWriter.cs │ │ ├── IntermediateIoCommon.cs │ │ ├── IntermediateIoHeader.cs │ │ ├── LrgReader.cs │ │ ├── MutableTranscriptReader.cs │ │ ├── MutableTranscriptWriter.cs │ │ ├── PredictionReader.cs │ │ ├── PredictionWriter.cs │ │ ├── RegulatoryRegionReader.cs │ │ └── RegulatoryRegionWriter.cs │ ├── MiniCache/ │ │ ├── DataBundle.cs │ │ └── IStaging.cs │ ├── PredictionCache/ │ │ ├── PredictionCacheBuilder.cs │ │ ├── PredictionCacheStaging.cs │ │ ├── PredictionCacheWriter.cs │ │ ├── PredictionExtensions.cs │ │ ├── PredictionUtilities.cs │ │ ├── RoundedEntry.cs │ │ └── RoundedEntryPrediction.cs │ ├── TranscriptCache/ │ │ ├── CanonicalTranscriptMarker.cs │ │ ├── Comparers/ │ │ │ ├── GeneComparer.cs │ │ │ ├── IntervalComparer.cs │ │ │ ├── RegulatoryRegionComparer.cs │ │ │ ├── TranscriptRegionComparer.cs │ │ │ └── UgaGeneComparer.cs │ │ ├── NSequence.cs │ │ ├── SortExtensions.cs │ │ ├── TranscriptCacheBuilder.cs │ │ ├── TranscriptCacheStaging.cs │ │ ├── TranscriptCacheUtilities.cs │ │ ├── TranscriptCacheWriter.cs │ │ ├── TranscriptConversionExtensions.cs │ │ ├── TranscriptRegionMerger.cs │ │ └── TranscriptRegionValidater.cs │ └── Utilities/ │ ├── AccessionUtilities.cs │ ├── HeaderUtilities.cs │ ├── RemoteFile.cs │ └── TaskExtensions.cs ├── Cloud/ │ ├── AmazonS3ClientWrapper.cs │ ├── AssemblyInfo.cs │ ├── Cloud.appsettings.json │ ├── Cloud.csproj │ ├── Configuration.cs │ ├── LambdaUrlHelper.cs │ ├── Messages/ │ │ ├── Annotation/ │ │ │ ├── AnnotationConfig.cs │ │ │ ├── AnnotationPosition.cs │ │ │ ├── AnnotationRange.cs │ │ │ └── AnnotationResult.cs │ │ ├── Custom/ │ │ │ ├── CustomConfig.cs │ │ │ └── CustomResult.cs │ │ ├── FileList.cs │ │ ├── Gene/ │ │ │ └── GeneConfig.cs │ │ ├── JwtFields.cs │ │ ├── Nirvana/ │ │ │ ├── NirvanaConfig.cs │ │ │ └── NirvanaResult.cs │ │ ├── S3Path.cs │ │ ├── SaUrls.cs │ │ ├── Single/ │ │ │ ├── SingleConfig.cs │ │ │ └── SingleVariant.cs │ │ └── StrValidation/ │ │ ├── ValidationConfig.cs │ │ └── ValidationResult.cs │ ├── Notifications/ │ │ └── SNS.cs │ ├── RedactionUtilities.cs │ └── Utilities/ │ ├── AwsExceptionUtilities.cs │ ├── JsonUtilities.cs │ ├── LambdaUtilities.cs │ ├── LogUtilities.cs │ └── UploadUtilities.cs ├── CommandLine/ │ ├── AssemblyInfo.cs │ ├── Builders/ │ │ ├── ConsoleAppBuilder.cs │ │ ├── IConsoleAppBuilder.cs │ │ ├── ITopLevelAppBuilder.cs │ │ ├── TopLevelAppBuilder.cs │ │ ├── TopLevelOption.cs │ │ └── ValidationExtensions.cs │ ├── CommandLine.csproj │ ├── NDesk.Options/ │ │ └── Options.cs │ ├── Utilities/ │ │ ├── Benchmark.cs │ │ ├── CommandLineUtilities.cs │ │ ├── Help.cs │ │ ├── MemoryUtilities.cs │ │ └── OutputHelper.cs │ └── VersionProviders/ │ ├── DefaultVersionProvider.cs │ └── IVersionProvider.cs ├── CommonAssemblyInfo.props ├── Compression/ │ ├── Algorithms/ │ │ ├── ICompressionAlgorithm.cs │ │ ├── Zlib.cs │ │ └── Zstandard.cs │ ├── AssemblyInfo.cs │ ├── Compression.csproj │ ├── DataStructures/ │ │ └── Block.cs │ ├── FileHandling/ │ │ ├── BgzBlockReader.cs │ │ ├── BgzfBlock.cs │ │ ├── BgzipTextReader.cs │ │ ├── BgzipTextWriter.cs │ │ ├── BlockGZipStream.cs │ │ ├── BlockHeader.cs │ │ └── BlockStream.cs │ └── Utilities/ │ ├── BlockExtensions.cs │ ├── GZipUtilities.cs │ └── LibraryUtilities.cs ├── CreateLambdaZips.sh ├── CustomAnnotationLambda/ │ ├── CustomAnnotationLambda.cs │ ├── CustomAnnotationLambda.csproj │ ├── CustomConfigExtensions.cs │ ├── GeneAnnotationCreator.cs │ └── VariantAnnotationCreator.cs ├── CustomStrValidationLambda/ │ ├── CustomStrValidationLambda.cs │ └── CustomStrValidationLambda.csproj ├── Downloader/ │ ├── AnnotationRepository.cs │ ├── AssemblyInfo.cs │ ├── Client.cs │ ├── Configuration.cs │ ├── Downloader.appsettings.json │ ├── Downloader.csproj │ ├── DownloaderMain.cs │ ├── FileExtensions/ │ │ ├── CacheFileExtensions.cs │ │ ├── ReferencesFileExtensions.cs │ │ └── SupplementaryAnnotationFileExtensions.cs │ ├── IClient.cs │ ├── Manifest.cs │ ├── OutputDirectory.cs │ ├── RemoteFile.cs │ └── Utilities/ │ ├── ConsoleEmbellishments.cs │ ├── DiskSpaceUtilities.cs │ ├── GenomeAssemblyHelper.cs │ ├── ParallelUtilities.cs │ └── SyncUtilities.cs ├── ErrorHandling/ │ ├── AssemblyInfo.cs │ ├── ErrorCategory.cs │ ├── ErrorHandling.csproj │ ├── ExceptionUtilities.cs │ ├── Exceptions/ │ │ ├── CompressionException.cs │ │ ├── DeploymentErrorException.cs │ │ ├── FileNotSortedException.cs │ │ ├── InvalidFileFormatException.cs │ │ ├── MissingCompressionLibraryException.cs │ │ ├── ProcessLockedFileException.cs │ │ └── UserErrorException.cs │ ├── ExitCodeUtilities.cs │ └── ExitCodes.cs ├── GeneAnnotationLambda/ │ ├── GeneAnnotationLambda.cs │ ├── GeneAnnotationLambda.csproj │ └── GeneResult.cs ├── Genome/ │ ├── Band.cs │ ├── Chromosome.cs │ ├── ChromosomeInterval.cs │ ├── ContigInfo.cs │ ├── CytogeneticBands.cs │ ├── Genome.csproj │ ├── GenomeAssembly.cs │ ├── GenomeAssemblyHelper.cs │ ├── GenomicPosition.cs │ ├── GenomicRange.cs │ ├── GenomicRangeChecker.cs │ ├── IChromosomeInterval.cs │ ├── ISequence.cs │ ├── ReferenceNameUtilities.cs │ └── SequenceUtilities.cs ├── IO/ │ ├── BufferedBinaryReader.cs │ ├── CacheConstants.cs │ ├── ExtendedBinaryReader.cs │ ├── ExtendedBinaryWriter.cs │ ├── FilePathUtilities.cs │ ├── FileUtilities.cs │ ├── HttpUtilities.cs │ ├── IBufferedBinaryReader.cs │ ├── IConnect.cs │ ├── IExtendedBinaryWriter.cs │ ├── IO.csproj │ ├── IS3Client.cs │ ├── ISerializable.cs │ ├── LengthStream.cs │ ├── Logger.cs │ ├── MD5Stream.cs │ ├── PersistentConnect.cs │ ├── PersistentStream.cs │ ├── PersistentStreamUtils.cs │ ├── SpanBufferBinaryReader.cs │ ├── UrlUtilities.cs │ └── v2/ │ ├── FileType.cs │ └── Header.cs ├── Intervals/ │ ├── Extensions.cs │ ├── IInterval.cs │ ├── IIntervalForest.cs │ ├── IIntervalSearch.cs │ ├── Interval.cs │ ├── IntervalArray.cs │ ├── IntervalForest.cs │ ├── Intervals.csproj │ ├── NullIntervalSearch.cs │ ├── OverlapType.cs │ └── Utilities.cs ├── Jasix/ │ ├── AssemblyInfo.cs │ ├── DataStructures/ │ │ ├── JasixChrIndex.cs │ │ ├── JasixCommons.cs │ │ ├── JasixIndex.cs │ │ ├── JasixNode.cs │ │ ├── JsonSchema.cs │ │ └── Utilities.cs │ ├── IndexCreator.cs │ ├── Jasix.csproj │ ├── JasixMain.cs │ ├── OnTheFlyIndexCreator.cs │ └── QueryProcessor.cs ├── Jist/ │ ├── Jist.csproj │ ├── JistMain.cs │ ├── JistUtilities.cs │ └── JsonStitcher.cs ├── LICENSE ├── MitoHeteroplasmy/ │ ├── MitoHeteroplasmy.csproj │ ├── MitoHeteroplasmyProvider.cs │ └── MitoHeteroplasmyReader.cs ├── Nirvana/ │ ├── AnnotationFiles.cs │ ├── AnnotationResources.cs │ ├── Nirvana.cs │ ├── Nirvana.csproj │ ├── PreLoadUtilities.cs │ ├── ProviderUtilities.cs │ └── StreamAnnotation.cs ├── Nirvana.sln ├── Nirvana.sln.DotSettings ├── NirvanaLambda/ │ ├── AnnotationJob.cs │ ├── AnnotationResultSummary.cs │ ├── AssemblyInfo.cs │ ├── NirvanaLambda.cs │ ├── NirvanaLambda.csproj │ └── PartitionUtilities.cs ├── OptimizedCore/ │ ├── ExpandableArray.cs │ ├── NullSequenceEqual.cs │ ├── OptimizedCore.csproj │ ├── StringBuilderPool.cs │ └── StringExtensions.cs ├── Phantom/ │ ├── AssemblyInfo.cs │ └── Phantom.csproj ├── README.md ├── ReferenceSequence/ │ ├── AssemblyInfo.cs │ ├── Commands/ │ │ ├── CreateReferenceMain.cs │ │ ├── CreateSubstringMain.cs │ │ └── CreateTestSeqMain.cs │ ├── Common/ │ │ ├── IndexEntry.cs │ │ ├── MaskedEntry.cs │ │ └── Sequence.cs │ ├── Compression/ │ │ └── TwoBitCompressor.cs │ ├── Creation/ │ │ ├── FastaSequence.cs │ │ ├── ReferenceBuffer.cs │ │ ├── ReferenceDictionaryUtils.cs │ │ └── ReferenceSequence.cs │ ├── IO/ │ │ ├── AssemblyReader.cs │ │ ├── CytogeneticBandsReader.cs │ │ ├── FastaReader.cs │ │ ├── ReferenceNamesReader.cs │ │ ├── ReferenceSequenceReader.cs │ │ └── ReferenceSequenceWriter.cs │ ├── ReferenceSequence.csproj │ ├── ReferenceSequenceCommon.cs │ ├── ReferenceUtilsMain.cs │ └── Utilities/ │ └── SequenceHelper.cs ├── RepeatExpansions/ │ ├── AssemblyInfo.cs │ ├── IO/ │ │ └── RepeatExpansionReader.cs │ ├── IRepeatExpansionProvider.cs │ ├── Matcher.cs │ ├── PercentileUtilities.cs │ ├── RepeatExpansionPhenotype.cs │ ├── RepeatExpansionProvider.cs │ ├── RepeatExpansionSupplementaryAnnotation.cs │ ├── RepeatExpansions.csproj │ └── Resources/ │ ├── RepeatExpansions.GRCh37.tsv │ └── RepeatExpansions.GRCh38.tsv ├── SAUtils/ │ ├── AAConservation/ │ │ ├── AaConservationMain.cs │ │ ├── AaConservationUtilities.cs │ │ ├── ProteinConservationParser.cs │ │ └── ProteinConservationWriter.cs │ ├── AssemblyInfo.cs │ ├── ClinGen/ │ │ ├── DosageMapRegionItem.cs │ │ ├── DosageMapRegionParser.cs │ │ ├── DosageMapRegions.cs │ │ ├── DosageSensitivity.cs │ │ ├── DosageSensitivityItem.cs │ │ ├── DosageSensitivityParser.cs │ │ ├── GeneDiseaseValidity.cs │ │ ├── GeneDiseaseValidityItem.cs │ │ ├── GeneDiseaseValidityParser.cs │ │ └── ScoreToDescription.cs │ ├── CosmicGeneFusions/ │ │ ├── Cache/ │ │ │ ├── ReferenceLoader.cs │ │ │ └── TranscriptCache.cs │ │ ├── Conversion/ │ │ │ ├── CosmicConverter.cs │ │ │ ├── CosmicGeneFusion.cs │ │ │ ├── HgvsRnaFixer.cs │ │ │ ├── HgvsRnaParser.cs │ │ │ ├── Histology.cs │ │ │ ├── RawCosmicGeneFusion.cs │ │ │ └── Site.cs │ │ ├── CreateCosmicGeneFusions.cs │ │ ├── IO/ │ │ │ ├── CosmicGeneFusionParser.cs │ │ │ └── GeneFusionJsonWriter.cs │ │ └── Utilities/ │ │ └── CosmicCountUtilities.cs │ ├── CreateClinvarDb/ │ │ ├── ClinVarMain.cs │ │ └── ClinVarStats.cs │ ├── CreateCosmicDb/ │ │ └── Main.cs │ ├── CreateDbsnpDb/ │ │ └── Main.cs │ ├── CreateDecipherDb/ │ │ └── Main.cs │ ├── CreateGlobalAllelesDb/ │ │ └── Main.cs │ ├── CreateGmeDb/ │ │ └── Main.cs │ ├── CreateOneKgDb/ │ │ └── Main.cs │ ├── CreateTopMedDb/ │ │ └── Main.cs │ ├── Custom/ │ │ ├── AllowedValues.cs │ │ ├── CaUtilities.cs │ │ ├── CustomGene.cs │ │ ├── GeneAnnotationsParser.cs │ │ ├── GeneMain.cs │ │ ├── ParserUtilities.cs │ │ ├── VariantAnnotationsParser.cs │ │ └── VariantMain.cs │ ├── Dann/ │ │ └── Create.cs │ ├── DataStructures/ │ │ ├── AlleleFrequencyItem.cs │ │ ├── AncestralAlleleItem.cs │ │ ├── ClinGenItem.cs │ │ ├── ClinVarItem.cs │ │ ├── ComputingUtilities.cs │ │ ├── ConservationItem.cs │ │ ├── CosmicItem.cs │ │ ├── CounterDictionary.cs │ │ ├── CustomInterval.cs │ │ ├── CustomItem.cs │ │ ├── DbSnpItem.cs │ │ ├── DecipherItem.cs │ │ ├── DgvItem.cs │ │ ├── GlobalMinorItem.cs │ │ ├── GmeItem.cs │ │ ├── GnomadItem.cs │ │ ├── GnomadSvItem.cs │ │ ├── KeyCounts.cs │ │ ├── MinHeap.cs │ │ ├── OmimItem.cs │ │ ├── OneKGenItem.cs │ │ ├── OnekGenSvItem.cs │ │ ├── RefMinorItem.cs │ │ ├── SuppDataUtilities.cs │ │ └── TopMedItem.cs │ ├── DbSnpRemapper/ │ │ ├── ChromMapper.cs │ │ ├── DbSnpRemapperMain.cs │ │ ├── GenomicLocation.cs │ │ ├── LeftoverMapper.cs │ │ └── Utilities.cs │ ├── DegenerateBaseUtilities.cs │ ├── ExtractCosmicSvs/ │ │ ├── CosmicCnvItem.cs │ │ ├── CosmicCnvReader.cs │ │ ├── CosmicSvReader.cs │ │ └── ExtractCosmicSvsMain.cs │ ├── ExtractMiniSa/ │ │ ├── ExtractMiniSaMain.cs │ │ └── MiniSaExtractor.cs │ ├── ExtractMiniXml/ │ │ ├── ExtractMiniXmlMain.cs │ │ ├── RcvXmlExtractor.cs │ │ └── VcvXmlExtractor.cs │ ├── FusionCatcher/ │ │ ├── CollectionType.cs │ │ ├── CreateFusionCatcher.cs │ │ ├── FusionCatcherDataSource.cs │ │ ├── FusionCatcherOncogenes.cs │ │ ├── GeneFusionSourceBuilder.cs │ │ ├── GeneFusionSourceWriter.cs │ │ └── IndexBuilder.cs │ ├── GERP/ │ │ └── GerpMain.cs │ ├── GeneIdentifiers/ │ │ ├── GeneSymbolUpdater.cs │ │ └── GeneUtilities.cs │ ├── GenericScore/ │ │ ├── GenericScoreParser/ │ │ │ ├── GenericScoreItem.cs │ │ │ ├── GenericScoreParser.cs │ │ │ └── SaItemValidator.cs │ │ ├── ParserSettings.cs │ │ ├── ScoreFileWriter.cs │ │ └── WriterSettings.cs │ ├── GnomadGeneScores/ │ │ ├── GnomadGeneItem.cs │ │ ├── GnomadGeneParser.cs │ │ └── GnomadGenesMain.cs │ ├── InputFileParsers/ │ │ ├── AncestralAlleleReader.cs │ │ ├── ClinGen/ │ │ │ └── ClinGenReader.cs │ │ ├── ClinVar/ │ │ │ ├── ClinVarCommon.cs │ │ │ ├── ClinVarParser.cs │ │ │ ├── ClinVarSchema.cs │ │ │ ├── ClinVarVariationReader.cs │ │ │ ├── ClinvarVariant.cs │ │ │ ├── IClinVarSaItem.cs │ │ │ ├── VariantAligner.cs │ │ │ ├── VcvItem.cs │ │ │ └── VcvSaItem.cs │ │ ├── Cosmic/ │ │ │ └── MergedCosmicReader.cs │ │ ├── DGV/ │ │ │ └── DgvReader.cs │ │ ├── DataSourceVersionReader.cs │ │ ├── DbSnp/ │ │ │ ├── DbSnpReader.cs │ │ │ └── GlobalMinorReader.cs │ │ ├── Decipher/ │ │ │ └── DecipherParser.cs │ │ ├── Gme/ │ │ │ └── GmeParser.cs │ │ ├── OneKGen/ │ │ │ ├── OneKGenReader.cs │ │ │ ├── RefMinorReader.cs │ │ │ └── oneKGenSvReader.cs │ │ ├── SequenceExtensions.cs │ │ └── TOPMed/ │ │ └── TopMedReader.cs │ ├── MakeAaDb/ │ │ └── Main.cs │ ├── MakeClinGenDb/ │ │ └── Main.cs │ ├── MitoHeteroplasmy/ │ │ ├── MitoHeteroplasmyDb.cs │ │ ├── MitoHeteroplasmyParser.cs │ │ └── StatClasses.cs │ ├── MitoMap/ │ │ ├── CircularGenomeModel.cs │ │ ├── MitoMapDatabaseUtilities.cs │ │ ├── MitoMapInputDb.cs │ │ ├── MitoMapItem.cs │ │ ├── MitoMapSvItem.cs │ │ ├── MitoMapSvReader.cs │ │ ├── MitoMapVariantReader.cs │ │ ├── ParsingUtilities.cs │ │ ├── SmallVarDb.cs │ │ └── StructVarDb.cs │ ├── NgaWriter.cs │ ├── NsaConcatenator/ │ │ ├── ConcatUtilities.cs │ │ └── NsaConcatenator.cs │ ├── NsaIndexUpdater/ │ │ └── UpdateIndex.cs │ ├── NsaWriter.cs │ ├── Omim/ │ │ ├── Downloader.cs │ │ ├── EntryApiResponse/ │ │ │ ├── EntryResponse.cs │ │ │ └── GeneMap.cs │ │ ├── Main.cs │ │ ├── OmimParser.cs │ │ ├── OmimPhenotypeSchema.cs │ │ ├── OmimQuery.cs │ │ ├── OmimSchema.cs │ │ ├── OmimStatistics.cs │ │ ├── OmimUtilities.cs │ │ └── OmimVersion.cs │ ├── OneKGenSvDb/ │ │ ├── Create.cs │ │ └── VcfToBed.cs │ ├── ParseUtils/ │ │ ├── SAParseUtilities.cs │ │ ├── SplitLine.cs │ │ └── TsvIndices.cs │ ├── PhyloP/ │ │ ├── Main.cs │ │ ├── NpdWriter.cs │ │ └── PhylopParser.cs │ ├── PrimateAi/ │ │ ├── PrimateAiDb.cs │ │ ├── PrimateAiItem.cs │ │ ├── PrimateAiParser.cs │ │ └── PrimateAiUtilities.cs │ ├── ProcessSpliceNetTsv/ │ │ ├── PredictionFilter.cs │ │ └── SpliceNetPredictionFilterMain.cs │ ├── RefMinorDb/ │ │ ├── Main.cs │ │ └── RefMinorDbWriter.cs │ ├── RegionUtilities.cs │ ├── Revel/ │ │ └── Create.cs │ ├── SAUtils.cs │ ├── SAUtils.csproj │ ├── SaUtilsCommon.cs │ ├── Schema/ │ │ ├── SaJsonKeyAnnotation.cs │ │ ├── SaJsonKeyProperties.cs │ │ ├── SaJsonSchema.cs │ │ └── SaJsonValueType.cs │ ├── SpliceAi/ │ │ ├── SpliceAiDb.cs │ │ ├── SpliceAiItem.cs │ │ ├── SpliceAiParser.cs │ │ └── SpliceUtilities.cs │ ├── gnomAD/ │ │ ├── GnomadSnvMain.cs │ │ ├── GnomadSnvReader.cs │ │ ├── GnomadSvBedParser.cs │ │ ├── GnomadSvMain.cs │ │ ├── GnomadSvParser.cs │ │ ├── GnomadSvTsvParser.cs │ │ ├── GnomadUtilities.cs │ │ ├── LcrInterval.cs │ │ ├── LcrRegionParser.cs │ │ └── LcrRegionsMain.cs │ └── makeDgvDb/ │ └── Main.cs ├── Sandbox/ │ ├── AminoAcidAligner/ │ │ ├── AlignmentBuilder.cs │ │ ├── AminoAcidAligner.csproj │ │ ├── ExonToTranscript.cs │ │ └── Utilities.cs │ ├── GenerateMustGenotypeVcf/ │ │ ├── ConfigurationSettings.cs │ │ ├── GenerateMustGenotypeVcf.csproj │ │ ├── GenerateMustGenotypeVcfsMain.cs │ │ └── MustGenotypeExtractor.cs │ ├── Piano/ │ │ ├── ConfigurationSettings.cs │ │ ├── Piano.cs │ │ ├── Piano.csproj │ │ ├── PianoAnnotatedTranscript.cs │ │ ├── PianoAnnotationProvider.cs │ │ ├── PianoAnnotationUtils.cs │ │ ├── PianoAnnotator.cs │ │ ├── PianoTranscriptAnnotator.cs │ │ └── ProviderUtilities.cs │ ├── Sandbox.sln │ ├── Sandbox.sln.DotSettings │ ├── Scripts/ │ │ ├── ConvertCacheMatrix.pl │ │ ├── StressTestUnitTests.ps1 │ │ ├── UpdateCacheFiles.ps1 │ │ ├── UpdateMiniSaFiles.ps1 │ │ └── updateSA.ps1 │ └── UnitTests/ │ ├── Piano/ │ │ ├── PianoAnnotatedTranscriptTests.cs │ │ ├── PianoTests.cs │ │ └── SimpleSequence.cs │ ├── Resources/ │ │ ├── ConflicitingEntries1000G.vcf │ │ ├── RefMinorAllele.vcf │ │ ├── Test1000GFile.vcf │ │ ├── TestCosmicParser.Coding.vcf │ │ ├── TestCosmicParser.NonCoding.vcf │ │ ├── TestCosmicParser.tsv │ │ ├── TestWigParser.wig │ │ ├── chr1.npd │ │ ├── chr1_10918_150000.npd │ │ ├── mini.WigFix │ │ ├── missingLastVariantHgmd.vcf │ │ ├── testClinGenUnifier.txt │ │ ├── tmpPopInfo.txt │ │ └── tmpSampleInfo.txt │ ├── UnitTests.csproj │ └── Utilities/ │ ├── ResourceUtilities.cs │ └── Resources.cs ├── SingleAnnotationLambda/ │ ├── CacheConfiguration.cs │ ├── CacheUtilities.cs │ ├── ExceptionHandler.cs │ ├── SingleAnnotationLambda.cs │ ├── SingleAnnotationLambda.csproj │ ├── SingleConfigExtensions.cs │ ├── SingleResult.cs │ └── SupplementaryAnnotationUtilities.cs ├── Tabix/ │ ├── AssemblyInfo.cs │ ├── BgzfBlockVcfReader.cs │ ├── BinUtilities.cs │ ├── Constants.cs │ ├── Index.cs │ ├── Interval.cs │ ├── Reader.cs │ ├── ReferenceIndex.cs │ ├── Search.cs │ ├── SearchUtilities.cs │ ├── Tabix.csproj │ └── VirtualPosition.cs ├── UnitTests/ │ ├── AnnotationLambda/ │ │ ├── AnnotationLambdaTests.cs │ │ └── S3UtilitiesTests.cs │ ├── CacheUtils/ │ │ ├── DataDumperImport/ │ │ │ ├── DataStructures/ │ │ │ │ └── Import/ │ │ │ │ └── ImportNodeExtensionsTests.cs │ │ │ ├── FauxRegex/ │ │ │ │ └── RegexDecisionTreeTests.cs │ │ │ ├── FileHandling/ │ │ │ │ └── DataDumperReaderTests.cs │ │ │ └── Import/ │ │ │ ├── ImportRegulatoryFeatureTests.cs │ │ │ └── ImportTranscriptTests.cs │ │ ├── Genes/ │ │ │ ├── Combiners/ │ │ │ │ ├── CombinerUtilsTests.cs │ │ │ │ ├── HgncIdCombinerTests.cs │ │ │ │ └── PartitionCombinerTests.cs │ │ │ ├── GeneFlattenerTests.cs │ │ │ └── Utilities/ │ │ │ └── DictionaryUtilitiesTests.cs │ │ ├── IO/ │ │ │ └── Caches/ │ │ │ └── TranscriptCacheWriterTests.cs │ │ ├── TranscriptCache/ │ │ │ └── TranscriptRegionMergerTests.cs │ │ └── Utilities/ │ │ ├── AccessionUtilitiesTests.cs │ │ └── RemoteFileTests.cs │ ├── Cloud/ │ │ ├── ConsistencyTests.cs │ │ ├── JsonUtilitiesTests.cs │ │ ├── LambdaUrlHelperTests.cs │ │ ├── RedactionUtilitiesTests.cs │ │ ├── S3PathTests.cs │ │ ├── SaUrlsTests.cs │ │ └── UploadUtilitiesTests.cs │ ├── CommandLine/ │ │ ├── Builders/ │ │ │ ├── ConsoleAppBuilderDataTests.cs │ │ │ ├── ConsoleAppBuilderTests.cs │ │ │ ├── TopLevelAppBuilderTests.cs │ │ │ └── ValidationExtensionsTests.cs │ │ ├── NDesk.Options/ │ │ │ ├── OptionContextTests.cs │ │ │ ├── OptionSetTests.cs │ │ │ └── OptionsTests.cs │ │ ├── Utilities/ │ │ │ ├── BenchmarkTests.cs │ │ │ └── MemoryUtilitiesTests.cs │ │ └── VersionProviders/ │ │ └── DefaultVersionProviderTests.cs │ ├── Compression/ │ │ ├── CompressionAlgorithmTests.cs │ │ ├── DataStructures/ │ │ │ └── BlockTests.cs │ │ ├── FileHandling/ │ │ │ ├── BgzipTextWriterTests.cs │ │ │ ├── BlockGZipStreamTests.cs │ │ │ ├── BlockHeaderTests.cs │ │ │ └── BlockStreamTests.cs │ │ └── Utilities/ │ │ ├── GZipUtilitiesTests.cs │ │ └── LibraryUtilitiesTests.cs │ ├── CustomAnnotationLambda/ │ │ └── CustomAnnotationConfigTests.cs │ ├── Downloader/ │ │ ├── AnnotationRepositoryTests.cs │ │ ├── ConfigurationTests.cs │ │ ├── FileExtensions/ │ │ │ ├── CacheFileExtensionsTests.cs │ │ │ ├── ReferencesFileExtensionTests.cs │ │ │ └── SupplementaryAnnotationFileExtensionsTests.cs │ │ ├── GenomeAssemblyHelperTests.cs │ │ ├── ManifestTests.cs │ │ └── RemoteFileComparer.cs │ ├── EndToEndTests.cs │ ├── ErrorHandling/ │ │ ├── ExceptionUtilitiesTests.cs │ │ ├── Exceptions/ │ │ │ └── ExceptionsTests.cs │ │ ├── ExitCodeUtilitiesTests.cs │ │ └── ExitCodesTests.cs │ ├── GeneAnnotationLambda/ │ │ ├── GeneAnnotationLambdaTests.cs │ │ └── GeneConfigTests.cs │ ├── Genome/ │ │ ├── ChromosomeIntervalTests.cs │ │ ├── CytogeneticBandTests.cs │ │ ├── EmptyChromosomeTests.cs │ │ ├── GenomeAssemblyHelperTests.cs │ │ ├── ReferenceNameUtilitiesTests.cs │ │ └── SequenceUtilitiesTests.cs │ ├── IO/ │ │ ├── BufferedBinaryReaderTests.cs │ │ ├── ExtendedBinaryReaderTests.cs │ │ ├── FilePathUtilitiesTests.cs │ │ ├── FileUtilitiesTests.cs │ │ ├── HttpUtilitiesTests.cs │ │ ├── LengthStreamTests.cs │ │ ├── MD5StreamTests.cs │ │ ├── PersistentStreamTests.cs │ │ └── UrlUtilitiesTests.cs │ ├── Intervals/ │ │ ├── IntervalArrayTests.cs │ │ ├── IntervalExtensionsTests.cs │ │ ├── IntervalForestTests.cs │ │ ├── IntervalOperationsTests.cs │ │ ├── NullIntervalSearchTests.cs │ │ └── OverlapTypeTests.cs │ ├── Jasix/ │ │ ├── IndexTests.cs │ │ ├── JasixFunctionalityTests.cs │ │ ├── JasixQueryProcessingTests.cs │ │ └── OtfIndexCreatorTests.cs │ ├── Jist/ │ │ └── JiSTtests.cs │ ├── MitoHeteroplasmy/ │ │ └── MitoHeteroplasmyProviderTests.cs │ ├── MockedData/ │ │ ├── Genes.cs │ │ ├── TranscriptRegions.cs │ │ ├── Transcripts.cs │ │ └── Translations.cs │ ├── Nirvana/ │ │ ├── AnnotationFilesTests.cs │ │ ├── PreLoadUtilitiesTests.cs │ │ └── ProviderUtilitiesTests.cs │ ├── NirvanaLambda/ │ │ ├── AnnotationJobTests.cs │ │ ├── NirvanaConfigTests.cs │ │ ├── NirvanaLambdaTests.cs │ │ └── PartitionUtilitiesTests.cs │ ├── OptimizedCore/ │ │ ├── StringBuilderCacheTests.cs │ │ └── StringExtensionsTests.cs │ ├── RepeatExpansions/ │ │ ├── MatcherTests.cs │ │ ├── PercentileUtilitiesTests.cs │ │ └── RepeatExpansionProviderTests.cs │ ├── Resources/ │ │ ├── COSM18152.tsv │ │ ├── COSM18152.vcf │ │ ├── COSM983708.tsv │ │ ├── COSM983708.vcf │ │ ├── ClinGen_Dosage_Sensitivity_Map_20190507.nga │ │ ├── ClinVarXmlFiles/ │ │ │ ├── RCV000000101.xml │ │ │ ├── RCV000000734.xml │ │ │ ├── RCV000001054.xml │ │ │ ├── RCV000001373.xml │ │ │ ├── RCV000001752.xml │ │ │ ├── RCV000003254.xml │ │ │ ├── RCV000005426.xml │ │ │ ├── RCV000007484.xml │ │ │ ├── RCV000010551.xml │ │ │ ├── RCV000016673.xml │ │ │ ├── RCV000017510.xml │ │ │ ├── RCV000021819.xml │ │ │ ├── RCV000030349.xml │ │ │ ├── RCV000032548.xml │ │ │ ├── RCV000032707.xml │ │ │ ├── RCV000038438.xml │ │ │ ├── RCV000050055.xml │ │ │ ├── RCV000073701.xml │ │ │ ├── RCV000077146.xml │ │ │ ├── RCV000080071.xml │ │ │ ├── RCV000083638.xml │ │ │ ├── RCV000087262.xml │ │ │ ├── RCV000112977.xml │ │ │ ├── RCV000113363.xml │ │ │ ├── RCV000120902.xml │ │ │ ├── RCV000124712.xml │ │ │ ├── RCV000144179.xml │ │ │ ├── RCV000152657.xml │ │ │ ├── RCV000153339.xml │ │ │ ├── RCV000167792.xml │ │ │ ├── RCV000169296.xml │ │ │ ├── RCV000170338.xml │ │ │ ├── RCV000171474.xml │ │ │ ├── RCV000179026.xml │ │ │ ├── RCV000194003.xml │ │ │ ├── RCV000203290.xml │ │ │ ├── RCV000205418.xml │ │ │ ├── RCV000207071.xml │ │ │ ├── RCV000207504.xml │ │ │ ├── RCV000235027.xml │ │ │ ├── RCV000267121.xml │ │ │ ├── RCV000342164.xml │ │ │ ├── RCV000373191.xml │ │ │ ├── RCV000401212.xml │ │ │ ├── RCV000406351.xml │ │ │ ├── RCV000435546.xml │ │ │ ├── RCV000485802.xml │ │ │ ├── RCV000537563.xml │ │ │ ├── RCV000724338.xml │ │ │ ├── Two_RCVs.xml │ │ │ └── VCVs/ │ │ │ ├── TwoRecords.xml │ │ │ ├── VCV000000081.xml │ │ │ ├── VCV000137106.xml │ │ │ ├── VCV000431749.xml │ │ │ ├── VCV000476472.xml │ │ │ └── VCV000618791.xml │ │ ├── Clinvar20150901.json.gz.jsi │ │ ├── EndToEnd/ │ │ │ └── GRCh37/ │ │ │ ├── chr12_7018490_7086889_Both.bases │ │ │ ├── chr12_7018490_7086889_Both.polyphen.ndb │ │ │ ├── chr12_7018490_7086889_Both.sift.ndb │ │ │ └── chr12_7018490_7086889_Both.transcripts.ndb │ │ ├── JasixTest.json.gz.jsi │ │ ├── Mother_chr22.genome.vcf.gz.tbi │ │ ├── OMIM_20190812.nga │ │ ├── SA/ │ │ │ ├── CosmicCNV.tsv │ │ │ └── MockSaFiles/ │ │ │ ├── not_sa.txt │ │ │ ├── sa1.nsa │ │ │ ├── sa1.nsa.idx │ │ │ ├── sa2.nsa │ │ │ ├── sa2.nsa.idx │ │ │ ├── sa3.nsi │ │ │ ├── sa4.nsi │ │ │ ├── sa5.npd │ │ │ ├── sa5.npd.idx │ │ │ ├── sa6.nga │ │ │ ├── sa7.nga │ │ │ ├── sa8.rma │ │ │ └── sa8.rma.idx │ │ ├── TinyAnnotated.json │ │ ├── clinvar.dict │ │ ├── cosm5428243.tsv │ │ ├── cosm5428243.vcf │ │ ├── cosmicv72.indels.json.gz.jsi │ │ ├── custom_gene.nga │ │ ├── dbSNP.version │ │ ├── gnomAD_gene_scores_2.1.nga │ │ ├── manifest.txt │ │ ├── mini.WigFix │ │ ├── miniHEXA_minimal.vcf.gz.tbi │ │ └── testDgvParser.txt │ ├── SAUtils/ │ │ ├── AnnotationItems/ │ │ │ └── CosmicCnvItemTests.cs │ │ ├── ClinGen/ │ │ │ └── GeneDiseaseValidityTests.cs │ │ ├── CosmicGeneFusions/ │ │ │ ├── Cache/ │ │ │ │ └── TranscriptCacheTests.cs │ │ │ ├── Conversion/ │ │ │ │ ├── CosmicConverterTests.cs │ │ │ │ ├── HgvsRnaFixerTests.cs │ │ │ │ ├── HgvsRnaParserTests.cs │ │ │ │ ├── HistologyTests.cs │ │ │ │ └── SiteTests.cs │ │ │ ├── CreateCosmicGeneFusionsTests.cs │ │ │ └── IO/ │ │ │ ├── CosmicGeneFusionParserTests.cs │ │ │ └── GeneFusionJsonWriterTests.cs │ │ ├── CustomAnnotations/ │ │ │ ├── AllowedValuesTests.cs │ │ │ ├── GeneAnnotationParserTests.cs │ │ │ ├── ParserUtilitiesTests.cs │ │ │ └── VariantAnnotationsParserTests.cs │ │ ├── Dann/ │ │ │ └── DannParserTests.cs │ │ ├── DataStructures/ │ │ │ └── CounterDictionaryTests.cs │ │ ├── DbVar/ │ │ │ ├── DosageMapRegionParserTests.cs │ │ │ └── DosageSensitivityParserTests.cs │ │ ├── FusionCatcher/ │ │ │ ├── FusionCatcherDataSourceTests.cs │ │ │ ├── GeneFusionSourceWriterTests.cs │ │ │ └── IndexBuilderTests.cs │ │ ├── GERP/ │ │ │ ├── GerpParserTests.cs │ │ │ └── GerpReaderTests.cs │ │ ├── GeneAnnotationsTest.cs │ │ ├── GenericScoreParserTests/ │ │ │ └── GenericScoreParserTests.cs │ │ ├── InputFileParsers/ │ │ │ ├── AlleleReaderTests.cs │ │ │ ├── ClinGenTests.cs │ │ │ ├── ClinVarXmlReaderTests.cs │ │ │ ├── ClinvarVariationParserTests.cs │ │ │ ├── CosmicCnvReaderTests.cs │ │ │ ├── CosmicItemTests.cs │ │ │ ├── DataSourceVersionTests.cs │ │ │ ├── DbSnpReaderTests.cs │ │ │ ├── DecipherReaderTest.cs │ │ │ ├── DgvReaderTests.cs │ │ │ ├── DgvTests.cs │ │ │ ├── GlobalMinorReaderTests.cs │ │ │ ├── GmeReaderTests.cs │ │ │ ├── MergedCosmicReaderTests.cs │ │ │ ├── OneKGenTests.cs │ │ │ ├── ParserTestUtils.cs │ │ │ ├── RefMinorTests.cs │ │ │ └── TopMedReaderTests.cs │ │ ├── MitoHeteroplasmy/ │ │ │ └── MitoHeteroplasmyTests.cs │ │ ├── MitoMap/ │ │ │ ├── MitoMapSvReaderTests.cs │ │ │ ├── MitoMapVariantReaderTests.cs │ │ │ └── ParsingUtilitiesTests.cs │ │ ├── NsaWriters/ │ │ │ ├── IntervalWriterReaderTests.cs │ │ │ ├── NsaUtilitiesTests.cs │ │ │ └── WriterReaderTests.cs │ │ ├── Omim/ │ │ │ └── OmimUtilitiesTests.cs │ │ ├── ParseUtils/ │ │ │ ├── SplitLineTests.cs │ │ │ └── TsvIndicesTests.cs │ │ ├── PhylopTests.cs │ │ ├── PrimateAi/ │ │ │ └── PrimateAiTests.cs │ │ ├── ProteinConservation/ │ │ │ └── ParserTests.cs │ │ ├── Revel/ │ │ │ └── RevelParserTests.cs │ │ ├── SaJsonSchemaTests.cs │ │ ├── SpliceAi/ │ │ │ └── SpliceAiTests.cs │ │ └── gnomAD/ │ │ ├── GnomadGeneScoreTests.cs │ │ ├── GnomadReaderTests.cs │ │ ├── GnomadSvBedParserTests.cs │ │ ├── GnomadSvItemTests.cs │ │ ├── GnomadSvTsvParserTests.cs │ │ └── LcrParserTests.cs │ ├── SingleAnnotationLambda/ │ │ ├── SingleConfigTests.cs │ │ └── SingleVariantTests.cs │ ├── Tabix/ │ │ ├── BgzfBlockVcfReaderTests.cs │ │ ├── BinUtilitiesTests.cs │ │ ├── ReaderTests.cs │ │ ├── SearchTests.cs │ │ ├── SearchTestsLocalMother.cs │ │ ├── SearchTestsRemoteMother.cs │ │ ├── SearchUtilitiesTests.cs │ │ └── VirtualPositionTests.cs │ ├── TestDataStructures/ │ │ └── SimpleSequence.cs │ ├── TestUtilities/ │ │ ├── AnnotationUtilities.cs │ │ ├── ByteUtilities.cs │ │ ├── ChromosomeUtilities.cs │ │ ├── RandomPath.cs │ │ ├── ResourceUtilities.cs │ │ ├── Resources.cs │ │ └── TestDataGenerator.cs │ ├── UnitTests.csproj │ ├── VariantAnnotation/ │ │ ├── Algorithms/ │ │ │ └── SwapTests.cs │ │ ├── AnnotatedPositions/ │ │ │ ├── AnnotatedPositionTests.cs │ │ │ ├── AnnotatedVariantTests.cs │ │ │ ├── ConsequenceTests.cs │ │ │ ├── HgvsCodingNomenclatureTests.cs │ │ │ ├── HgvsProteinNomenclatureTests.cs │ │ │ ├── HgvsUtilitiesTests.cs │ │ │ ├── HgvscNotationTests.cs │ │ │ ├── HgvsgNotationTests.cs │ │ │ ├── HgvspNotationTests.cs │ │ │ ├── ProteinChangeTests.cs │ │ │ ├── RegulatoryRegionAnnotatorTests.cs │ │ │ └── Transcript/ │ │ │ ├── AminoAcidTests.cs │ │ │ ├── AnnotatedTranscriptTests.cs │ │ │ ├── CdnaSequenceTests.cs │ │ │ ├── CodingSequenceTests.cs │ │ │ ├── CodonsTests.cs │ │ │ ├── CompactIdTests.cs │ │ │ ├── FeatureVariantEffectsTests.cs │ │ │ ├── MappedPositionUtilitiesTests.cs │ │ │ ├── StringExtensionsTests.cs │ │ │ ├── TranscriptPositionalEffectTests.cs │ │ │ ├── TranscriptUtilitiesTests.cs │ │ │ └── VariantEffectTests.cs │ │ ├── AnnotatorTests.cs │ │ ├── Caches/ │ │ │ ├── DataStructures/ │ │ │ │ ├── EncodedTranscriptDataTests.cs │ │ │ │ ├── GeneTests.cs │ │ │ │ ├── RegulatoryRegionTests.cs │ │ │ │ ├── TranscriptRegionExtensionsTests.cs │ │ │ │ ├── TranscriptRegionTests.cs │ │ │ │ ├── TranscriptTests.cs │ │ │ │ └── TranslationTests.cs │ │ │ ├── TranscriptCacheTests.cs │ │ │ └── Utilities/ │ │ │ ├── RnaEditUtilitiesTests.cs │ │ │ └── TranscriptUtilitiesTests.cs │ │ ├── GeneFusions/ │ │ │ ├── Calling/ │ │ │ │ └── GeneFusionCallerTests.cs │ │ │ ├── IO/ │ │ │ │ ├── GeneFusionIndexEntryTests.cs │ │ │ │ └── GeneFusionSourceReaderTests.cs │ │ │ ├── SA/ │ │ │ │ ├── GeneFusionPairTests.cs │ │ │ │ ├── GeneFusionSourceCollectionTests.cs │ │ │ │ └── GeneFusionSourceUtilitiesTests.cs │ │ │ └── Utilities/ │ │ │ ├── GeneFusionKeyTests.cs │ │ │ └── IndexEntryExtensionsTests.cs │ │ ├── IO/ │ │ │ ├── Caches/ │ │ │ │ ├── CacheConstantsTests.cs │ │ │ │ ├── CacheHeaderTests.cs │ │ │ │ └── TranscriptCacheReaderTests.cs │ │ │ ├── JsonObjectTests.cs │ │ │ ├── JsonWriterTests.cs │ │ │ └── SampleExtensionsTests.cs │ │ ├── NSA/ │ │ │ ├── NsaIndexTests.cs │ │ │ ├── RefMinorIndexTests.cs │ │ │ ├── SuppAnnotationsOutputTests.cs │ │ │ └── SuppIntervalUtilitiesTests.cs │ │ ├── ProviderTests/ │ │ │ ├── GsaProviderTests.cs │ │ │ ├── LcrProviderTests.cs │ │ │ └── NsaProviderTests.cs │ │ ├── ScoreFile/ │ │ │ ├── GenericScoreEncoderTests.cs │ │ │ ├── GenericScoreEndToEndTests.cs │ │ │ ├── GenericScoreTests.cs │ │ │ ├── HeaderTests.cs │ │ │ ├── ReaderSettingsTests.cs │ │ │ ├── SaItemValidatorTests.cs │ │ │ ├── ScoreEncoderTests.cs │ │ │ ├── ScoreIndexTests.cs │ │ │ └── ScoreJsonEncoderTests.cs │ │ ├── Sequence/ │ │ │ ├── CompressedSequenceReaderTests.cs │ │ │ └── CompressedSequenceTests.cs │ │ ├── TranscriptAnnotation/ │ │ │ ├── BreakEndUtilitiesTests.cs │ │ │ ├── FullTranscriptAnnotatorTests.cs │ │ │ └── TranscriptAnnotationFactoryTests.cs │ │ └── Utilities/ │ │ ├── DateTests.cs │ │ ├── FormatUtilitiesTests.cs │ │ └── GeneComparerTests.cs │ ├── Variants/ │ │ ├── BiDirectionalTrimmerTests.cs │ │ ├── SimpleVariantTests.cs │ │ ├── VariantRotatorTests.cs │ │ └── VariantTests.cs │ └── Vcf/ │ ├── Samples/ │ │ ├── BooleanExtensionsTests.cs │ │ ├── FormatIndicesTests.cs │ │ ├── Legacy/ │ │ │ └── LegacySampleFieldExtractorTests.cs │ │ ├── SampleFieldExtractorTests.cs │ │ ├── SampleParsingExtensionsTests.cs │ │ ├── SampleTests.cs │ │ ├── TestUtilities.cs │ │ └── VariantFrequencyTests.cs │ ├── StringExtensionsTests.cs │ ├── VariantCreator/ │ │ ├── CnvCreatorTests.cs │ │ ├── LegacyVariantIdTests.cs │ │ ├── ReferenceVariantCreatorTests.cs │ │ ├── SmallVariantCreatorTests.cs │ │ ├── VariantFactoryTests.cs │ │ ├── VariantFactoryTestsWithLegacyVids.cs │ │ └── VariantIdTests.cs │ ├── VcfFilterTests.cs │ ├── VcfInfoParserTests.cs │ ├── VcfReaderTests.cs │ └── VcfReaderUtilsTests.cs ├── VariantAnnotation/ │ ├── Algorithms/ │ │ └── Swap.cs │ ├── AnnotatedPositions/ │ │ ├── AnnotatedPosition.cs │ │ ├── AnnotatedRegulatoryRegion.cs │ │ ├── AnnotatedVariant.cs │ │ ├── Consequence/ │ │ │ └── Consequences.cs │ │ ├── HgvsCodingNomenclature.cs │ │ ├── HgvsProteinNomenclature.cs │ │ ├── HgvsUtilities.cs │ │ ├── HgvscNotation.cs │ │ ├── HgvsgNotation.cs │ │ ├── HgvspNotation.cs │ │ ├── PositionOffset.cs │ │ ├── RegulatoryRegionAnnotator.cs │ │ └── Transcript/ │ │ ├── AminoAcids.cs │ │ ├── AnnotatedConservationScore.cs │ │ ├── AnnotatedGeneFusion.cs │ │ ├── AnnotatedTranscript.cs │ │ ├── CdnaSequence.cs │ │ ├── CodingSequence.cs │ │ ├── Codons.cs │ │ ├── CompactId.cs │ │ ├── FeatureVariantEffects.cs │ │ ├── MappedPosition.cs │ │ ├── MappedPositionUtilities.cs │ │ ├── StringExtensions.cs │ │ ├── TranscriptPositionalEffect.cs │ │ ├── TranscriptUtilities.cs │ │ ├── VariantEffect.cs │ │ └── VariantEffectCache.cs │ ├── Annotator.cs │ ├── AssemblyInfo.cs │ ├── Caches/ │ │ ├── DataStructures/ │ │ │ ├── CodingRegion.cs │ │ │ ├── EncodedTranscriptData.cs │ │ │ ├── Gene.cs │ │ │ ├── IndexEntry.cs │ │ │ ├── Prediction.cs │ │ │ ├── RegulatoryRegion.cs │ │ │ ├── RnaEdit.cs │ │ │ ├── Transcript.cs │ │ │ ├── TranscriptRegion.cs │ │ │ ├── TranscriptRegionExtensions.cs │ │ │ └── Translation.cs │ │ ├── PredictionCache.cs │ │ ├── TranscriptCache.cs │ │ ├── TranscriptCacheData.cs │ │ ├── TranscriptIntervalForestExtensions.cs │ │ └── Utilities/ │ │ ├── ExonUtilities.cs │ │ ├── GeneForestGenerator.cs │ │ └── RnaEditUtilities.cs │ ├── GeneAnnotation/ │ │ └── GeneAnnotationProvider.cs │ ├── GeneFusions/ │ │ ├── Calling/ │ │ │ ├── BreakEndAdjacency.cs │ │ │ ├── BreakEndAdjacencyFactory.cs │ │ │ ├── BreakPoint.cs │ │ │ ├── BreakPointTranscript.cs │ │ │ └── GeneFusionCaller.cs │ │ ├── HGVS/ │ │ │ └── HgvsRnaNomenclature.cs │ │ ├── IO/ │ │ │ ├── GeneFusionIndexEntry.cs │ │ │ ├── GeneFusionJsonReader.cs │ │ │ ├── GeneFusionSourceReader.cs │ │ │ └── IGeneFusionSaReader.cs │ │ ├── SA/ │ │ │ ├── GeneFusionPair.cs │ │ │ ├── GeneFusionSource.cs │ │ │ ├── GeneFusionSourceCollection.cs │ │ │ └── GeneFusionSourceUtilities.cs │ │ └── Utilities/ │ │ ├── GeneFusionKey.cs │ │ └── IndexEntryExtensions.cs │ ├── GenericScore/ │ │ ├── ChromosomeBlock.cs │ │ ├── EncoderType.cs │ │ ├── GenericScoreEncoder.cs │ │ ├── IScoreEncoder.cs │ │ ├── MetaData.cs │ │ ├── ReaderSettings.cs │ │ ├── ScoreBlock.cs │ │ ├── ScoreIndex.cs │ │ ├── ScoreIndexBlock.cs │ │ ├── ScoreJsonEncoder.cs │ │ ├── ScoreReader.cs │ │ └── ZeroToOneScoreEncoder.cs │ ├── IO/ │ │ ├── Caches/ │ │ │ ├── CacheHeader.cs │ │ │ ├── Header.cs │ │ │ ├── PredictionCacheCustomHeader.cs │ │ │ ├── PredictionCacheReader.cs │ │ │ ├── PredictionHeader.cs │ │ │ ├── TranscriptCacheCustomHeader.cs │ │ │ └── TranscriptCacheReader.cs │ │ ├── IntervalExtensions.cs │ │ ├── JsonCommon.cs │ │ ├── JsonObject.cs │ │ ├── JsonWriter.cs │ │ └── SampleExtensions.cs │ ├── NSA/ │ │ ├── NgaReader.cs │ │ ├── NsaBlock.cs │ │ ├── NsaIndex.cs │ │ ├── NsaIndexBlock.cs │ │ ├── NsaReader.cs │ │ ├── NsiReader.cs │ │ ├── NsiWriter.cs │ │ ├── RefMinorDbReader.cs │ │ ├── RefMinorIndex.cs │ │ ├── SuppInterval.cs │ │ ├── SuppIntervalUtilities.cs │ │ └── SupplementaryAnnotation.cs │ ├── PerformanceMetrics.cs │ ├── PhyloP/ │ │ ├── NpdIndex.cs │ │ └── NpdReader.cs │ ├── Pools/ │ │ ├── AnnotatedPositionPool.cs │ │ ├── AnnotatedTranscriptPool.cs │ │ ├── AnnotatedVariantPool.cs │ │ └── VariantPool.cs │ ├── ProteinConservation/ │ │ ├── ProteinConservationCommon.cs │ │ ├── ProteinConservationItem.cs │ │ ├── ProteinConservationReader.cs │ │ └── TranscriptConservationScores.cs │ ├── Providers/ │ │ ├── ConservationScoreProvider.cs │ │ ├── DataSourceVersion.cs │ │ ├── LcrProvider.cs │ │ ├── NsaProvider.cs │ │ ├── ProteinConservationProvider.cs │ │ ├── RefMinorProvider.cs │ │ ├── ReferenceSequenceProvider.cs │ │ ├── ScoreProvider.cs │ │ ├── TranscriptAnnotationProvider.cs │ │ └── VersionProvider.cs │ ├── SA/ │ │ ├── CustomAnnotationCategories.cs │ │ ├── JsonDataType.cs │ │ └── SaCommon.cs │ ├── TranscriptAnnotation/ │ │ ├── FlankingTranscriptAnnotator.cs │ │ ├── FullTranscriptAnnotator.cs │ │ ├── ReducedTranscriptAnnotator.cs │ │ ├── RohTranscriptAnnotator.cs │ │ ├── SequenceChange.cs │ │ └── TranscriptAnnotationFactory.cs │ ├── Utilities/ │ │ ├── BaseFormatting.cs │ │ ├── Date.cs │ │ ├── FormatUtilities.cs │ │ └── GeneComparer.cs │ └── VariantAnnotation.csproj ├── VariantAnnotation.Interface/ │ ├── AnnotatedPositions/ │ │ ├── BioType.cs │ │ ├── ConsequenceTag.cs │ │ ├── IAnnotatedGeneFusion.cs │ │ ├── IAnnotatedPosition.cs │ │ ├── IAnnotatedRegulatoryRegion.cs │ │ ├── IAnnotatedTranscript.cs │ │ ├── IAnnotatedVariant.cs │ │ ├── ICodingRegion.cs │ │ ├── ICompactId.cs │ │ ├── IFeatureVariantEffects.cs │ │ ├── IGene.cs │ │ ├── IGeneFusion.cs │ │ ├── IGeneFusionPair.cs │ │ ├── IMappedPosition.cs │ │ ├── IRegulatoryRegion.cs │ │ ├── IRnaEdit.cs │ │ ├── ITranscript.cs │ │ ├── ITranscriptRegion.cs │ │ ├── ITranslation.cs │ │ ├── IVariantEffect.cs │ │ └── PredictionScore.cs │ ├── Caches/ │ │ ├── IPredictionCache.cs │ │ ├── ITranscriptCache.cs │ │ └── RegulatoryRegionType.cs │ ├── Constants.cs │ ├── GeneAnnotation/ │ │ └── IGeneAnnotationProvider.cs │ ├── IAnnotationResources.cs │ ├── IAnnotator.cs │ ├── IO/ │ │ ├── IJsonSerializer.cs │ │ ├── IJsonWriter.cs │ │ ├── IVcfReader.cs │ │ └── VcfCommon.cs │ ├── IVariantIdCreator.cs │ ├── Positions/ │ │ ├── ICustomFields.cs │ │ ├── IInfoData.cs │ │ ├── IPosition.cs │ │ ├── ISample.cs │ │ └── ISimplePosition.cs │ ├── Providers/ │ │ ├── IAnnotationProvider.cs │ │ ├── IDataSourceVersion.cs │ │ ├── IMitoHeteroplasmyProvider.cs │ │ ├── IProvider.cs │ │ ├── IRefMinorProvider.cs │ │ ├── ISequenceProvider.cs │ │ └── ITranscriptAnnotationProvider.cs │ ├── SA/ │ │ ├── INsaReader.cs │ │ ├── INsiReader.cs │ │ ├── ISaMetadata.cs │ │ ├── ISupplementaryAnnotation.cs │ │ ├── ISupplementaryDataItem.cs │ │ ├── ISupplementaryInterval.cs │ │ ├── IsuppGeneItem.cs │ │ └── IsuppIntervalItem.cs │ ├── VariantAnnotation.Interface.csproj │ └── VariantCategory.cs ├── Variants/ │ ├── AnnotationBehavior.cs │ ├── BiDirectionalTrimmer.cs │ ├── ISimpleVariant.cs │ ├── IVariant.cs │ ├── RepeatExpansion.cs │ ├── SimpleVariant.cs │ ├── Variant.cs │ ├── VariantRotator.cs │ ├── VariantType.cs │ ├── VariantUtils.cs │ └── Variants.csproj └── Vcf/ ├── AssemblyInfo.cs ├── IVcfFilter.cs ├── Info/ │ ├── CustomFields.cs │ ├── InfoData.cs │ └── VcfInfoParser.cs ├── NullVcfFilter.cs ├── Position.cs ├── PositionPool.cs ├── Sample/ │ ├── BooleanExtensions.cs │ ├── FormatIndices.cs │ ├── Legacy/ │ │ ├── AlleleDepths.cs │ │ ├── FailedFilter.cs │ │ ├── Genotype.cs │ │ ├── GenotypeQuality.cs │ │ ├── IntermediateSampleFields.cs │ │ ├── LegacySampleFieldExtractor.cs │ │ ├── LegacyVariantFrequency.cs │ │ ├── ReadCounts.cs │ │ └── TotalDepth.cs │ ├── Sample.cs │ ├── SampleFieldExtractor.cs │ ├── SampleParsingExtensions.cs │ └── VariantFrequency.cs ├── SimplePosition.cs ├── StringExtensions.cs ├── VariantCreator/ │ ├── CnvCreator.cs │ ├── LegacyVariantId.cs │ ├── ReferenceVariantCreator.cs │ ├── RepeatExpansionCreator.cs │ ├── RohVariantCreator.cs │ ├── SmallVariantCreator.cs │ ├── StructuralVariantCreator.cs │ ├── VariantFactory.cs │ └── VariantId.cs ├── Vcf.csproj ├── VcfFilter.cs └── VcfReader.cs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ *.dat binary ================================================ FILE: .gitignore ================================================ ## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. # User-specific files *.suo *.user *.userosscache *.sln.docstates aws-lambda*.json # User-specific files (MonoDevelop/Xamarin Studio) *.userprefs # Build results [Dd]ebug/ [Dd]ebugPublic/ [Rr]elease/ [Rr]eleases/ x64/ x86/ bld/ [Bb]in/ [Oo]bj/ [Ll]og/ # Visual Studio 2015 cache/options directory .vs/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ # MSTest test Results [Tt]est[Rr]esult*/ [Bb]uild[Ll]og.* # NUNIT *.VisualState.xml TestResult.xml # Build Results of an ATL Project [Dd]ebugPS/ [Rr]eleasePS/ dlldata.c # DNX project.lock.json project.fragment.lock.json artifacts/ *_i.c *_p.c *_i.h *.ilk *.meta *.obj *.pch *.pdb *.pgc *.pgd *.rsp *.sbr *.tlb *.tli *.tlh *.tmp *.tmp_proj *.log *.vspscc *.vssscc .builds *.pidb *.svclog *.scc # Chutzpah Test files _Chutzpah* # Visual C++ cache files ipch/ *.aps *.ncb *.opendb *.opensdf *.sdf *.cachefile *.VC.db *.VC.VC.opendb # Visual Studio profiler *.psess *.vsp *.vspx *.sap # TFS 2012 Local Workspace $tf/ # Guidance Automation Toolkit *.gpState # ReSharper is a .NET coding add-in _ReSharper*/ *.[Rr]e[Ss]harper *.DotSettings.user # JustCode is a .NET coding add-in .JustCode # TeamCity is a build add-in _TeamCity* # DotCover is a Code Coverage Tool *.dotCover # NCrunch _NCrunch_* .*crunch*.local.xml nCrunchTemp_* # MightyMoose *.mm.* AutoTest.Net/ # Web workbench (sass) .sass-cache/ # Installshield output folder [Ee]xpress/ # DocProject is a documentation generator add-in DocProject/buildhelp/ DocProject/Help/*.HxT DocProject/Help/*.HxC DocProject/Help/*.hhc DocProject/Help/*.hhk DocProject/Help/*.hhp DocProject/Help/Html2 DocProject/Help/html # Click-Once directory publish/ # Publish Web Output *.[Pp]ublish.xml *.azurePubxml # TODO: Comment the next line if you want to checkin your web deploy settings # but database connection strings (with potential passwords) will be unencrypted #*.pubxml *.publishproj # Microsoft Azure Web App publish settings. Comment the next line if you want to # checkin your Azure Web App publish settings, but sensitive information contained # in these scripts will be unencrypted PublishScripts/ # NuGet Packages *.nupkg # The packages folder can be ignored because of Package Restore **/packages/* # except build/, which is used as an MSBuild target. !**/packages/build/ # Uncomment if necessary however generally it will be regenerated when needed #!**/packages/repositories.config # NuGet v3's project.json files produces more ignoreable files *.nuget.props *.nuget.targets # Microsoft Azure Build Output csx/ *.build.csdef # Microsoft Azure Emulator ecf/ rcf/ # Windows Store app package directories and files AppPackages/ BundleArtifacts/ Package.StoreAssociation.xml _pkginfo.txt # Visual Studio cache files # files ending in .cache can be ignored *.[Cc]ache # but keep track of directories ending in .cache !*.[Cc]ache/ # Others ClientBin/ ~$* *~ *.dbmdl *.dbproj.schemaview *.jfm *.pfx *.publishsettings node_modules/ orleans.codegen.cs # Since there are multiple workflows, uncomment next line to ignore bower_components # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) #bower_components/ # RIA/Silverlight projects Generated_Code/ # Backup & report files from converting an old project file # to a newer Visual Studio version. Backup files are not needed, # because we have git ;-) _UpgradeReport_Files/ Backup*/ UpgradeLog*.XML UpgradeLog*.htm # SQL Server files *.mdf *.ldf # Business Intelligence projects *.rdl.data *.bim.layout *.bim_*.settings # Microsoft Fakes FakesAssemblies/ # GhostDoc plugin setting file *.GhostDoc.xml # Node.js Tools for Visual Studio .ntvs_analysis.dat # Visual Studio 6 build log *.plg # Visual Studio 6 workspace options file *.opt # Visual Studio LightSwitch build output **/*.HTMLClient/GeneratedArtifacts **/*.DesktopClient/GeneratedArtifacts **/*.DesktopClient/ModelManifest.xml **/*.Server/GeneratedArtifacts **/*.Server/ModelManifest.xml _Pvt_Extensions # Paket dependency manager .paket/paket.exe paket-files/ # FAKE - F# Make .fake/ # JetBrains Rider .idea/ *.sln.iml # CodeRush .cr/ # Python Tools for Visual Studio (PTVS) __pycache__/ *.pyc ================================================ FILE: AnnotationLambda/AnnotationLambda.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Security.Cryptography; using System.Text; using Amazon.Lambda.Core; using Cloud; using Cloud.Messages.Annotation; using Cloud.Notifications; using Cloud.Utilities; using CommandLine.Utilities; using Compression.FileHandling; using ErrorHandling; using Genome; using IO; using Nirvana; using Vcf; using Tabix; using VariantAnnotation; using VariantAnnotation.SA; [assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.Json.JsonSerializer))] namespace AnnotationLambda { // ReSharper disable once UnusedMember.Global // ReSharper disable once ClassNeverInstantiated.Global public sealed class AnnotationLambda { // ReSharper disable once UnusedMember.Global public AnnotationResult Run(AnnotationConfig config, ILambdaContext context) { var result = new AnnotationResult { id = config.id }; string snsTopicArn = null; var runLog = new StringBuilder(); try { LogUtilities.UpdateLogger(context.Logger, runLog); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey }); LambdaUtilities.GarbageCollect(); LambdaUtilities.DeleteTempOutput(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); string vcfUrl = config.vcfUrl; int variantCount = 0; using (var annotationResources = GetAnnotationResources(config)) { if (annotationResources.InputStartVirtualPosition == -1) return GetSuccessOutput(result); long fileOffset = VirtualPosition.From(annotationResources.InputStartVirtualPosition).FileOffset; using (var preloadVcfStream = PersistentStreamUtils.GetReadStream(vcfUrl, fileOffset)) { var annotationRange = config.annotationRange?.ToGenomicRange(annotationResources.SequenceProvider.RefNameToChromosome); annotationResources.GetVariantPositions(new BlockGZipStream(preloadVcfStream, CompressionMode.Decompress), annotationRange); } Logger.WriteLine("Scan for positions to preload complete."); using (var aes = new AesCryptoServiceProvider()) { FileMetadata jsonMetadata, jasixMetadata; string jsonPath = Path.GetTempPath() + LambdaUrlHelper.JsonSuffix; string jasixPath = jsonPath + LambdaUrlHelper.JsonIndexSuffix; using (var inputVcfStream = new BlockGZipStream(PersistentStreamUtils.GetReadStream(vcfUrl, fileOffset), CompressionMode.Decompress)) using (var headerStream = config.annotationRange == null ? null : new BlockGZipStream(PersistentStreamUtils.GetReadStream(vcfUrl), CompressionMode.Decompress)) // using (var jsonFileStream = FileUtilities.GetCreateStream(jsonPath)) using (var jsonCryptoStream = new CryptoStream(jsonFileStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var jsonMd5Stream = new MD5Stream(jsonCryptoStream)) // using (var jasixFileStream = FileUtilities.GetCreateStream(jasixPath)) using (var jasixCryptoStream = new CryptoStream(jasixFileStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var jasixMd5Stream = new MD5Stream(jasixCryptoStream)) { IVcfFilter vcfFilter = config.annotationRange == null ? new NullVcfFilter() : new VcfFilter(config.annotationRange.ToGenomicRange(annotationResources.SequenceProvider.RefNameToChromosome)); using (var jsonCompressStream = new BlockGZipStream(jsonMd5Stream, CompressionMode.Compress)) { variantCount = StreamAnnotation.Annotate(headerStream, inputVcfStream, jsonCompressStream, jasixMd5Stream, annotationResources, vcfFilter, true, false, config.desiredVcfInfo == null? null: new HashSet(config.desiredVcfInfo), config.desiredVcfSampleInfo == null? null: new HashSet(config.desiredVcfSampleInfo)).variantCount; } Logger.WriteLine("Annotation done."); jsonMetadata = jsonMd5Stream.GetFileMetadata(); jasixMetadata = jasixMd5Stream.GetFileMetadata(); } result.filePath = S3Utilities.GetKey(config.outputDir.path, config.outputPrefix + LambdaUrlHelper.JsonSuffix); string jasixKey = result.filePath + LambdaUrlHelper.JsonIndexSuffix; var s3Client = config.outputDir.GetS3Client(context.RemainingTime); s3Client.DecryptUpload(config.outputDir.bucketName, jasixKey, jasixPath, aes, jasixMetadata); s3Client.DecryptUpload(config.outputDir.bucketName, result.filePath, jsonPath, aes, jsonMetadata); Logger.WriteLine("Nirvana result files uploaded."); } } LambdaUtilities.DeleteTempOutput(); if (string.IsNullOrEmpty(result.filePath)) throw new FileNotFoundException(); result.variantCount = variantCount; return GetSuccessOutput(result); } catch (Exception exception) { LambdaUtilities.DeleteTempOutput(); return HandleException(runLog, result, exception, snsTopicArn); } } private static AnnotationResult GetSuccessOutput(AnnotationResult result) { result.status = LambdaUtilities.SuccessMessage; LogUtilities.LogObject("Result", result); return result; } private static AnnotationResult HandleException(StringBuilder runLog, AnnotationResult result, Exception e, string snsTopicArn) { Logger.Log(e); result.status = e.Message; result.errorCategory = ExceptionUtilities.ExceptionToErrorCategory(e); Logger.WriteLine($"Error Category: {result.errorCategory}"); if (result.errorCategory != ErrorCategory.UserError) { string snsMessage = SNS.CreateMessage(runLog.ToString(), result.status, e.StackTrace); SNS.SendMessage(snsTopicArn, snsMessage); } LogUtilities.LogObject("Result", result); return result; } internal static long GetTabixVirtualPosition(AnnotationRange annotationRange, Stream stream, Dictionary refNameToChromosome) { // process the entire file if no range specified if (annotationRange == null) return 0; var tabixIndex = Reader.GetTabixIndex(stream, refNameToChromosome); return tabixIndex.GetOffset(annotationRange.Start.Chromosome, annotationRange.Start.Position); } private static AnnotationResources GetAnnotationResources(AnnotationConfig annotationConfig) { var genomeAssembly = GenomeAssemblyHelper.Convert(annotationConfig.genomeAssembly); string cachePathPrefix = LambdaUrlHelper.GetCacheFolder().UrlCombine(genomeAssembly.ToString()).UrlCombine(LambdaUrlHelper.DefaultCacheSource); string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly); // SaVersion will be provided as an environment variable. Defaults to "latest" string saVersion = Environment.GetEnvironmentVariable("SaVersion"); string saManifestUrl = LambdaUtilities.GetManifestUrl(string.IsNullOrEmpty(saVersion)? "latest": saVersion, genomeAssembly, SaCommon.SchemaVersion); var metrics = new PerformanceMetrics(); var annotationResources = new AnnotationResources(nirvanaS3Ref, cachePathPrefix, saManifestUrl == null? null: new List {saManifestUrl}, annotationConfig.customAnnotations, annotationConfig.customStrUrl, false, false, metrics); using (var tabixStream = PersistentStreamUtils.GetReadStream(annotationConfig.tabixUrl)) { annotationResources.InputStartVirtualPosition = GetTabixVirtualPosition(annotationConfig.annotationRange, tabixStream, annotationResources.SequenceProvider.RefNameToChromosome); } Logger.WriteLine($"Tabix position :{annotationResources.InputStartVirtualPosition}"); return annotationResources; } } } ================================================ FILE: AnnotationLambda/AnnotationLambda.csproj ================================================  net6.0 true Lambda bin\$(Configuration) ================================================ FILE: AnnotationLambda/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: AnnotationLambda/S3Utilities.cs ================================================ namespace AnnotationLambda { public static class S3Utilities { public static string GetKey(string outputDir, string filename) { outputDir = outputDir?.Trim('/'); if (string.IsNullOrEmpty(outputDir)) return filename; return outputDir + '/' + filename; } } } ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at mstromberg@illumina.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] [homepage]: http://contributor-covenant.org [version]: http://contributor-covenant.org/version/1/4/ ================================================ FILE: CONTRIBUTING.md ================================================ This guide provides: * protocols for contributing new features or bug fixes * high-level information about our development process Information is added as pertinent questions/discussions come up in the contributor community, so this guide is not intended to provide complete coverage of the above topics. # Table of Contents * [Scrum (agile development) ](#scrum-agile-development) * [Developer environment](#developer-environment) * [Coding conventions](#coding-conventions) * [Branching model](#branching-model) * [Unit testing](#unit-testing) * [Continuous integration](#continuous-integration) * [Portability](#portability) # Scrum (agile development) The development team uses Scrum agile development methodology. Our sprints are two weeks long and consistent of the four key ceremonies: * sprint planning * daily stand-ups * sprint retrospective * sprint review For external developers interested in contributing to the project, we would be happy to invite you to these ceremonies. Please contact any of the team members and we'll make the necessary arrangements. # Developer environment ## IDE The development team is using Microsoft Visual Studio 2015 to develop Nirvana. Developers could in theory choose to use other C# IDEs such as [MonoDevelop](http://www.monodevelop.com/), [SharpDevelop](https://sourceforge.net/projects/sharpdevelop/), or [Project Rider](https://www.jetbrains.com/rider/). However, we have not evaluated those IDEs at the moment. ## Extensions

JetBrains makes an incredible Visual Studio extension called [ReSharper](https://www.jetbrains.com/resharper/). No other tool comes as close to helping developers produce clean C# code while offering powerful functionality to make refactoring a breeze. For our internal development team, we require the use of ReSharper. # Coding conventions We use the same coding conventions (naming, layout, and commenting conventions) as is used in Microsoft's [C# Coding Conventions Guide](https://msdn.microsoft.com/en-us/library/ff926074.aspx). The only exception to this is the variable naming scheme that ReSharper suggests (i.e. private class variables should begin with an underscore). Here's a small example class that demonstrates most of these conventions: ```C# using System; using System.Collections.Generic; namespace Demo { public class Fibonacci { #region members private readonly List _fibonacciSeries; public readonly string Description; #endregion /// /// constructor /// public Fibonacci(string description, int numValues) { Description = description; _fibonacciSeries = new List(numValues); Calculate(numValues); } /// /// iteratively calculates the first n values of the Fibonacci series /// private void Calculate(int numValues) { int a = 1, b = 1; _fibonacciSeries.Add(a); _fibonacciSeries.Add(b); for (int i = 2; i < numValues; i++) { int sum = a + b; _fibonacciSeries.Add(sum); a = b; b = sum; } } /// /// displays all the calculated values of our fibonacci series /// public void Display() { Console.WriteLine($"{Description}:"); foreach(var value in _fibonacciSeries) Console.Write($"{value} "); Console.WriteLine(); } /// /// displays the nth calculated value of our fibonacci series /// public void Display(int index) { if ((index < 1) || (index > _fibonacciSeries.Count)) { throw new ArgumentOutOfRangeException(nameof(index)); } Console.WriteLine($"{Description}: {_fibonacciSeries[index - 1]}"); } } } ``` # Branching model The development team uses [GitFlow](http://nvie.com/posts/a-successful-git-branching-model/) to organize all of our branches. ## Feature branches In essence, all of our day-to-day work is on the **develop branch**. When work begins on a new **story** or **bug fix**, we will create a feature branch from the develop branch. When work on the feature branch has been completed, a **pull request is required** before it can be merged back to the develop branch. When the feature has finished development, we typically go through the following steps: 1. pull the latest develop branch 1. merge the develop branch to the feature branch 1. ensure that all unit tests pass 1. ensure that all regression and integration tests pass (internal developers only) 1. create a pull request 1. once approved, merge the feature branch to the develop branch Internal developers will also check the status of the Jenkins integration and regression tests before merging a feature branch back. ### Naming Our feature branch names obey the following convention: ``` features/short_description_1234 bugfixes/short_description_1234 ``` All feature branches are prefixed by either **features/** or **bugfixes/**. This naming scheme is exploited by our continuous integration framework. The number 1234 is used as a convenience to hold our JIRA ID (external developers are not required to add a numerical identifier). ## Builds and releases When we're ready to issue a new build, the develop branch is merged to the **master branch** and an **annotated tag** is added to the master branch. ``` git tag -a v1.4.3 -m "Nirvana 1.4.3" git push origin v1.4.3 ``` ## Release and hotfix branches Our team typically creates releases and hotfix branches for internal projects. As such, they will only be visible on our internal GitHub Enterprise server. # Unit testing Our team strives to have high unit test code coverage of all Nirvana code. Currently, the code coverage of the Illumina.VariantAnnotation library is around **82%** and we aspire to increase that to 90% or greater within the next few months. We prefer using a [TDD methodology](https://en.wikipedia.org/wiki/Test-driven_development), but we are not forcing developers to use it at this time. TDD has had a measured effect on improving our code quality. Any time our continuous integration pipeline shows an annotation that deviates from the baseline, we create a unit test to demonstrate the correct behavior and to ensure that future regressions do not occur.
# Continuous integration At Illumina, we have developed an extensive testing framework on top of the [Jenkins continuous integration framework](https://jenkins.io/). During our daily stand-ups, we check the status of every field in every variant for a few dozen data sets against the baseline. This translates to 100's of millions of variants (or billions of annotation fields) being checked on a daily basis. Unfortunately, our Jenkins servers sits behind our corporate firewall at the moment; but here's a snapshot of the information provided by our CI framework. We run a full set of smoke tests on every git commit on the develop branch. Developers can trigger both smoke and regression tests on any of the branches:
For each smoke or regression test, our testing framework provides a wealth of information for each input VCF file:
In some cases, deviations from our baseline are found. When this happens, we add it as a bug in our JIRA project and prioritize it accordingly in our backlog until it's ready to committed for a sprint:

# Portability While development is mainly performed in a Windows environment, Nirvana is expected to run on multiple platforms (Windows and Linux) reliably. We test Nirvana on a daily basis on both platforms. ================================================ FILE: CacheUtils/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: CacheUtils/BuildCache.sh ================================================ #!/bin/sh # ============= # configuration # ============= DOTNET=dotnet RELEASE_DIR=/d/Projects/NirvanaCacheUtils/bin/Release/netcoreapp2.0 CACHE_UTILS=$RELEASE_DIR/CacheUtils.dll VEP_VERSION=90 CACHE_VERSION=25 DATA_ROOT=/e/Data/Nirvana INTERMEDIATE_CACHE_DIR=$DATA_ROOT/IntermediateCache/$VEP_VERSION CACHE_DIR=$DATA_ROOT/Cache/$CACHE_VERSION REFERENCE_DIR=$DATA_ROOT/References/5 ENSEMBL37_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/Ensembl${VEP_VERSION}_GRCh37.transcripts.gz ENSEMBL38_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/Ensembl${VEP_VERSION}_GRCh38.transcripts.gz REFSEQ37_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/RefSeq${VEP_VERSION}_GRCh37.transcripts.gz REFSEQ38_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/RefSeq${VEP_VERSION}_GRCh38.transcripts.gz ENSEMBL37_CACHE_PATH=$CACHE_DIR/GRCh37/Ensembl${VEP_VERSION}.transcripts.ndb ENSEMBL38_CACHE_PATH=$CACHE_DIR/GRCh38/Ensembl${VEP_VERSION}.transcripts.ndb REFSEQ37_CACHE_PATH=$CACHE_DIR/GRCh37/RefSeq${VEP_VERSION}.transcripts.ndb REFSEQ38_CACHE_PATH=$CACHE_DIR/GRCh38/RefSeq${VEP_VERSION}.transcripts.ndb ENSEMBL38_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/Ensembl${VEP_VERSION}_GRCh38.transcripts.gz REFSEQ37_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/RefSeq${VEP_VERSION}_GRCh37.transcripts.gz REFSEQ38_TRANSCRIPT_PATH=$INTERMEDIATE_CACHE_DIR/RefSeq${VEP_VERSION}_GRCh38.transcripts.gz ENSEMBL37_URL="ftp://ftp.ensembl.org/pub/release-${VEP_VERSION}/variation/VEP/homo_sapiens_vep_${VEP_VERSION}_GRCh37.tar.gz" ENSEMBL38_URL="ftp://ftp.ensembl.org/pub/release-${VEP_VERSION}/variation/VEP/homo_sapiens_vep_${VEP_VERSION}_GRCh38.tar.gz" REFSEQ37_URL="ftp://ftp.ensembl.org/pub/release-${VEP_VERSION}/variation/VEP/homo_sapiens_refseq_vep_${VEP_VERSION}_GRCh37.tar.gz" REFSEQ38_URL="ftp://ftp.ensembl.org/pub/release-${VEP_VERSION}/variation/VEP/homo_sapiens_refseq_vep_${VEP_VERSION}_GRCh38.tar.gz" # ========= # functions # ========= CreateCache() { GA=$1 TS=$2 $DOTNET $CACHE_UTILS create -i $INTERMEDIATE_CACHE_DIR/${TS}${VEP_VERSION}_${GA} -r $REFERENCE_DIR/Homo_sapiens.${GA}.Nirvana.dat -o $CACHE_DIR/${GA}/${TS}${VEP_VERSION} if [ ! $? -eq 0 ]; then echo "ERROR: Unable to generate the cache successfully (Genome assembly: ${GA}, transcript source: ${TS})" exit 1 fi } export -f CreateCache # ============= # main workflow # ============= # download all the required files for building the cache $DOTNET $CACHE_UTILS download # create the intermediate cache files for each configuration # if [ ! -f ENSEMBL37_TRANSCRIPT_PATH ] # then # echo "Not implemented yet." # exit 1 # fi # if [ ! -f ENSEMBL38_TRANSCRIPT_PATH ] # then # echo "Not implemented yet." # exit 1 # fi # if [ ! -f REFSEQ37_TRANSCRIPT_PATH ] # then # echo "Not implemented yet." # exit 1 # fi # if [ ! -f REFSEQ38_TRANSCRIPT_PATH ] # then # echo "Not implemented yet." # exit 1 # fi # create the universal gene archive $DOTNET $CACHE_UTILS gene -r $REFERENCE_DIR -i $INTERMEDIATE_CACHE_DIR # create the actual cache files CACHE_LIST="" if [ ! -f ENSEMBL37_CACHE_PATH ] then CACHE_LIST="$CACHE_LIST GRCh37 Ensembl" fi if [ ! -f ENSEMBL38_CACHE_PATH ] then CACHE_LIST="$CACHE_LIST GRCh38 Ensembl" fi if [ ! -f REFSEQ37_CACHE_PATH ] then CACHE_LIST="$CACHE_LIST GRCh37 RefSeq" fi if [ ! -f REFSEQ38_CACHE_PATH ] then CACHE_LIST="$CACHE_LIST GRCh38 RefSeq" fi if [ ! -z "$CACHE_LIST" ] then echo "- creating cache files in parallel... " echo $CACHE_LIST | xargs -n 2 -P 8 bash -c 'CreateCache "$@"' -- echo "finished." fi ================================================ FILE: CacheUtils/CacheUtils.cs ================================================ using System.Collections.Generic; using CacheUtils.Commands.CombineCacheDirectories; using CacheUtils.Commands.CreateCache; using CacheUtils.Commands.Download; using CacheUtils.Commands.ExtractTranscripts; using CacheUtils.Commands.GFF; using CacheUtils.Commands.Header; using CacheUtils.Commands.ParseVepCacheDirectory; using CacheUtils.Commands.RegulatoryGFF; using CacheUtils.Commands.UniversalGeneArchive; using CommandLine.Builders; using VariantAnnotation.Interface; namespace CacheUtils { internal static class CacheUtilsMain { private static int Main(string[] args) { var ops = new Dictionary { ["combine"] = new TopLevelOption("combine cache directories", CombineCacheDirectoriesMain.Run), ["create"] = new TopLevelOption("create Nirvana cache files", CreateNirvanaDatabaseMain.Run), ["download"] = new TopLevelOption("downloads required files", DownloadMain.Run), ["extract"] = new TopLevelOption("extracts transcripts", ExtractTranscriptsMain.Run), ["gene"] = new TopLevelOption("updates the universal gene archive", UniversalGeneArchiveMain.Run), ["gff"] = new TopLevelOption("export transcripts to GFF", CreateGffMain.Run), ["header"] = new TopLevelOption("displays the header information", HeaderMain.Run), ["parse"] = new TopLevelOption("parses the VEP cache files", ParseVepCacheDirectoryMain.Run), ["rgff"] = new TopLevelOption("export regulatory regions to GFF", CreateRegulatoryGffMain.Run) }; var exitCode = new TopLevelAppBuilder(args, ops) .Parse() .ShowBanner(Constants.Authors) .ShowHelpMenu("Utilities focused on querying the cache directory") .ShowErrors() .Execute(); return (int)exitCode; } } } ================================================ FILE: CacheUtils/CacheUtils.csproj ================================================  Exe net6.0 ..\bin\$(Configuration) PreserveNewest ================================================ FILE: CacheUtils/CacheUtils.dll.gene.json ================================================ { "GRCh37":{ "ReferencePath":"Homo_sapiens.GRCh37.Nirvana.dat", "EnsemblCachePath":"Ensembl_GRCh37.transcripts.gz", "RefSeqCachePath":"RefSeq_GRCh37.transcripts.gz" }, "GRCh38": { "ReferencePath": "Homo_sapiens.GRCh38.Nirvana.dat", "EnsemblCachePath": "Ensembl_GRCh38.transcripts.gz", "RefSeqCachePath": "RefSeq_GRCh38.transcripts.gz" } } ================================================ FILE: CacheUtils/Commands/CombineCacheDirectories/CombineCacheDirectoriesMain.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using CacheUtils.PredictionCache; using CacheUtils.TranscriptCache; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Algorithms; using Compression.FileHandling; using ErrorHandling; using Genome; using Intervals; using IO; using ReferenceSequence.Utilities; using VariantAnnotation.Caches; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; using VariantAnnotation.Providers; namespace CacheUtils.Commands.CombineCacheDirectories { public static class CombineCacheDirectoriesMain { private static string _inputPrefix; private static string _inputPrefix2; private static string _outputPrefix; private static string _refSequencePath; private static ExitCodes ProgramExecution() { var sequenceData = SequenceHelper.GetDictionaries(_refSequencePath); var caches = LoadTranscriptCaches(CacheConstants.TranscriptPath(_inputPrefix), CacheConstants.TranscriptPath(_inputPrefix2), sequenceData.refIndexToChromosome); if (caches.Cache.TranscriptIntervalArrays.Length != caches.Cache2.TranscriptIntervalArrays.Length) throw new InvalidDataException($"Expected the number of reference sequences in cache 1 ({caches.Cache.TranscriptIntervalArrays.Length}) and cache 2 ({caches.Cache2.TranscriptIntervalArrays.Length}) to be the same."); int numRefSeqs = caches.Cache.TranscriptIntervalArrays.Length; var combinedIntervalArrays = new IntervalArray[numRefSeqs]; var siftPredictionsPerRef = new Prediction[numRefSeqs][]; var polyphenPredictionsPerRef = new Prediction[numRefSeqs][]; PredictionHeader siftHeader; PredictionHeader polyphenHeader; using (var siftReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(_inputPrefix)), PredictionCacheReader.SiftDescriptions)) using (var siftReader2 = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(_inputPrefix2)), PredictionCacheReader.SiftDescriptions)) using (var polyphenReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(_inputPrefix)), PredictionCacheReader.PolyphenDescriptions)) using (var polyphenReader2 = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(_inputPrefix2)), PredictionCacheReader.PolyphenDescriptions)) { siftHeader = siftReader.Header; polyphenHeader = polyphenReader.Header; for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var chromosome = sequenceData.refIndexToChromosome[refIndex]; Console.ForegroundColor = ConsoleColor.Yellow; Logger.WriteLine($"\n{chromosome.UcscName}:"); Console.ResetColor(); var sift = CombinePredictions(chromosome, "SIFT", siftReader, siftReader2); siftPredictionsPerRef[refIndex] = sift.Predictions; var polyphen = CombinePredictions(chromosome, "PolyPhen", polyphenReader, polyphenReader2); polyphenPredictionsPerRef[refIndex] = polyphen.Predictions; var transcriptIntervalArray = caches.Cache.TranscriptIntervalArrays[refIndex]; var transcriptIntervalArray2 = caches.Cache2.TranscriptIntervalArrays[refIndex]; combinedIntervalArrays[refIndex] = CombineTranscripts(transcriptIntervalArray, transcriptIntervalArray2, sift.Offset, polyphen.Offset); } } Logger.WriteLine(""); WritePredictions("SIFT", CacheConstants.SiftPath(_outputPrefix), siftHeader, siftPredictionsPerRef); WritePredictions("PolyPhen", CacheConstants.PolyPhenPath(_outputPrefix), polyphenHeader, polyphenPredictionsPerRef); WriteTranscripts(CloneHeader(caches.Cache.Header), combinedIntervalArrays, caches.Cache.RegulatoryRegionIntervalArrays); return ExitCodes.Success; } private static void WriteTranscripts(CacheHeader header, IntervalArray[] transcriptIntervalArrays, IntervalArray[] regulatoryRegionIntervalArrays) { var staging = TranscriptCacheStaging.GetStaging(header, transcriptIntervalArrays, regulatoryRegionIntervalArrays); Logger.Write("- writing transcripts... "); staging.Write(FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(_outputPrefix))); Logger.WriteLine("finished."); } private static void WritePredictions(string description, string filePath, PredictionHeader header, Prediction[][] predictionsPerRef) { Logger.Write($"- writing {description} predictions... "); using (var stream = new BlockStream(new Zstandard(), FileUtilities.GetCreateStream(filePath), CompressionMode.Compress)) using (var writer = new PredictionCacheWriter(stream, CloneHeader(header))) { writer.Write(header.LookupTable, predictionsPerRef); } Logger.WriteLine("finished."); } private static IntervalArray CombineTranscripts(IntervalArray intervalArray, IntervalArray intervalArray2, int siftOffset, int polyphenOffset) { Logger.Write("- combine transcripts... "); int numCombinedTranscripts = GetNumCombinedTranscripts(intervalArray, intervalArray2); var combinedIntervals = new Interval[numCombinedTranscripts]; var combinedIndex = 0; CopyItems(intervalArray?.Array, combinedIntervals, ref combinedIndex, interval => interval); CopyItems(intervalArray2?.Array, combinedIntervals, ref combinedIndex, interval => GetUpdatedTranscript(interval, siftOffset, polyphenOffset)); Logger.WriteLine("finished."); return new IntervalArray(combinedIntervals.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray()); } private static int GetNumCombinedTranscripts(IntervalArray intervalArray, IntervalArray intervalArray2) { int numIntervals = intervalArray?.Array.Length ?? 0; int numIntervals2 = intervalArray2?.Array.Length ?? 0; return numIntervals + numIntervals2; } // ReSharper disable SuggestBaseTypeForParameter private static void CopyItems(T[] src, T[] dest, ref int destIndex, Func updateFunc) // ReSharper restore SuggestBaseTypeForParameter { if (src == null) return; foreach (var item in src) dest[destIndex++] = updateFunc(item); } private static Interval GetUpdatedTranscript(Interval interval, int siftOffset, int polyphenOffset) { var transcript = interval.Value; if (transcript.SiftIndex == -1 && transcript.PolyPhenIndex == -1) return interval; int newSiftIndex = transcript.SiftIndex == -1 ? -1 : transcript.SiftIndex + siftOffset; int newPolyphenIndex = transcript.PolyPhenIndex == -1 ? -1 : transcript.PolyPhenIndex + polyphenOffset; var updatedTranscript = transcript.UpdatePredictions(newSiftIndex, newPolyphenIndex); return new Interval(transcript.Start, transcript.End, updatedTranscript); } private static VariantAnnotation.IO.Caches.Header CloneBaseHeader(VariantAnnotation.IO.Caches.Header header) => new VariantAnnotation.IO.Caches.Header(CacheConstants.Identifier, header.SchemaVersion, header.DataVersion, Source.BothRefSeqAndEnsembl, DateTime.Now.Ticks, header.Assembly); private static PredictionHeader CloneHeader(PredictionHeader header) => new PredictionHeader(CloneBaseHeader(header), header.Custom, header.LookupTable); private static CacheHeader CloneHeader(CacheHeader header) => new CacheHeader(CloneBaseHeader(header), header.Custom); private static (Prediction[] Predictions, int Offset) CombinePredictions(Chromosome chromosome, string description, PredictionCacheReader reader, PredictionCacheReader reader2) { Logger.Write($"- load {description} predictions... "); var predictions = reader.GetPredictions(chromosome.Index); var predictions2 = reader2.GetPredictions(chromosome.Index); Logger.WriteLine("finished."); var combinedPredictions = CombinePredictions(description, predictions, predictions2); return (combinedPredictions, predictions.Length); } private static Prediction[] CombinePredictions(string description, Prediction[] predictions, Prediction[] predictions2) { Logger.Write($"- combine {description} predictions... "); int numCombinedPredictions = predictions.Length + predictions2.Length; var combinedPredictions = new Prediction[numCombinedPredictions]; var combinedIndex = 0; CopyItems(predictions, combinedPredictions, ref combinedIndex, x => x); CopyItems(predictions2, combinedPredictions, ref combinedIndex, x => x); Logger.WriteLine("finished."); return combinedPredictions; } private static (TranscriptCacheData Cache, TranscriptCacheData Cache2) LoadTranscriptCaches( string transcriptPath, string transcriptPath2, Dictionary refIndexToChromosome) { TranscriptCacheData cache; TranscriptCacheData cache2; Logger.Write("- loading transcript caches... "); using (var transcriptReader = new TranscriptCacheReader(FileUtilities.GetReadStream(transcriptPath))) using (var transcriptReader2 = new TranscriptCacheReader(FileUtilities.GetReadStream(transcriptPath2))) { cache = transcriptReader.Read(refIndexToChromosome); cache2 = transcriptReader2.Read(refIndexToChromosome); } Logger.WriteLine("finished."); return (cache, cache2); } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "in|1=", "input cache {prefix}", v => _inputPrefix = v }, { "in2|2=", "input cache 2 {prefix}", v => _inputPrefix2 = v }, { "out|o=", "output cache {prefix}", v => _outputPrefix = v }, { "ref|r=", "input reference {path}", v => _refSequencePath = v } }; string commandLineExample = $"{command} --in --in2 --out --ref "; return new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .CheckInputFilenameExists(_refSequencePath, "reference sequence", "--ref") .HasRequiredParameter(_inputPrefix, "input cache", "--in") .HasRequiredParameter(_inputPrefix2, "input cache 2", "--in2") .HasRequiredParameter(_outputPrefix, "output cache", "--out") .SkipBanner() .ShowHelpMenu("Combines two cache sets into one cache.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: CacheUtils/Commands/CreateCache/CreateNirvanaDatabaseMain.cs ================================================ using System.Collections.Generic; using System.Linq; using CacheUtils.Commands.Download; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.IO; using CacheUtils.Genes.Utilities; using CacheUtils.IntermediateIO; using CacheUtils.PredictionCache; using CacheUtils.TranscriptCache; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using Genome; using Intervals; using IO; using ReferenceSequence.Utilities; using VariantAnnotation.Providers; namespace CacheUtils.Commands.CreateCache { public static class CreateNirvanaDatabaseMain { private static string _inputPrefix; private static string _inputReferencePath; private static string _outputCacheFilePrefix; private static ExitCodes ProgramExecution() { string transcriptPath = _inputPrefix + ".transcripts.gz"; string siftPath = _inputPrefix + ".sift.gz"; string polyphenPath = _inputPrefix + ".polyphen.gz"; string regulatoryPath = _inputPrefix + ".regulatory.gz"; (var refIndexToChromosome, var refNameToChromosome, int numRefSeqs) = SequenceHelper.GetDictionaries(_inputReferencePath); using (var transcriptReader = new MutableTranscriptReader(GZipUtilities.GetAppropriateReadStream(transcriptPath), refIndexToChromosome)) using (var regulatoryReader = new RegulatoryRegionReader(GZipUtilities.GetAppropriateReadStream(regulatoryPath), refIndexToChromosome)) using (var siftReader = new PredictionReader(GZipUtilities.GetAppropriateReadStream(siftPath), refIndexToChromosome, IntermediateIoCommon.FileType.Sift)) using (var polyphenReader = new PredictionReader(GZipUtilities.GetAppropriateReadStream(polyphenPath), refIndexToChromosome, IntermediateIoCommon.FileType.Polyphen)) using (var geneReader = new UgaGeneReader(GZipUtilities.GetAppropriateReadStream(ExternalFiles.UniversalGeneFilePath), refNameToChromosome)) { var genomeAssembly = transcriptReader.Header.Assembly; var source = transcriptReader.Header.Source; long vepReleaseTicks = transcriptReader.Header.VepReleaseTicks; ushort vepVersion = transcriptReader.Header.VepVersion; Logger.Write("- loading universal gene archive file... "); var genes = geneReader.GetGenes(); var geneForest = CreateGeneForest(genes, numRefSeqs, genomeAssembly); Logger.WriteLine($"{genes.Length:N0} loaded."); Logger.Write("- loading regulatory region file... "); var regulatoryRegions = regulatoryReader.GetRegulatoryRegions(); Logger.WriteLine($"{regulatoryRegions.Length:N0} loaded."); Logger.Write("- loading transcript file... "); var transcripts = transcriptReader.GetTranscripts(); var transcriptsByRefIndex = transcripts.GetMultiValueDict(x => x.Chromosome.Index); Logger.WriteLine($"{transcripts.Length:N0} loaded."); MarkCanonicalTranscripts(transcripts); var predictionBuilder = new PredictionCacheBuilder(genomeAssembly); var predictionCaches = predictionBuilder.CreatePredictionCaches(transcriptsByRefIndex, siftReader, polyphenReader, numRefSeqs); Logger.Write("- writing SIFT prediction cache... "); predictionCaches.Sift.Write(FileUtilities.GetCreateStream(CacheConstants.SiftPath(_outputCacheFilePrefix))); Logger.WriteLine("finished."); Logger.Write("- writing PolyPhen prediction cache... "); predictionCaches.PolyPhen.Write(FileUtilities.GetCreateStream(CacheConstants.PolyPhenPath(_outputCacheFilePrefix))); Logger.WriteLine("finished."); var transcriptBuilder = new TranscriptCacheBuilder(genomeAssembly, source, vepReleaseTicks, vepVersion); var transcriptStaging = transcriptBuilder.CreateTranscriptCache(transcripts, regulatoryRegions, geneForest, numRefSeqs); Logger.Write("- writing transcript cache... "); transcriptStaging.Write(FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(_outputCacheFilePrefix))); Logger.WriteLine("finished."); } return ExitCodes.Success; } private static IIntervalForest CreateGeneForest(IEnumerable genes, int numRefSeqs, GenomeAssembly genomeAssembly) { bool useGrch37 = genomeAssembly == GenomeAssembly.GRCh37; var intervalLists = new List>[numRefSeqs]; for (var i = 0; i < numRefSeqs; i++) intervalLists[i] = new List>(); foreach (var gene in genes) { var coords = useGrch37 ? gene.GRCh37 : gene.GRCh38; if (coords.Start == -1 && coords.End == -1) continue; intervalLists[gene.Chromosome.Index].Add(new Interval(coords.Start, coords.End, gene)); } var refIntervalArrays = new IntervalArray[numRefSeqs]; for (var i = 0; i < numRefSeqs; i++) { refIntervalArrays[i] = new IntervalArray(intervalLists[i].OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray()); } return new IntervalForest(refIntervalArrays); } private static void MarkCanonicalTranscripts(MutableTranscript[] transcripts) { var ccdsIdToEnsemblId = CcdsReader.GetCcdsIdToEnsemblId(ExternalFiles.CcdsFile.FilePath); var lrgTranscriptIds = LrgReader.GetTranscriptIds(ExternalFiles.LrgFile.FilePath, ccdsIdToEnsemblId); Logger.Write("- marking canonical transcripts... "); var canonical = new CanonicalTranscriptMarker(lrgTranscriptIds); int numCanonicalTranscripts = canonical.MarkTranscripts(transcripts); Logger.WriteLine($"{numCanonicalTranscripts:N0} marked."); } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "in|i=", "input filename {prefix}", v => _inputPrefix = v }, { "out|o=", "output cache file {prefix}", v => _outputCacheFilePrefix = v }, { "ref|r=", "input reference {filename}", v => _inputReferencePath = v } }; string commandLineExample = $"{command} --in --out --ref "; return new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .HasRequiredParameter(_inputPrefix, "intermediate cache", "--in") .CheckInputFilenameExists(_inputReferencePath, "compressed reference", "--ref") .HasRequiredParameter(_outputCacheFilePrefix, "Nirvana", "--out") .SkipBanner() .ShowHelpMenu("Converts *deserialized* VEP cache files to Nirvana cache format.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: CacheUtils/Commands/Download/DownloadMain.cs ================================================ using ErrorHandling; namespace CacheUtils.Commands.Download { public static class DownloadMain { private static ExitCodes ProgramExecution() { ExternalFiles.Download(); return ExitCodes.Success; } public static ExitCodes Run(string command, string[] args) { return ProgramExecution(); } } } ================================================ FILE: CacheUtils/Commands/Download/ExternalFiles.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.Genbank; using CacheUtils.IntermediateIO; using CacheUtils.Utilities; using Compression.Utilities; using Genome; using IO; using OptimizedCore; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.Commands.Download { public static class ExternalFiles { public static readonly RemoteFile CcdsFile = new RemoteFile("CCDS file (2016-09-08)", "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS2Sequence.20160908.txt", false); public static readonly RemoteFile LrgFile = new RemoteFile("latest LRG file", "http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt"); public static readonly RemoteFile HgncFile = new RemoteFile("latest HGNC gene symbols", "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt"); public static readonly RemoteFile GeneInfoFile = new RemoteFile("latest gene_info", "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz"); private static readonly RemoteFile AssemblyFile37 = new RemoteFile("assembly report (GRCh37.p13)", "ftp://ftp.ncbi.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt", false); public static readonly RemoteFile EnsemblGtfFile37 = new RemoteFile("Ensembl 75 GTF (GRCh37)", "ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz", false); public static readonly RemoteFile RefSeqGenomeGffFile37 = new RemoteFile("RefSeq genomic GFF (GRCh37.p13)", "ftp://ftp.ncbi.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz", false); public static readonly RemoteFile RefSeqGffFile37 = new RemoteFile("RefSeq GFF3 (GRCh37.p13)", "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz", false); private static readonly RemoteFile AssemblyFile38 = new RemoteFile("assembly report (GRCh38.p11)", "ftp://ftp.ncbi.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.37_GRCh38.p11/GCF_000001405.37_GRCh38.p11_assembly_report.txt", false); public static readonly RemoteFile EnsemblGtfFile38 = new RemoteFile("Ensembl 90 GTF (GRCh38)", "ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz", false); public static readonly RemoteFile RefSeqGenomeGffFile38 = new RemoteFile("RefSeq genomic GFF (GRCh38.p11)", "ftp://ftp.ncbi.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.37_GRCh38.p11/GCF_000001405.37_GRCh38.p11_genomic.gff.gz", false); public static readonly RemoteFile RefSeqGffFile38 = new RemoteFile("RefSeq GFF3 (GRCh38.p7)", "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/GFF/ref_GRCh38.p7_top_level.gff3.gz", false); public static readonly string GenbankFilePath = Path.Combine(Path.GetTempPath(), RemoteFile.GetFilename("Genbank.tsv.gz", false)); public static readonly string UniversalGeneFilePath = Path.Combine(Path.GetTempPath(), RemoteFile.GetFilename("UGA.tsv.gz", false)); public static void Download() { var fileList = new List { CcdsFile, LrgFile, HgncFile, GeneInfoFile, AssemblyFile37, AssemblyFile38, EnsemblGtfFile37, EnsemblGtfFile38, RefSeqGenomeGffFile37, RefSeqGenomeGffFile38, RefSeqGffFile37, RefSeqGffFile38 }; var genbankFiles = GetGenbankFiles(fileList); fileList.Execute("downloads", file => file.Download()); if (genbankFiles == null) return; genbankFiles.Execute("file parsing", file => file.Parse()); var genbankEntries = GetIdToGenbankEntryDict(genbankFiles); WriteDictionary(genbankEntries); } private static IEnumerable GetIdToGenbankEntryDict(IEnumerable files) => files.SelectMany(file => file.GenbankDict.Values).OrderBy(x => x.TranscriptId).ToList(); private static List GetGenbankFiles(ICollection fileList) { var genbankFileInfo = new FileInfo(GenbankFilePath); if (genbankFileInfo.Exists && GetElapsedDays(genbankFileInfo.CreationTime) < 30.0) return null; int numGenbankFiles = GetNumGenbankFiles(); var genbankFiles = new List(numGenbankFiles); for (var i = 0; i < numGenbankFiles; i++) { var genbankFile = new GenbankFile(i + 1); fileList.Add(genbankFile.RemoteFile); genbankFiles.Add(genbankFile); } return genbankFiles; } public static double GetElapsedDays(DateTime creationTime) => DateTime.Now.Subtract(creationTime).TotalDays; private static int GetNumGenbankFiles() { var fileList = new RemoteFile("RefSeq filelist", "ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed"); fileList.Download(); var maxNum = 0; using (var reader = FileUtilities.GetStreamReader(FileUtilities.GetReadStream(fileList.FilePath))) { while (true) { string line = reader.ReadLine(); if (line == null) break; string filename = line.OptimizedSplit('\t')[1]; if (!filename.EndsWith(".rna.gbff.gz")) continue; int num = int.Parse(filename.Substring(6, filename.Length - 18)); if (num > maxNum) maxNum = num; } } return maxNum; } private static void WriteDictionary(IEnumerable entries) { var header = new IntermediateIoHeader(0, 0, Source.None, GenomeAssembly.Unknown, 0); Logger.Write($"- writing Genbank file ({Path.GetFileName(GenbankFilePath)})... "); using (var writer = new GenbankWriter(GZipUtilities.GetStreamWriter(GenbankFilePath), header)) { foreach (var entry in entries) writer.Write(entry); } Logger.WriteLine("finished."); } } } ================================================ FILE: CacheUtils/Commands/Download/GenbankFile.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.Genbank; using CacheUtils.Utilities; using Compression.Utilities; using IO; namespace CacheUtils.Commands.Download { public sealed class GenbankFile { public readonly RemoteFile RemoteFile; public readonly Dictionary GenbankDict; public GenbankFile(int num) { RemoteFile = new RemoteFile($"RefSeq Genbank {num} gbff", $"ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.{num}.rna.gbff.gz", false); GenbankDict = new Dictionary(); } public void Parse() { Logger.WriteLine($"- parsing {Path.GetFileName(RemoteFile.FilePath)}"); using (var reader = new GenbankReader(GZipUtilities.GetAppropriateStreamReader(RemoteFile.FilePath))) { while (true) { var entry = reader.GetGenbankEntry(); if (entry == null) break; GenbankDict[entry.TranscriptId] = entry; } } } } } ================================================ FILE: CacheUtils/Commands/ExtractTranscripts/ExtractTranscriptsMain.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.MiniCache; using CacheUtils.PredictionCache; using CacheUtils.TranscriptCache; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using Genome; using Intervals; using IO; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.IO.Caches; using VariantAnnotation.Providers; namespace CacheUtils.Commands.ExtractTranscripts { public static class ExtractTranscriptsMain { private static string _inputPrefix; private static string _inputReferencePath; private static string _outputDirectory; private static string _referenceName; private static int _referencePosition = -1; private static int _referenceEndPosition = -1; private static ExitCodes ProgramExecution() { var bundle = DataBundle.GetDataBundle(_inputReferencePath, _inputPrefix); int numRefSeqs = bundle.SequenceReader.NumRefSeqs; var chromosome = ReferenceNameUtilities.GetChromosome(bundle.SequenceReader.RefNameToChromosome, _referenceName); bundle.Load(chromosome); string outputStub = GetOutputStub(chromosome, bundle.Source); var interval = new ChromosomeInterval(chromosome, _referencePosition, _referenceEndPosition); var transcripts = GetTranscripts(bundle, interval); var sift = GetPredictionStaging("SIFT", transcripts, chromosome, bundle.SiftPredictions, bundle.SiftReader, x => x.SiftIndex, numRefSeqs); var polyphen = GetPredictionStaging("PolyPhen", transcripts, chromosome, bundle.PolyPhenPredictions, bundle.PolyPhenReader, x => x.PolyPhenIndex, numRefSeqs); var regulatoryRegionIntervalArrays = GetRegulatoryRegionIntervalArrays(bundle.TranscriptCache, interval, numRefSeqs); var transcriptIntervalArrays = PredictionUtilities.UpdateTranscripts(transcripts, bundle.SiftPredictions, sift.Predictions, bundle.PolyPhenPredictions, polyphen.Predictions, numRefSeqs); var transcriptStaging = GetTranscriptStaging(bundle.TranscriptCacheData.Header, transcriptIntervalArrays, regulatoryRegionIntervalArrays); WriteCache(FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(outputStub)), transcriptStaging, "transcript"); WriteCache(FileUtilities.GetCreateStream(CacheConstants.SiftPath(outputStub)), sift.Staging, "SIFT"); WriteCache(FileUtilities.GetCreateStream(CacheConstants.PolyPhenPath(outputStub)), polyphen.Staging, "PolyPhen"); return ExitCodes.Success; } private static TranscriptCacheStaging GetTranscriptStaging(CacheHeader header, IntervalArray[] transcriptIntervalArrays, IntervalArray[] regulatoryRegionIntervalArrays) => TranscriptCacheStaging.GetStaging(header, transcriptIntervalArrays, regulatoryRegionIntervalArrays); private static void WriteCache(Stream stream, IStaging staging, string description) { Logger.Write($"- writing {description} cache... "); staging.Write(stream); Logger.WriteLine("finished."); } private static string GetOutputStub(Chromosome chromosome, Source source) => Path.Combine(_outputDirectory, $"{chromosome.UcscName}_{_referencePosition}_{_referenceEndPosition}_{GetSource(source)}"); private static string GetSource(Source source) => source != Source.BothRefSeqAndEnsembl ? source.ToString() : "Both"; private static (PredictionCacheStaging Staging, Prediction[] Predictions) GetPredictionStaging( string description, IEnumerable transcripts, Chromosome chromosome, IReadOnlyList oldPredictions, PredictionCacheReader reader, Func indexFunc, int numRefSeqs) { Logger.Write($"- retrieving {description} predictions... "); var indexSet = GetUniqueIndices(transcripts, indexFunc); var predictionsPerRef = GetPredictions(indexSet, chromosome, numRefSeqs, oldPredictions); var staging = new PredictionCacheStaging(reader.Header, predictionsPerRef); Logger.WriteLine($"found {indexSet.Count} predictions."); return (staging, predictionsPerRef[chromosome.Index]); } private static Prediction[][] GetPredictions(ICollection indexSet, Chromosome chromosome, int numRefSeqs, IReadOnlyList oldPredictions) { var refPredictions = new Prediction[indexSet.Count]; var predIdx = 0; foreach (int index in indexSet) refPredictions[predIdx++] = oldPredictions[index]; var predictions = new Prediction[numRefSeqs][]; predictions[chromosome.Index] = refPredictions; return predictions; } private static HashSet GetUniqueIndices(IEnumerable transcripts, Func indexFunc) { var indexSet = new HashSet(); foreach (var transcript in transcripts) { int index = indexFunc(transcript); if (index == -1) continue; indexSet.Add(index); } return indexSet; } private static IntervalArray[] GetRegulatoryRegionIntervalArrays( ITranscriptCache cache, ChromosomeInterval interval, int numRefSeqs) { Logger.Write("- retrieving regulatory regions... "); var regulatoryIntervalForest = cache.RegulatoryIntervalForest; var regulatoryRegions = regulatoryIntervalForest.GetAllOverlappingValues(interval.Chromosome.Index, interval.Start, interval.End); Logger.WriteLine($"found {regulatoryRegions.Length} regulatory regions."); return regulatoryRegions.ToIntervalArrays(numRefSeqs); } private static List GetTranscripts(DataBundle bundle, ChromosomeInterval interval) { Logger.Write("- retrieving transcripts... "); var transcripts = TranscriptCacheUtilities.GetTranscripts(bundle, interval); Logger.WriteLine($"found {transcripts.Count} transcripts."); if (transcripts.Count == 0) throw new InvalidDataException("Expected at least one transcript, but found none."); return transcripts; } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "in|i=", "input cache {prefix}", v => _inputPrefix = v }, { "name|n=", "reference {name}", v => _referenceName = v }, { "out|o=", "output {directory}", v => _outputDirectory = v }, { "pos|p=", "reference {position}", (int v) => _referencePosition = v }, { "endpos=", "reference end {position}", (int v) => _referenceEndPosition = v }, { "ref|r=", "input reference {filename}", v => _inputReferencePath = v } }; string commandLineExample = $"{command} --in --out -r --chr -p --endpos \n"; return new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .HasRequiredParameter(_inputPrefix, "Nirvana cache", "--in") .CheckInputFilenameExists(_inputReferencePath, "compressed reference sequence", "--ref") .CheckDirectoryExists(_outputDirectory, "output cache", "--out") .SkipBanner() .ShowHelpMenu("Extracts transcripts from Nirvana cache files.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: CacheUtils/Commands/GFF/CreateGffMain.cs ================================================ using System.Collections.Generic; using CacheUtils.Commands.ParseVepCacheDirectory; using CacheUtils.GFF; using CacheUtils.Helpers; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using Genome; using IO; using ReferenceSequence.Utilities; using VariantAnnotation.Caches; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Providers; namespace CacheUtils.Commands.GFF { public static class CreateGffMain { private static string _compressedReferencePath; private static string _inputPrefix; private static string _outputFileName; private static string _transcriptSource; private static ExitCodes ProgramExecution() { Source transcriptSource = ParseVepCacheDirectoryMain.GetSource(_transcriptSource); string cachePath = CacheConstants.TranscriptPath(_inputPrefix); Dictionary refIndexToChromosome = SequenceHelper.GetDictionaries(_compressedReferencePath).refIndexToChromosome; TranscriptCacheData cache = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome); Dictionary geneToInternalId = InternalGenes.CreateDictionary(cache.Genes); using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName))) { var creator = new GffCreator(writer, geneToInternalId, transcriptSource); creator.Create(cache.TranscriptIntervalArrays); } return ExitCodes.Success; } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "in|i=", "input cache {prefix}", v => _inputPrefix = v }, { "out|o=", "output {file name}", v => _outputFileName = v }, { "source|s=", "transcript {source}", v => _transcriptSource = v }, { "ref|r=", "reference {file}", v => _compressedReferencePath = v } }; string commandLineExample = $"{command} --in --out "; return new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .HasRequiredParameter(_inputPrefix, "input cache prefix", "--in") .CheckOutputFilenameSuffix(_outputFileName, ".gz", "GFF") .SkipBanner() .ShowHelpMenu("Outputs exon coordinates for all transcripts in a database.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: CacheUtils/Commands/GFF/InternalGenes.cs ================================================ using System.Collections.Generic; using CacheUtils.TranscriptCache.Comparers; using ErrorHandling.Exceptions; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.Commands.GFF { public static class InternalGenes { public static Dictionary CreateDictionary(IGene[] genes) { var geneComparer = new GeneComparer(); var geneToInternalId = new Dictionary(geneComparer); for (var geneIndex = 0; geneIndex < genes.Length; geneIndex++) { var gene = genes[geneIndex]; if (geneToInternalId.TryGetValue(gene, out int oldGeneIndex)) { throw new UserErrorException($"Found a duplicate gene in the dictionary: {genes[geneIndex]} ({geneIndex} vs {oldGeneIndex})"); } geneToInternalId[gene] = geneIndex; } return geneToInternalId; } } } ================================================ FILE: CacheUtils/Commands/Header/HeaderMain.cs ================================================ using System; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using VariantAnnotation.IO.Caches; using VariantAnnotation.Providers; namespace CacheUtils.Commands.Header { public static class HeaderMain { private static string _inputPrefix; private static ExitCodes ProgramExecution() { string cachePath = CacheConstants.TranscriptPath(_inputPrefix); var header = GetHeaderInformation(cachePath); Console.WriteLine($"Versions: Schema: {header.Schema}, Data: {header.Data}, VEP: {header.Vep}"); return ExitCodes.Success; } private static (ushort Schema, ushort Data, ushort Vep) GetHeaderInformation(string cachePath) { CacheHeader header; using (var stream = FileUtilities.GetReadStream(cachePath)) { header = CacheHeader.Read(stream); } if (header == null) throw new InvalidFileFormatException($"Could not parse the header information correctly for {cachePath}"); return (header.SchemaVersion, header.DataVersion, header.Custom.VepVersion); } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "in|i=", "input cache {prefix}", v => _inputPrefix = v } }; return new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .HasRequiredParameter(_inputPrefix, "input cache prefix", "--in") .SkipBanner() .ShowHelpMenu("Displays the cache header information.", $"{command} --in ") .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: CacheUtils/Commands/ParseVepCacheDirectory/ParseVepCacheDirectoryMain.cs ================================================ using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.IntermediateIO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using System; using System.Collections.Generic; using CacheUtils.Commands.Download; using CacheUtils.Genbank; using Genome; using IO; using ReferenceSequence.IO; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Providers; namespace CacheUtils.Commands.ParseVepCacheDirectory { public static class ParseVepCacheDirectoryMain { private static string _inputVepDirectory; private static string _inputReferencePath; private static string _outputStub; private static string _vepReleaseDate; private static string _genomeAssembly; private static string _transcriptSource; private static ushort _vepVersion; private static ExitCodes ProgramExecution() { var transcriptSource = GetSource(_transcriptSource); var sequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath)); var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome); var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory); var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks; var idToGenbank = GetIdToGenbank(genomeAssembly, transcriptSource); // ========================= // create the pre-cache file // ========================= // process each VEP directory int numRefSeqs = sequenceReader.NumRefSeqs; var header = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs); string siftPath = _outputStub + ".sift.gz"; string polyphenPath = _outputStub + ".polyphen.gz"; string transcriptPath = _outputStub + ".transcripts.gz"; string regulatoryPath = _outputStub + ".regulatory.gz"; using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log"))) using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift)) using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen)) using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header)) using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header)) { var converter = new VepCacheParser(transcriptSource); var emptyPredictionDict = new Dictionary>(); for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var chromosome = sequenceReader.RefIndexToChromosome[refIndex]; if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir)) { siftWriter.Write(chromosome, emptyPredictionDict); polyphenWriter.Write(chromosome, emptyPredictionDict); continue; } Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName); var rawData = converter.ParseDumpDirectory(chromosome, vepSubDir); var mergedTranscripts = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank); var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions); int numRawTranscripts = rawData.Transcripts.Count; int numMergedTranscripts = mergedTranscripts.Count; Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}"); WriteTranscripts(transcriptWriter, mergedTranscripts); WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions); WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome); WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome); } } Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count); return ExitCodes.Success; } private static Dictionary GetIdToGenbank(GenomeAssembly assembly, Source source) { if (assembly != GenomeAssembly.GRCh37 || source != Source.RefSeq) return null; Logger.Write("- loading the intermediate Genbank file... "); Dictionary genbankDict; using (var reader = new IntermediateIO.GenbankReader(GZipUtilities.GetAppropriateReadStream(ExternalFiles.GenbankFilePath))) { genbankDict = reader.GetIdToGenbank(); } Logger.WriteLine($"{genbankDict.Count} entries loaded."); return genbankDict; } private static void WriteRegulatoryRegions(RegulatoryRegionWriter writer, IEnumerable regulatoryRegions) { foreach (var regulatoryRegion in regulatoryRegions) writer.Write(regulatoryRegion); } private static void WriteTranscripts(MutableTranscriptWriter writer, IEnumerable transcripts) { foreach (var transcript in transcripts) writer.Write(transcript); } private static void WritePredictions(PredictionWriter writer, IReadOnlyList transcripts, Func predictionFunc, Chromosome chromosome) { var predictionDict = new Dictionary>(StringComparer.Ordinal); for (var transcriptIndex = 0; transcriptIndex < transcripts.Count; transcriptIndex++) { var transcript = transcripts[transcriptIndex]; string predictionData = predictionFunc(transcript); if (predictionData == null) continue; if (predictionDict.TryGetValue(predictionData, out var transcriptIdList)) transcriptIdList.Add(transcriptIndex); else predictionDict[predictionData] = new List { transcriptIndex }; } writer.Write(chromosome, predictionDict); } public static Source GetSource(string source) { source = source.ToLower(); if (source.StartsWith("ensembl")) return Source.Ensembl; if (source.StartsWith("refseq")) return Source.RefSeq; return source.StartsWith("both") ? Source.BothRefSeqAndEnsembl : Source.None; } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "date=", "VEP release {date}", v => _vepReleaseDate = v }, { "source|s=", "transcript {source}", v => _transcriptSource = v }, { "ga=", "genome assembly {version}", v => _genomeAssembly = v }, { "in|i=", "input VEP {directory}", v => _inputVepDirectory = v }, { "out|o=", "output filename {stub}", v => _outputStub = v }, { "ref|r=", "input reference {filename}", v => _inputReferencePath = v }, { "vep=", "VEP {version}", (ushort v) => _vepVersion = v } }; string commandLineExample = $"{command} --in --out --vep "; return new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .CheckDirectoryExists(_inputVepDirectory, "VEP", "--in") .CheckInputFilenameExists(_inputReferencePath, "compressed reference sequence", "--ref") .HasRequiredParameter(_outputStub, "output stub", "--out") .HasRequiredParameter(_vepVersion, "VEP version", "--vep") .HasRequiredParameter(_genomeAssembly, "genome assembly", "--ga") .HasRequiredDate(_vepReleaseDate, "VEP release date", "--date") .HasRequiredParameter(_transcriptSource, "transcript source", "--source") .SkipBanner() .ShowHelpMenu("Converts *deserialized* VEP cache files to a Nirvana pre-cache file.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: CacheUtils/Commands/ParseVepCacheDirectory/RegulatoryRegionMerger.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.TranscriptCache.Comparers; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.Commands.ParseVepCacheDirectory { public static class RegulatoryRegionMerger { public static IEnumerable Merge(IEnumerable regulatoryRegions) { var regulatoryDict = new Dictionary(); var comparer = new RegulatoryRegionComparer(); foreach (var currentRegion in regulatoryRegions) { if (currentRegion.Id.IsEmpty()) throw new InvalidOperationException("Found a regulatory region without an ID."); string regulatoryKey = $"{currentRegion.Id}.{currentRegion.Start}.{currentRegion.End}"; if (regulatoryDict.TryGetValue(regulatoryKey, out var previousRegion)) { MergeRegulatoryRegion(previousRegion, currentRegion, comparer); } else { regulatoryDict[regulatoryKey] = currentRegion; } } return regulatoryDict.Values.OrderBy(x => x.Chromosome.Index).ThenBy(x => x.Start).ThenBy(x => x.End) .ToList(); } private static void MergeRegulatoryRegion(IRegulatoryRegion previous, IRegulatoryRegion current, RegulatoryRegionComparer comparer) { if (comparer.Equals(previous, current)) return; throw new InvalidDataException("Found different regulatory regions"); } } } ================================================ FILE: CacheUtils/Commands/ParseVepCacheDirectory/TranscriptFilter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.DataDumperImport.DataStructures; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.DataDumperImport.Utilities; using CacheUtils.Genbank; using CacheUtils.Genes.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.Commands.ParseVepCacheDirectory { public static class TranscriptFilter { private static readonly MutableTranscriptComparer Comparer = new MutableTranscriptComparer(); private static void Log(this TranscriptMergerLogger logger, string transcriptId, string description) => logger.WriteLine($"{transcriptId}\t{description}"); public static List PickSpecificTranscript( this List transcripts, TranscriptMergerLogger logger, string transcriptId) { if (transcripts.Count == 1) return transcripts; List filteredTranscripts; string logMessage; switch (transcriptId) { case "NM_001005786": filteredTranscripts = transcripts.Where(transcript => transcript.CdnaMaps[9].Start == 25419007).ToList(); logMessage = $"Filtered on exon 9 start: {transcriptId}"; break; case "NM_001278597": case "NM_001278596": filteredTranscripts = transcripts.Where(transcript => transcript.CdnaMaps.Length == 26).ToList(); logMessage = $"Filtered on exon count (26): {transcriptId}"; break; case "NM_016152": filteredTranscripts = transcripts.Where(transcript => transcript.Exons[0].Phase == 0).ToList(); logMessage = $"Filtered on exon phase (0): {transcriptId}"; break; default: return transcripts; } if (filteredTranscripts.Count == 0) return transcripts; logger.Log(transcriptId, logMessage); return filteredTranscripts.Unique(); } public static List InvestigateInconsistentCdnaMaps(this List transcripts, TranscriptMergerLogger logger, string transcriptId) { var index = 0; foreach (var transcript in transcripts) { string onReverseStrand = transcript.Gene.OnReverseStrand ? "R" : "F"; if (transcript.Exons.Length != transcript.CdnaMaps.Length) { logger.Log(transcriptId, $"Found different exon & cDNA maps counts ({transcript.Exons.Length} vs {transcript.CdnaMaps.Length}) (index: {index}, {onReverseStrand})"); } if (transcript.Exons.Length == transcript.CdnaMaps.Length && DiffExonsAndCdnaMaps(transcript.Exons, transcript.CdnaMaps)) { logger.Log(transcriptId, $"Found different start/end coordinates between exons & cDNA maps. (index: {index}, {onReverseStrand})"); } index++; } return transcripts; } private static bool DiffExonsAndCdnaMaps(IReadOnlyList exons, IReadOnlyList cdnaMaps) { int numExons = exons.Count; for (var i = 0; i < numExons; i++) { var exon = exons[i]; var cdnaMap = cdnaMaps[i]; if (exon.Start != cdnaMap.Start || exon.End != cdnaMap.End) return false; } return false; } public static List ChooseEditedTranscripts( this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var filteredTranscripts = transcripts.Where(transcript => transcript.RnaEdits != null || transcript.BamEditStatus == "ok").ToList(); if (filteredTranscripts.Count == 0) return transcripts; logger.Log(transcripts[0].Id, "Filtered transcripts without RNA edits or BAM edit status"); return filteredTranscripts.Unique(); } public static List RemoveFailedTranscripts( this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var filteredTranscripts = transcripts.Where(transcript => transcript.BamEditStatus != "failed").ToList(); if (filteredTranscripts.Count == 0) return transcripts; logger.Log(transcripts[0].Id, "Filtered transcripts with failed BAM status."); return filteredTranscripts.Unique(); } public static List RemoveTranscriptsWithLowestVersion( this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var versionToTranscript = transcripts.GetMultiValueDict(x => x.Version); if (versionToTranscript.Count == 1) return transcripts; byte maxVersion = versionToTranscript.Keys.Max(); transcripts.RemoveAll(x => x.Version != maxVersion); logger.Log(transcripts[0].Id, "Filtered transcripts with lower versions"); return transcripts.Unique(); } public static List Unique(this IEnumerable transcripts) { var set = new HashSet(Comparer); foreach (var transcript in transcripts) set.Add(transcript); return set.ToList(); } public static List FixCodingRegionCdnaStart(this List transcripts, TranscriptMergerLogger logger, IReadOnlyDictionary idToGenbankEntry, string transcriptId) { if (transcripts.Count == 1 || idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry)) return transcripts; var cdnaStartToTranscript = transcripts.GetMultiValueDict(x => x.CodingRegion.CdnaStart); if (cdnaStartToTranscript.Count == 1) return transcripts; if (!cdnaStartToTranscript.TryGetValue(genbankEntry.CodingRegion.Start, out var filteredTranscripts)) return transcripts; logger.Log(transcripts[0].Id, "Filtered transcripts by coding region cDNA start"); return filteredTranscripts.Unique(); } public static List FixCodingRegionCdnaEnd(this List transcripts, TranscriptMergerLogger logger, IReadOnlyDictionary idToGenbankEntry, string transcriptId) { if (transcripts.Count == 1 || idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry)) return transcripts; var cdnaEndToTranscript = transcripts.GetMultiValueDict(x => x.CodingRegion.CdnaEnd); if (cdnaEndToTranscript.Count == 1) return transcripts; if (!cdnaEndToTranscript.TryGetValue(genbankEntry.CodingRegion.End, out var filteredTranscripts)) return transcripts; logger.Log(transcripts[0].Id, "Filtered transcripts by coding region cDNA end"); return filteredTranscripts.Unique(); } public static List FixGeneSymbolSource(this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var symbolSources = transcripts.GetSet(x => x.Gene.SymbolSource); if (symbolSources.Count == 1) return transcripts; if (symbolSources.Contains(GeneSymbolSource.Unknown)) symbolSources.Remove(GeneSymbolSource.Unknown); if (symbolSources.Count != 1) throw new NotImplementedException("Cannot handle multiple gene symbol sources at this time"); var targetSymbolSource = symbolSources.First(); foreach (var transcript in transcripts) transcript.Gene.SymbolSource = targetSymbolSource; logger.Log(transcripts[0].Id, "Normalized gene symbol source"); return transcripts.Unique(); } public static List FixBioType(this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var biotypes = transcripts.GetSet(x => x.BioType); if (biotypes.Count != 2) return transcripts; var biotype = GetDesiredBioType(biotypes); if (biotype == BioType.other) return transcripts; foreach (var transcript in transcripts) transcript.BioType = biotype; logger.Log(transcripts[0].Id, "Normalized biotype"); return transcripts.Unique(); } private static readonly BioType[] MiscRnaBioTypes = { BioType.antisense_RNA, BioType.miRNA, BioType.pseudogene, BioType.lncRNA, BioType.protein_coding, BioType.rRNA, BioType.SRP_RNA, BioType.vaultRNA, BioType.Y_RNA }; private static readonly BioType[] LncRnaBioTypes = { BioType.antisense_RNA, BioType.pseudogene }; private static BioType GetDesiredBioType(ICollection biotypes) { if (biotypes.Contains(BioType.misc_RNA)) { foreach (var biotype in MiscRnaBioTypes) if (biotypes.Contains(biotype)) return biotype; } if (biotypes.Contains(BioType.lncRNA) && LncRnaBioTypes.Any(biotypes.Contains)) return BioType.lncRNA; if (biotypes.Contains(BioType.mRNA) && biotypes.Contains(BioType.protein_coding)) return BioType.protein_coding; return BioType.other; } public static List FixGeneId(this List transcripts, TranscriptMergerLogger logger, Dictionary idToGenbankEntry, string transcriptId) { if (transcripts.Count == 1 || idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry)) return transcripts; var geneIds = transcripts.GetSet(x => x.Gene.GeneId); if (geneIds.Count == 1) return transcripts; if (!geneIds.Contains(genbankEntry.GeneId)) throw new InvalidDataException($"Could not find the Genbank gene ID ({genbankEntry.GeneId}) within the transcripts."); foreach (var transcript in transcripts) transcript.Gene.GeneId = genbankEntry.GeneId; logger.Log(transcripts[0].Id, "Normalized gene ID"); return transcripts.Unique(); } public static List UnsupervisedFixGeneId(this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var geneIds = transcripts.GetSet(x => x.Gene.GeneId).ToList(); if (geneIds.Count == 1) return transcripts; string geneId = geneIds[0]; foreach (var transcript in transcripts) transcript.Gene.GeneId = geneId; logger.Log(transcripts[0].Id, "Normalized gene ID (unsupervised)"); return transcripts.Unique(); } public static List FixGeneSymbols(this List transcripts, TranscriptMergerLogger logger, Dictionary idToGenbankEntry, string transcriptId) { if (transcripts.Count == 1) return transcripts; var symbols = transcripts.GetSet(x => x.Gene.Symbol); if (symbols.Count == 1) return transcripts; if (symbols.Contains(null)) symbols.Remove(null); if (idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry)) return transcripts.UnsupervisedFixGeneSymbols(logger, symbols.ToList()); if (!symbols.Contains(genbankEntry.Symbol)) return transcripts.UnsupervisedFixGeneSymbols(logger, symbols.ToList()); foreach (var transcript in transcripts) transcript.Gene.Symbol = genbankEntry.Symbol; logger.Log(transcripts[0].Id, "Normalized gene symbol"); return transcripts.Unique(); } public static List FixCanonical(this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var canonicals = transcripts.GetSet(x => x.IsCanonical); if (canonicals.Count == 1) return transcripts; foreach (var transcript in transcripts) transcript.IsCanonical = false; logger.Log(transcripts[0].Id, "Normalized canonical flag"); return transcripts.Unique(); } public static List FixHgncId(this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var hgncIds = transcripts.GetSet(x => x.Gene.HgncId); if (hgncIds.Count == 1) return transcripts; if (hgncIds.Contains(-1)) hgncIds.Remove(-1); int hgncId = hgncIds.First(); foreach (var transcript in transcripts) transcript.Gene.HgncId = hgncId; logger.Log(transcripts[0].Id, "Normalized HGNC ID"); return transcripts.Unique(); } public static List FixGeneStart(this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var geneStarts = transcripts.GetSet(x => x.Gene.Start); if (geneStarts.Count == 1) return transcripts; var transcriptStarts = transcripts.GetSet(x => x.Start).ToArray(); if (transcriptStarts.Length > 1) return transcripts; int closestStart = GetClosest(geneStarts, transcriptStarts[0]); foreach (var transcript in transcripts) transcript.Gene.Start = closestStart; logger.Log(transcripts[0].Id, "Normalized gene start"); return transcripts.Unique(); } public static List FixGeneEnd(this List transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) return transcripts; var geneEnds = transcripts.GetSet(x => x.Gene.End); if (geneEnds.Count == 1) return transcripts; var transcriptEnds = transcripts.GetSet(x => x.End).ToArray(); if (transcriptEnds.Length > 1) return transcripts; int closestEnd = GetClosest(geneEnds, transcriptEnds[0]); foreach (var transcript in transcripts) transcript.Gene.End = closestEnd; logger.Log(transcripts[0].Id, "Normalized gene end"); return transcripts.Unique(); } private static List UnsupervisedFixGeneSymbols(this IReadOnlyList transcripts, TranscriptMergerLogger logger, List symbols) { var nonLocGeneSymbols = symbols.FindAll(x => !string.IsNullOrEmpty(x) && !x.StartsWith("LOC")); string symbol = nonLocGeneSymbols.Count > 0 ? nonLocGeneSymbols[0] : symbols[0]; foreach (var transcript in transcripts) transcript.Gene.Symbol = symbol; logger.Log(transcripts[0].Id, "Normalized gene symbol (unsupervised)"); return transcripts.Unique(); } private static int GetClosest(IEnumerable values, int targetValue) { int bestDelta = int.MaxValue; int bestValue = -1; foreach (int value in values) { int delta = Math.Abs(value - targetValue); if (delta >= bestDelta) continue; bestDelta = delta; bestValue = value; } return bestValue; } } } ================================================ FILE: CacheUtils/Commands/ParseVepCacheDirectory/TranscriptIdFilter.cs ================================================ using System.IO; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.Commands.ParseVepCacheDirectory { public sealed class TranscriptIdFilter { private readonly string[] _whitelist; public TranscriptIdFilter(Source source) { // ReSharper disable once SwitchStatementMissingSomeCases switch (source) { case Source.Ensembl: _whitelist = new[] { "ENSE0", "ENSG0", "ENSP0", "ENST0" }; break; case Source.RefSeq: _whitelist = new[] { "NG_", "NM_", "NP_", "NR_", "XM_", "XP_", "XR_", "YP_" }; break; default: throw new InvalidDataException($"Unhandled import mode found: {source}"); } } public bool Pass(MutableTranscript transcript) => _whitelist.Any(prefix => transcript.Id.StartsWith(prefix)) && !transcript.Id.Contains("dupl"); } } ================================================ FILE: CacheUtils/Commands/ParseVepCacheDirectory/TranscriptMerger.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Genbank; using CacheUtils.Genes.Utilities; namespace CacheUtils.Commands.ParseVepCacheDirectory { public static class TranscriptMerger { /// /// separates the transcripts by ID and clusters the transcripts into overlapping /// islands. From there we can resolve differences and return a unique transcript /// for each cluster. /// public static List Merge(TranscriptMergerLogger logger, IEnumerable transcripts, Dictionary idToGenbankEntry) { var idToTranscripts = transcripts.GetMultiValueDict(x => x.Id + "|" + x.Start + "|" + x.End); var mergedTranscripts = idToTranscripts.Select(kvp => Merge(logger, kvp.Value, idToGenbankEntry)).ToList(); return mergedTranscripts.OrderBy(x => x.Start).ThenBy(x => x.End).ToList(); } private static MutableTranscript Merge(TranscriptMergerLogger logger, IReadOnlyList transcripts, Dictionary idToGenbankEntry) { string transcriptId = transcripts[0].Id; if (transcripts.Count == 1) { transcripts.Unique().InvestigateInconsistentCdnaMaps(logger, transcriptId); return transcripts[0]; } var filteredTranscripts = transcripts .Unique() .InvestigateInconsistentCdnaMaps(logger, transcriptId) .RemoveFailedTranscripts(logger) .ChooseEditedTranscripts(logger) .RemoveTranscriptsWithLowestVersion(logger) .FixCodingRegionCdnaStart(logger, idToGenbankEntry, transcriptId) .FixCodingRegionCdnaEnd(logger, idToGenbankEntry, transcriptId) .FixGeneSymbolSource(logger) .FixBioType(logger) .FixGeneId(logger, idToGenbankEntry, transcriptId) .FixCanonical(logger) .FixHgncId(logger) .FixGeneStart(logger) .FixGeneEnd(logger) .FixGeneSymbols(logger, idToGenbankEntry, transcriptId) .UnsupervisedFixGeneId(logger) .PickSpecificTranscript(logger, transcriptId); if (filteredTranscripts.Count == 1) return filteredTranscripts[0]; throw new NotImplementedException($"Could not merge down to one transcript: {filteredTranscripts.Count} transcripts ({transcriptId})"); } } } ================================================ FILE: CacheUtils/Commands/ParseVepCacheDirectory/TranscriptMergerLogger.cs ================================================ using System; using System.IO; namespace CacheUtils.Commands.ParseVepCacheDirectory { public sealed class TranscriptMergerLogger : IDisposable { private readonly StreamWriter _writer; public TranscriptMergerLogger(Stream stream) => _writer = new StreamWriter(stream); public void WriteLine() => _writer.WriteLine(); public void WriteLine(string s) => _writer.WriteLine(s); public void Write(string s) => _writer.Write(s); public void SetBold() { // not used } public void ResetColor() { // not used } public void Dispose() => _writer.Dispose(); } } ================================================ FILE: CacheUtils/Commands/ParseVepCacheDirectory/VepCacheParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.DataDumperImport.Import; using CacheUtils.DataDumperImport.IO; using Compression.Utilities; using Genome; using IO; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.Commands.ParseVepCacheDirectory { public sealed class VepCacheParser { private readonly Source _source; private readonly TranscriptIdFilter _filter; public VepCacheParser(Source source) { _source = source; _filter = new TranscriptIdFilter(source); } public (List RegulatoryRegions, List Transcripts) ParseDumpDirectory( Chromosome chromosome, string dirPath) { var regulatoryRegions = ParseRegulatoryFiles(chromosome, dirPath); var transcripts = ParseTranscriptFiles(chromosome, dirPath); return (regulatoryRegions, transcripts); } private static List ParseRegulatoryFiles(Chromosome chromosome, string dirPath) { var regulatoryRegions = new List(); var files = FileUtilities.GetFileNamesInDir(dirPath, "*_reg_regulatory_regions_data_dumper.txt.gz") .ToArray(); foreach (string dumpPath in VepRootDirectory.GetSortedFiles(files)) { ParseRegulatoryDumpFile(chromosome, dumpPath, regulatoryRegions); } return regulatoryRegions; } private List ParseTranscriptFiles(Chromosome chromosome, string dirPath) { var transcripts = new List(); var files = FileUtilities.GetFileNamesInDir(dirPath, "*_transcripts_data_dumper.txt.gz").ToArray(); foreach (string dumpPath in VepRootDirectory.GetSortedFiles(files)) { ParseTranscriptDumpFile(chromosome, dumpPath, transcripts); } return transcripts; } private static void ParseRegulatoryDumpFile(Chromosome chromosome, string filePath, ICollection regulatoryRegions) { Console.WriteLine("- processing {0}", Path.GetFileName(filePath)); using (var reader = new DataDumperReader(GZipUtilities.GetAppropriateReadStream(filePath))) { foreach (var ad in reader.GetRootNode().Value.Values) { if (!(ad is ObjectKeyValueNode objectKeyValue)) continue; foreach (var featureGroup in objectKeyValue.Value.Values) { switch (featureGroup.Key) { case "MotifFeature": // not used break; case "RegulatoryFeature": ParseRegulatoryRegions(chromosome, featureGroup, regulatoryRegions); break; default: throw new InvalidDataException("Found an unexpected feature group (" + featureGroup.Key + ") in the regulatory regions file."); } } } } } private void ParseTranscriptDumpFile(Chromosome chromosome, string filePath, ICollection transcripts) { Console.WriteLine("- processing {0}", Path.GetFileName(filePath)); using (var reader = new DataDumperReader(GZipUtilities.GetAppropriateReadStream(filePath))) { foreach (var node in reader.GetRootNode().Value.Values) { if (!(node is ListObjectKeyValueNode transcriptNodes)) continue; foreach (var tNode in transcriptNodes.Values) { if (!(tNode is ObjectValueNode transcriptNode)) throw new InvalidOperationException("Expected a transcript object value node, but the current node is not an object value."); if (transcriptNode.Type != "Bio::EnsEMBL::Transcript") throw new InvalidOperationException($"Expected a transcript node, but the current data type is: [{transcriptNode.Type}]"); var transcript = ImportTranscript.Parse(transcriptNode, chromosome, _source); if (_filter.Pass(transcript)) transcripts.Add(transcript); } } } } private static void ParseRegulatoryRegions(Chromosome chromosome, IImportNode featureGroupNode, ICollection regulatoryRegions) { if (!(featureGroupNode is ListObjectKeyValueNode regulatoryFeatureNodes)) return; foreach (var node in regulatoryFeatureNodes.Values) { if (!(node is ObjectValueNode regulatoryFeatureNode)) throw new InvalidOperationException("Expected a regulatory region object value node, but the current node is not an object value."); if (regulatoryFeatureNode.Type != "Bio::EnsEMBL::Funcgen::RegulatoryFeature") throw new InvalidOperationException($"Expected a regulatory region node, but the current data type is: [{regulatoryFeatureNode.Type}]"); var regulatoryRegion = ImportRegulatoryFeature.Parse(regulatoryFeatureNode, chromosome); regulatoryRegions.Add(regulatoryRegion); } } } } ================================================ FILE: CacheUtils/Commands/ParseVepCacheDirectory/VepRootDirectory.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; namespace CacheUtils.Commands.ParseVepCacheDirectory { public sealed class VepRootDirectory { private readonly Dictionary _refNameToChromosome; public VepRootDirectory(Dictionary refNameToChromosome) { _refNameToChromosome = refNameToChromosome; } public Dictionary GetRefIndexToVepDir(string dirPath) { var vepDirectories = Directory.GetDirectories(dirPath); var referenceDict = new Dictionary(); foreach (string dir in vepDirectories) { string referenceName = Path.GetFileName(dir); var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, referenceName); if (chromosome.IsEmpty()) continue; referenceDict[chromosome.Index] = dir; } return referenceDict; } public static IEnumerable GetSortedFiles(IEnumerable filePaths) { var sortedPaths = new SortedDictionary(); foreach (string filePath in filePaths) { string fileName = Path.GetFileName(filePath); if (fileName == null) continue; int hyphenPos = fileName.IndexOf("-", StringComparison.Ordinal); if (hyphenPos == -1) throw new InvalidDataException($"Could not find the hyphen in: [{fileName}]"); int position = int.Parse(fileName.Substring(0, hyphenPos)); sortedPaths[position] = filePath; } return sortedPaths.Values.ToArray(); } } } ================================================ FILE: CacheUtils/Commands/RegulatoryGFF/CreateRegulatoryGffMain.cs ================================================ using System; using System.IO; using CacheUtils.Helpers; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using ReferenceSequence.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Providers; namespace CacheUtils.Commands.RegulatoryGFF { public static class CreateRegulatoryGffMain { private static string _referencePath; private static string _inputPrefix; private static string _outputFileName; private static ExitCodes ProgramExecution() { using (var writer = GZipUtilities.GetStreamWriter(_outputFileName)) { string cachePath = CacheConstants.TranscriptPath(_inputPrefix); var sequenceData = SequenceHelper.GetDictionaries(_referencePath); // load the cache Console.Write("- reading {0}... ", Path.GetFileName(cachePath)); var cache = TranscriptCacheHelper.GetCache(cachePath, sequenceData.refIndexToChromosome); Console.WriteLine("found {0:N0} reference sequences. ", cache.RegulatoryRegionIntervalArrays.Length); Console.Write("- writing GFF entries... "); foreach (var intervalArray in cache.RegulatoryRegionIntervalArrays) { if (intervalArray == null) continue; foreach (var interval in intervalArray.Array) WriteRegulatoryFeature(writer, interval.Value); } Console.WriteLine("finished."); } return ExitCodes.Success; } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "in|i=", "input cache {prefix}", v => _inputPrefix = v }, { "out|o=", "output {file name}", v => _outputFileName = v }, { "ref|r=", "reference {file}", v => _referencePath = v } }; string commandLineExample = $"{command} --in --out "; return new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .HasRequiredParameter(_inputPrefix, "input cache prefix", "--in") .CheckOutputFilenameSuffix(_outputFileName, ".gz", "GFF") .SkipBanner() .ShowHelpMenu("Outputs regulatory regions in a database.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } private static void WriteRegulatoryFeature(TextWriter writer, IRegulatoryRegion regulatoryRegion) { writer.Write($"{regulatoryRegion.Chromosome.UcscName}\t.\tregulatory feature\t{regulatoryRegion.Start}\t{regulatoryRegion.End}\t.\t.\t.\t"); WriteGeneralAttributes(writer, regulatoryRegion); writer.WriteLine(); } private static void WriteGeneralAttributes(TextWriter writer, IRegulatoryRegion regulatoryRegion) { if (!regulatoryRegion.Id.IsEmpty()) writer.Write($"regulatory_feature_id \"{regulatoryRegion.Id}\"; "); writer.Write($"regulatory_feature_type \"{regulatoryRegion.Type}\"; "); } } } ================================================ FILE: CacheUtils/Commands/UniversalGeneArchive/FilePaths.cs ================================================ namespace CacheUtils.Commands.UniversalGeneArchive { // ReSharper disable UnusedAutoPropertyAccessor.Global public sealed class FilePaths { public AssemblySpecificPaths GRCh37 { get; set; } public AssemblySpecificPaths GRCh38 { get; set; } // ReSharper disable once ClassNeverInstantiated.Global public class AssemblySpecificPaths { public string ReferencePath { get; set; } public string EnsemblCachePath { get; set; } public string RefSeqCachePath { get; set; } } } // ReSharper restore UnusedAutoPropertyAccessor.Global } ================================================ FILE: CacheUtils/Commands/UniversalGeneArchive/UniversalGeneArchiveMain.cs ================================================ using System.Collections.Generic; using System.IO; using System.IO.Compression; using CacheUtils.Commands.Download; using CacheUtils.Genes; using CacheUtils.Genes.DataStores; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using CommandLine.Utilities; using Compression.FileHandling; using ErrorHandling; using IO; using Microsoft.Extensions.Configuration; using ReferenceSequence.Utilities; using VariantAnnotation.Providers; namespace CacheUtils.Commands.UniversalGeneArchive { public static class UniversalGeneArchiveMain { private static string _referencesPath; private static string _intermediateCachePath; private static ExitCodes ProgramExecution() { if (UniversalGeneArchiveCurrent()) { Logger.WriteLine("- universal gene archive is already up-to-date."); return ExitCodes.Success; } const string jsonPath = "CacheUtils.dll.gene.json"; var filePaths = GetFilePaths(jsonPath); var ds = LoadDataStores(filePaths); var grch37GenesByRef = ds.Assembly37.UpdateHgncIds(ds.Hgnc).MergeByHgnc(true); var grch38GenesByRef = ds.Assembly38.UpdateHgncIds(ds.Hgnc).MergeByHgnc(false); var universalGenes = CombineGenomeAssemblies(grch37GenesByRef, grch38GenesByRef).UpdateGeneSymbols( ds.Hgnc.HgncIdToSymbol, ds.GeneInfoData.EntrezGeneIdToSymbol, ds.Assembly38.EnsemblGtf.EnsemblIdToSymbol, ds.Assembly37.RefSeqGff.EntrezGeneIdToSymbol); WriteGenes(universalGenes); return ExitCodes.Success; } private static bool UniversalGeneArchiveCurrent() { var fileInfo = new FileInfo(ExternalFiles.UniversalGeneFilePath); return fileInfo.Exists && ExternalFiles.GetElapsedDays(fileInfo.CreationTime) < 1.0; } private static (GeneInfoData GeneInfoData, AssemblyDataStore Assembly37, AssemblyDataStore Assembly38, Hgnc Hgnc) LoadDataStores(FilePaths filePaths) { Logger.Write("- loading datastores... "); var loadBenchmark = new Benchmark(); var (_, refNameToChromosome, _) = SequenceHelper.GetDictionaries(filePaths.GRCh38.ReferencePath); var geneInfoData = GeneInfoData.Create(ExternalFiles.GeneInfoFile.FilePath); var dataStore37 = AssemblyDataStore.Create("GRCh37", filePaths.GRCh37, refNameToChromosome, true); var dataStore38 = AssemblyDataStore.Create("GRCh38", filePaths.GRCh38, refNameToChromosome, false); var hgnc = Hgnc.Create(ExternalFiles.HgncFile.FilePath, refNameToChromosome); Logger.WriteLine($"{Benchmark.ToHumanReadable(loadBenchmark.GetElapsedTime())}"); return (geneInfoData, dataStore37, dataStore38, hgnc); } private static UgaGene[] CombineGenomeAssemblies(Dictionary> genesByRef37, Dictionary> genesByRef38) { Logger.WriteLine("\n*** Global ***"); Logger.Write("- combining genes from GRCh37 and GRCh38... "); var combinedGenes = UgaAssemblyCombiner.Combine(genesByRef37, genesByRef38); Logger.WriteLine($"{combinedGenes.Length} genes."); return combinedGenes; } private static UgaGene[] UpdateGeneSymbols(this UgaGene[] genes, Dictionary hgncIdToSymbol, Dictionary entrezGeneIdToSymbol, Dictionary ensemblIdToSymbol, Dictionary refseqGeneIdToSymbol) { var updater = new GeneSymbolUpdater(hgncIdToSymbol, entrezGeneIdToSymbol, ensemblIdToSymbol, refseqGeneIdToSymbol); updater.Update(genes); return genes; } private static void WriteGenes(UgaGene[] genes) { Logger.Write($"- writing genes to {Path.GetFileName(ExternalFiles.UniversalGeneFilePath)}... "); using (var stream = new BlockGZipStream(FileUtilities.GetCreateStream(ExternalFiles.UniversalGeneFilePath), CompressionMode.Compress)) using (var writer = new UgaGeneWriter(stream)) { writer.Write(genes); } Logger.WriteLine("finished"); } private static FilePaths GetFilePaths(string jsonPath) { var builder = new ConfigurationBuilder(); builder.AddJsonFile(jsonPath); var configuration = builder.Build(); var filePaths = new FilePaths(); configuration.Bind(filePaths); UpdatePaths(filePaths.GRCh37); UpdatePaths(filePaths.GRCh38); CheckPaths(filePaths.GRCh37); CheckPaths(filePaths.GRCh38); return filePaths; } private static void UpdatePaths(FilePaths.AssemblySpecificPaths paths) { paths.EnsemblCachePath = Path.Combine(_intermediateCachePath, paths.EnsemblCachePath); paths.RefSeqCachePath = Path.Combine(_intermediateCachePath, paths.RefSeqCachePath); paths.ReferencePath = Path.Combine(_referencesPath, paths.ReferencePath); } private static void CheckPath(string filePath, string description) { if (string.IsNullOrEmpty(filePath)) throw new InvalidDataException($"No value was found for the {description} key."); if (!File.Exists(filePath)) throw new FileNotFoundException($"Unable to find the following file: {filePath}"); } private static void CheckPaths(FilePaths.AssemblySpecificPaths paths) { CheckPath(paths.EnsemblCachePath, "Ensembl intermediate cache"); CheckPath(paths.RefSeqCachePath, "RefSeq intermediate cache"); CheckPath(paths.ReferencePath, "reference"); } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "icache|i=", "intermediate cache {dir}", v => _intermediateCachePath = v }, { "ref|r=", "reference {dir}", v => _referencesPath = v } }; string commandLineExample = $"{command} -i -r "; return new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .CheckDirectoryExists(_intermediateCachePath, "intermediate cache", "--icache") .CheckDirectoryExists(_referencesPath, "reference", "--ref") .SkipBanner() .ShowHelpMenu("Creates the universal gene archive", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/GenomeSymbolSource.cs ================================================ namespace CacheUtils.DataDumperImport.DataStructures { public enum GeneSymbolSource : byte { // ReSharper disable InconsistentNaming Unknown, CloneBasedEnsemblGene, CloneBasedVegaGene, EntrezGene, HGNC, LRG, NCBI, miRBase, RFAM, UniProtGeneName // ReSharper restore InconsistentNaming } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Import/IImportNode.cs ================================================ namespace CacheUtils.DataDumperImport.DataStructures.Import { public interface IImportNode { string Key { get; } } public interface IListMember : IImportNode { } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Import/ImportNodeExtensions.cs ================================================ using System.IO; namespace CacheUtils.DataDumperImport.DataStructures.Import { public static class ImportNodeExtensions { public static int GetInt32(this IImportNode node) { string s = GetString(node); if (s == null) return -1; if (!int.TryParse(s, out int ret)) { throw new InvalidDataException($"Unable to convert the string ({s}) to an integer."); } return ret; } public static bool GetBool(this IImportNode node) { int num = GetInt32(node); return num == 1; } public static string GetString(this IImportNode node) { if (!(node is StringKeyValueNode stringKeyValue)) { throw new InvalidDataException($"Unable to convert the AbstractData type to a StringKeyValue type: [{node.Key}]"); } string s = stringKeyValue.Value; if (s == "" || s == "-") s = null; return s; } public static bool IsUndefined(this IImportNode node) { if (!(node is StringKeyValueNode stringKeyValue)) return false; return stringKeyValue.Value == null; } } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Import/ListObjectKeyValueNode.cs ================================================ using System.Collections.Generic; namespace CacheUtils.DataDumperImport.DataStructures.Import { public sealed class ListObjectKeyValueNode : IImportNode { public string Key { get; } public List Values { get; } = new List(); public ListObjectKeyValueNode(string key) => Key = key; public void Add(IListMember node) => Values.Add(node); } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Import/ObjectKeyValueNode.cs ================================================ namespace CacheUtils.DataDumperImport.DataStructures.Import { public sealed class ObjectKeyValueNode : IImportNode { public string Key { get; } public ObjectValueNode Value { get; } public ObjectKeyValueNode(string key, ObjectValueNode value) { Key = key; Value = value; } } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Import/ObjectValueNode.cs ================================================ using System.Collections.Generic; namespace CacheUtils.DataDumperImport.DataStructures.Import { public sealed class ObjectValueNode : IListMember { public string Type { get; } public string Key { get; } public List Values { get; } internal ObjectValueNode(string type, List values) { Key = null; Type = type; Values = values; } } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Import/StringKeyValueNode.cs ================================================ namespace CacheUtils.DataDumperImport.DataStructures.Import { public sealed class StringKeyValueNode : IImportNode { public string Key { get; } public string Value { get; } public StringKeyValueNode(string key, string value) { Key = key; Value = value; } } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Import/StringValueNode.cs ================================================ namespace CacheUtils.DataDumperImport.DataStructures.Import { public sealed class StringValueNode : IListMember { public string Key { get; } public StringValueNode(string key) => Key = key; } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Mutable/MutableExon.cs ================================================ using System; using Genome; using Intervals; namespace CacheUtils.DataDumperImport.DataStructures.Mutable { public sealed class MutableExon : IEquatable, IInterval { private readonly Chromosome _chromosome; public readonly int Phase; public int Start { get; } public int End { get; } public MutableExon(Chromosome chromosome, int start, int end, int phase) { _chromosome = chromosome; Start = start; End = end; Phase = phase; } public bool Equals(MutableExon other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return _chromosome.Index == other._chromosome.Index && Start == other.Start && End == other.End && Phase == other.Phase; } public override int GetHashCode() { unchecked { int hashCode = _chromosome.Index.GetHashCode(); hashCode = (hashCode * 397) ^ Start; hashCode = (hashCode * 397) ^ End; hashCode = (hashCode * 397) ^ Phase.GetHashCode(); return hashCode; } } } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Mutable/MutableGene.cs ================================================ using System; using CacheUtils.Genes.DataStructures; using Genome; using Intervals; namespace CacheUtils.DataDumperImport.DataStructures.Mutable { public sealed class MutableGene : IEquatable, IFlatGene { public Chromosome Chromosome { get; set; } public int Start { get; set; } public int End { get; set; } public bool OnReverseStrand { get; } public string GeneId { get; set; } public string Symbol { get; set; } public int HgncId { get; set; } public GeneSymbolSource SymbolSource { get; set; } public MutableGene(Chromosome chromosome, int start, int end, bool onReverseStrand, string symbol, GeneSymbolSource symbolSource, string geneId, int hgncId) { Chromosome = chromosome; Start = start; End = end; OnReverseStrand = onReverseStrand; Symbol = symbol; SymbolSource = symbolSource; GeneId = geneId; HgncId = hgncId; } public override string ToString() { string strand = OnReverseStrand ? "R" : "F"; return $"{GeneId}: {Chromosome.UcscName} {Start}-{End} {strand} symbol: {Symbol} ({SymbolSource}), HGNC ID: {HgncId}"; } public bool Equals(MutableGene other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return Chromosome.Index == other.Chromosome.Index && Start == other.Start && End == other.End && OnReverseStrand == other.OnReverseStrand && Symbol == other.Symbol && GeneId == other.GeneId; } public override int GetHashCode() { unchecked { // ReSharper disable NonReadonlyMemberInGetHashCode int hashCode = Chromosome.Index.GetHashCode(); hashCode = (hashCode * 397) ^ Start; hashCode = (hashCode * 397) ^ End; hashCode = (hashCode * 397) ^ OnReverseStrand.GetHashCode(); hashCode = (hashCode * 397) ^ Symbol.GetHashCode(); hashCode = (hashCode * 397) ^ GeneId.GetHashCode(); // ReSharper restore NonReadonlyMemberInGetHashCode return hashCode; } } public MutableGene Clone() => new MutableGene(Chromosome, Start, End, OnReverseStrand, Symbol, SymbolSource, GeneId, HgncId); public UgaGene ToUgaGene(bool isGrch37) { (string ensemblGeneId, string entrezGeneId) = GeneId.StartsWith("ENSG") ? (GeneId, null as string) : (null as string, GeneId); IInterval interval = new Interval(Start, End); (IInterval grch37, IInterval grch38) = isGrch37 ? (interval, null as IInterval) : (null as IInterval, interval); return new UgaGene(Chromosome, grch37, grch38, OnReverseStrand, entrezGeneId, ensemblGeneId, Symbol, HgncId); } } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Mutable/MutableTranscript.cs ================================================ using System; using Genome; using Intervals; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.DataDumperImport.DataStructures.Mutable { public sealed class MutableTranscript : IEquatable { public readonly Chromosome Chromosome; public readonly int Start; public readonly int End; public readonly string Id; public readonly byte Version; public readonly string CcdsId; public readonly string RefSeqId; public readonly Source Source; public readonly MutableGene Gene; public readonly IInterval[] MicroRnas; public readonly bool CdsStartNotFound; public readonly bool CdsEndNotFound; public readonly int[] SelenocysteinePositions; public readonly int StartExonPhase; public readonly IRnaEdit[] RnaEdits; public readonly string ProteinId; public readonly byte ProteinVersion; public readonly string PeptideSequence; public readonly MutableExon[] Exons; public readonly int TotalExonLength; public readonly IInterval[] Introns; public readonly string TranslateableSequence; public readonly MutableTranscriptRegion[] CdnaMaps; public readonly string BamEditStatus; // mutable public BioType BioType; public bool IsCanonical; public Gene UpdatedGene; public int CdsLength; public ITranscriptRegion[] TranscriptRegions; public byte NewStartExonPhase; public ICodingRegion CodingRegion; public readonly string SiftData; public readonly string PolyphenData; public int SiftIndex = -1; public int PolyPhenIndex = -1; public MutableTranscript(Chromosome chromosome, int start, int end, string id, byte version, string ccdsId, string refSeqId, BioType bioType, bool isCanonical, ICodingRegion codingRegion, string proteinId, byte proteinVersion, string peptideSequence, Source source, MutableGene gene, MutableExon[] exons, int startExonPhase, int totalExonLength, IInterval[] introns, MutableTranscriptRegion[] cdnaMaps, string siftData, string polyphenData, string translateableSequence, IInterval[] microRnas, bool cdsStartNotFound, bool cdsEndNotFound, int[] selenocysteinePositions, IRnaEdit[] rnaEdits, string bamEditStatus) { Chromosome = chromosome; Start = start; End = end; Id = id; Version = version; CcdsId = ccdsId; RefSeqId = refSeqId; BioType = bioType; IsCanonical = isCanonical; CodingRegion = codingRegion; ProteinId = proteinId; ProteinVersion = proteinVersion; PeptideSequence = peptideSequence; Source = source; Gene = gene; Exons = exons; StartExonPhase = startExonPhase; TotalExonLength = totalExonLength; Introns = introns; CdnaMaps = cdnaMaps; SiftData = siftData; PolyphenData = polyphenData; TranslateableSequence = translateableSequence; MicroRnas = microRnas; CdsStartNotFound = cdsStartNotFound; CdsEndNotFound = cdsEndNotFound; SelenocysteinePositions = selenocysteinePositions; RnaEdits = rnaEdits; BamEditStatus = bamEditStatus; } public bool Equals(MutableTranscript other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return Chromosome.Index == other.Chromosome.Index && Start == other.Start && End == other.End && Id == other.Id && Version == other.Version && BioType == other.BioType && Source == other.Source; } public override int GetHashCode() { unchecked { // ReSharper disable NonReadonlyMemberInGetHashCode int hashCode = Chromosome.Index.GetHashCode(); hashCode = (hashCode * 397) ^ Start; hashCode = (hashCode * 397) ^ End; hashCode = (hashCode * 397) ^ Id.GetHashCode(); hashCode = (hashCode * 397) ^ Version.GetHashCode(); hashCode = (hashCode * 397) ^ (int) BioType; hashCode = (hashCode * 397) ^ (int) Source; return hashCode; // ReSharper restore NonReadonlyMemberInGetHashCode } } } } ================================================ FILE: CacheUtils/DataDumperImport/DataStructures/Mutable/MutableTranscriptRegion.cs ================================================ using IO; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.DataDumperImport.DataStructures.Mutable { public sealed class MutableTranscriptRegion : ITranscriptRegion { public int Start { get; } public int End { get; } public TranscriptRegionType Type { get; set; } public ushort Id { get; set; } public int CdnaStart { get; set; } public int CdnaEnd { get; set; } public MutableTranscriptRegion(TranscriptRegionType type, ushort id, int start, int end, int cdnaStart = -1, int cdnaEnd = -1) { Type = type; Id = id; Start = start; End = end; CdnaStart = cdnaStart; CdnaEnd = cdnaEnd; } public void Write(IExtendedBinaryWriter writer) => throw new System.NotImplementedException(); } } ================================================ FILE: CacheUtils/DataDumperImport/FauxRegex/RegexDecisionTree.cs ================================================ using System; using System.Linq; using CacheUtils.DataDumperImport.IO; namespace CacheUtils.DataDumperImport.FauxRegex { internal static class RegexDecisionTree { internal static (EntryType Type, string Key, string Value) GetEntryType(string s) { s = s.Trim().TrimEnd(','); int fatArrowPos = s.IndexOf("=>", StringComparison.Ordinal); return fatArrowPos != -1 ? GetEntryTypeFatArrow(s, fatArrowPos) : GetEntryTypeNoArrow(s); } private static (EntryType Type, string Key, string Value) GetEntryTypeNoArrow(string s) { int varPos = s.IndexOf("$VAR", StringComparison.Ordinal); return varPos != -1 ? GetEntryTypeVar(s) : GetEntryTypeNoVar(s); } private static (EntryType Type, string Key, string Value) GetEntryTypeNoVar(string s) { s = s.TrimEnd(';'); // ReSharper disable once ConvertIfStatementToSwitchStatement if (s == "}") return (EntryType.EndBraces, null, null); if (s == "bless( {") return (EntryType.OpenBraces, null, null); int endBracePos = s.IndexOf("}, 'Bio::", StringComparison.Ordinal); if (endBracePos != -1) return GetEntryTypeDataPos(s, endBracePos + 4); s = s.Trim('\''); if (OnlyDigits(s)) return (EntryType.DigitKey, s, null); throw new NotImplementedException($"Unable to match the non-$VAR regexes: [{s}]"); } private static (EntryType Type, string Key, string Value) GetEntryTypeDataPos(string s, int afterFirstQuote) { return (EntryType.EndBracesWithDataType, GetForwardString(s, afterFirstQuote), null); } private static (EntryType Type, string Key, string Value) GetEntryTypeVar(string s) { if (!s.EndsWith(" = {")) throw new NotImplementedException("Unable to match the $VAR regexes: [{s}]"); int spacePos = s.IndexOf(' '); return (EntryType.RootObjectKeyValue, s.Substring(0, spacePos), null); } private static (EntryType, string Key, string Value) GetEntryTypeFatArrow(string s, int fatArrowPos) { string key = GetKey(s, fatArrowPos - 2); int firstPosAfterFatArrow = fatArrowPos + 3; if (s[firstPosAfterFatArrow] == '\'') return GetEntryTypeStringKeyValue(s, firstPosAfterFatArrow + 1, key); if (s[s.Length - 1] == '{') return (EntryType.ObjectKeyValue, key, null); string afterFatArrow = s.Substring(firstPosAfterFatArrow); // ReSharper disable once ConvertIfStatementToSwitchStatement if (afterFatArrow == "undef") return (EntryType.UndefKeyValue, key, null); if (afterFatArrow == "{}") return (EntryType.EmptyValueKeyValue, key, null); if (afterFatArrow == "[]") return (EntryType.EmptyListKeyValue, key, null); if (afterFatArrow.StartsWith("$VAR")) return (EntryType.ReferenceStringKeyValue, key, afterFatArrow); if (s[firstPosAfterFatArrow] == '[') return (EntryType.ListObjectKeyValue, key, null); if (OnlyDigits(afterFatArrow)) return (EntryType.DigitKeyValue, key, afterFatArrow); throw new NotImplementedException(); } private static (EntryType, string Key, string Value) GetEntryTypeStringKeyValue(string s, int afterFirstQuote, string key) { int secondQuotePos = s.IndexOf('\'', afterFirstQuote); return secondQuotePos == -1 ? (EntryType.MultiLineKeyValue, key, s.Substring(afterFirstQuote)) : (EntryType.StringKeyValue, key, s.Substring(afterFirstQuote, secondQuotePos - afterFirstQuote)); } private static string GetKey(string s, int secondQuotePos) { int afterFirstQuote = s.LastIndexOf('\'', secondQuotePos - 1) + 1; return s.Substring(afterFirstQuote, secondQuotePos - afterFirstQuote); } private static string GetForwardString(string s, int afterFirstQuote) { int secondQuotePos = s.IndexOf('\'', afterFirstQuote); string result = s.Substring(afterFirstQuote, secondQuotePos - afterFirstQuote); return result; } internal static bool OnlyDigits(string s) => s.All(c => char.IsDigit(c) || c == '-'); } } ================================================ FILE: CacheUtils/DataDumperImport/IO/DataDumperReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Text; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.FauxRegex; using IO; using OptimizedCore; namespace CacheUtils.DataDumperImport.IO { public sealed class DataDumperReader : IDisposable { private readonly StreamReader _reader; private readonly StringBuilder _sb = new StringBuilder(); public DataDumperReader(Stream stream) => _reader = FileUtilities.GetStreamReader(stream); private string GetNextLine() => _reader.ReadLine(); public ObjectKeyValueNode GetRootNode() { string line = GetNextLine(); if (line == null) throw new InvalidDataException("Expected a root object node, but no data was found."); var results = RegexDecisionTree.GetEntryType(line); if (results.Type != EntryType.RootObjectKeyValue) throw new InvalidDataException($"Expected a root object node, but found a {results.Type} node."); return new ObjectKeyValueNode(results.Key, GetObjectValue()); } private static StringValueNode GetDigitKey(string key) => new StringValueNode(key); private ListObjectKeyValueNode GetListObjectKeyValue(string key) { var listObjectKeyValue = new ListObjectKeyValueNode(key); while (true) { string line = GetNextLine().Trim().TrimEnd(','); if (line == "]") break; var results = RegexDecisionTree.GetEntryType(line); // ReSharper disable once SwitchStatementMissingSomeCases switch (results.Type) { case EntryType.OpenBraces: listObjectKeyValue.Add(GetObjectValue()); break; case EntryType.DigitKey: listObjectKeyValue.Add(GetDigitKey(line)); break; default: throw new InvalidDataException($"Unhandled entry type encountered: {results.Type}"); } } return listObjectKeyValue; } private StringKeyValueNode GetMultiLineKeyValue(string key, string value) { _sb.Clear(); _sb.Append(value); while (true) { string line = GetNextLine().Trim(); if (line.OptimizedStartsWith('\'')) break; _sb.Append(' '); _sb.Append(line); } return new StringKeyValueNode(key, _sb.ToString()); } private ObjectValueNode GetObjectValue() { var type = "(unknown)"; var nodes = new List(); while (true) { string line = GetNextLine(); var results = RegexDecisionTree.GetEntryType(line); if (results.Type == EntryType.EndBraces || results.Type == EntryType.EndBracesWithDataType) { if (results.Type == EntryType.EndBracesWithDataType) type = results.Key; break; } // ReSharper disable once SwitchStatementMissingSomeCases switch (results.Type) { case EntryType.ObjectKeyValue: nodes.Add(new ObjectKeyValueNode(results.Key, GetObjectValue())); break; case EntryType.ListObjectKeyValue: nodes.Add(GetListObjectKeyValue(results.Key)); break; case EntryType.DigitKeyValue: case EntryType.StringKeyValue: case EntryType.ReferenceStringKeyValue: nodes.Add(new StringKeyValueNode(results.Key, results.Value)); break; case EntryType.UndefKeyValue: case EntryType.EmptyListKeyValue: case EntryType.EmptyValueKeyValue: nodes.Add(new StringKeyValueNode(results.Key, null)); break; case EntryType.MultiLineKeyValue: nodes.Add(GetMultiLineKeyValue(results.Key, results.Value)); break; default: throw new InvalidDataException($"Unhandled entry type encountered in GetObjectValue: {results.Type}: [{line}]"); } } return new ObjectValueNode(type, nodes); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/DataDumperImport/IO/EntryType.cs ================================================ namespace CacheUtils.DataDumperImport.IO { internal enum EntryType { DigitKeyValue, DigitKey, EmptyListKeyValue, EmptyValueKeyValue, EndBraces, EndBracesWithDataType, ListObjectKeyValue, MultiLineKeyValue, ObjectKeyValue, OpenBraces, ReferenceStringKeyValue, RootObjectKeyValue, StringKeyValue, UndefKeyValue } } ================================================ FILE: CacheUtils/DataDumperImport/Import/Attribute.cs ================================================ using System.Collections.Generic; using System.IO; using System.Text.RegularExpressions; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.Utilities; using Intervals; using OptimizedCore; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.DataDumperImport.Import { internal static class Attribute { private static readonly HashSet KnownKeys; private static readonly Regex RangeRegex; static Attribute() { KnownKeys = new HashSet { ImportKeys.Name, ImportKeys.Description, ImportKeys.Code, ImportKeys.Value }; RangeRegex = new Regex("(\\d+)-(\\d+)", RegexOptions.Compiled); } /// /// returns an array of miRNAs given a list of ObjectValues (AbstractData) /// public static (IInterval[] MicroRnas, IRnaEdit[] RnaEdits, bool CdsStartNotFound, bool CdsEndNotFound) ParseList( IImportNode importNode) { var listMembers = importNode.GetListMembers(); if (listMembers == null) throw new InvalidDataException("Encountered an attribute node that could not be converted to a member list."); var microRnaList = new List(); var rnaEditList = new List(); var cdsStartNotFound = false; var cdsEndNotFound = false; foreach (var node in listMembers) { if (!(node is ObjectValueNode objectValue)) throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectValue: [{node.GetType()}]"); (string key, string value) = ParseKeyValue(objectValue); if (key == null) continue; // ReSharper disable once SwitchStatementMissingSomeCases switch (key) { case "miRNA": microRnaList.Add(GetInterval(value)); break; case "_rna_edit": rnaEditList.Add(GetRnaEdit(value)); break; case "cds_start_NF": cdsStartNotFound = true; break; case "cds_end_NF": cdsEndNotFound = true; break; } } var microRnas = microRnaList.Count == 0 ? null : microRnaList.ToArray(); var rnaEdits = rnaEditList.Count == 0 ? null : rnaEditList.ToArray(); return (microRnas, rnaEdits, cdsStartNotFound, cdsEndNotFound); } private static IInterval GetInterval(string s) { var rangeMatch = RangeRegex.Match(s); if (!rangeMatch.Success) throw new InvalidDataException($"Unable to convert the Attribute to a miRNA object. The value string failed the regex: {s}"); int start = int.Parse(rangeMatch.Groups[1].Value); int end = int.Parse(rangeMatch.Groups[2].Value); return new Interval(start, end); } private static RnaEdit GetRnaEdit(string s) { var cols = s.OptimizedSplit(' '); if (cols.Length != 3) throw new InvalidDataException($"Expected 3 columns but found {cols.Length} when parsing RNA edit"); int start = int.Parse(cols[0]); int end = int.Parse(cols[1]); string bases = cols[2]; return new RnaEdit(start, end, bases); } private static (string Key, string Value) ParseKeyValue(ObjectValueNode objectValue) { string key = null; string value = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper attribute object: {node.Key}"); } switch (node.Key) { case ImportKeys.Name: case ImportKeys.Description: // not used break; case ImportKeys.Code: key = node.GetString(); break; case ImportKeys.Value: value = node.GetString(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return (key, value); } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportExon.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.DataDumperImport.Utilities; using Genome; namespace CacheUtils.DataDumperImport.Import { internal static class ImportExon { private static readonly HashSet KnownKeys; static ImportExon() { KnownKeys = new HashSet { ImportKeys.End, ImportKeys.EndPhase, ImportKeys.Phase, ImportKeys.StableId, ImportKeys.Start, ImportKeys.Strand }; } /// /// returns a new exon given an ObjectValue /// public static MutableExon Parse(ObjectValueNode objectValue, Chromosome currentChromosome) { int start = -1; int end = -1; int phase = int.MinValue; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper mapper object: {node.Key}"); } switch (node.Key) { case ImportKeys.Strand: case ImportKeys.StableId: case ImportKeys.EndPhase: // not used break; case ImportKeys.End: end = node.GetInt32(); break; case ImportKeys.Phase: phase = node.GetInt32(); break; case ImportKeys.Start: start = node.GetInt32(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return new MutableExon(currentChromosome, start, end, phase); } /// /// returns an array of exons given a list of ObjectValues (AbstractData) /// public static MutableExon[] ParseList(IImportNode importNode, Chromosome chromosome) { var listMembers = importNode.GetListMembers(); if (listMembers == null) throw new InvalidDataException("Encountered an exon node that could not be converted to a member list."); var exons = new MutableExon[listMembers.Count]; for (var exonIndex = 0; exonIndex < listMembers.Count; exonIndex++) { if (listMembers[exonIndex] is ObjectValueNode objectValue) { exons[exonIndex] = Parse(objectValue, chromosome); } else { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectValue: [{listMembers[exonIndex].GetType()}]"); } } return exons; } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportGene.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.Utilities; namespace CacheUtils.DataDumperImport.Import { internal static class ImportGene { private static readonly HashSet KnownKeys; static ImportGene() { KnownKeys = new HashSet { ImportKeys.End, ImportKeys.StableId, ImportKeys.Start, ImportKeys.Strand }; } public static (int Start, int End, string Id, bool OnReverseStrand) Parse(IImportNode importNode) { var objectValue = importNode.GetObjectValueNode(); if (objectValue == null) throw new InvalidDataException("Encountered a gene import node that could not be converted to an object value node."); int start = -1; int end = -1; string stableId = null; var onReverseStrand = false; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper gene object: {node.Key}"); } // handle each key switch (node.Key) { case ImportKeys.End: end = node.GetInt32(); break; case ImportKeys.StableId: stableId = node.GetString(); break; case ImportKeys.Start: start = node.GetInt32(); break; case ImportKeys.Strand: onReverseStrand = TranscriptUtilities.GetStrand(node); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return (start, end, stableId, onReverseStrand); } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportIntron.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using Intervals; namespace CacheUtils.DataDumperImport.Import { internal static class ImportIntron { private static readonly HashSet KnownKeys; static ImportIntron() { KnownKeys = new HashSet { ImportKeys.Analysis, ImportKeys.Adaptor, ImportKeys.DbId, ImportKeys.End, ImportKeys.Next, ImportKeys.Prev, ImportKeys.SeqName, ImportKeys.Slice, ImportKeys.Start, ImportKeys.Strand }; } /// /// returns a new exon given an ObjectValue /// private static IInterval Parse(ObjectValueNode objectValue) { int start = -1; int end = -1; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper mapper object: {node.Key}"); } switch (node.Key) { case ImportKeys.Analysis: case ImportKeys.Adaptor: case ImportKeys.DbId: case ImportKeys.Next: case ImportKeys.Prev: case ImportKeys.SeqName: case ImportKeys.Strand: case ImportKeys.Slice: // not used break; case ImportKeys.End: end = node.GetInt32(); break; case ImportKeys.Start: start = node.GetInt32(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return new Interval(start, end); } /// /// parses the relevant data from each intron object /// public static IInterval[] ParseList(List members) { var introns = new IInterval[members.Count]; for (var intronIndex = 0; intronIndex < members.Count; intronIndex++) { if (!(members[intronIndex] is ObjectValueNode objectValue)) { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectValue: [{members[intronIndex].GetType()}]"); } introns[intronIndex] = Parse(objectValue); } return introns; } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportKeys.cs ================================================ namespace CacheUtils.DataDumperImport.Import { internal static class ImportKeys { internal const string Adaptor = "adaptor"; internal const string AltSeq = "alt_seq"; internal const string Analysis = "analysis"; internal const string AnalysisId = "_analysis_id"; internal const string Attributes = "attributes"; internal const string BamEditStatus = "_bam_edit_status"; internal const string Biotype = "biotype"; internal const string BoundLengths = "_bound_lengths"; internal const string Ccds = "_ccds"; internal const string CdnaCodingEnd = "cdna_coding_end"; internal const string CdnaCodingStart = "cdna_coding_start"; internal const string CellTypeCount = "cell_type_count"; internal const string CellTypes = "cell_types"; internal const string Code = "code"; internal const string CodingDnaCodingEnd = "cdna_coding_end"; internal const string CodingDnaCodingStart = "cdna_coding_start"; internal const string CodingRegionEnd = "coding_region_end"; internal const string CodingRegionStart = "coding_region_start"; internal const string CodonTable = "codon_table"; internal const string CreatedDate = "created_date"; internal const string DbId = "dbID"; internal const string Description = "description"; internal const string DisplayLabel = "display_label"; internal const string DisplayXref = "display_xref"; internal const string End = "end"; internal const string EndExon = "end_exon"; internal const string EndPhase = "end_phase"; internal const string EpigenomeCount = "epigenome_count"; internal const string ExonCoordinateMapper = "exon_coord_mapper"; internal const string ExternalDb = "external_db"; internal const string ExternalDisplayName = "external_display_name"; internal const string ExternalName = "external_name"; internal const string ExternalStatus = "external_status"; internal const string FeatureType = "feature_type"; internal const string FivePrimeUtr = "five_prime_utr"; internal const string From = "from"; internal const string FromCoordSystem = "from_cs"; internal const string FromName = "from"; internal const string Gene = "_gene"; internal const string GeneHgnc = "_gene_hgnc"; internal const string GeneHgncId = "_gene_hgnc_id"; internal const string GenePhenotype = "_gene_phenotype"; internal const string GeneStableId = "_gene_stable_id"; internal const string GeneSymbol = "_gene_symbol"; internal const string GeneSymbolSource = "_gene_symbol_source"; internal const string Genomic = "GENOME"; internal const string HasEvidence = "has_evidence"; internal const string Id = "id"; internal const string Introns = "introns"; internal const string IsCanonical = "is_canonical"; internal const string IsMatrixCompressed = "matrix_compressed"; internal const string IsSorted = "_is_sorted"; internal const string Mapper = "mapper"; internal const string Matrix = "matrix"; internal const string ModifiedDate = "modified_date"; internal const string Name = "name"; internal const string Next = "next"; internal const string Ori = "ori"; internal const string PairCodingDna = "_pair_cdna"; internal const string PairCount = "pair_count"; internal const string PairGenomic = "_pair_genomic"; internal const string Peptide = "peptide"; internal const string PeptideLength = "peptide_length"; internal const string Phase = "phase"; internal const string PolyPhen = "polyphen"; internal const string PolyPhenHumDiv = "polyphen_humdiv"; internal const string PolyPhenHumVar = "polyphen_humvar"; internal const string Prev = "prev"; internal const string Projected = "projected"; internal const string Protein = "_protein"; internal const string ProteinFeatures = "protein_features"; internal const string ProteinFunctionPredictions = "protein_function_predictions"; internal const string Refseq = "_refseq"; internal const string RegulatoryBuildId = "regulatory_build_id"; internal const string Selenocysteines = "selenocysteines"; internal const string SeqEdits = "seq_edits"; internal const string SeqName = "seqname"; internal const string Sequence = "seq"; internal const string Set = "set"; internal const string Sift = "sift"; internal const string Slice = "slice"; internal const string SortedExons = "sorted_exons"; internal const string Source = "source"; internal const string SplicedSequence = "spliced_seq"; internal const string StableId = "stable_id"; internal const string Start = "start"; internal const string StartExon = "start_exon"; internal const string StartPhase = "start_phase"; internal const string Strand = "strand"; internal const string SubAnalysis = "sub_analysis"; internal const string SwissProt = "_swissprot"; internal const string ThreePrimeUtr = "three_prime_utr"; internal const string To = "to"; internal const string ToCoordSystem = "to_cs"; internal const string ToName = "to"; internal const string TransExonArray = "_trans_exon_array"; internal const string Transcript = "transcript"; internal const string TranslateableSeq = "translateable_seq"; internal const string Translation = "translation"; internal const string TranslationMd5 = "translation_md5"; internal const string Trembl = "_trembl"; internal const string UniParc = "_uniparc"; internal const string Value = "value"; internal const string VariationEffectFeatureCache = "_variation_effect_feature_cache"; internal const string VepFeatureType = "_vep_feature_type"; internal const string VepLazyLoaded = "_vep_lazy_loaded"; internal const string Version = "version"; } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportMapper.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; namespace CacheUtils.DataDumperImport.Import { internal static class ImportMapper { private static readonly HashSet KnownKeys; static ImportMapper() { KnownKeys = new HashSet { ImportKeys.FromCoordSystem, ImportKeys.FromName, ImportKeys.IsSorted, ImportKeys.PairCodingDna, ImportKeys.PairCount, ImportKeys.PairGenomic, ImportKeys.ToCoordSystem, ImportKeys.ToName }; } /// /// parses the relevant data from each exon coordinate mapper object /// public static MutableTranscriptRegion[] Parse(ObjectValueNode objectValue) { MutableTranscriptRegion[] cdnaMaps = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper mapper object: {node.Key}"); } switch (node.Key) { case ImportKeys.ToName: case ImportKeys.PairCount: case ImportKeys.PairCodingDna: case ImportKeys.FromCoordSystem: case ImportKeys.FromName: case ImportKeys.IsSorted: case ImportKeys.ToCoordSystem: // not used break; case ImportKeys.PairGenomic: if (node is ObjectKeyValueNode pairGenomicNode) { cdnaMaps = ImportPairGenomic.Parse(pairGenomicNode.Value); } else if (!node.IsUndefined()) { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectKeyValue: [{node.GetType()}]"); } break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return cdnaMaps; } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportMapperPair.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.DataDumperImport.Import { internal static class ImportMapperPair { private static readonly HashSet KnownKeys; static ImportMapperPair() { KnownKeys = new HashSet { ImportKeys.From, ImportKeys.Ori, ImportKeys.To }; } /// /// parses the relevant data from each mapper pairs object /// private static MutableTranscriptRegion Parse(ObjectValueNode objectValue) { int fromStart = -1; int fromEnd = -1; var fromType = MapperUnitType.Unknown; int toStart = -1; int toEnd = -1; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the mapper pair object: {node.Key}"); } switch (node.Key) { case ImportKeys.Ori: // not used break; case ImportKeys.From: if (node is ObjectKeyValueNode fromKeyNode) { (fromStart, fromEnd, fromType) = ImportMapperUnit.Parse(fromKeyNode.Value); } else { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectKeyValue: [{node.GetType()}]"); } break; case ImportKeys.To: if (node is ObjectKeyValueNode toKeyNode) { (toStart, toEnd, _) = ImportMapperUnit.Parse(toKeyNode.Value); } else { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectKeyValue: [{node.GetType()}]"); } break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return GetCdnaMap(fromStart, fromEnd, fromType, toStart, toEnd); } private static MutableTranscriptRegion GetCdnaMap(int fromStart, int fromEnd, MapperUnitType fromType, int toStart, int toEnd) { return fromType == MapperUnitType.Genomic ? new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, fromStart, fromEnd, toStart, toEnd) : new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, toStart, toEnd, fromStart, fromEnd); } /// /// parses the relevant data from each mapper pairs object /// public static MutableTranscriptRegion[] ParseList(List listMembers) { var cdnaMaps = new List(listMembers.Count); foreach (var entry in listMembers) { if (!(entry is ObjectValueNode mapperPairNode)) throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectValue: [{entry.GetType()}]"); if (mapperPairNode.Type != "Bio::EnsEMBL::Mapper::Pair") throw new InvalidDataException($"Expected a mapper pair data type, but found the following data type: [{mapperPairNode.Type}]"); cdnaMaps.Add(Parse(mapperPairNode)); } return cdnaMaps.ToArray(); } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportMapperUnit.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.Utilities; namespace CacheUtils.DataDumperImport.Import { internal static class ImportMapperUnit { private static readonly HashSet KnownKeys; static ImportMapperUnit() { KnownKeys = new HashSet { ImportKeys.End, ImportKeys.Id, ImportKeys.Start }; } /// /// parses the relevant data from each mapper unit object /// public static (int Start, int End, MapperUnitType Type) Parse(ObjectValueNode objectValue) { int start = -1; int end = -1; var type = MapperUnitType.Unknown; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the mapper unit object: {node.Key}"); } switch (node.Key) { case ImportKeys.Id: type = TranscriptUtilities.GetMapperUnitType(node); break; case ImportKeys.End: end = node.GetInt32(); break; case ImportKeys.Start: start = node.GetInt32(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return (start, end, type); } } public enum MapperUnitType : byte { Unknown, CodingDna, Genomic } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportPairGenomic.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; namespace CacheUtils.DataDumperImport.Import { internal static class ImportPairGenomic { private static readonly HashSet KnownKeys; static ImportPairGenomic() { KnownKeys = new HashSet { ImportKeys.Genomic }; } /// /// parses the relevant data from each pair genomic object /// public static MutableTranscriptRegion[] Parse(ObjectValueNode objectValue) { MutableTranscriptRegion[] cdnaMaps = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the pair genomic object: {node.Key}"); } switch (node.Key) { case ImportKeys.Genomic: if (node is ListObjectKeyValueNode genomicNode) { cdnaMaps = ImportMapperPair.ParseList(genomicNode.Values); } else if (!node.IsUndefined()) { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectKeyValue: [{node.GetType()}]"); } break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return cdnaMaps; } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportPrediction.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; namespace CacheUtils.DataDumperImport.Import { internal static class ImportPrediction { private static readonly HashSet KnownKeys; static ImportPrediction() { KnownKeys = new HashSet { ImportKeys.Analysis, ImportKeys.IsMatrixCompressed, ImportKeys.Matrix, ImportKeys.PeptideLength, ImportKeys.SubAnalysis, ImportKeys.TranslationMd5 }; } /// /// parses the relevant data from each prediction object /// public static string Parse(ObjectValueNode objectValue) { string predictionData = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper prediction object: {node.Key}"); } switch (node.Key) { case ImportKeys.Analysis: case ImportKeys.IsMatrixCompressed: case ImportKeys.PeptideLength: case ImportKeys.SubAnalysis: case ImportKeys.TranslationMd5: break; case ImportKeys.Matrix: predictionData = node.GetString(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return predictionData; } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportProteinFunctionPredictions.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.Utilities; namespace CacheUtils.DataDumperImport.Import { internal static class ImportProteinFunctionPredictions { private static readonly HashSet KnownKeys; static ImportProteinFunctionPredictions() { KnownKeys = new HashSet { ImportKeys.PolyPhenHumVar, ImportKeys.PolyPhenHumDiv, ImportKeys.PolyPhen, ImportKeys.Sift }; } public static (string SiftMatrix, string PolyphenMatrix) Parse(ObjectValueNode objectValue) { string siftData = null; string polyphenData = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper mapper object: {node.Key}"); } switch (node.Key) { case ImportKeys.PolyPhen: case ImportKeys.PolyPhenHumDiv: // not used break; case ImportKeys.PolyPhenHumVar: // used by default polyphenData = node.GetPredictionData(); break; case ImportKeys.Sift: siftData = node.GetPredictionData(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return (siftData, polyphenData); } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportRegulatoryFeature.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.Helpers; using Genome; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.DataDumperImport.Import { public static class ImportRegulatoryFeature { private static readonly HashSet KnownKeys; static ImportRegulatoryFeature() { KnownKeys = new HashSet { ImportKeys.AnalysisId, ImportKeys.BoundLengths, ImportKeys.CellTypeCount, ImportKeys.CellTypes, ImportKeys.DbId, ImportKeys.DisplayLabel, ImportKeys.End, ImportKeys.EpigenomeCount, ImportKeys.FeatureType, ImportKeys.HasEvidence, ImportKeys.Projected, ImportKeys.RegulatoryBuildId, ImportKeys.Set, ImportKeys.StableId, ImportKeys.Start, ImportKeys.Strand, ImportKeys.Slice, ImportKeys.VepFeatureType }; } /// /// parses the relevant data from each regulatory element /// public static IRegulatoryRegion Parse(ObjectValueNode objectValue, Chromosome chromosome) { int start = -1; int end = -1; string stableId = null; string type = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper regulatory element object: {node.Key}"); } switch (node.Key) { case ImportKeys.AnalysisId: case ImportKeys.BoundLengths: case ImportKeys.CellTypeCount: case ImportKeys.CellTypes: case ImportKeys.DbId: case ImportKeys.DisplayLabel: case ImportKeys.EpigenomeCount: case ImportKeys.HasEvidence: case ImportKeys.Projected: case ImportKeys.RegulatoryBuildId: case ImportKeys.Set: case ImportKeys.Strand: case ImportKeys.Slice: case ImportKeys.VepFeatureType: // not used break; case ImportKeys.FeatureType: type = node.GetString(); break; case ImportKeys.End: end = node.GetInt32(); break; case ImportKeys.StableId: stableId = node.GetString(); break; case ImportKeys.Start: start = node.GetInt32(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return new RegulatoryRegion(chromosome, start, end, CompactId.Convert(stableId), RegulatoryRegionTypeHelper.GetRegulatoryRegionType(type)); } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportSeqEdits.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; namespace CacheUtils.DataDumperImport.Import { internal static class ImportSeqEdits { private static readonly HashSet KnownKeys; static ImportSeqEdits() { KnownKeys = new HashSet { ImportKeys.AltSeq, ImportKeys.Code, ImportKeys.Description, ImportKeys.End, ImportKeys.Name, ImportKeys.Start }; } /// /// parses the relevant data from each seqedits object /// public static int[] Parse(List members) { var selenocysteineList = new List(); foreach (var seqEditNode in members) { if (!(seqEditNode is ObjectValueNode seListNode)) continue; string code = null; int start = -1; foreach (var node in seListNode.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper seq_edits object: {node.Key}"); } switch (node.Key) { case ImportKeys.AltSeq: case ImportKeys.Description: case ImportKeys.End: case ImportKeys.Name: // not used break; case ImportKeys.Code: code = node.GetString(); break; case ImportKeys.Start: start = node.GetInt32(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } if (code != null && code == "_selenocysteine") selenocysteineList.Add(start); } return selenocysteineList.Count == 0 ? null : selenocysteineList.ToArray(); } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportTranscript.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.DataDumperImport.DataStructures; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.DataDumperImport.Utilities; using CacheUtils.Helpers; using CacheUtils.Utilities; using Genome; using Intervals; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using TranscriptUtilities = CacheUtils.DataDumperImport.Utilities.TranscriptUtilities; namespace CacheUtils.DataDumperImport.Import { public static class ImportTranscript { private static readonly HashSet KnownKeys; static ImportTranscript() { KnownKeys = new HashSet { ImportKeys.Attributes, ImportKeys.BamEditStatus, ImportKeys.Biotype, ImportKeys.Ccds, ImportKeys.CdnaCodingEnd, ImportKeys.CdnaCodingStart, ImportKeys.CodingRegionEnd, ImportKeys.CodingRegionStart, ImportKeys.CreatedDate, ImportKeys.DbId, ImportKeys.Description, ImportKeys.DisplayXref, ImportKeys.End, ImportKeys.ExternalDb, ImportKeys.ExternalDisplayName, ImportKeys.ExternalName, ImportKeys.ExternalStatus, ImportKeys.Gene, ImportKeys.GeneHgnc, ImportKeys.GeneHgncId, ImportKeys.GenePhenotype, ImportKeys.GeneStableId, ImportKeys.GeneSymbol, ImportKeys.GeneSymbolSource, ImportKeys.IsCanonical, ImportKeys.ModifiedDate, ImportKeys.Protein, ImportKeys.Refseq, ImportKeys.Slice, ImportKeys.Source, ImportKeys.StableId, ImportKeys.Start, ImportKeys.Strand, ImportKeys.SwissProt, ImportKeys.TransExonArray, ImportKeys.Translation, ImportKeys.Trembl, ImportKeys.UniParc, ImportKeys.VariationEffectFeatureCache, ImportKeys.VepLazyLoaded, ImportKeys.Version }; } /// /// parses the relevant data from each transcript /// public static MutableTranscript Parse(ObjectValueNode objectValue, Chromosome chromosome, Source source) { // IDs string transcriptId = null; byte transcriptVersion = 1; string proteinId = null; byte proteinVersion = 0; string ccdsId = null; string refSeqId = null; string geneId = null; int hgncId = -1; // gene int geneStart = -1; int geneEnd = -1; var geneOnReverseStrand = false; string geneSymbol = null; var geneSymbolSource = GeneSymbolSource.Unknown; // translation int translationStart = -1; int translationEnd = -1; MutableExon translationStartExon = null; MutableExon translationEndExon = null; // predictions string siftData = null; string polyphenData = null; var bioType = BioType.other; IInterval[] microRnas = null; MutableTranscriptRegion[] cdnaMaps = null; IInterval[] introns = null; string peptideSequence = null; string translateableSequence = null; var isCanonical = false; int compDnaCodingStart = -1; int compDnaCodingEnd = -1; int start = -1; int end = -1; MutableExon[] exons = null; var cdsStartNotFound = false; var cdsEndNotFound = false; int[] selenocysteinePositions = null; IRnaEdit[] rnaEdits = null; string bamEditStatus = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper transcript object: {node.Key}"); } // handle each key switch (node.Key) { case ImportKeys.CodingRegionEnd: case ImportKeys.CodingRegionStart: case ImportKeys.CreatedDate: case ImportKeys.DbId: case ImportKeys.Description: case ImportKeys.DisplayXref: case ImportKeys.ExternalDb: case ImportKeys.ExternalDisplayName: case ImportKeys.ExternalName: case ImportKeys.ExternalStatus: case ImportKeys.GenePhenotype: case ImportKeys.GeneStableId: case ImportKeys.ModifiedDate: case ImportKeys.Protein: case ImportKeys.Slice: case ImportKeys.Source: case ImportKeys.Strand: case ImportKeys.SwissProt: case ImportKeys.Trembl: case ImportKeys.UniParc: case ImportKeys.VepLazyLoaded: // not used break; case ImportKeys.BamEditStatus: bamEditStatus = node.GetString(); break; case ImportKeys.Attributes: (microRnas, rnaEdits, cdsStartNotFound, cdsEndNotFound) = Attribute.ParseList(node); break; case ImportKeys.Biotype: bioType = TranscriptUtilities.GetBiotype(node); break; case ImportKeys.Ccds: ccdsId = node.GetString(); break; case ImportKeys.CdnaCodingEnd: compDnaCodingEnd = node.GetInt32(); break; case ImportKeys.CdnaCodingStart: compDnaCodingStart = node.GetInt32(); break; case ImportKeys.End: end = node.GetInt32(); break; case ImportKeys.GeneHgncId: hgncId = node.GetHgncId(); break; case ImportKeys.GeneSymbol: case ImportKeys.GeneHgnc: // older key geneSymbol = node.GetString(); break; case ImportKeys.GeneSymbolSource: geneSymbolSource = GeneSymbolSourceHelper.GetGeneSymbolSource(node.GetString()); break; case ImportKeys.Gene: (geneStart, geneEnd, geneId, geneOnReverseStrand) = ImportGene.Parse(node); break; case ImportKeys.IsCanonical: isCanonical = node.GetBool(); break; case ImportKeys.Refseq: refSeqId = node.GetString(); break; case ImportKeys.StableId: transcriptId = node.GetString(); break; case ImportKeys.Start: start = node.GetInt32(); break; case ImportKeys.TransExonArray: exons = ImportExon.ParseList(node, chromosome); break; case ImportKeys.Translation: (translationStart, translationEnd, proteinId, proteinVersion, translationStartExon, translationEndExon) = ImportTranslation.Parse(node, chromosome); break; case ImportKeys.VariationEffectFeatureCache: (cdnaMaps, introns, peptideSequence, translateableSequence, siftData, polyphenData, selenocysteinePositions) = ImportVariantEffectFeatureCache.Parse(node); break; case ImportKeys.Version: transcriptVersion = (byte)node.GetInt32(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } var fixedTranscript = AccessionUtilities.GetMaxVersion(transcriptId, transcriptVersion); var fixedProtein = AccessionUtilities.GetMaxVersion(proteinId, proteinVersion); var gene = new MutableGene(chromosome, geneStart, geneEnd, geneOnReverseStrand, geneSymbol, geneSymbolSource, geneId, hgncId); var codingRegion = new CodingRegion(GetCodingRegionStart(geneOnReverseStrand, translationStartExon, translationEndExon, translationStart, translationEnd), GetCodingRegionEnd(geneOnReverseStrand, translationStartExon, translationEndExon, translationStart, translationEnd), compDnaCodingStart, compDnaCodingEnd, 0); int totalExonLength = GetTotalExonLength(exons); int startExonPhase = translationStartExon?.Phase ?? int.MinValue; return new MutableTranscript(chromosome, start, end, fixedTranscript.Id, fixedTranscript.Version, ccdsId, refSeqId, bioType, isCanonical, codingRegion, fixedProtein.Id, fixedProtein.Version, peptideSequence, source, gene, exons, startExonPhase, totalExonLength, introns, cdnaMaps, siftData, polyphenData, translateableSequence, microRnas, cdsStartNotFound, cdsEndNotFound, selenocysteinePositions, rnaEdits, bamEditStatus); } /// /// returns the start position of the coding region. Returns -1 if no translation was possible. /// private static int GetCodingRegionStart(bool onReverseStrand, IInterval startExon, IInterval endExon, int translationStart, int translationEnd) { if (startExon == null || endExon == null) return -1; return onReverseStrand ? endExon.End - translationEnd + 1 : startExon.Start + translationStart - 1; } /// /// returns the start position of the coding region. Returns -1 if no translation was possible. /// private static int GetCodingRegionEnd(bool onReverseStrand, IInterval startExon, IInterval endExon, int translationStart, int translationEnd) { if (startExon == null || endExon == null) return -1; return onReverseStrand ? startExon.End - translationStart + 1 : endExon.Start + translationEnd - 1; } /// /// returns the sum of the exon lengths /// private static int GetTotalExonLength(IEnumerable exons) => exons.Sum(exon => exon.End - exon.Start + 1); } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportTranscriptMapper.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; namespace CacheUtils.DataDumperImport.Import { internal static class ImportTranscriptMapper { private static readonly HashSet KnownKeys; static ImportTranscriptMapper() { KnownKeys = new HashSet { ImportKeys.CodingDnaCodingEnd, ImportKeys.CodingDnaCodingStart, ImportKeys.ExonCoordinateMapper, ImportKeys.StartPhase }; } /// /// parses the relevant data from each transcript mapper /// public static MutableTranscriptRegion[] Parse(ObjectValueNode objectValue) { MutableTranscriptRegion[] cdnaMaps = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper transcript mapper object: {node.Key}"); } switch (node.Key) { case ImportKeys.CodingDnaCodingEnd: case ImportKeys.CodingDnaCodingStart: case ImportKeys.StartPhase: break; case ImportKeys.ExonCoordinateMapper: if (node is ObjectKeyValueNode exonCoordMapperNode) { cdnaMaps = ImportMapper.Parse(exonCoordMapperNode.Value); } else { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectKeyValue: [{node.GetType()}]"); } break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return cdnaMaps; } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportTranslation.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.DataDumperImport.Utilities; using Genome; namespace CacheUtils.DataDumperImport.Import { internal static class ImportTranslation { private static readonly HashSet KnownKeys; static ImportTranslation() { KnownKeys = new HashSet { ImportKeys.Adaptor, ImportKeys.DbId, ImportKeys.EndExon, ImportKeys.End, ImportKeys.Sequence, ImportKeys.StableId, ImportKeys.StartExon, ImportKeys.Start, ImportKeys.Transcript, ImportKeys.Version }; } /// /// parses the relevant data from each translation object /// public static (int Start, int End, string ProteinId, byte ProteinVersion, MutableExon startExon, MutableExon endExon) Parse(IImportNode importNode, Chromosome currentChromosome) { var objectValue = importNode.GetObjectValueNode(); if (objectValue == null) throw new InvalidDataException("Encountered a translation import node that could not be converted to an object value node."); int start = -1; int end = -1; string proteinId = null; byte proteinVersion = 0; MutableExon startExon = null; MutableExon endExon = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper mapper object: {node.Key}"); } ObjectKeyValueNode exonNode; switch (node.Key) { case ImportKeys.Adaptor: case ImportKeys.Sequence: case ImportKeys.DbId: case ImportKeys.Transcript: // skip this key break; case ImportKeys.StartExon: exonNode = node as ObjectKeyValueNode; if (exonNode != null) startExon = ImportExon.Parse(exonNode.Value, currentChromosome); break; case ImportKeys.EndExon: exonNode = node as ObjectKeyValueNode; if (exonNode != null) endExon = ImportExon.Parse(exonNode.Value, currentChromosome); break; case ImportKeys.StableId: proteinId = node.GetString(); break; case ImportKeys.End: end = node.GetInt32(); break; case ImportKeys.Start: start = node.GetInt32(); break; case ImportKeys.Version: proteinVersion = (byte)node.GetInt32(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return (start, end, proteinId, proteinVersion, startExon, endExon); } } } ================================================ FILE: CacheUtils/DataDumperImport/Import/ImportVariantEffectFeatureCache.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.DataDumperImport.Utilities; using Intervals; namespace CacheUtils.DataDumperImport.Import { internal static class ImportVariantEffectFeatureCache { private static readonly HashSet KnownKeys; static ImportVariantEffectFeatureCache() { KnownKeys = new HashSet { ImportKeys.CodonTable, ImportKeys.FivePrimeUtr, ImportKeys.Introns, ImportKeys.Mapper, ImportKeys.Peptide, ImportKeys.ProteinFeatures, ImportKeys.ProteinFunctionPredictions, ImportKeys.Selenocysteines, ImportKeys.SeqEdits, ImportKeys.SplicedSequence, ImportKeys.SortedExons, ImportKeys.ThreePrimeUtr, ImportKeys.TranslateableSeq }; } /// /// parses the relevant data from each variant effect feature cache /// public static (MutableTranscriptRegion[] CdnaMaps, IInterval[] Introns, string PeptideSequence, string TranslateableSequence, string SiftData, string PolyPhenData, int[] SelenocysteinePositions) Parse(IImportNode importNode) { var objectValue = importNode.GetObjectValueNode(); if (objectValue == null) throw new InvalidDataException("Encountered a variant effect feature cache node that could not be converted to an object value node."); MutableTranscriptRegion[] cdnaMaps = null; IInterval[] introns = null; string peptideSequence = null; string translateableSequence = null; string siftData = null; string polyphenData = null; int[] selenocysteinePositions = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper variant effect feature cache object: {node.Key}"); } switch (node.Key) { case ImportKeys.CodonTable: case ImportKeys.FivePrimeUtr: case ImportKeys.ProteinFeatures: case ImportKeys.Selenocysteines: case ImportKeys.SortedExons: case ImportKeys.SplicedSequence: case ImportKeys.ThreePrimeUtr: // not used break; case ImportKeys.Introns: introns = node.ParseListObjectKeyValueNode(ImportIntron.ParseList); break; case ImportKeys.Mapper: cdnaMaps = node.ParseObjectKeyValueNode(ImportTranscriptMapper.Parse); break; case ImportKeys.Peptide: peptideSequence = node.GetString(); break; case ImportKeys.ProteinFunctionPredictions: if (node is ObjectKeyValueNode predictionsNode) { (siftData, polyphenData) = ImportProteinFunctionPredictions.Parse(predictionsNode.Value); } else { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectKeyValue: [{node.GetType()}]"); } break; case ImportKeys.SeqEdits: selenocysteinePositions = node.ParseListObjectKeyValueNode(ImportSeqEdits.Parse); break; case ImportKeys.TranslateableSeq: translateableSequence = node.GetString(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } return (cdnaMaps, introns, peptideSequence, translateableSequence, siftData, polyphenData, selenocysteinePositions); } } } ================================================ FILE: CacheUtils/DataDumperImport/Utilities/ImportUtilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.Import; namespace CacheUtils.DataDumperImport.Utilities { public static class ImportUtilities { public static string GetPredictionData(this IImportNode node) { string predictionData = null; if (node is ObjectKeyValueNode predictionNode) { predictionData = ImportPrediction.Parse(predictionNode.Value); } else if (!node.IsUndefined()) { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectKeyValue: [{node.GetType()}]"); } return predictionData; } public static T[] ParseObjectKeyValueNode(this IImportNode node, Func parseFunc) { T[] results; if (node is ObjectKeyValueNode keyValueNode) { results = parseFunc(keyValueNode.Value); } else { throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectKeyValue: [{node.GetType()}]"); } return results; } public static T[] ParseListObjectKeyValueNode(this IImportNode node, Func, T[]> parseFunc) { T[] results = null; if (node is ListObjectKeyValueNode listObjectKeyValueNode) { results = parseFunc(listObjectKeyValueNode.Values); } else if (!node.IsUndefined()) { throw new InvalidDataException($"Could not transform the AbstractData object into a ListObjectKeyValue: [{node.GetType()}]"); } return results; } } } ================================================ FILE: CacheUtils/DataDumperImport/Utilities/MutableTranscriptComparer.cs ================================================ using System; using System.Collections.Generic; using CacheUtils.DataDumperImport.DataStructures.Mutable; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.DataDumperImport.Utilities { internal sealed class MutableTranscriptComparer : EqualityComparer { private static bool GeneEquals(MutableGene x, MutableGene y) { return x.Chromosome.Index == y.Chromosome.Index && x.Start == y.Start && x.End == y.End && x.OnReverseStrand == y.OnReverseStrand && x.GeneId == y.GeneId && x.Symbol == y.Symbol && x.HgncId == y.HgncId && x.SymbolSource == y.SymbolSource; } private static bool ExonEquals(MutableExon x, MutableExon y) { return x.Start == y.Start && x.End == y.End && x.Phase == y.Phase; } private static bool IntervalEquals(IInterval x, IInterval y) { return x.Start == y.Start && x.End == y.End; } private static bool TranscriptRegionEquals(ITranscriptRegion x, ITranscriptRegion y) { return x.Start == y.Start && x.End == y.End && x.CdnaStart == y.CdnaStart && x.CdnaEnd == y.CdnaEnd; } private static bool CodingRegionEquals(ICodingRegion x, ICodingRegion y) { return x.Start == y.Start && x.End == y.End && x.CdnaStart == y.CdnaStart && x.CdnaEnd == y.CdnaEnd && x.Length == y.Length; } // ReSharper disable SuggestBaseTypeForParameter private static bool ArrayEquals(T[] x, T[] y, Func equals) // ReSharper restore SuggestBaseTypeForParameter { if (x == null && y == null) return true; if (x == null || y == null) return false; if (x.Length != y.Length) return false; // ReSharper disable once LoopCanBeConvertedToQuery for (var i = 0; i < x.Length; i++) if (!equals(x[i], y[i])) return false; return true; } private static bool IntEquals(int x, int y) => x == y; public override bool Equals(MutableTranscript x, MutableTranscript y) { return x.Chromosome.Index == y.Chromosome.Index && x.Start == y.Start && x.End == y.End && x.Id == y.Id && x.Version == y.Version && x.CcdsId == y.CcdsId && x.RefSeqId == y.RefSeqId && x.Source == y.Source && x.TotalExonLength == y.TotalExonLength && x.TranslateableSequence == y.TranslateableSequence && x.CdsStartNotFound == y.CdsStartNotFound && x.CdsEndNotFound == y.CdsEndNotFound && x.StartExonPhase == y.StartExonPhase && x.BioType == y.BioType && x.IsCanonical == y.IsCanonical && x.ProteinId == y.ProteinId && x.ProteinVersion == y.ProteinVersion && x.PeptideSequence == y.PeptideSequence && x.SiftData == y.SiftData && x.PolyphenData == y.PolyphenData && GeneEquals(x.Gene, y.Gene) && ArrayEquals(x.Exons, y.Exons, ExonEquals) && ArrayEquals(x.Introns, y.Introns, IntervalEquals) && ArrayEquals(x.MicroRnas, y.MicroRnas, IntervalEquals) && ArrayEquals(x.SelenocysteinePositions, y.SelenocysteinePositions, IntEquals) && ArrayEquals(x.CdnaMaps, y.CdnaMaps, TranscriptRegionEquals) && CodingRegionEquals(x.CodingRegion, y.CodingRegion); } public override int GetHashCode(MutableTranscript obj) { unchecked { int hashCode = obj.Chromosome.Index.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Start.GetHashCode(); hashCode = (hashCode * 397) ^ obj.End.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Id.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Version.GetHashCode(); hashCode = (hashCode * 397) ^ obj.BioType.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Source.GetHashCode(); return hashCode; } } } } ================================================ FILE: CacheUtils/DataDumperImport/Utilities/TranscriptUtilities.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.Import; using CacheUtils.Helpers; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.DataDumperImport.Utilities { public static class TranscriptUtilities { private const string CodingDnaMapperUnitTypeKey = "cdna"; private const string GenomeMapperUnitTypeKey = "genome"; private static readonly Dictionary MapperUnitTypes; static TranscriptUtilities() { MapperUnitTypes = new Dictionary { [CodingDnaMapperUnitTypeKey] = MapperUnitType.CodingDna, [GenomeMapperUnitTypeKey] = MapperUnitType.Genomic }; } public static BioType GetBiotype(IImportNode node) => BioTypeHelper.GetBioType(node.GetString()); public static MapperUnitType GetMapperUnitType(IImportNode node) { string mapperUnitTypeString = node.GetString(); if (!MapperUnitTypes.TryGetValue(mapperUnitTypeString, out var ret)) { throw new InvalidDataException($"Unable to find the specified mapper unit type ({mapperUnitTypeString}) in the MapperUnitType dictionary."); } return ret; } public static ObjectValueNode GetObjectValueNode(this IImportNode node) { if (node is ObjectKeyValueNode objectKeyValueNode) return objectKeyValueNode.Value; return null; } public static List GetListMembers(this IImportNode node) { if (node is ListObjectKeyValueNode listObjectKeyValueNode) return listObjectKeyValueNode.Values; return null; } public static bool GetStrand(IImportNode node) { int strandNum = node.GetInt32(); // sanity check: make sure the value is either 1 or -1 if (strandNum != -1 && strandNum != 1) { throw new InvalidDataException($"Expected the strand number to be either -1 or 1. Found: {strandNum}."); } return strandNum == -1; } public static int GetHgncId(this IImportNode node) { string hgnc = node.GetString(); if (hgnc != null && hgnc.StartsWith("HGNC:")) hgnc = hgnc.Substring(5); int hgncId = -1; if (hgnc != null) hgncId = int.Parse(hgnc); return hgncId; } } } ================================================ FILE: CacheUtils/GFF/GeneralAttributes.cs ================================================ namespace CacheUtils.GFF { public sealed class GeneralAttributes : IGeneralAttributes { public string GeneId { get; } public string GeneSymbol { get; } public string TranscriptId { get; } public string ProteinId { get; } public string BioType { get; } public bool IsCanonical { get; } public int InternalGeneId { get; } public GeneralAttributes(string geneId, string geneSymbol, string transcriptId, string proteinId, string bioType, bool isCanonical, int internalGeneId) { GeneId = geneId; GeneSymbol = geneSymbol; TranscriptId = transcriptId; ProteinId = proteinId; BioType = bioType; IsCanonical = isCanonical; InternalGeneId = internalGeneId; } } } ================================================ FILE: CacheUtils/GFF/GffCreator.cs ================================================ using System; using System.Collections.Generic; using Intervals; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.GFF { public sealed class GffCreator { private readonly Dictionary _geneToInternalId; private readonly Source _source; private readonly GffWriter _writer; private readonly HashSet _observedGenes; public GffCreator(GffWriter writer, Dictionary geneToInternalId, Source source) { _writer = writer; _geneToInternalId = geneToInternalId; _source = source; _observedGenes = new HashSet(); } public void Create(IEnumerable> transcriptIntervalArrays) { Console.Write("- writing GFF entries... "); foreach (var transcriptArray in transcriptIntervalArrays) { if (transcriptArray == null) continue; foreach (var interval in transcriptArray.Array) Write(interval.Value); } Console.WriteLine("finished."); } private void Write(ITranscript transcript) { if (transcript.Source != _source) return; var requiredFields = GetRequiredFields(transcript); var attribs = GetGeneralAttributes(transcript); WriteGene(transcript.Gene, requiredFields, attribs.GeneId, attribs.InternalGeneId); WriteTranscript(transcript, requiredFields, attribs); var exons = transcript.TranscriptRegions.GetExons(); var codingRegion = transcript.Translation?.CodingRegion; foreach (var exon in exons) WriteExon(exon, requiredFields, attribs, codingRegion); } private void WriteTranscript(IInterval interval, IRequiredFields requiredFields, IGeneralAttributes attribs) => _writer.WriteTranscript(interval, requiredFields, attribs); private void WriteGene(IGene gene, IRequiredFields requiredFields, string geneId, int internalGeneId) { if (_observedGenes.Contains(internalGeneId)) return; _observedGenes.Add(internalGeneId); var gffGene = GetGene(gene, geneId); _writer.WriteGene(gffGene, requiredFields, internalGeneId); } private void WriteExon(ITranscriptRegion exon, IRequiredFields requiredFields, IGeneralAttributes attribs, IInterval codingRegion) { _writer.WriteExonicRegion(exon, requiredFields, attribs, exon.Id, "exon"); WriteCds(codingRegion, exon, requiredFields, attribs); WriteUtr(codingRegion, exon, requiredFields, attribs); } private void WriteUtr(IInterval codingRegion, ITranscriptRegion exon, IRequiredFields requiredFields, IGeneralAttributes attribs) { if (!GffUtilities.HasUtr(codingRegion, exon)) return; if (exon.Start < codingRegion.Start) Write5PrimeUtr(codingRegion, exon, requiredFields, attribs); if (exon.End > codingRegion.End) Write3PrimeUtr(codingRegion, exon, requiredFields, attribs); } private void Write5PrimeUtr(IInterval codingRegion, ITranscriptRegion exon, IRequiredFields requiredFields, IGeneralAttributes attribs) { int utrEnd = codingRegion.Start - 1; if (utrEnd > exon.End) utrEnd = exon.End; _writer.WriteExonicRegion(new Interval(exon.Start, utrEnd), requiredFields, attribs, exon.Id, "UTR"); } private void Write3PrimeUtr(IInterval codingRegion, ITranscriptRegion exon, IRequiredFields requiredFields, IGeneralAttributes attribs) { int utrStart = codingRegion.End + 1; if (utrStart < exon.Start) utrStart = exon.Start; _writer.WriteExonicRegion(new Interval(utrStart, exon.End), requiredFields, attribs, exon.Id, "UTR"); } private void WriteCds(IInterval codingRegion, ITranscriptRegion exon, IRequiredFields requiredFields, IGeneralAttributes attribs) { if (!GffUtilities.HasCds(codingRegion, exon)) return; var cds = GffUtilities.GetCdsCoordinates(codingRegion, exon); _writer.WriteExonicRegion(cds, requiredFields, attribs, exon.Id, "CDS"); } private static IGffGene GetGene(IGene gene, string id) => new GffGene(gene.Start, gene.End, id, gene.EntrezGeneId.WithVersion, gene.EnsemblId.WithVersion, gene.Symbol); private static IRequiredFields GetRequiredFields(ITranscript transcript) { string source = transcript.Source.ToString(); return new RequiredFields(transcript.Chromosome.UcscName, source, transcript.Gene.OnReverseStrand); } private IGeneralAttributes GetGeneralAttributes(ITranscript transcript) { string bioType = AnnotatedTranscript.GetBioType(transcript.BioType); int internalGeneId = _geneToInternalId[transcript.Gene]; string geneId = transcript.Source == Source.Ensembl ? transcript.Gene.EnsemblId.WithVersion : transcript.Gene.EntrezGeneId.WithVersion; return new GeneralAttributes(geneId, transcript.Gene.Symbol, transcript.Id.WithVersion, transcript.Translation?.ProteinId?.WithVersion, bioType, transcript.IsCanonical, internalGeneId); } } } ================================================ FILE: CacheUtils/GFF/GffGene.cs ================================================ namespace CacheUtils.GFF { public sealed class GffGene : IGffGene { public int Start { get; } public int End { get; } public string Id { get; } public string EntrezGeneId { get; } public string EnsemblGeneId { get; } public string Symbol { get; } public GffGene(int start, int end, string id, string entrezGeneId, string ensemblGeneId, string symbol) { Start = start; End = end; Id = id; EntrezGeneId = entrezGeneId; EnsemblGeneId = ensemblGeneId; Symbol = symbol; } } } ================================================ FILE: CacheUtils/GFF/GffUtilities.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using CacheUtils.Genes.Utilities; using Intervals; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.GFF { public static class GffUtilities { public static bool HasCds(IInterval codingRegion, IInterval exon) { if (codingRegion == null || codingRegion.Start == -1 || codingRegion.End == -1) return false; return exon.Overlaps(codingRegion); } public static IInterval GetCdsCoordinates(IInterval codingRegion, ITranscriptRegion exon) { int start = exon.Start; int end = exon.End; if (start < codingRegion.Start) start = codingRegion.Start; if (end > codingRegion.End) end = codingRegion.End; return new Interval(start, end); } public static bool HasUtr(IInterval codingRegion, IInterval exon) { if (codingRegion == null || codingRegion.Start == -1 || codingRegion.End == -1) return false; return exon.Start < codingRegion.Start || exon.End > codingRegion.End; } public static IEnumerable GetExons(this ITranscriptRegion[] regions) => regions.FilterNonExons().Merge().OrderBy(x => x.Start).ThenBy(x => x.End); private static ITranscriptRegion[] FilterNonExons(this IEnumerable regions) => regions.Where(region => region.Type == TranscriptRegionType.Exon).ToArray(); private static IEnumerable Merge(this IReadOnlyCollection exons) { if (exons.Count == 1) return exons; var mergedExons = new List(); var exonsById = exons.GetMultiValueDict(x => x.Id); foreach (var kvp in exonsById) { mergedExons.Add(MergeTranscriptRegions(kvp.Key, kvp.Value)); } return mergedExons; } private static ITranscriptRegion MergeTranscriptRegions(ushort exonId, IReadOnlyList regions) { if (regions.Count == 1) return regions[0]; int lastIndex = regions.Count - 1; int start = regions[0].Start; int end = regions[lastIndex].End; int cdnaStart = Math.Min(regions[0].CdnaStart, regions[lastIndex].CdnaStart); int cdnaEnd = Math.Max(regions[0].CdnaEnd, regions[lastIndex].CdnaEnd); return new TranscriptRegion(TranscriptRegionType.Exon, exonId, start, end, cdnaStart, cdnaEnd); } } } ================================================ FILE: CacheUtils/GFF/GffWriter.cs ================================================ using System; using System.IO; using Intervals; namespace CacheUtils.GFF { public sealed class GffWriter : IDisposable { private readonly StreamWriter _writer; public GffWriter(StreamWriter writer) => _writer = writer; public void Dispose() => _writer.Dispose(); private void WriteRequiredFields(IInterval interval, IRequiredFields fields, string feature) { char strand = fields.OnReverseStrand ? '-' : '+'; _writer.Write($"{fields.UcscName}\t{fields.Source}\t{feature}\t{interval.Start}\t{interval.End}\t.\t{strand}\t.\t"); } private static bool NotEmpty(string s) => !string.IsNullOrEmpty(s); private void WriteGeneralAttributes(IGeneralAttributes attribs) { if (NotEmpty(attribs.GeneId)) _writer.Write($"gene_id \"{attribs.GeneId}\"; "); if (NotEmpty(attribs.GeneSymbol)) _writer.Write($"gene_name \"{attribs.GeneSymbol}\"; "); if (NotEmpty(attribs.TranscriptId)) _writer.Write($"transcript_id \"{attribs.TranscriptId}\"; "); _writer.Write($"transcript_type \"{attribs.BioType}\"; "); if (attribs.IsCanonical) _writer.Write("tag \"canonical\"; "); if (NotEmpty(attribs.ProteinId)) _writer.Write($"protein_id \"{attribs.ProteinId}\"; "); } public void WriteGene(IGffGene gene, IRequiredFields requiredFields, int internalGeneId) { WriteRequiredFields(gene, requiredFields, "gene"); if (!string.IsNullOrEmpty(gene.Id)) _writer.Write($"gene_id \"{gene.Id}\"; "); if (!string.IsNullOrEmpty(gene.EntrezGeneId)) _writer.Write($"entrez_gene_id \"{gene.EntrezGeneId}\"; "); if (!string.IsNullOrEmpty(gene.EnsemblGeneId)) _writer.Write($"ensembl_gene_id \"{gene.EnsemblGeneId}\"; "); if (!string.IsNullOrEmpty(gene.Symbol)) _writer.Write($"gene_name \"{gene.Symbol}\"; "); WriteInternalGeneId(internalGeneId); } private void WriteInternalGeneId(int geneId) => _writer.WriteLine($"internal_gene_id \"{geneId}\"; "); public void WriteTranscript(IInterval interval, IRequiredFields requiredFields, IGeneralAttributes attribs) { WriteRequiredFields(interval, requiredFields, "transcript"); WriteGeneralAttributes(attribs); WriteInternalGeneId(attribs.InternalGeneId); } public void WriteExonicRegion(IInterval interval, IRequiredFields requiredFields, IGeneralAttributes attribs, ushort exonNumber, string feature) { WriteRequiredFields(interval, requiredFields, feature); WriteGeneralAttributes(attribs); _writer.Write($"exon_number {exonNumber}; "); WriteInternalGeneId(attribs.InternalGeneId); } } } ================================================ FILE: CacheUtils/GFF/IGeneralAttributes.cs ================================================ namespace CacheUtils.GFF { public interface IGeneralAttributes { string GeneId { get; } string GeneSymbol { get; } string TranscriptId { get; } string ProteinId { get; } string BioType { get; } bool IsCanonical { get; } int InternalGeneId { get; } } } ================================================ FILE: CacheUtils/GFF/IGffGene.cs ================================================ using Intervals; namespace CacheUtils.GFF { public interface IGffGene : IInterval { string Id { get; } string EntrezGeneId { get; } string EnsemblGeneId { get; } string Symbol { get; } } } ================================================ FILE: CacheUtils/GFF/IRequiredFields.cs ================================================ namespace CacheUtils.GFF { public interface IRequiredFields { string UcscName { get; } string Source { get; } bool OnReverseStrand { get; } } } ================================================ FILE: CacheUtils/GFF/RequiredFields.cs ================================================ namespace CacheUtils.GFF { public sealed class RequiredFields : IRequiredFields { public string UcscName { get; } public string Source { get; } public bool OnReverseStrand { get; } public RequiredFields(string ucscName, string source, bool onReverseStrand) { UcscName = ucscName; Source = source; OnReverseStrand = onReverseStrand; } } } ================================================ FILE: CacheUtils/Genbank/GenbankEntry.cs ================================================ using Intervals; namespace CacheUtils.Genbank { public sealed class GenbankEntry { public readonly string TranscriptId; public readonly byte TranscriptVersion; public readonly string ProteinId; public readonly byte ProteinVersion; public readonly string GeneId; public readonly string Symbol; public readonly IInterval CodingRegion; public readonly IInterval[] Exons; public GenbankEntry(string transcriptId, byte transcriptVersion, string proteinId, byte proteinVersion, string geneId, string symbol, IInterval codingRegion, IInterval[] exons) { TranscriptId = transcriptId; TranscriptVersion = transcriptVersion; ProteinId = proteinId; ProteinVersion = proteinVersion; GeneId = geneId; Symbol = symbol; CodingRegion = codingRegion; Exons = exons; } } } ================================================ FILE: CacheUtils/Genbank/GenbankReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Intervals; using OptimizedCore; using VariantAnnotation.Utilities; namespace CacheUtils.Genbank { public sealed class GenbankReader : IDisposable { private readonly StreamReader _reader; // ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz private const string LocusTag = "LOCUS"; private const string FeaturesTag = "FEATURES"; private const string OriginTag = "ORIGIN"; private const string TerminatorTag = "//"; private const string GeneFeatureTag = "gene"; private const string CdsFeatureTag = "CDS"; private const string ExonFeatureTag = "exon"; private const string ProteinIdTag = "/protein_id="; private const string GeneIdTag = "/db_xref=\"GeneID:"; private const string GeneSymbolTag = "/gene="; private const int FeatureColumnLength = 21; public GenbankReader(StreamReader reader) => _reader = reader; public GenbankEntry GetGenbankEntry() { // assert that the record starts with LOCUS if (!HasLocus()) return null; (string transcriptId, byte transcriptVersion) = ParseHeader(); var featureData = ParseFeatures(); ParseOrigin(); var exons = featureData.Exons.Count == 0 ? null : featureData.Exons.ToArray(); return transcriptId == null ? null : new GenbankEntry(transcriptId, transcriptVersion, featureData.ProteinId, featureData.ProteinVersion, featureData.GeneId, featureData.GeneSymbol, featureData.CodingRegion, exons); } private void ParseOrigin() { string line; do { line = GetNextLine(); } while (line != null); } private string GetNextLine() { string line = _reader.ReadLine(); if (line == null || line.StartsWith(TerminatorTag)) return null; return line; } private FeatureData ParseFeatures() { var featureState = FeaturesState.Unknown; var featureData = new FeatureData(); while (true) { string line = GetNextLine(); if (line == null || line.StartsWith(OriginTag)) break; bool isNewState; (featureState, isNewState) = GetFeatureState(line, featureState); string info = line.Substring(FeatureColumnLength); // ReSharper disable once SwitchStatementMissingSomeCases switch (featureState) { case FeaturesState.Gene: ParseGeneFeature(info, featureData); break; case FeaturesState.Cds: ParseCdsFeature(isNewState, featureData, info); break; case FeaturesState.Exon: ParseExonFeature(isNewState, featureData, info); break; } } return featureData; } private static void ParseExonFeature(bool isNewState, FeatureData featureData, string info) { if (isNewState) featureData.Exons.Add(GetInterval(info)); } private static void ParseCdsFeature(bool isNewState, FeatureData featureData, string info) { if (isNewState) featureData.CodingRegion = GetInterval(info); if (info.StartsWith(ProteinIdTag)) ParseProteinId(featureData, info); } private static void ParseGeneFeature(string info, FeatureData featureData) { if (info.StartsWith(GeneIdTag)) featureData.GeneId = ParseGeneId(info); if (info.StartsWith(GeneSymbolTag)) featureData.GeneSymbol = ParseGeneSymbol(info); } private (string TranscriptId, byte TranscriptVersion) ParseHeader() { const string versionTag = "VERSION"; string transcriptId = null; byte transcriptVersion = 0; while (true) { string line = GetNextLine(); if (line == null || line.StartsWith(FeaturesTag)) break; if (line.StartsWith(versionTag)) (transcriptId, transcriptVersion) = ParseVersion(line); } return (transcriptId, transcriptVersion); } private static string ParseGeneSymbol(string info) => info.Substring(GeneSymbolTag.Length).Trim('"'); private static string ParseGeneId(string info) => info.Substring(GeneIdTag.Length).Trim('"'); private static void ParseProteinId(FeatureData featureData, string info) { string rawId = info.Substring(ProteinIdTag.Length).Trim('"'); (featureData.ProteinId, featureData.ProteinVersion) = FormatUtilities.SplitVersion(rawId); } private static IInterval GetInterval(string info) { if (info.StartsWith("join")) return GetJoinInterval(info); var coordinates = info.Split(".."); if (coordinates.Length != 2) throw new InvalidDataException("Expected two coordinates in the exon feature line."); int start = int.Parse(coordinates[0].TrimStart('<')); int end = int.Parse(coordinates[1].TrimStart('>')); return new Interval(start, end); } private static IInterval GetJoinInterval(string info) { var cols = info.Substring(5, info.Length - 6).OptimizedSplit(','); int start = int.Parse(cols[0].Split("..")[0]); int end = int.Parse(cols[1].Split("..")[1]); return new Interval(start, end); } private static (FeaturesState State, bool IsNewState) GetFeatureState(string line, FeaturesState featureState) { string label = line.Substring(0, FeatureColumnLength).Trim(); if (string.IsNullOrEmpty(label)) return (featureState, false); if (label.StartsWith(GeneFeatureTag)) return (FeaturesState.Gene, true); if (label.StartsWith(ExonFeatureTag)) return (FeaturesState.Exon, true); return label.StartsWith(CdsFeatureTag) ? (FeaturesState.Cds, true) : (FeaturesState.Unknown, true); } private bool HasLocus() { string line = _reader.ReadLine(); return line != null && line.StartsWith(LocusTag); } private static (string TranscriptId, byte TranscriptVersion) ParseVersion(string line) { string accession = line.Substring(12).Trim(); return FormatUtilities.SplitVersion(accession); } public void Dispose() => _reader.Dispose(); private sealed class FeatureData { public string ProteinId; public byte ProteinVersion; public string GeneId; public string GeneSymbol; public IInterval CodingRegion; public readonly List Exons = new List(); } } } ================================================ FILE: CacheUtils/Genbank/GenbankState.cs ================================================ namespace CacheUtils.Genbank { internal enum FeaturesState : byte { Unknown, Cds, Exon, Gene } } ================================================ FILE: CacheUtils/Genes/Combiners/CombinerUtils.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.Genes.DataStructures; namespace CacheUtils.Genes.Combiners { public static class CombinerUtils { public static UgaGene Merge(UgaGene gene37, UgaGene gene38) { string ensemblId = CombineField(gene37.EnsemblId, gene38.EnsemblId); string entrezGeneId = CombineField(gene37.EntrezGeneId, gene38.EntrezGeneId); int hgncId = CombineField(gene37.HgncId, gene38.HgncId); return new UgaGene(gene37.Chromosome, gene37.GRCh37, gene38.GRCh38, gene37.OnReverseStrand, entrezGeneId, ensemblId, gene37.Symbol, hgncId); } private static T CombineField(T grch37, T grch38) { if (grch37 == null) return grch38; if (grch38 == null) return grch37; if (!grch37.Equals(grch38)) throw new InvalidDataException($"Found two different values: {grch37} & {grch38}"); return grch37; } internal static void RemoveGenes(IEnumerable genes, ICollection remainingGenes) { foreach (var gene in genes) remainingGenes.Remove(gene); } internal static void AddOrphans(ICollection combinedGenes, IEnumerable genes) { foreach (var gene in genes) combinedGenes.Add(gene); } } } ================================================ FILE: CacheUtils/Genes/Combiners/HgncIdCombiner.cs ================================================ using System.Collections.Generic; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.Utilities; namespace CacheUtils.Genes.Combiners { public sealed class HgncIdCombiner : ICombiner { public void Combine(List combinedGenes, HashSet remainingGenes37, HashSet remainingGenes38) { var hgncIds = GetHgncIds(remainingGenes37, remainingGenes38); var genesByHgnc37 = remainingGenes37.GetMultiValueDict(x => x.HgncId); var genesByHgnc38 = remainingGenes38.GetMultiValueDict(x => x.HgncId); foreach (int hgncId in hgncIds) { var genes37 = GetGenesByHgncId(genesByHgnc37, hgncId); var genes38 = GetGenesByHgncId(genesByHgnc38, hgncId); CombinerUtils.RemoveGenes(genes37, remainingGenes37); CombinerUtils.RemoveGenes(genes38, remainingGenes38); // merge if we have one gene on each genome assembly and they're on the same strand if (genes37.Count == 1 && genes38.Count == 1) { var gene37 = genes37[0]; var gene38 = genes38[0]; if (gene37.OnReverseStrand == gene38.OnReverseStrand) { var mergedGene = CombinerUtils.Merge(gene37, gene38); combinedGenes.Add(mergedGene); continue; } } // the following situations happen if we have: // - one gene from GRCh37 and none from GRCh38 (or vice versa) // - there is a mixture of genes forward and reverse strands (13 occurrences) CombinerUtils.AddOrphans(combinedGenes, genes37); CombinerUtils.AddOrphans(combinedGenes, genes38); } } private static List GetGenesByHgncId(IReadOnlyDictionary> genesByHgnc, int hgncId) => genesByHgnc.TryGetValue(hgncId, out var genes) ? genes : UgaAssemblyCombiner.EmptyUgaGenes; private static IEnumerable GetHgncIds(IEnumerable remainingUga37, IEnumerable remainingUga38) { var hgncIds = new HashSet(); foreach (var gene in remainingUga37) if (gene.HgncId != -1) hgncIds.Add(gene.HgncId); foreach (var gene in remainingUga38) if (gene.HgncId != -1) hgncIds.Add(gene.HgncId); return hgncIds; } } } ================================================ FILE: CacheUtils/Genes/Combiners/ICombiner.cs ================================================ using System.Collections.Generic; using CacheUtils.Genes.DataStructures; namespace CacheUtils.Genes.Combiners { public interface ICombiner { void Combine(List combinedGenes, HashSet remainingGenes37, HashSet remainingGenes38); } } ================================================ FILE: CacheUtils/Genes/Combiners/PartitionCombiner.cs ================================================ using System.Collections.Generic; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.Utilities; namespace CacheUtils.Genes.Combiners { public sealed class PartitionCombiner : ICombiner { public void Combine(List combinedGenes, HashSet remainingGenes37, HashSet remainingGenes38) { var grch37 = Partition(remainingGenes37); var grch38 = Partition(remainingGenes38); CombineSet(combinedGenes, grch37.Both, grch38.Both, remainingGenes37, remainingGenes38); CombineSet(combinedGenes, grch37.EntrezGeneOnly, grch38.EntrezGeneOnly, remainingGenes37, remainingGenes38); CombineSet(combinedGenes, grch37.EnsemblOnly, grch38.EnsemblOnly, remainingGenes37, remainingGenes38); } private static void CombineSet(ICollection combinedGenes, IEnumerable uga37, IEnumerable uga38, ICollection remainingGenes37, ICollection remainingGenes38) { var keyToGene37 = uga37.GetMultiValueDict(GetKey); var keyToGene38 = uga38.GetMultiValueDict(GetKey); var keys = GetAllKeys(keyToGene37.Keys, keyToGene38.Keys); foreach (string key in keys) { var genes37 = GetGenesByKey(keyToGene37, key); var genes38 = GetGenesByKey(keyToGene38, key); CombinerUtils.RemoveGenes(genes37, remainingGenes37); CombinerUtils.RemoveGenes(genes38, remainingGenes38); // this happens for both Entrez Gene Only & Ensembl Only if (genes37.Count == 1 && genes38.Count == 1) { var gene37 = genes37[0]; var gene38 = genes38[0]; var mergedGene = CombinerUtils.Merge(gene37, gene38); combinedGenes.Add(mergedGene); continue; } // the following situations happen if we have: // - one gene from GRCh37 and none from GRCh38 (or vice versa) // - two or more non-overlapping genes on the same assembly (14 occurrences) CombinerUtils.AddOrphans(combinedGenes, genes37); CombinerUtils.AddOrphans(combinedGenes, genes38); } } private static List GetGenesByKey(IReadOnlyDictionary> genesByKey, string key) => genesByKey.TryGetValue(key, out var genes) ? genes : UgaAssemblyCombiner.EmptyUgaGenes; private static IEnumerable GetAllKeys(IEnumerable keys37, IEnumerable keys38) { var keys = new HashSet(); foreach (string key in keys37) keys.Add(key); foreach (string key in keys38) keys.Add(key); return keys; } private static string GetKey(UgaGene gene) => gene.EnsemblId + '|' + gene.EntrezGeneId + '|' + (gene.OnReverseStrand ? "R" : "F"); private static (List EnsemblOnly, List Both, List EntrezGeneOnly) Partition( IEnumerable remainingGenes) { var ensemblOnly = new List(); var both = new List(); var entrezGeneOnly = new List(); foreach (var gene in remainingGenes) { if (gene.EntrezGeneId != null && gene.EnsemblId != null) both.Add(gene); else if (gene.EntrezGeneId != null) entrezGeneOnly.Add(gene); else ensemblOnly.Add(gene); } return (ensemblOnly, both, entrezGeneOnly); } } } ================================================ FILE: CacheUtils/Genes/DataStores/AssemblyDataStore.cs ================================================ using System.Collections.Generic; using CacheUtils.Commands.Download; using CacheUtils.Commands.UniversalGeneArchive; using Genome; using IO; using ReferenceSequence.Utilities; namespace CacheUtils.Genes.DataStores { public sealed class AssemblyDataStore { private readonly string _description; public readonly EnsemblGtf EnsemblGtf; public readonly RefSeqGff RefSeqGff; private readonly GlobalCache _globalCache; private AssemblyDataStore(string description, EnsemblGtf ensemblGtf, RefSeqGff refSeqGff, GlobalCache globalCache) { _description = description; EnsemblGtf = ensemblGtf; RefSeqGff = refSeqGff; _globalCache = globalCache; } public static AssemblyDataStore Create(string description, FilePaths.AssemblySpecificPaths paths, Dictionary refNameToChromosome, bool useGrch37) { string ensemblGtfPath = useGrch37 ? ExternalFiles.EnsemblGtfFile37.FilePath : ExternalFiles.EnsemblGtfFile38.FilePath; string refseqGffPath = useGrch37 ? ExternalFiles.RefSeqGffFile37.FilePath : ExternalFiles.RefSeqGffFile38.FilePath; string refseqGenomeGffPath = useGrch37 ? ExternalFiles.RefSeqGenomeGffFile37.FilePath : ExternalFiles.RefSeqGenomeGffFile38.FilePath; var ensemblGtf = EnsemblGtf.Create(ensemblGtfPath, refNameToChromosome); var refSeqGff = RefSeqGff.Create(refseqGffPath, refseqGenomeGffPath, refNameToChromosome); var (refIndexToChromosome, _, _) = SequenceHelper.GetDictionaries(paths.ReferencePath); var globalCache = GlobalCache.Create(paths.RefSeqCachePath, paths.EnsemblCachePath, refIndexToChromosome, refNameToChromosome); return new AssemblyDataStore(description, ensemblGtf, refSeqGff, globalCache); } public IUpdateHgncData UpdateHgncIds(Hgnc oldHgnc) { Logger.WriteLine($"\n*** {_description} ***"); var hgnc = oldHgnc.Clone(); Logger.Write("- removing duplicate gene IDs from HGNC... "); (int numEntrezGeneIdsRemoved, int numEnsemblIdsRemoved) = hgnc.RemoveDuplicateEntries(); Logger.WriteLine($"{numEntrezGeneIdsRemoved} Entrez Gene, {numEnsemblIdsRemoved} Ensembl."); Logger.Write("- adding coordinates to the HGNC entries... "); int numEntriesWithCoordinates = hgnc.AddCoordinates(EnsemblGtf, RefSeqGff); Logger.WriteLine($"{numEntriesWithCoordinates} with coordinates."); Logger.Write("- updating HGNC IDs for RefSeq genes... "); int numGenesWithHgncId = hgnc.HgncGenes.Update(_globalCache.RefSeqGenesByRef, x => x.EntrezGeneId).Consolidate(); Logger.WriteLine($"{numGenesWithHgncId} genes have HGNC ID."); Logger.Write("- updating HGNC IDs for Ensembl genes... "); numGenesWithHgncId = hgnc.HgncGenes.Update(_globalCache.EnsemblGenesByRef, x => x.EnsemblId).Consolidate(); Logger.WriteLine($"{numGenesWithHgncId} genes have HGNC ID."); return new UpdateHgncData(_globalCache.EnsemblGenesByRef, _globalCache.RefSeqGenesByRef); } } } ================================================ FILE: CacheUtils/Genes/DataStores/EnsemblGtf.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.IO; using CacheUtils.Genes.Utilities; using Compression.Utilities; using Genome; namespace CacheUtils.Genes.DataStores { public sealed class EnsemblGtf { public readonly Dictionary EnsemblIdToGene; public readonly Dictionary EnsemblIdToSymbol; private EnsemblGtf(Dictionary ensemblIdToGene, Dictionary ensemblIdToSymbol) { EnsemblIdToGene = ensemblIdToGene; EnsemblIdToSymbol = ensemblIdToSymbol; } public static EnsemblGtf Create(string filePath, Dictionary refNameToChromosome) { var ensemblGenes = LoadEnsemblGenes(GZipUtilities.GetAppropriateStreamReader(filePath), refNameToChromosome); var ensemblIdToGene = ensemblGenes.GetSingleValueDict(x => x.GeneId); var ensemblIdToSymbol = ensemblGenes.GetKeyValueDict(x => x.GeneId, x => x.Symbol); return new EnsemblGtf(ensemblIdToGene, ensemblIdToSymbol); } private static EnsemblGene[] LoadEnsemblGenes(StreamReader streamReader, Dictionary refNameToChromosome) { EnsemblGene[] genes; using (var reader = new EnsemblGtfReader(streamReader, refNameToChromosome)) genes = reader.GetGenes(); return genes; } } } ================================================ FILE: CacheUtils/Genes/DataStores/GeneInfoData.cs ================================================ using System.Collections.Generic; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.IO; using CacheUtils.Genes.Utilities; using Compression.Utilities; namespace CacheUtils.Genes.DataStores { public sealed class GeneInfoData { public readonly Dictionary EntrezGeneIdToSymbol; private GeneInfoData(Dictionary entrezGeneIdToSymbol) { EntrezGeneIdToSymbol = entrezGeneIdToSymbol; } public static GeneInfoData Create(string filePath) { var entrezGeneIdToSymbol = LoadGeneInfoGenes(filePath) .GetKeyValueDict(x => x.EntrezGeneId, x => x.Symbol); return new GeneInfoData(entrezGeneIdToSymbol); } private static IEnumerable LoadGeneInfoGenes(string filePath) { GeneInfo[] genes; using (var streamReader = GZipUtilities.GetAppropriateStreamReader(filePath)) using (var reader = new GeneInfoReader(streamReader)) genes = reader.GetGenes(); return genes; } } } ================================================ FILE: CacheUtils/Genes/DataStores/GlobalCache.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Genes.Utilities; using CacheUtils.IntermediateIO; using Compression.Utilities; using Genome; namespace CacheUtils.Genes.DataStores { public sealed class GlobalCache { public readonly Dictionary> EnsemblGenesByRef; public readonly Dictionary> RefSeqGenesByRef; private GlobalCache(Dictionary> ensemblGenesByRef, Dictionary> refSeqGenesByRef) { EnsemblGenesByRef = ensemblGenesByRef; RefSeqGenesByRef = refSeqGenesByRef; } public static GlobalCache Create(string refSeqCachePath, string ensemblCachePath, Dictionary refIndexToChromosome, Dictionary refNameToChromosome38) { var ensemblGenesByRef = FlattenGenes(LoadGenes(GZipUtilities.GetAppropriateReadStream(ensemblCachePath), refIndexToChromosome, refNameToChromosome38)); var refSeqGenesByRef = FlattenGenes(LoadGenes(GZipUtilities.GetAppropriateReadStream(refSeqCachePath), refIndexToChromosome, refNameToChromosome38)); return new GlobalCache(ensemblGenesByRef, refSeqGenesByRef); } private static Dictionary> FlattenGenes(IEnumerable genes) { var genesByRef = genes.GetMultiValueDict(x => x.Chromosome.Index); var result = new Dictionary>(); foreach (var kvp in genesByRef.OrderBy(x => x.Key)) { result[kvp.Key] = kvp.Value.GetMultiValueDict(x => x.GeneId).FlattenGeneList(); } return result; } private static IEnumerable LoadGenes(Stream stream, Dictionary refIndexToChromosome, Dictionary refNameToChromosome38) { var geneDict = new Dictionary(); using (var reader = new MutableTranscriptReader(stream, refIndexToChromosome)) { var transcripts = reader.GetTranscripts(); foreach (var transcript in transcripts) { var gene = transcript.Gene; string key = GetGeneKey(gene); if (geneDict.ContainsKey(key)) continue; gene.Chromosome = refNameToChromosome38[gene.Chromosome.UcscName]; geneDict[key] = gene; } } return geneDict.Values.OrderBy(x => x.Chromosome.Index).ThenBy(x => x.Start).ThenBy(x => x.End); } private static string GetGeneKey(MutableGene gene) => gene.GeneId + '|' + gene.Chromosome.UcscName + '|' + gene.Start + '|' + gene.End + '|' + (gene.OnReverseStrand ? 'R' : 'F'); } } ================================================ FILE: CacheUtils/Genes/DataStores/Hgnc.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.IO; using CacheUtils.Genes.Utilities; using Genome; using IO; namespace CacheUtils.Genes.DataStores { public sealed class Hgnc { public readonly HgncGene[] HgncGenes; public readonly Dictionary HgncIdToSymbol; private Hgnc(HgncGene[] hgncGenes, Dictionary hgncIdToSymbol) { HgncGenes = hgncGenes; HgncIdToSymbol = hgncIdToSymbol; } public static Hgnc Create(string filePath, Dictionary refNameToChromosome) { var hgncGenes = LoadHgncGenes(FileUtilities.GetReadStream(filePath), refNameToChromosome); var hgncIdToSymbol = hgncGenes.GetKeyValueDict(x => x.HgncId, x => x.Symbol); return new Hgnc(hgncGenes, hgncIdToSymbol); } private static HgncGene[] LoadHgncGenes(Stream stream, Dictionary refNameToChromosome) { HgncGene[] genes; using (var reader = new HgncReader(stream, refNameToChromosome)) genes = reader.GetGenes(); return genes; } public int AddCoordinates(EnsemblGtf ensemblGtf, RefSeqGff refSeqGff) { foreach (var hgncGene in HgncGenes) { (var refSeqGenes, EnsemblGene ensemblGene, int numMatches) = GetGenes(hgncGene.EntrezGeneId, refSeqGff.EntrezGeneIdToGene, hgncGene.EnsemblId, ensemblGtf.EnsemblIdToGene); switch (numMatches) { case 0: break; case 1: if (ensemblGene == null) AddCoordinatesFromGene(hgncGene, refSeqGenes[0]); else AddCoordinatesFromGene(hgncGene, ensemblGene); break; default: AddCoordinatesFromMultipleGenes(hgncGene, ensemblGene, refSeqGenes); break; } } return HgncGenes.Count(hgncGene => hgncGene.Start != 1 && hgncGene.End != -1); } private static void AddCoordinatesFromMultipleGenes(HgncGene hgncGene, EnsemblGene ensemblGene, IEnumerable refSeqGenes) { if (ensemblGene == null) return; AddCoordinatesFromGene(hgncGene, ensemblGene); foreach (var refSeqGene in refSeqGenes) { if (!Intervals.Utilities.Overlaps(hgncGene.Start, hgncGene.End, refSeqGene.Start, refSeqGene.End)) continue; AddCoordinatesFromGene(hgncGene, refSeqGene); } } private static void AddCoordinatesFromGene(HgncGene hgncGene, IFlatGene flatGene) where T : IFlatGene { hgncGene.Start = hgncGene.Start == -1 ? flatGene.Start : Math.Min(hgncGene.Start, flatGene.Start); hgncGene.End = hgncGene.End == -1 ? flatGene.End : Math.Max(hgncGene.End, flatGene.End); } private static (List RefSeqGenes, EnsemblGene EnsemblGene, int NumMatches) GetGenes( string entrezGeneId, IReadOnlyDictionary> entrezGeneIdToGene, string ensemblId, IReadOnlyDictionary ensemblIdToGene) { var refSeqGenes = GetRefSeqGenes(entrezGeneId, entrezGeneIdToGene); var ensemblGene = GetEnsemblGene(ensemblId, ensemblIdToGene); int numMatches = (ensemblGene != null ? 1 : 0) + refSeqGenes.Count; return (refSeqGenes, ensemblGene, numMatches); } public Hgnc Clone() { var newGenes = new HgncGene[HgncGenes.Length]; for (var i = 0; i < HgncGenes.Length; i++) newGenes[i] = HgncGenes[i].Clone(); return new Hgnc(newGenes, HgncIdToSymbol); } private static EnsemblGene GetEnsemblGene(string ensemblId, IReadOnlyDictionary ensemblIdToGene) { if (string.IsNullOrEmpty(ensemblId)) return null; return ensemblIdToGene.TryGetValue(ensemblId, out var ensemblGene) ? ensemblGene : null; } private static readonly List EmptyList = new List(); private static List GetRefSeqGenes(string entrezGeneId, IReadOnlyDictionary> entrezGeneIdToGene) { if (string.IsNullOrEmpty(entrezGeneId)) return EmptyList; return entrezGeneIdToGene.TryGetValue(entrezGeneId, out var geneList) ? geneList : EmptyList; } public (int NumEntrezGeneIdsRemoved, int NumEnsemblIdsRemoved) RemoveDuplicateEntries() { int numEntrezGeneIdsRemoved = RemoveDuplicatesByTranscriptSource(HgncGenes, x => x.EntrezGeneId, x => x.EntrezGeneId = null); int numEnsemblIdsRemoved = RemoveDuplicatesByTranscriptSource(HgncGenes, x => x.EnsemblId, x => x.EnsemblId = null); return (numEntrezGeneIdsRemoved, numEnsemblIdsRemoved); } private static int RemoveDuplicatesByTranscriptSource(IEnumerable newHgncGenes, Func idFunc, Action nullAction) { var hgncByGeneId = newHgncGenes.GetMultiValueDict(idFunc); var numGeneIdsRemoved = 0; foreach (var kvp in hgncByGeneId) { if (kvp.Value.Count == 1) continue; foreach (var hgncGene in kvp.Value) nullAction(hgncGene); numGeneIdsRemoved++; } return numGeneIdsRemoved; } } } ================================================ FILE: CacheUtils/Genes/DataStores/IUpdateHgncData.cs ================================================ using System.Collections.Generic; using CacheUtils.DataDumperImport.DataStructures.Mutable; namespace CacheUtils.Genes.DataStores { public interface IUpdateHgncData { Dictionary> EnsemblGenesByRef { get; } Dictionary> RefSeqGenesByRef { get; } } } ================================================ FILE: CacheUtils/Genes/DataStores/RefSeqGff.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.IO; using CacheUtils.Genes.Utilities; using Compression.Utilities; using Genome; namespace CacheUtils.Genes.DataStores { public sealed class RefSeqGff { public readonly Dictionary> EntrezGeneIdToGene; public readonly Dictionary EntrezGeneIdToSymbol; private RefSeqGff(Dictionary> entrezGeneIdToGene, Dictionary entrezGeneIdToSymbol) { EntrezGeneIdToGene = entrezGeneIdToGene; EntrezGeneIdToSymbol = entrezGeneIdToSymbol; } public static RefSeqGff Create(string gcfGffPath, string refGffPath, Dictionary accessionToChromosome) { var refSeqGenes = LoadRefSeqGffGenes(GZipUtilities.GetAppropriateStreamReader(gcfGffPath), GZipUtilities.GetAppropriateStreamReader(refGffPath), accessionToChromosome); var entrezGeneIdToGene = refSeqGenes .GetMultiValueDict(x => x.GeneId) .FlattenGeneList() .GetMultiValueDict(x => x.GeneId); var entrezGeneIdToSymbol = refSeqGenes.GetKeyValueDict(x => x.GeneId, x => x.Symbol); return new RefSeqGff(entrezGeneIdToGene, entrezGeneIdToSymbol); } private static List LoadRefSeqGffGenes(StreamReader gcfGffReader, StreamReader refGffReader, Dictionary accessionToChromosome) { var refSeqGenes = new List(); LoadRefSeqGff(gcfGffReader, refSeqGenes, accessionToChromosome); LoadRefSeqGff(refGffReader, refSeqGenes, accessionToChromosome); return refSeqGenes.OrderBy(x => x.Chromosome.Index).ThenBy(x => x.Start).ThenBy(x => x.End).ToList(); } private static void LoadRefSeqGff(StreamReader streamReader, List refSeqGenes, Dictionary accessionToChromosome) { using (var reader = new RefSeqGffReader(streamReader, accessionToChromosome)) { reader.AddGenes(refSeqGenes); } } } } ================================================ FILE: CacheUtils/Genes/DataStores/UpdateHgncData.cs ================================================ using System.Collections.Generic; using CacheUtils.DataDumperImport.DataStructures.Mutable; namespace CacheUtils.Genes.DataStores { public sealed class UpdateHgncData : IUpdateHgncData { public Dictionary> EnsemblGenesByRef { get; } public Dictionary> RefSeqGenesByRef { get; } public UpdateHgncData(Dictionary> ensemblGenesByRef, Dictionary> refSeqGenesByRef) { EnsemblGenesByRef = ensemblGenesByRef; RefSeqGenesByRef = refSeqGenesByRef; } } } ================================================ FILE: CacheUtils/Genes/DataStructures/EnsemblGene.cs ================================================ using System; using Genome; namespace CacheUtils.Genes.DataStructures { public sealed class EnsemblGene : IFlatGene { public Chromosome Chromosome { get; } public int Start { get; } public int End { get; set; } public string GeneId { get; } public string Symbol { get; } public EnsemblGene(Chromosome chromosome, int start, int end, string geneId, string symbol) { Chromosome = chromosome; Start = start; End = end; GeneId = geneId; Symbol = symbol; } public EnsemblGene Clone() => throw new NotImplementedException(); } } ================================================ FILE: CacheUtils/Genes/DataStructures/GeneInfo.cs ================================================ namespace CacheUtils.Genes.DataStructures { public sealed class GeneInfo { public string Symbol { get; } public string EntrezGeneId { get; } public GeneInfo(string symbol, string entrezGeneId) { Symbol = symbol; EntrezGeneId = entrezGeneId; } } } ================================================ FILE: CacheUtils/Genes/DataStructures/HgncGene.cs ================================================  using Genome; namespace CacheUtils.Genes.DataStructures { public sealed class HgncGene : IChromosomeInterval { public Chromosome Chromosome { get; } public int Start { get; set; } public int End { get; set; } public string Symbol { get; } public string EntrezGeneId { get; set; } public string EnsemblId { get; set; } public readonly int HgncId; public HgncGene(Chromosome chromosome, int start, int end, string symbol, string entrezGeneId, string ensemblId, int hgncId) { Chromosome = chromosome; Start = start; End = end; Symbol = symbol; EntrezGeneId = entrezGeneId; EnsemblId = ensemblId; HgncId = hgncId; } public HgncGene Clone() => new HgncGene(Chromosome, -1, -1, Symbol, EntrezGeneId, EnsemblId, HgncId); } } ================================================ FILE: CacheUtils/Genes/DataStructures/IFlatGene.cs ================================================  using Genome; namespace CacheUtils.Genes.DataStructures { public interface IFlatGene { Chromosome Chromosome { get; } int Start { get; } int End { get; set; } T Clone(); } } ================================================ FILE: CacheUtils/Genes/DataStructures/RefSeqGene.cs ================================================  using Genome; namespace CacheUtils.Genes.DataStructures { public sealed class RefSeqGene : IFlatGene { public Chromosome Chromosome { get; } public int Start { get; } public int End { get; set; } private bool OnReverseStrand { get; } public string GeneId { get; } public string Symbol { get; } private int HgncId { get; } public RefSeqGene(Chromosome chromosome, int start, int end, bool onReverseStrand, string entrezGeneId, string symbol, int hgncId) { Chromosome = chromosome; Start = start; End = end; OnReverseStrand = onReverseStrand; GeneId = entrezGeneId; Symbol = symbol; HgncId = hgncId; } public RefSeqGene Clone() => new RefSeqGene(Chromosome, Start, End, OnReverseStrand, GeneId, Symbol, HgncId); } } ================================================ FILE: CacheUtils/Genes/DataStructures/UgaGene.cs ================================================ using Genome; using Intervals; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; namespace CacheUtils.Genes.DataStructures { public sealed class UgaGene { public readonly Chromosome Chromosome; public readonly IInterval GRCh37; public readonly IInterval GRCh38; public readonly bool OnReverseStrand; public readonly int HgncId; public string Symbol { get; set; } public string EntrezGeneId { get; } public string EnsemblId { get; } public UgaGene(Chromosome chromosome, IInterval grch37, IInterval grch38, bool onReverseStrand, string entrezGeneId, string ensemblId, string symbol, int hgncId) { Chromosome = chromosome; GRCh37 = grch37; GRCh38 = grch38; EntrezGeneId = entrezGeneId; EnsemblId = ensemblId; Symbol = symbol; OnReverseStrand = onReverseStrand; HgncId = hgncId; } public override string ToString() { string interval37 = GetInterval(GRCh37); string interval38 = GetInterval(GRCh38); string strand = OnReverseStrand ? "R" : "F"; return $"{Chromosome.UcscName}\t{Chromosome.EnsemblName}\t{Symbol}\t{interval37}\t{interval38}\t{strand}\t{HgncId}\t{EnsemblId}\t{EntrezGeneId}"; } private static string GetInterval(IInterval interval) => interval == null ? "-1\t-1" : $"{interval.Start}\t{interval.End}"; public Gene ToGene(GenomeAssembly genomeAssembly) { var interval = genomeAssembly == GenomeAssembly.GRCh37 ? GRCh37 : GRCh38; return new Gene(Chromosome, interval.Start, interval.End, OnReverseStrand, Symbol, HgncId, CompactId.Convert(EntrezGeneId), CompactId.Convert(EnsemblId)); } } } ================================================ FILE: CacheUtils/Genes/GeneFlattener.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using CacheUtils.Genes.DataStructures; namespace CacheUtils.Genes { public static class GeneFlattener { public static List FlattenGeneList(this Dictionary> genesById) where T : IFlatGene { var genesList = new List(); foreach (var genes in genesById.Values) { var flatGenes = FlattenWithSameId(genes); genesList.AddRange(flatGenes); } return genesList.OrderBy(x => x.Chromosome.Index).ThenBy(x => x.Start).ThenBy(x => x.End).ToList(); } internal static List FlattenWithSameId(List genes) where T : IFlatGene { if (genes == null || genes.Count == 1) return genes; var flatGenes = new List(); var seedGene = genes[0].Clone(); foreach (var gene in genes) { if (Intervals.Utilities.Overlaps(seedGene.Start, seedGene.End, gene.Start, gene.End)) { seedGene.End = Math.Max(seedGene.End, gene.End); continue; } flatGenes.Add(seedGene); seedGene = gene.Clone(); } flatGenes.Add(seedGene); return flatGenes; } } } ================================================ FILE: CacheUtils/Genes/GeneMerger.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Genes.DataStores; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.Utilities; using Intervals; using IO; namespace CacheUtils.Genes { public static class GeneMerger { public static Dictionary> MergeByHgnc(this IUpdateHgncData data, bool isGrch37) { Logger.Write("- merging RefSeq & Ensembl genes... "); var genesByRef = new Dictionary>(); var mergedGenesByRef = new Dictionary>(); AddGenes(data.EnsemblGenesByRef, genesByRef); AddGenes(data.RefSeqGenesByRef, genesByRef); var totalOrphanEntries = 0; var totalMergedEntries = 0; foreach (var kvp in genesByRef) { var hgncIdToGenes = kvp.Value.GetMultiValueDict(x => x.HgncId.ToString() + '|' + (x.OnReverseStrand ? 'R' : 'F')); (var mergedGenes, int numOrphanEntries, int numMergedEntries) = GetMergedGenes(hgncIdToGenes, isGrch37); mergedGenesByRef[kvp.Key] = mergedGenes; totalOrphanEntries += numOrphanEntries; totalMergedEntries += numMergedEntries; } Logger.WriteLine($"orphans: {totalOrphanEntries}, merged: {totalMergedEntries}"); return mergedGenesByRef; } private static void AddGenes(Dictionary> source, Dictionary> target) { foreach (var kvp in source) { if (target.TryGetValue(kvp.Key, out var targetGeneList)) { targetGeneList.AddRange(kvp.Value); } else { var geneList = new List(); geneList.AddRange(kvp.Value); target[kvp.Key] = geneList; } } } private static (List MergedGenes, int NumOrphanEntries, int NumMergedEntries) GetMergedGenes( Dictionary> hgncIdToGenes, bool isGrch37) { var mergedGenes = new List(); var numOrphanEntries = 0; var numMergedEntries = 0; foreach (var kvp in hgncIdToGenes) { if (kvp.Key.StartsWith("-1|") || kvp.Value.Count == 1) { var convertedGenes = ConvertToUgaGenes(kvp.Value, isGrch37); mergedGenes.AddRange(convertedGenes); numOrphanEntries += convertedGenes.Count; continue; } if (kvp.Value.Count > 2) throw new InvalidDataException("Found more than two genes when merging Ensembl and RefSeq genes."); mergedGenes.Add(GetMergedGene(kvp.Value[0], kvp.Value[1], isGrch37)); numMergedEntries++; } return (mergedGenes, numOrphanEntries, numMergedEntries); } private static List ConvertToUgaGenes(IEnumerable genes, bool isGrch37) { var ugaGenes = new List(); // ReSharper disable once LoopCanBeConvertedToQuery foreach (var gene in genes) { if (gene.GeneId == null) continue; ugaGenes.Add(gene.ToUgaGene(isGrch37)); } return ugaGenes; } private static UgaGene GetMergedGene(MutableGene geneA, MutableGene geneB, bool isGrch37) { (MutableGene ensemblGene, MutableGene refSeqGene) = geneA.GeneId.StartsWith("ENSG") ? (geneA, geneB) : (geneB, geneA); if (ensemblGene.Chromosome.Index != refSeqGene.Chromosome.Index) throw new InvalidDataException($"The two genes are on different chromosomes: {geneA.GeneId} & {geneB.GeneId}"); if (ensemblGene.OnReverseStrand != refSeqGene.OnReverseStrand) throw new InvalidDataException($"Both genes do not have the same orientation: {geneA.GeneId} & {geneB.GeneId}"); IInterval interval = GetMergedInterval(ensemblGene, refSeqGene); (IInterval grch37, IInterval grch38) = isGrch37 ? (interval, null as IInterval) : (null as IInterval, interval); return new UgaGene(ensemblGene.Chromosome, grch37, grch38, ensemblGene.OnReverseStrand, refSeqGene.GeneId, ensemblGene.GeneId, ensemblGene.Symbol, ensemblGene.HgncId); } private static IInterval GetMergedInterval(MutableGene geneA, MutableGene geneB) => new Interval(Math.Min(geneA.Start, geneB.Start), Math.Max(geneA.End, geneB.End)); } } ================================================ FILE: CacheUtils/Genes/GeneSymbolUpdater.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.Genes.DataStructures; using IO; namespace CacheUtils.Genes { public sealed class GeneSymbolUpdater { private int _numUpdatedByHgncId; private int _numUpdatedByEntrezGeneId; private int _numUpdatedByEnsemblId; private int _numUpdatedByRefSeqGff; private readonly Dictionary _hgncIdToSymbol; private readonly Dictionary _entrezGeneIdToSymbol; private readonly Dictionary _ensemblIdToSymbol; private readonly Dictionary _refseqGeneIdToSymbol; public GeneSymbolUpdater(Dictionary hgncIdToSymbol, Dictionary entrezGeneIdToSymbol, Dictionary ensemblIdToSymbol, Dictionary refseqGeneIdToSymbol) { _hgncIdToSymbol = hgncIdToSymbol; _entrezGeneIdToSymbol = entrezGeneIdToSymbol; _ensemblIdToSymbol = ensemblIdToSymbol; _refseqGeneIdToSymbol = refseqGeneIdToSymbol; } public void Update(UgaGene[] mergedGenes) { Logger.Write("- updating gene symbols... "); foreach (var gene in mergedGenes) UpdateGeneSymbol(gene); Logger.WriteLine($"{_numUpdatedByHgncId} by HGNC id, {_numUpdatedByEntrezGeneId} by Entrez Gene ID, {_numUpdatedByEnsemblId} by Ensembl ID, {_numUpdatedByRefSeqGff} by RefSeq GFF"); int numGenesMissingSymbol = mergedGenes.Count(gene => string.IsNullOrEmpty(gene.Symbol)); if (numGenesMissingSymbol > 0) throw new InvalidDataException($"{numGenesMissingSymbol} genes are missing symbols."); } private void UpdateGeneSymbol(UgaGene gene) { string originalSymbol = gene.Symbol; bool isUpdated = UpdateBySymbolDict(gene, x => x.HgncId, x => x == -1, _hgncIdToSymbol); if (isUpdated) { if (gene.Symbol != originalSymbol) _numUpdatedByHgncId++; return; } isUpdated = UpdateBySymbolDict(gene, x => x.EntrezGeneId, string.IsNullOrEmpty, _entrezGeneIdToSymbol); if (isUpdated) { if (gene.Symbol != originalSymbol) _numUpdatedByEntrezGeneId++; return; } isUpdated = UpdateBySymbolDict(gene, x => x.EnsemblId, string.IsNullOrEmpty, _ensemblIdToSymbol); if (isUpdated) { if (gene.Symbol != originalSymbol) _numUpdatedByEnsemblId++; return; } isUpdated = UpdateBySymbolDict(gene, x => x.EntrezGeneId, string.IsNullOrEmpty, _refseqGeneIdToSymbol); // ReSharper disable once InvertIf if (isUpdated && gene.Symbol != originalSymbol) _numUpdatedByRefSeqGff++; } private static bool UpdateBySymbolDict(UgaGene gene, Func idFunc, Func isEmpty, IReadOnlyDictionary idToSymbol) { var key = idFunc(gene); if (isEmpty(key)) return false; if (!idToSymbol.TryGetValue(idFunc(gene), out string symbol)) return false; gene.Symbol = symbol; return true; } } } ================================================ FILE: CacheUtils/Genes/HgncIdConsolidator.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Genes.Utilities; namespace CacheUtils.Genes { public static class HgncIdConsolidator { public static int Consolidate(this Dictionary> genesByRef) { var numHgncIds = 0; foreach (var refKvp in genesByRef.OrderBy(x => x.Key)) { var genesByHgncId = refKvp.Value.Where(gene => gene.HgncId != -1).GetMultiValueDict(x => x.HgncId); foreach (var kvp in genesByHgncId) { if (kvp.Value.Count <= 1) continue; CreateAggregateGene(kvp.Value.OrderBy(x => x.Start).ThenBy(x => x.End).ToList()); } numHgncIds += refKvp.Value.Count(gene => gene.HgncId != -1); } return numHgncIds; } private static void CreateAggregateGene(IReadOnlyList genes) { var seedGene = genes[0]; for (var i = 1; i < genes.Count; i++) { genes[i].GeneId = null; genes[i].HgncId = -1; seedGene.End = Math.Max(seedGene.End, genes[i].End); } } } } ================================================ FILE: CacheUtils/Genes/HgncIdUpdater.cs ================================================ using System; using System.Collections.Generic; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.Utilities; namespace CacheUtils.Genes { public static class HgncIdUpdater { public static Dictionary> Update(this IEnumerable hgncGenes, Dictionary> genesByRef, Func idFunc) { var geneIdToHgncId = hgncGenes.GetSingleValueDict(idFunc); foreach (var kvp in genesByRef) ReplaceHgncIds(kvp.Value, geneIdToHgncId); return genesByRef; } private static void ReplaceHgncIds(IEnumerable genes, IReadOnlyDictionary geneIdToHgncGene) { foreach (var gene in genes) { gene.HgncId = -1; if (!geneIdToHgncGene.TryGetValue(gene.GeneId, out var hgncGene)) continue; if (!Intervals.Utilities.Overlaps(hgncGene.Start, hgncGene.End, gene.Start, gene.End)) continue; gene.HgncId = hgncGene.HgncId; } } } } ================================================ FILE: CacheUtils/Genes/IO/EnsemblGtfReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.Genes.DataStructures; using ErrorHandling.Exceptions; using Genome; using OptimizedCore; namespace CacheUtils.Genes.IO { public sealed class EnsemblGtfReader : IDisposable { private readonly Dictionary _refNameToChromosome; private readonly StreamReader _reader; private const int ChromosomeIndex = 0; private const int FeatureTypeIndex = 2; private const int StartIndex = 3; private const int EndIndex = 4; private const int InfoIndex = 8; public EnsemblGtfReader(StreamReader reader, Dictionary refNameToChromosome) { _refNameToChromosome = refNameToChromosome; _reader = reader; _reader.ReadLine(); } public EnsemblGene[] GetGenes() { var genes = new List(); while (true) { string line = _reader.ReadLine(); if (line == null) break; if (line.OptimizedStartsWith('#')) continue; var cols = line.OptimizedSplit('\t'); if (cols.Length != 9) throw new InvalidDataException($"Expected 9 columns but found {cols.Length} when parsing the GFF entry."); string featureType = cols[FeatureTypeIndex]; if (featureType != "gene") continue; AddGene(cols, genes); } return genes.ToArray(); } private void AddGene(string[] cols, ICollection genes) { var chromosome = RefSeqGffReader.GetChromosome(cols[ChromosomeIndex], _refNameToChromosome); if (chromosome == null) return; try { int start = int.Parse(cols[StartIndex]); int end = int.Parse(cols[EndIndex]); var infoCols = cols[InfoIndex].Split(';', StringSplitOptions.RemoveEmptyEntries); var info = GetGffFields(infoCols); var gene = new EnsemblGene(chromosome, start, end, info.EnsemblGeneId, info.Name); genes.Add(gene); } catch (Exception) { Console.WriteLine(); Console.WriteLine("Offending line: {0}", string.Join('\t', cols)); for (var i = 0; i < cols.Length; i++) Console.WriteLine("- col {0}: [{1}]", i, cols[i]); throw; } } private static (string EnsemblGeneId, string Name) GetGffFields(string[] cols) { string ensemblId = null; string symbol = null; foreach (string col in cols) { var kvp = col.Trim().OptimizedSplit(' '); string key = kvp[0]; string value = kvp[1].Trim('\"'); // ReSharper disable once SwitchStatementMissingSomeCases switch (key) { case "gene_id": ensemblId = value; break; case "gene_name": symbol = value; break; } } if (string.IsNullOrEmpty(ensemblId) || string.IsNullOrEmpty(symbol)) { throw new UserErrorException(string.Join('\t', cols)); } return (ensemblId, symbol); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/Genes/IO/GeneInfoReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.Genes.DataStructures; using OptimizedCore; namespace CacheUtils.Genes.IO { public sealed class GeneInfoReader : IDisposable { private readonly StreamReader _reader; private int _entrezGeneIndex = -1; private int _symbolIndex = -1; private int _dbXrefsIndex = -1; public GeneInfoReader(StreamReader reader) { _reader = reader; string headerLine = _reader.ReadLine(); SetColumnIndices(headerLine); } private void SetColumnIndices(string line) { if (line.StartsWith("#Format: ")) line = line.Substring(9); if (line.OptimizedStartsWith('#')) line = line.Substring(1); var cols = line.OptimizedSplit('\t'); if (cols.Length == 1) cols = line.OptimizedSplit(' '); for (var index = 0; index < cols.Length; index++) { string header = cols[index]; // ReSharper disable once SwitchStatementMissingSomeCases switch (header) { case "dbXrefs": _dbXrefsIndex = index; break; case "GeneID": _entrezGeneIndex = index; break; case "Symbol": _symbolIndex = index; break; } } // ReSharper disable once InvertIf if (_entrezGeneIndex == -1 || _symbolIndex == -1) { Console.WriteLine("_dbXrefsIndex: {0}", _dbXrefsIndex); Console.WriteLine("_entrezGeneIndex: {0}", _entrezGeneIndex); Console.WriteLine("_symbolIndex: {0}", _symbolIndex); throw new InvalidDataException("Not all of the indices were set."); } } /// /// retrieves the next gene. Returns false if there are no more genes available /// private GeneInfo Next() { string line = _reader.ReadLine(); if (line == null) return null; if (!line.StartsWith("9606")) return null; var cols = line.OptimizedSplit('\t'); if (cols.Length != 16) throw new InvalidDataException($"Expected 16 columns but found {cols.Length} when parsing the gene entry:\n[{line}]"); try { string entrezGeneId = cols[_entrezGeneIndex]; string symbol = cols[_symbolIndex]; return new GeneInfo(symbol, entrezGeneId); } catch (Exception) { Console.WriteLine("Offending line: {0}", line); for (var i = 0; i < cols.Length; i++) Console.WriteLine("- col {0}: [{1}]", i, cols[i]); throw; } } public GeneInfo[] GetGenes() { var list = new List(); while (true) { var gene = Next(); if (gene == null) break; list.Add(gene); } return list.ToArray(); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/Genes/IO/HgncReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.Genes.DataStructures; using Genome; using IO; using OptimizedCore; namespace CacheUtils.Genes.IO { public sealed class HgncReader : IDisposable { private readonly Dictionary _refNameToChromosome; private readonly StreamReader _reader; private const int HgncIdIndex = 0; private const int SymbolIndex = 1; private const int LocationIndex = 6; private const int EntrezIdIndex = 18; private const int EnsemblIdIndex = 19; public HgncReader(Stream stream, Dictionary refNameToChromosome) { _refNameToChromosome = refNameToChromosome; _reader = FileUtilities.GetStreamReader(stream); _reader.ReadLine(); } /// /// retrieves the next gene. Returns false if there are no more genes available /// private HgncGene Next() { string line = _reader.ReadLine(); if (line == null) return null; var cols = line.OptimizedSplit('\t'); if (cols.Length != 49) throw new InvalidDataException($"Expected 48 columns but found {cols.Length} when parsing the gene entry:[{line}]"); try { int hgncId = int.Parse(cols[HgncIdIndex].Substring(5)); string symbol = cols[SymbolIndex]; Chromosome chromosome = GetChromosome(cols[LocationIndex]); string entrezGeneId = GetId(cols[EntrezIdIndex]); string ensemblId = GetId(cols[EnsemblIdIndex]); return new HgncGene(chromosome, -1, -1, symbol, entrezGeneId, ensemblId, hgncId); } catch (Exception) { Console.WriteLine("Offending line: {0}", line); for (var i = 0; i < cols.Length; i++) Console.WriteLine("- col {0}: [{1}]", i, cols[i]); throw; } } public HgncGene[] GetGenes() { var list = new List(); while (true) { var gene = Next(); if (gene == null) break; list.Add(gene); } return list.ToArray(); } private Chromosome GetChromosome(string cytogeneticBand) { int armPos = GetArmPos(cytogeneticBand); if (armPos == -1) return Chromosome.GetEmptyChromosome(cytogeneticBand); string chrName = cytogeneticBand.Substring(0, armPos); return ReferenceNameUtilities.GetChromosome(_refNameToChromosome, chrName); } private static int GetArmPos(string cytogeneticBand) { int pos = cytogeneticBand.IndexOf('p'); if (pos != -1) return pos; pos = cytogeneticBand.IndexOf('q'); return pos; } private static string GetId(string s) => string.IsNullOrEmpty(s) ? null : s; public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/Genes/IO/RefSeqGffReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.Genes.DataStructures; using Genome; using OptimizedCore; namespace CacheUtils.Genes.IO { public sealed class RefSeqGffReader : IDisposable { private readonly Dictionary _accessionIdToChromosome; private readonly StreamReader _reader; private const int AccessionIndex = 0; private const int FeatureTypeIndex = 2; private const int StartIndex = 3; private const int EndIndex = 4; private const int StrandIndex = 6; private const int InfoIndex = 8; public RefSeqGffReader(StreamReader reader, Dictionary accessionIdToChromosome) { _accessionIdToChromosome = accessionIdToChromosome; _reader = reader; _reader.ReadLine(); } public void AddGenes(List refSeqGenes) { while (true) { string line = _reader.ReadLine(); if (line == null) break; if (line.OptimizedStartsWith('#')) continue; var cols = line.OptimizedSplit('\t'); if (cols.Length != 9) throw new InvalidDataException($"Expected 9 columns but found {cols.Length} when parsing the GFF entry."); string featureType = cols[FeatureTypeIndex]; if (featureType == "gene") AddGene(cols, refSeqGenes); } } private void AddGene(string[] cols, ICollection refSeqGenes) { var chromosome = GetChromosome(cols[AccessionIndex], _accessionIdToChromosome); if (chromosome == null) return; try { int start = int.Parse(cols[StartIndex]); int end = int.Parse(cols[EndIndex]); bool onReverseStrand = cols[StrandIndex] == "-"; var infoCols = cols[InfoIndex].OptimizedSplit(';'); var info = GetGffFields(infoCols); var gene = new RefSeqGene(chromosome, start, end, onReverseStrand, info.EntrezGeneId, info.Name, info.HgncId); refSeqGenes.Add(gene); } catch (Exception) { Console.WriteLine(); Console.WriteLine("Offending line: {0}", string.Join('\t', cols)); for (var i = 0; i < cols.Length; i++) Console.WriteLine("- col {0}: [{1}]", i, cols[i]); throw; } } internal static Chromosome GetChromosome(string referenceName, Dictionary refNameToChromosome) { refNameToChromosome.TryGetValue(referenceName, out var chromosome); return chromosome; } private static (string Name, string EntrezGeneId, int HgncId) GetGffFields(IEnumerable cols) { string entrezGeneId = null; string name = null; int hgncId = -1; foreach (string col in cols) { (string key, string value) = col.OptimizedKeyValue(); // ReSharper disable once SwitchStatementMissingSomeCases switch (key) { case "Dbxref": var ids = value.OptimizedSplit(','); (entrezGeneId, hgncId) = GetIds(ids); break; case "Name": name = value; break; } } return (name, entrezGeneId, hgncId); } private static (string EntrezGeneId, int HgncId) GetIds(IEnumerable ids) { string entrezGeneId = null; int hgncId = -1; foreach (string idPair in ids) { var cols = idPair.OptimizedSplit(':'); // ReSharper disable once SwitchStatementMissingSomeCases switch (cols[0]) { case "HGNC": int lastIndex = cols.Length - 1; if (cols[lastIndex] != "HGNC") hgncId = int.Parse(cols[lastIndex]); break; case "GeneID": entrezGeneId = cols[1]; break; } } return (entrezGeneId, hgncId); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/Genes/IO/UgaGeneReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.Genes.DataStructures; using Genome; using Intervals; using IO; using OptimizedCore; namespace CacheUtils.Genes.IO { public sealed class UgaGeneReader : IDisposable { private readonly Dictionary _refNameToChromosome; private readonly StreamReader _reader; public UgaGeneReader(Stream stream, Dictionary refNameToChromosome, bool leaveOpen = false) { _refNameToChromosome = refNameToChromosome; _reader = FileUtilities.GetStreamReader(stream, leaveOpen); _reader.ReadLine(); } public void Dispose() => _reader.Dispose(); public UgaGene[] GetGenes() { var genes = new List(); while (true) { var gene = GetNextGene(); if (gene == null) break; genes.Add(gene); } return genes.ToArray(); } private UgaGene GetNextGene() { string line = _reader.ReadLine(); if (line == null) return null; var cols = line.OptimizedSplit('\t'); if (cols.Length != 11) throw new InvalidDataException($"Expected 11 columns, but found {cols.Length} columns."); string ucscRefName = cols[0]; var chromosome = _refNameToChromosome == null ? Chromosome.GetEmptyChromosome(ucscRefName) : ReferenceNameUtilities.GetChromosome(_refNameToChromosome, ucscRefName); string symbol = cols[2]; int start37 = int.Parse(cols[3]); int end37 = int.Parse(cols[4]); int start38 = int.Parse(cols[5]); int end38 = int.Parse(cols[6]); bool onReverseStrand = cols[7] == "R"; int hgncId = int.Parse(cols[8]); string ensemblId = cols[9]; string entrezGeneId = cols[10]; var grch37 = new Interval(start37, end37); var grch38 = new Interval(start38, end38); return new UgaGene(chromosome, grch37, grch38, onReverseStrand, entrezGeneId, ensemblId, symbol, hgncId); } } } ================================================ FILE: CacheUtils/Genes/IO/UgaGeneWriter.cs ================================================ using System; using System.IO; using System.Text; using CacheUtils.Genes.DataStructures; namespace CacheUtils.Genes.IO { public sealed class UgaGeneWriter : IDisposable { private readonly StreamWriter _writer; public UgaGeneWriter(Stream stream, bool leaveOpen = false) { _writer = new StreamWriter(stream, Encoding.ASCII, 1024, leaveOpen); } public void Dispose() => _writer.Dispose(); public void Write(UgaGene[] genes) { _writer.WriteLine(genes.Length); foreach (var gene in genes) _writer.WriteLine(gene.ToString()); } } } ================================================ FILE: CacheUtils/Genes/UgaAssemblyCombiner.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.Genes.Combiners; using CacheUtils.Genes.DataStructures; using CacheUtils.TranscriptCache.Comparers; using Intervals; namespace CacheUtils.Genes { public static class UgaAssemblyCombiner { internal static readonly List EmptyUgaGenes = new List(); public static UgaGene[] Combine(Dictionary> genesByRef37, Dictionary> genesByRef38) { var referenceIndices = GetReferenceIndices(genesByRef37.Keys, genesByRef38.Keys); var combinedGenes = new List(); var combiners = GetCombiners(); foreach (ushort refIndex in referenceIndices.OrderBy(x => x)) { var ugaGenesByRef = CombineByReference(GetUgaGenesByRef(genesByRef37, refIndex), GetUgaGenesByRef(genesByRef38, refIndex), combiners); combinedGenes.AddRange(ugaGenesByRef); } return combinedGenes.OrderBy(x => x.Chromosome.Index).ThenBy(x => MinCoordinate(x, y => y.Start)) .ThenBy(x => MinCoordinate(x, y => y.End)).ToArray(); } private static List GetCombiners() => new List {new HgncIdCombiner(), new PartitionCombiner()}; private static IEnumerable GetReferenceIndices(IEnumerable keysA, IEnumerable keysB) { var referenceIndices = new HashSet(); foreach (ushort key in keysA) referenceIndices.Add(key); foreach (ushort key in keysB) referenceIndices.Add(key); return referenceIndices.OrderBy(x => x); } private static IEnumerable CombineByReference(IEnumerable uga37, IEnumerable uga38, IEnumerable combiners) { var combinedGenes = new List(); var remainingUga37 = GetRemainingGenes(uga37); var remainingUga38 = GetRemainingGenes(uga38); foreach (var combiner in combiners) combiner.Combine(combinedGenes, remainingUga37, remainingUga38); if (remainingUga37.Count > 0 || remainingUga38.Count > 0) throw new InvalidDataException($"Expected the combiners to handle all genes, but some still remain. GRCh37: {remainingUga37.Count}, GRCh38: {remainingUga38.Count}"); return combinedGenes; } private static HashSet GetRemainingGenes(IEnumerable genes) { var comparer = new UgaGeneComparer(); var geneSet = new HashSet(comparer); foreach (var gene in genes) geneSet.Add(gene); return geneSet; } private static IEnumerable GetUgaGenesByRef(IReadOnlyDictionary> refIndexToUgaGenes, ushort refIndex) => refIndexToUgaGenes.TryGetValue(refIndex, out var genes) ? genes : EmptyUgaGenes; private static int MinCoordinate(UgaGene gene, Func coordFunc) => coordFunc(gene.GRCh37 ?? gene.GRCh38); } } ================================================ FILE: CacheUtils/Genes/Utilities/DictionaryUtilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; namespace CacheUtils.Genes.Utilities { public static class DictionaryUtilities { public static Dictionary GetSingleValueDict(this IEnumerable elements, Func idFunc) { var dict = new Dictionary(); foreach (var element in elements) { var key = idFunc(element); if (key == null) continue; if (dict.ContainsKey(key)) throw new InvalidDataException($"Multiple entries for [{key}] already exist in the dictionary."); dict[key] = element; } return dict; } public static Dictionary> GetMultiValueDict(this IEnumerable elements, Func idFunc) { var dict = new Dictionary>(); foreach (var element in elements) { var key = idFunc(element); if (key == null) continue; if (dict.TryGetValue(key, out var geneList)) geneList.Add(element); else dict[key] = new List { element }; } return dict; } public static Dictionary GetKeyValueDict(this IEnumerable elements, Func keyFunc, Func valueFunc) { var dict = new Dictionary(); foreach (var element in elements) { var key = keyFunc(element); var value = valueFunc(element); if (key == null || value == null) continue; dict[key] = value; } return dict; } public static HashSet GetSet(this IEnumerable elements, Func idFunc) { var set = new HashSet(); foreach (var element in elements) { var key = idFunc(element); set.Add(key); } return set; } public static Dictionary CreateIndex(this IEnumerable elements) { var index = new Dictionary(); var currentIndex = 0; foreach (var element in elements) index[element] = currentIndex++; return index; } } } ================================================ FILE: CacheUtils/Helpers/BioTypeHelper.cs ================================================ using System; using System.Collections.Generic; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.Helpers { public static class BioTypeHelper { private static readonly Dictionary StringToBioTypes; static BioTypeHelper() { StringToBioTypes = new Dictionary { ["aligned_transcript"] = BioType.aligned_transcript, ["ambiguous_orf"] = BioType.ambiguous_orf, ["antisense"] = BioType.antisense, ["antisense_RNA"] = BioType.antisense_RNA, ["bidirectional_promoter_lncRNA"] = BioType.bidirectional_promoter_lncRNA, ["guide_RNA"] = BioType.guide_RNA, ["IG_pseudogene"] = BioType.IG_pseudogene, ["IG_C_gene"] = BioType.IG_C_gene, ["IG_C_pseudogene"] = BioType.IG_C_pseudogene, ["IG_D_gene"] = BioType.IG_D_gene, ["IG_J_gene"] = BioType.IG_J_gene, ["IG_J_pseudogene"] = BioType.IG_J_pseudogene, ["IG_V_gene"] = BioType.IG_V_gene, ["IG_V_pseudogene"] = BioType.IG_V_pseudogene, ["lincRNA"] = BioType.lincRNA, ["lncRNA"] = BioType.lncRNA, ["macro_lncRNA"] = BioType.macro_lncRNA, ["mRNA"] = BioType.mRNA, ["miRNA"] = BioType.miRNA, ["misc_RNA"] = BioType.misc_RNA, ["Mt_rRNA"] = BioType.Mt_rRNA, ["Mt_tRNA"] = BioType.Mt_tRNA, ["non_coding"] = BioType.non_coding, ["nonsense_mediated_decay"] = BioType.nonsense_mediated_decay, ["non_stop_decay"] = BioType.non_stop_decay, ["other"] = BioType.other, ["polymorphic_pseudogene"] = BioType.polymorphic_pseudogene, ["processed_pseudogene"] = BioType.processed_pseudogene, ["processed_transcript"] = BioType.processed_transcript, ["protein_coding"] = BioType.protein_coding, ["pseudogene"] = BioType.pseudogene, ["retained_intron"] = BioType.retained_intron, ["retrotransposed"] = BioType.retrotransposed, ["RNase_MRP_RNA"] = BioType.RNase_MRP_RNA, ["RNase_P_RNA"] = BioType.RNase_P_RNA, ["rRNA"] = BioType.rRNA, ["ribozyme"] = BioType.ribozyme, ["sense_intronic"] = BioType.sense_intronic, ["sense_overlapping"] = BioType.sense_overlapping, ["SRP_RNA"] = BioType.SRP_RNA, ["sRNA"] = BioType.sRNA, ["scRNA"] = BioType.scRNA, ["scaRNA"] = BioType.scaRNA, ["snRNA"] = BioType.snRNA, ["snoRNA"] = BioType.snoRNA, ["telomerase_RNA"] = BioType.telomerase_RNA, ["3prime_overlapping_ncrna"] = BioType.three_prime_overlapping_ncRNA, ["3prime_overlapping_ncRNA"] = BioType.three_prime_overlapping_ncRNA, ["transcribed_processed_pseudogene"] = BioType.transcribed_processed_pseudogene, ["translated_unprocessed_pseudogene"] = BioType.translated_unprocessed_pseudogene, ["transcribed_unitary_pseudogene"] = BioType.transcribed_unitary_pseudogene, ["TEC"] = BioType.TEC, ["tRNA"] = BioType.tRNA, ["translated_processed_pseudogene"] = BioType.translated_processed_pseudogene, ["transcribed_unprocessed_pseudogene"] = BioType.transcribed_unprocessed_pseudogene, ["TR_C_gene"] = BioType.TR_C_gene, ["TR_D_gene"] = BioType.TR_D_gene, ["TR_J_gene"] = BioType.TR_J_gene, ["TR_J_pseudogene"] = BioType.TR_J_pseudogene, ["TR_V_gene"] = BioType.TR_V_gene, ["TR_V_pseudogene"] = BioType.TR_V_pseudogene, ["unitary_pseudogene"] = BioType.unitary_pseudogene, ["unprocessed_pseudogene"] = BioType.unprocessed_pseudogene, ["vaultRNA"] = BioType.vaultRNA, ["Y_RNA"] = BioType.Y_RNA }; } public static BioType GetBioType(string s) { if (s == null) throw new ArgumentNullException(nameof(s)); if (!StringToBioTypes.TryGetValue(s, out var ret)) throw new InvalidOperationException($"The specified biotype ({s}) was not found in the BioType enum."); return ret; } } } ================================================ FILE: CacheUtils/Helpers/GeneSymbolSourceHelper.cs ================================================ using System; using System.Collections.Generic; using CacheUtils.DataDumperImport.DataStructures; namespace CacheUtils.Helpers { public static class GeneSymbolSourceHelper { private static readonly Dictionary StringToGeneSymbolSources; static GeneSymbolSourceHelper() { StringToGeneSymbolSources = new Dictionary { ["Clone_based_ensembl_gene"] = GeneSymbolSource.CloneBasedEnsemblGene, ["Clone_based_vega_gene"] = GeneSymbolSource.CloneBasedVegaGene, ["EntrezGene"] = GeneSymbolSource.EntrezGene, ["HGNC"] = GeneSymbolSource.HGNC, ["LRG"] = GeneSymbolSource.LRG, ["miRBase"] = GeneSymbolSource.miRBase, ["NCBI"] = GeneSymbolSource.NCBI, ["RFAM"] = GeneSymbolSource.RFAM, ["Uniprot_gn"] = GeneSymbolSource.UniProtGeneName }; } public static GeneSymbolSource GetGeneSymbolSource(string s) { if (s == null) return GeneSymbolSource.Unknown; if (!StringToGeneSymbolSources.TryGetValue(s, out var ret)) throw new InvalidOperationException($"The specified gene symbol source ({s}) was not found in the GeneSymbolSource enum."); return ret; } } } ================================================ FILE: CacheUtils/Helpers/RegulatoryRegionTypeHelper.cs ================================================ using System; using System.Collections.Generic; using VariantAnnotation.Interface.Caches; namespace CacheUtils.Helpers { public static class RegulatoryRegionTypeHelper { private static readonly Dictionary StringToRegulatoryRegionTypes; static RegulatoryRegionTypeHelper() { StringToRegulatoryRegionTypes = new Dictionary { ["CTCF_binding_site"] = RegulatoryRegionType.CTCF_binding_site, ["TF_binding_site"] = RegulatoryRegionType.TF_binding_site, ["enhancer"] = RegulatoryRegionType.enhancer, ["open_chromatin_region"] = RegulatoryRegionType.open_chromatin_region, ["promoter"] = RegulatoryRegionType.promoter, ["promoter_flanking_region"] = RegulatoryRegionType.promoter_flanking_region, ["mature_protein_region"] = RegulatoryRegionType.mature_protein_region }; } public static RegulatoryRegionType GetRegulatoryRegionType(string s) { if (s == null) throw new ArgumentNullException(nameof(s)); if (!StringToRegulatoryRegionTypes.TryGetValue(s, out var ret)) throw new InvalidOperationException($"The specified regulatory region type ({s}) was not found in the RegulatoryRegionType enum."); return ret; } } } ================================================ FILE: CacheUtils/Helpers/TranscriptCacheHelper.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using IO; using VariantAnnotation.Caches; using VariantAnnotation.IO.Caches; namespace CacheUtils.Helpers { public static class TranscriptCacheHelper { public static TranscriptCacheData GetCache(string cachePath, Dictionary refIndexToChromosome) { if (!File.Exists(cachePath)) throw new FileNotFoundException($"Could not find {cachePath}"); TranscriptCacheData cache; using (var reader = new TranscriptCacheReader(FileUtilities.GetReadStream(cachePath))) cache = reader.Read(refIndexToChromosome); return cache; } } } ================================================ FILE: CacheUtils/IntermediateIO/CcdsReader.cs ================================================ using System.Collections.Generic; using System.IO; using IO; using OptimizedCore; using VariantAnnotation.Utilities; namespace CacheUtils.IntermediateIO { public static class CcdsReader { private const int CcdsIdIndex = 0; private const int NucleotideIdIndex = 4; public static Dictionary> GetCcdsIdToEnsemblId(string ccdsPath) { var ccdsIdToEnsemblId = new Dictionary>(); using (var reader = FileUtilities.GetStreamReader(FileUtilities.GetReadStream(ccdsPath))) { while (true) { string line = reader.ReadLine(); if (line == null) break; if (line.OptimizedStartsWith('#')) continue; var cols = line.OptimizedSplit('\t'); if (cols.Length != 8) throw new InvalidDataException($"Expected 8 columns, but found {cols.Length}: [{line}]"); string nucleotideId = cols[NucleotideIdIndex]; if (!nucleotideId.StartsWith("ENST")) continue; var ccds = FormatUtilities.SplitVersion(cols[CcdsIdIndex]); var ensembl = FormatUtilities.SplitVersion(nucleotideId); if (ccdsIdToEnsemblId.TryGetValue(ccds.Id, out var ensemblList)) ensemblList.Add(ensembl.Id); else ccdsIdToEnsemblId[ccds.Id] = new List { ensembl.Id }; } } return ccdsIdToEnsemblId; } } } ================================================ FILE: CacheUtils/IntermediateIO/GenbankReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.Genbank; using Intervals; using IO; using OptimizedCore; namespace CacheUtils.IntermediateIO { internal sealed class GenbankReader : IDisposable { private readonly StreamReader _reader; internal GenbankReader(Stream stream) { _reader = FileUtilities.GetStreamReader(stream); IntermediateIoCommon.ReadHeader(_reader, IntermediateIoCommon.FileType.Genbank); } public Dictionary GetIdToGenbank() { var genbankDict = new Dictionary(); while (true) { var entry = GetNextEntry(); if (entry == null) break; genbankDict[entry.TranscriptId] = entry; } return genbankDict; } private GenbankEntry GetNextEntry() { string line = _reader.ReadLine(); if (line == null) return null; var info = ReadTranscriptInfo(line); var exons = ReadExons(info.NumExons); return new GenbankEntry(info.TranscriptId, info.TranscriptVersion, info.ProteinId, info.ProteinVersion, info.GeneId, info.GeneSymbol, info.CodingRegion, exons); } private IInterval[] ReadExons(int numExons) { if (numExons == 0) return null; string line = _reader.ReadLine(); if (line == null) throw new InvalidOperationException("Unexpected null line when parsing exons"); var cols = line.OptimizedSplit('\t'); if (cols[0] != "Exons") throw new InvalidDataException($"Expected the first keyword to be Exons, but found something different: {line}"); var exons = new IInterval[numExons]; var colIndex = 1; for (var i = 0; i < numExons; i++) { int start = int.Parse(cols[colIndex++]); int end = int.Parse(cols[colIndex++]); exons[i] = new Interval(start, end); } return exons; } private static (string TranscriptId, byte TranscriptVersion, string ProteinId, byte ProteinVersion, string GeneId, string GeneSymbol, IInterval CodingRegion, int NumExons) ReadTranscriptInfo(string line) { var cols = line.OptimizedSplit('\t'); if (cols.Length != 9) throw new InvalidDataException($"Expected 9 columns, but found {cols.Length} columns instead."); string transcriptId = cols[0]; byte transcriptVersion = byte.Parse(cols[1]); string proteinId = cols[2]; byte proteinVersion = byte.Parse(cols[3]); string geneId = cols[4]; string geneSymbol = cols[5]; int start = int.Parse(cols[6]); int end = int.Parse(cols[7]); int numExons = int.Parse(cols[8]); var codingRegion = new Interval(start, end); return (transcriptId, transcriptVersion, proteinId, proteinVersion, geneId, geneSymbol, codingRegion, numExons); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/IntermediateIO/GenbankWriter.cs ================================================ using System; using System.IO; using CacheUtils.Genbank; namespace CacheUtils.IntermediateIO { internal sealed class GenbankWriter : IDisposable { private readonly StreamWriter _writer; internal GenbankWriter(StreamWriter writer, IntermediateIoHeader header) { _writer = writer; _writer.NewLine = "\n"; header.Write(_writer, IntermediateIoCommon.FileType.Genbank); } internal void Write(GenbankEntry entry) { int numExons = entry.Exons?.Length ?? 0; int codingRegionStart = entry.CodingRegion?.Start ?? -1; int codingRegionEnd = entry.CodingRegion?.End ?? -1; string proteinId = entry.ProteinId ?? ""; byte proteinVersion = entry.ProteinVersion; _writer.WriteLine($"{entry.TranscriptId}\t{entry.TranscriptVersion}\t{proteinId}\t{proteinVersion}\t{entry.GeneId}\t{entry.Symbol}\t{codingRegionStart}\t{codingRegionEnd}\t{numExons}"); if (entry.Exons == null) return; _writer.Write("Exons"); foreach (var exon in entry.Exons) _writer.Write($"\t{exon.Start}\t{exon.End}"); _writer.WriteLine(); } public void Dispose() => _writer.Dispose(); } } ================================================ FILE: CacheUtils/IntermediateIO/IntermediateIoCommon.cs ================================================ using System.IO; namespace CacheUtils.IntermediateIO { public static class IntermediateIoCommon { public const string Header = "NirvanaIntermediateIo"; public enum FileType : byte { Genbank, Polyphen, Regulatory, Sift, Transcript } // ReSharper disable once ParameterOnlyUsedForPreconditionCheck.Global public static IntermediateIoHeader ReadHeader(StreamReader reader, FileType expectedType) { (string id, FileType type, IntermediateIoHeader header) = IntermediateIoHeader.Read(reader); if (id != Header || type != expectedType) throw new InvalidDataException("Could not verify the header tag or the file type in the header."); return header; } } } ================================================ FILE: CacheUtils/IntermediateIO/IntermediateIoHeader.cs ================================================ using System.IO; using Genome; using OptimizedCore; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.IntermediateIO { public sealed class IntermediateIoHeader { public readonly ushort VepVersion; public readonly long VepReleaseTicks; public readonly Source Source; public readonly GenomeAssembly Assembly; private readonly int _numRefSeqs; public IntermediateIoHeader(ushort vepVersion, long vepReleaseTicks, Source transcriptSource, GenomeAssembly genomeAssembly, int numRefSeqs) { VepVersion = vepVersion; VepReleaseTicks = vepReleaseTicks; Source = transcriptSource; Assembly = genomeAssembly; _numRefSeqs = numRefSeqs; } internal void Write(StreamWriter writer, IntermediateIoCommon.FileType fileType) { writer.WriteLine($"{IntermediateIoCommon.Header}\t{(byte)fileType}"); writer.WriteLine($"{VepVersion}\t{VepReleaseTicks}\t{(byte)Source}\t{(byte)Assembly}\t{_numRefSeqs}"); } internal static (string Id, IntermediateIoCommon.FileType Type, IntermediateIoHeader Header) Read(StreamReader reader) { var cols = reader.ReadLine()?.OptimizedSplit('\t'); var cols2 = reader.ReadLine()?.OptimizedSplit('\t'); if (cols == null || cols2 == null) throw new InvalidDataException("Found unexpected null lines when parsing the intermediate I/O file header"); string id = cols[0]; var type = (IntermediateIoCommon.FileType)byte.Parse(cols[1]); ushort vepVersion = ushort.Parse(cols2[0]); long vepReleaseTicks = long.Parse(cols2[1]); var source = (Source)byte.Parse(cols2[2]); var genomeAssembly = (GenomeAssembly)byte.Parse(cols2[3]); int numRefSeqs = int.Parse(cols2[4]); var header = new IntermediateIoHeader(vepVersion, vepReleaseTicks, source, genomeAssembly, numRefSeqs); return (id, type, header); } } } ================================================ FILE: CacheUtils/IntermediateIO/LrgReader.cs ================================================ using System.Collections.Generic; using System.IO; using IO; using OptimizedCore; using VariantAnnotation.Utilities; namespace CacheUtils.IntermediateIO { public static class LrgReader { private const int RefSeqTranscriptIndex = 4; private const int EnsemblTranscriptIndex = 5; private const int CccdsIndex = 6; public static HashSet GetTranscriptIds(string lrgPath, Dictionary> ccdsIdToEnsemblId) { var transcriptIds = new HashSet(); using (var reader = FileUtilities.GetStreamReader(FileUtilities.GetReadStream(lrgPath))) { while (true) { string line = reader.ReadLine(); if (line == null) break; if (line.OptimizedStartsWith('#')) continue; var cols = line.OptimizedSplit('\t'); if (cols.Length != 7) throw new InvalidDataException($"Expected 7 columns, but found {cols.Length}: [{line}]"); var refSeqTranscript = FormatUtilities.SplitVersion(Sanitize(cols[RefSeqTranscriptIndex])); var ccds = FormatUtilities.SplitVersion(Sanitize(cols[CccdsIndex])); var ensemblTranscriptIds = GetEnsemblTranscriptIds(ccds.Id, ccdsIdToEnsemblId, Sanitize(cols[EnsemblTranscriptIndex])); if (refSeqTranscript.Id != null) transcriptIds.Add(refSeqTranscript.Id); // ReSharper disable once InvertIf if (ensemblTranscriptIds != null) foreach (string id in ensemblTranscriptIds) transcriptIds.Add(id); } } return transcriptIds; } private static List GetEnsemblTranscriptIds(string ccdsId, IReadOnlyDictionary> ccdsIdToEnsemblId, string ensemblId) { if (!string.IsNullOrEmpty(ensemblId)) return new List { ensemblId }; if (string.IsNullOrEmpty(ccdsId)) return null; return !ccdsIdToEnsemblId.TryGetValue(ccdsId, out var ensemblList) ? null : ensemblList; } private static string Sanitize(string s) => s == "-" ? null : s; } } ================================================ FILE: CacheUtils/IntermediateIO/MutableTranscriptReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.TranscriptCache; using Genome; using Intervals; using IO; using OptimizedCore; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.IntermediateIO { internal sealed class MutableTranscriptReader : IDisposable { private readonly Dictionary _refIndexToChromosome; private readonly StreamReader _reader; public readonly IntermediateIoHeader Header; private readonly ISequence _sequence = new NSequence(); internal MutableTranscriptReader(Stream stream, Dictionary refIndexToChromosome) { _refIndexToChromosome = refIndexToChromosome; _reader = FileUtilities.GetStreamReader(stream); Header = IntermediateIoCommon.ReadHeader(_reader, IntermediateIoCommon.FileType.Transcript); } public MutableTranscript[] GetTranscripts() { var transcripts = new List(); while (true) { var transcript = GetNextTranscript(); if (transcript == null) break; transcripts.Add(transcript); } return transcripts.ToArray(); } private MutableTranscript GetNextTranscript() { string line = _reader.ReadLine(); if (line == null) return null; var transcriptInfo = ReadTranscriptInfo(line); var gene = ReadGene(transcriptInfo.Chromosome); var translation = ReadTranslation(); var exons = ReadExons(transcriptInfo.Chromosome); var introns = ReadIntervals("Introns"); var cdnaMaps = ReadCdnaMaps(); var mirnas = ReadIntervals("miRNAs"); var selenocysteines = ReadSelenocysteines(); var rnaEdits = ReadRnaEdits(); var transcript = new MutableTranscript(transcriptInfo.Chromosome, transcriptInfo.Start, transcriptInfo.End, transcriptInfo.Id, transcriptInfo.Version, transcriptInfo.CcdsId, transcriptInfo.RefSeqId, transcriptInfo.BioType, transcriptInfo.IsCanonical, translation.CodingRegion, translation.Id, translation.Version, translation.PeptideSeq, transcriptInfo.Source, gene, exons, transcriptInfo.StartExonPhase, transcriptInfo.TotalExonLength, introns, cdnaMaps, null, null, transcriptInfo.TranslateableSequence, mirnas, transcriptInfo.CdsStartNotFound, transcriptInfo.CdsEndNotFound, selenocysteines, rnaEdits, transcriptInfo.BamEditStatus); AddMutableContents(transcript); return transcript; } private void AddMutableContents(MutableTranscript mt) { mt.TranscriptRegions = TranscriptRegionMerger.GetTranscriptRegions(mt.CdnaMaps, mt.Exons, mt.Introns, mt.Gene.OnReverseStrand); TranscriptRegionValidater.Validate(mt.Id, mt.CdnaMaps, mt.Exons, mt.Introns, mt.TranscriptRegions); mt.NewStartExonPhase = mt.StartExonPhase < 0 ? (byte)0 : (byte)mt.StartExonPhase; if (mt.CodingRegion == null) return; var codingSequence = new CodingSequence(_sequence, mt.CodingRegion, mt.TranscriptRegions, mt.Gene.OnReverseStrand, mt.NewStartExonPhase, mt.RnaEdits); mt.CdsLength = codingSequence.GetCodingSequence().Length; mt.CodingRegion = new CodingRegion(mt.CodingRegion.Start, mt.CodingRegion.End, mt.CodingRegion.CdnaStart, mt.CodingRegion.CdnaEnd, mt.CdsLength); } private int[] ReadSelenocysteines() { var cols = GetColumns("Sec"); int numPositions = int.Parse(cols[1]); if (numPositions == 0) return null; var positions = new int[numPositions]; var colIndex = 2; for (var i = 0; i < numPositions; i++) positions[i] = int.Parse(cols[colIndex++]); return positions; } private IRnaEdit[] ReadRnaEdits() { var cols = GetColumns("RnaEdits"); int numRnaEdits = int.Parse(cols[1]); if (numRnaEdits == 0) return null; var rnaEdits = new IRnaEdit[numRnaEdits]; var colIndex = 2; for (var i = 0; i < numRnaEdits; i++) { int start = int.Parse(cols[colIndex++]); int end = int.Parse(cols[colIndex++]); string bases = cols[colIndex++]; rnaEdits[i] = new RnaEdit(start, end, bases); } return rnaEdits; } private MutableTranscriptRegion[] ReadCdnaMaps() { var cols = GetColumns("cDNA"); int numCdnaMaps = int.Parse(cols[1]); if (numCdnaMaps == 0) return null; var cdnaMaps = new MutableTranscriptRegion[numCdnaMaps]; var colIndex = 2; for (var i = 0; i < numCdnaMaps; i++) { int start = int.Parse(cols[colIndex++]); int end = int.Parse(cols[colIndex++]); int cdnaStart = int.Parse(cols[colIndex++]); int cdnaEnd = int.Parse(cols[colIndex++]); cdnaMaps[i] = new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, start, end, cdnaStart, cdnaEnd); } return cdnaMaps; } private IInterval[] ReadIntervals(string description) { var cols = GetColumns(description); int numIntervals = int.Parse(cols[1]); if (numIntervals == 0) return null; var intervals = new IInterval[numIntervals]; var colIndex = 2; for (var i = 0; i < numIntervals; i++) { int start = int.Parse(cols[colIndex++]); int end = int.Parse(cols[colIndex++]); intervals[i] = new Interval(start, end); } return intervals; } private MutableExon[] ReadExons(Chromosome chromosome) { var cols = GetColumns("Exons"); int numExons = int.Parse(cols[1]); if (numExons == 0) return null; var exons = new MutableExon[numExons]; var colIndex = 2; for (var i = 0; i < numExons; i++) { int start = int.Parse(cols[colIndex++]); int end = int.Parse(cols[colIndex++]); var phase = (byte)(int.Parse(cols[colIndex++]) + 1); exons[i] = new MutableExon(chromosome, start, end, phase); } return exons; } private (string Id, byte Version, ICodingRegion CodingRegion, string PeptideSeq) ReadTranslation() { var cols = GetColumns("Translation"); string id = cols[1]; byte version = byte.Parse(cols[2]); int start = int.Parse(cols[3]); int end = int.Parse(cols[4]); int cdnaStart = int.Parse(cols[5]); int cdnaEnd = int.Parse(cols[6]); string peptideSeq = cols[7]; var codingRegion = start == -1 && end == -1 ? null : new CodingRegion(start, end, cdnaStart, cdnaEnd, 0); return (id, version, codingRegion, peptideSeq); } private MutableGene ReadGene(Chromosome chromosome) { var cols = GetColumns("Gene"); string id = cols[1]; int start = int.Parse(cols[4]); int end = int.Parse(cols[5]); bool onReverseStrand = cols[6] == "R"; string symbol = cols[7]; var symbolSource = (GeneSymbolSource)int.Parse(cols[8]); int hgncId = int.Parse(cols[9]); return new MutableGene(chromosome, start, end, onReverseStrand, symbol, symbolSource, id, hgncId); } private (string Id, byte Version, Chromosome Chromosome, int Start, int End, BioType BioType, bool IsCanonical, int TotalExonLength, string CcdsId, string RefSeqId, Source Source, bool CdsStartNotFound, bool CdsEndNotFound, string TranslateableSequence, int StartExonPhase, string BamEditStatus) ReadTranscriptInfo( string line) { var cols = GetColumns("Transcript", line); string id = cols[1]; byte version = byte.Parse(cols[2]); ushort referenceIndex = ushort.Parse(cols[4]); int start = int.Parse(cols[5]); int end = int.Parse(cols[6]); var biotype = (BioType)byte.Parse(cols[8]); bool isCanonical = cols[9] == "Y"; int totalExonLength = int.Parse(cols[10]); string ccdsId = cols[11]; string refSeqId = cols[12]; var source = (Source)byte.Parse(cols[13]); bool cdsStartNotFound = cols[14] == "Y"; bool cdsEndNotFound = cols[15] == "Y"; int startExonPhase = int.Parse(cols[16]); string bamEditStatus = cols[17]; string translateableSequence = _reader.ReadLine(); var chromosome = ReferenceNameUtilities.GetChromosome(_refIndexToChromosome, referenceIndex); return (id, version, chromosome, start, end, biotype, isCanonical, totalExonLength, ccdsId, refSeqId, source , cdsStartNotFound, cdsEndNotFound, translateableSequence, startExonPhase, bamEditStatus); } private string[] GetColumns(string keyword, string line = null) { if (line == null) line = _reader.ReadLine(); var cols = line?.OptimizedSplit('\t'); if (cols == null) throw new InvalidDataException("Found an unexpected null when parsing the columns in the transcript reader."); if (cols[0] != keyword) throw new InvalidDataException($"Could not find the {keyword} keyword in the transcripts file."); return cols; } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/IntermediateIO/MutableTranscriptWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Mutable; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.IntermediateIO { internal sealed class MutableTranscriptWriter : IDisposable { private readonly StreamWriter _writer; internal MutableTranscriptWriter(StreamWriter writer, IntermediateIoHeader header) { _writer = writer; _writer.NewLine = "\n"; header.Write(_writer, IntermediateIoCommon.FileType.Transcript); } internal void Write(MutableTranscript transcript) { WriteTranscriptInfo(transcript); WriteGene(_writer, transcript.Gene); WriteTranslation(transcript.CodingRegion, transcript.ProteinId, transcript.ProteinVersion, transcript.PeptideSequence); WriteExons(transcript.Exons); WriteIntervals(transcript.Introns, "Introns"); WriteCdnaMaps(transcript.CdnaMaps); WriteIntervals(transcript.MicroRnas, "miRNAs"); WriteSelenocysteines(transcript.SelenocysteinePositions); WriteRnaEdits(transcript.RnaEdits); } private void WriteRnaEdits(IReadOnlyCollection rnaEdits) { if (rnaEdits == null) { _writer.WriteLine("RnaEdits\t0"); return; } _writer.Write($"RnaEdits\t{rnaEdits.Count}"); foreach (var rnaEdit in rnaEdits) _writer.Write($"\t{rnaEdit.Start}\t{rnaEdit.End}\t{rnaEdit.Bases}"); _writer.WriteLine(); } private void WriteSelenocysteines(IReadOnlyCollection positions) { if (positions == null) { _writer.WriteLine("Sec\t0"); return; } _writer.Write($"Sec\t{positions.Count}"); foreach (int pos in positions) _writer.Write($"\t{pos}"); _writer.WriteLine(); } private void WriteCdnaMaps(IReadOnlyCollection cdnaMaps) { _writer.Write($"cDNA\t{cdnaMaps.Count}"); foreach (var cdnaMap in cdnaMaps) _writer.Write($"\t{cdnaMap.Start}\t{cdnaMap.End}\t{cdnaMap.CdnaStart}\t{cdnaMap.CdnaEnd}"); _writer.WriteLine(); } private void WriteIntervals(IReadOnlyCollection intervals, string description) { if (intervals == null) { _writer.WriteLine($"{description}\t0"); return; } _writer.Write($"{description}\t{intervals.Count}"); foreach (var interval in intervals) _writer.Write($"\t{interval.Start}\t{interval.End}"); _writer.WriteLine(); } private void WriteExons(IReadOnlyCollection exons) { _writer.Write($"Exons\t{exons.Count}"); foreach (var exon in exons) _writer.Write($"\t{exon.Start}\t{exon.End}\t{exon.Phase}"); _writer.WriteLine(); } private void WriteTranslation(ICodingRegion codingRegion, string proteinId, byte proteinVersion, string peptideSequence) => _writer.WriteLine($"Translation\t{proteinId}\t{proteinVersion}\t{codingRegion.Start}\t{codingRegion.End}\t{codingRegion.CdnaStart}\t{codingRegion.CdnaEnd}\t{peptideSequence}"); private static void WriteGene(TextWriter writer, MutableGene gene) { char strand = gene.OnReverseStrand ? 'R' : 'F'; writer.WriteLine($"Gene\t{gene.GeneId}\t{gene.Chromosome.UcscName}\t{gene.Chromosome.Index}\t{gene.Start}\t{gene.End}\t{strand}\t{gene.Symbol}\t{(int)gene.SymbolSource}\t{gene.HgncId}"); } private void WriteTranscriptInfo(MutableTranscript transcript) { _writer.WriteLine($"Transcript\t{transcript.Id}\t{transcript.Version}\t{transcript.Chromosome.UcscName}\t{transcript.Chromosome.Index}\t{transcript.Start}\t{transcript.End}\t{transcript.BioType}\t{(byte)transcript.BioType}\t{BoolToChar(transcript.IsCanonical)}\t{transcript.TotalExonLength}\t{transcript.CcdsId}\t{transcript.RefSeqId}\t{(byte)transcript.Source}\t{BoolToChar(transcript.CdsStartNotFound)}\t{BoolToChar(transcript.CdsEndNotFound)}\t{transcript.StartExonPhase}\t{transcript.BamEditStatus}"); _writer.WriteLine(transcript.TranslateableSequence); } private static char BoolToChar(bool b) => b ? 'Y' : 'N'; public void Dispose() => _writer.Dispose(); } } ================================================ FILE: CacheUtils/IntermediateIO/PredictionReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using IO; using OptimizedCore; namespace CacheUtils.IntermediateIO { public sealed class PredictionReader : IDisposable { private readonly Dictionary _refIndexToChromosome; private readonly StreamReader _reader; public PredictionReader(Stream stream, Dictionary refIndexToChromosome, IntermediateIoCommon.FileType expectedFileType) { _refIndexToChromosome = refIndexToChromosome; _reader = FileUtilities.GetStreamReader(stream); IntermediateIoCommon.ReadHeader(_reader, expectedFileType); } public (string[] PredictionData, Dictionary TranscriptToPredictionIndex, Chromosome Chromosome) GetPredictionData() { var chromosomeHeader = GetChromosomeHeader(); var predictionData = new string[chromosomeHeader.NumPredictions]; var transcriptToPredictionIndex = new Dictionary(chromosomeHeader.NumPredictions); for (var predictionIndex = 0; predictionIndex < chromosomeHeader.NumPredictions; predictionIndex++) { var prediction = GetNextPrediction(); predictionData[predictionIndex] = prediction.PredictionData; foreach (int index in prediction.TranscriptIndices) transcriptToPredictionIndex[index] = predictionIndex; } return (predictionData, transcriptToPredictionIndex, chromosomeHeader.Chromosome); } private (Chromosome Chromosome, int NumPredictions) GetChromosomeHeader() { string line = _reader.ReadLine(); var cols = line?.OptimizedSplit('\t'); if (cols == null) throw new InvalidDataException("Found an unexpected null line when parsing the chromosome header in the prediction reader."); if (cols.Length != 3) throw new InvalidDataException($"Expected 3 columns in the chromosome header, but found {cols.Length}"); ushort referenceIndex = ushort.Parse(cols[1]); var chromosome = ReferenceNameUtilities.GetChromosome(_refIndexToChromosome, referenceIndex); int numPredictions = int.Parse(cols[2]); return (chromosome, numPredictions); } private (List TranscriptIndices, string PredictionData) GetNextPrediction() { string line = _reader.ReadLine(); if (line == null) throw new InvalidDataException("Found an unexpected empty line while parsing the prediction file."); var cols = line.OptimizedSplit('\t'); if (cols.Length != 2) throw new InvalidDataException($"Expected 2 columns in the prediction entry, but found {cols.Length}"); var transcriptIndices = GetTranscriptIndices(cols[0]); string predictionData = cols[1]; return (transcriptIndices, predictionData); } private static List GetTranscriptIndices(string s) { var indexStrings = s.OptimizedSplit(','); var indices = new int[indexStrings.Length]; for (var i = 0; i < indexStrings.Length; i++) indices[i] = int.Parse(indexStrings[i]); return indices.ToList(); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/IntermediateIO/PredictionWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; namespace CacheUtils.IntermediateIO { internal sealed class PredictionWriter : IDisposable { private readonly StreamWriter _writer; internal PredictionWriter(StreamWriter writer, IntermediateIoHeader header, IntermediateIoCommon.FileType fileType) { _writer = writer; _writer.NewLine = "\n"; header.Write(_writer, fileType); } internal void Write(Chromosome chromosome, Dictionary> predictionDict) { _writer.WriteLine($"{chromosome.UcscName}\t{chromosome.Index}\t{predictionDict.Count}"); foreach (var kvp in predictionDict) WritePrediction(kvp.Value, kvp.Key); } private void WritePrediction(IEnumerable transcriptIds, string predictionData) { string transcriptIdString = string.Join(',', transcriptIds); _writer.WriteLine($"{transcriptIdString}\t{predictionData}"); } public void Dispose() => _writer.Dispose(); } } ================================================ FILE: CacheUtils/IntermediateIO/RegulatoryRegionReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using IO; using OptimizedCore; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; namespace CacheUtils.IntermediateIO { internal sealed class RegulatoryRegionReader : IDisposable { private readonly Dictionary _refIndexToChromosome; private readonly StreamReader _reader; internal RegulatoryRegionReader(Stream stream, Dictionary refIndexToChromosome) { _refIndexToChromosome = refIndexToChromosome; _reader = FileUtilities.GetStreamReader(stream); IntermediateIoCommon.ReadHeader(_reader, IntermediateIoCommon.FileType.Regulatory); } public IRegulatoryRegion[] GetRegulatoryRegions() { var regulatoryRegions = new List(); while (true) { var regulatoryRegion = GetNextRegulatoryRegion(); if (regulatoryRegion == null) break; regulatoryRegions.Add(regulatoryRegion); } return regulatoryRegions.ToArray(); } private IRegulatoryRegion GetNextRegulatoryRegion() { string line = _reader.ReadLine(); if (line == null) return null; var cols = line.OptimizedSplit('\t'); ushort referenceIndex = ushort.Parse(cols[1]); int start = int.Parse(cols[2]); int end = int.Parse(cols[3]); var id = CompactId.Convert(cols[4]); var type = (RegulatoryRegionType)byte.Parse(cols[6]); var chromosome = ReferenceNameUtilities.GetChromosome(_refIndexToChromosome, referenceIndex); return new RegulatoryRegion(chromosome, start, end, id, type); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: CacheUtils/IntermediateIO/RegulatoryRegionWriter.cs ================================================ using System; using System.IO; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.IntermediateIO { internal sealed class RegulatoryRegionWriter : IDisposable { private readonly StreamWriter _writer; internal RegulatoryRegionWriter(StreamWriter writer, IntermediateIoHeader header) { _writer = writer; _writer.NewLine = "\n"; header.Write(_writer, IntermediateIoCommon.FileType.Regulatory); } internal void Write(IRegulatoryRegion region) => _writer.WriteLine( $"{region.Chromosome.UcscName}\t{region.Chromosome.Index}\t{region.Start}\t{region.End}\t{region.Id}\t{region.Type}\t{(byte) region.Type}"); public void Dispose() => _writer.Dispose(); } } ================================================ FILE: CacheUtils/MiniCache/DataBundle.cs ================================================ using Genome; using IO; using ReferenceSequence.IO; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; using VC = VariantAnnotation.Caches; namespace CacheUtils.MiniCache { /// /// the bundle of cache and reference data objects that correspond to a /// specific genome assembly and transcript data source /// public sealed class DataBundle { public readonly CompressedSequenceReader SequenceReader; public readonly VC.TranscriptCacheData TranscriptCacheData; public readonly VC.TranscriptCache TranscriptCache; public readonly PredictionCacheReader SiftReader; public readonly PredictionCacheReader PolyPhenReader; private Chromosome _currentChromosome = Chromosome.GetEmptyChromosome(string.Empty); public Prediction[] SiftPredictions; public Prediction[] PolyPhenPredictions; public readonly Source Source; private DataBundle(CompressedSequenceReader sequenceReader, PredictionCacheReader siftReader, PredictionCacheReader polyPhenReader, VC.TranscriptCacheData cacheData, VC.TranscriptCache transcriptCache, Source source) { SequenceReader = sequenceReader; TranscriptCacheData = cacheData; TranscriptCache = transcriptCache; Source = source; SiftReader = siftReader; PolyPhenReader = polyPhenReader; } public void Load(Chromosome chromosome) { if (_currentChromosome.Index == chromosome.Index) return; SequenceReader.GetCompressedSequence(chromosome); SiftPredictions = SiftReader.GetPredictions(chromosome.Index); PolyPhenPredictions = PolyPhenReader.GetPredictions(chromosome.Index); _currentChromosome = chromosome; } public static DataBundle GetDataBundle(string referencePath, string cachePrefix) { var sequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(referencePath)); var siftReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(cachePrefix)), PredictionCacheReader.SiftDescriptions); var polyPhenReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(cachePrefix)), PredictionCacheReader.PolyphenDescriptions); VC.TranscriptCacheData cacheData; VC.TranscriptCache cache; Source source; using (var transcriptReader = new TranscriptCacheReader(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(cachePrefix)))) { cacheData = transcriptReader.Read(sequenceReader.RefIndexToChromosome); cache = cacheData.GetCache(); source = transcriptReader.Header.Source; } return new DataBundle(sequenceReader, siftReader, polyPhenReader, cacheData, cache, source); } } } ================================================ FILE: CacheUtils/MiniCache/IStaging.cs ================================================ using System.IO; namespace CacheUtils.MiniCache { public interface IStaging { void Write(Stream stream); } } ================================================ FILE: CacheUtils/PredictionCache/PredictionCacheBuilder.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.IntermediateIO; using CacheUtils.Utilities; using Genome; using IO; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; namespace CacheUtils.PredictionCache { public sealed class PredictionCacheBuilder { private readonly GenomeAssembly _genomeAssembly; public PredictionCacheBuilder(GenomeAssembly genomeAssembly) => _genomeAssembly = genomeAssembly; public (PredictionCacheStaging Sift, PredictionCacheStaging PolyPhen) CreatePredictionCaches( Dictionary> transcriptsByRefIndex, PredictionReader siftReader, PredictionReader polyphenReader, int numRefSeqs) { Logger.Write("- converting prediction strings... "); var siftRoundedPredictionsPerRef = new RoundedEntryPrediction[numRefSeqs][]; var polyPhenRoundedPredictionsPerRef = new RoundedEntryPrediction[numRefSeqs][]; for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var sift = siftReader.GetPredictionData(); var polyphen = polyphenReader.GetPredictionData(); if (sift.Chromosome.Index != refIndex || polyphen.Chromosome.Index != refIndex) throw new InvalidDataException( $"Found mismatch between transcript chromosome index ({refIndex}) and prediction chromosome indices (SIFT: {sift.Chromosome.Index}, PolyPhen: {polyphen.Chromosome.Index}."); if (!transcriptsByRefIndex.TryGetValue(refIndex, out var refTranscripts)) continue; var (siftPredictions, polyPhenPredictions) = ProcessReference(refTranscripts, sift.TranscriptToPredictionIndex, polyphen.TranscriptToPredictionIndex, sift.PredictionData, polyphen.PredictionData); siftRoundedPredictionsPerRef[refIndex] = siftPredictions; polyPhenRoundedPredictionsPerRef[refIndex] = polyPhenPredictions; } Logger.WriteLine("finished."); var siftStaging = BuildCacheStaging("SIFT", siftRoundedPredictionsPerRef, numRefSeqs); var polyPhenStaging = BuildCacheStaging("PolyPhen", polyPhenRoundedPredictionsPerRef, numRefSeqs); return (siftStaging, polyPhenStaging); } private PredictionCacheStaging BuildCacheStaging(string description, IReadOnlyList roundedPredictionsPerRef, int numReferenceSeqs) { Logger.Write($"- calculating {description} LUT... "); var (lut, roundedEntryToLutIndex) = CreateLut(roundedPredictionsPerRef); Logger.WriteLine($"{lut.Length} entries."); Logger.Write($"- converting {description} rounded entries... "); var predictionsPerRef = ConvertPredictions(roundedPredictionsPerRef, roundedEntryToLutIndex, lut); Logger.WriteLine("finished."); var header = CreateHeader(numReferenceSeqs, lut); return new PredictionCacheStaging(header, predictionsPerRef); } private PredictionHeader CreateHeader(int numReferenceSeqs, Prediction.Entry[] lut) { var customHeader = new PredictionCacheCustomHeader(new IndexEntry[numReferenceSeqs]); return new PredictionHeader(HeaderUtilities.GetHeader(Source.None, _genomeAssembly), customHeader, lut); } private static Prediction[][] ConvertPredictions(IReadOnlyList roundedPredictionsPerRef, Dictionary roundedEntryToLutIndex, Prediction.Entry[] lut) { int numReferenceSeqs = roundedPredictionsPerRef.Count; var predictionsPerRef = new Prediction[numReferenceSeqs][]; for (var i = 0; i < numReferenceSeqs; i++) { predictionsPerRef[i] = ConvertReferencePredictions(roundedPredictionsPerRef[i], roundedEntryToLutIndex, lut); } return predictionsPerRef; } private static Prediction[] ConvertReferencePredictions(IReadOnlyList roundedEntryPredictions, Dictionary roundedEntryToLutIndex, Prediction.Entry[] lut) { if (roundedEntryPredictions == null) return null; int numPredictions = roundedEntryPredictions.Count; var predictions = new Prediction[numPredictions]; for (var i = 0; i < numPredictions; i++) predictions[i] = roundedEntryPredictions[i].Convert(roundedEntryToLutIndex, lut); return predictions; } private static (Prediction.Entry[] Lut, Dictionary RoundedEntryToLutIndex) CreateLut( IEnumerable roundedPredictionsPerRef) { var scores = new HashSet(); foreach (var roundedPredictions in roundedPredictionsPerRef) { if (roundedPredictions == null) continue; foreach (var roundedPrediction in roundedPredictions) { foreach (var roundedEntry in roundedPrediction.Entries) { if (roundedEntry.Score > 1000) continue; scores.Add(roundedEntry); } } } if (scores.Count > 255) throw new InvalidDataException($"Unable to create lookup table, too many LUT entries: {scores.Count} (max 255)."); var lut = new Prediction.Entry[scores.Count]; var roundedEntryToLutIndex = new Dictionary(); var currentIndex = 0; foreach (var entry in scores.OrderBy(x => x.EnumIndex).ThenBy(x => x.Score)) { roundedEntryToLutIndex[entry] = (byte)currentIndex; lut[currentIndex++] = new Prediction.Entry(entry.Score / 1000.0, entry.EnumIndex); } return (lut, roundedEntryToLutIndex); } private static (RoundedEntryPrediction[] Sift, RoundedEntryPrediction[] PolyPhen) ProcessReference( IReadOnlyList transcripts, Dictionary siftTranscriptToPredictionIndex, Dictionary polyphenTranscriptToPredictionIndex, string[] siftPredictionData, string[] polyphenPredictionData) { AssignPredictionIndices(transcripts, siftTranscriptToPredictionIndex, polyphenTranscriptToPredictionIndex); var siftPredictions = siftPredictionData.GetRoundedEntryPredictions(); var polyPhenPredictions = polyphenPredictionData.GetRoundedEntryPredictions(); return (siftPredictions, polyPhenPredictions); } private static void AssignPredictionIndices(IReadOnlyList transcripts, Dictionary siftTranscriptToPredictionIndex, Dictionary polyphenTranscriptToPredictionIndex) { foreach (var kvp in siftTranscriptToPredictionIndex) transcripts[kvp.Key].SiftIndex = kvp.Value; foreach (var kvp in polyphenTranscriptToPredictionIndex) transcripts[kvp.Key].PolyPhenIndex = kvp.Value; } } } ================================================ FILE: CacheUtils/PredictionCache/PredictionCacheStaging.cs ================================================ using System.IO; using System.IO.Compression; using CacheUtils.MiniCache; using Compression.Algorithms; using Compression.FileHandling; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.IO.Caches; namespace CacheUtils.PredictionCache { public sealed class PredictionCacheStaging : IStaging { private readonly Prediction[][] _predictionsPerRef; private readonly PredictionHeader _header; internal PredictionCacheStaging(PredictionHeader header, Prediction[][] predictionsPerRef) { _header = header; _predictionsPerRef = predictionsPerRef; } public void Write(Stream stream) { using (var blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Compress)) using (var writer = new PredictionCacheWriter(blockStream, _header)) { writer.Write(_header.LookupTable, _predictionsPerRef); } } } } ================================================ FILE: CacheUtils/PredictionCache/PredictionCacheWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Compression.FileHandling; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.IO.Caches; namespace CacheUtils.PredictionCache { public sealed class PredictionCacheWriter : IDisposable { private readonly BinaryWriter _writer; private readonly BlockStream _blockStream; private readonly PredictionHeader _header; private readonly bool _leaveOpen; public PredictionCacheWriter(BlockStream blockStream, PredictionHeader header, bool leaveOpen = false) { _blockStream = blockStream; _writer = new BinaryWriter(blockStream); _header = header; _leaveOpen = leaveOpen; } public void Dispose() { if (!_leaveOpen) _blockStream.Dispose(); _writer.Dispose(); } internal void Write(Prediction.Entry[] lut, Prediction[][] predictionsPerRef) { _blockStream.WriteHeader(_header.Write); WriteLookupTable(_writer, lut); _blockStream.Flush(); WritePredictions(predictionsPerRef); } private void WritePredictions(IReadOnlyList predictionsPerRef) { var indexEntries = _header.Custom.Entries; for (var i = 0; i < predictionsPerRef.Count; i++) { var refPredictions = predictionsPerRef[i]; var position = _blockStream.GetBlockPosition(); indexEntries[i].FileOffset = position.FileOffset; indexEntries[i].Count = refPredictions?.Length ?? 0; if (refPredictions != null) { foreach (var prediction in refPredictions) prediction.Write(_writer); } _blockStream.Flush(); } } private static void WriteLookupTable(BinaryWriter writer, IReadOnlyCollection lut) { writer.Write(lut.Count); foreach (var entry in lut) entry.Write(writer); } } } ================================================ FILE: CacheUtils/PredictionCache/PredictionExtensions.cs ================================================ using System; using System.IO; namespace CacheUtils.PredictionCache { public static class PredictionExtensions { public static RoundedEntryPrediction[] GetRoundedEntryPredictions(this string[] predictionStrings) { var predictions = new RoundedEntryPrediction[predictionStrings.Length]; var currentIndex = 0; foreach (string s in predictionStrings) predictions[currentIndex++] = s.GetRoundedEntryPrediction(); return predictions; } private static RoundedEntryPrediction GetRoundedEntryPrediction(this string predictionString) { // convert the base 64 string representation to our compressed prediction data var uncompressedDataWithHeader = Convert.FromBase64String(predictionString); const int headerLength = 3; // skip the 'VEP' header int newLength = uncompressedDataWithHeader.Length - headerLength; // sanity check: we should have an even number of bytes if ((newLength & 1) != 0) { throw new InvalidDataException($"Expected an even number of bytes when serializing the protein function prediction matrix: {newLength}"); } var data = new ushort[newLength / 2]; Buffer.BlockCopy(uncompressedDataWithHeader, headerLength, data, 0, newLength); var roundedEntries = new RoundedEntry[data.Length]; for (var i = 0; i < data.Length; i++) roundedEntries[i] = new RoundedEntry(data[i]); return new RoundedEntryPrediction(roundedEntries); } } } ================================================ FILE: CacheUtils/PredictionCache/PredictionUtilities.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.Genes.Utilities; using CacheUtils.TranscriptCache; using Intervals; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.PredictionCache { public static class PredictionUtilities { internal static IntervalArray[] UpdateTranscripts(IEnumerable transcripts, Prediction[] oldSiftPredictions, IEnumerable siftPredictions, Prediction[] oldPolyPhenPredictions, IEnumerable polyPhenPredictions, int numRefSeqs) { var siftDict = siftPredictions.CreateIndex(); var polyphenDict = polyPhenPredictions.CreateIndex(); var newTranscripts = new List(); // ReSharper disable once LoopCanBeConvertedToQuery foreach (var transcript in transcripts) { int siftIndex = GetNewIndex(oldSiftPredictions, transcript.SiftIndex, siftDict); int polyphenIndex = GetNewIndex(oldPolyPhenPredictions, transcript.PolyPhenIndex, polyphenDict); newTranscripts.Add(transcript.UpdatePredictions(siftIndex, polyphenIndex)); } return newTranscripts.ToIntervalArrays(numRefSeqs); } internal static ITranscript UpdatePredictions(this ITranscript t, int siftIndex, int polyphenIndex) { return new Transcript(t.Chromosome, t.Start, t.End, t.Id, t.Translation, t.BioType, t.Gene, t.TotalExonLength, t.StartExonPhase, t.IsCanonical, t.TranscriptRegions, t.NumExons, t.MicroRnas, siftIndex, polyphenIndex, t.Source, t.CdsStartNotFound, t.CdsEndNotFound, t.Selenocysteines, t.RnaEdits); } private static int GetNewIndex(IReadOnlyList oldPredictions, int index, IReadOnlyDictionary dict) { if (index == -1) return -1; var prediction = oldPredictions[index]; if (!dict.TryGetValue(prediction, out int newIndex)) throw new InvalidDataException("Unable to find the prediction in the dictionary."); return newIndex; } } } ================================================ FILE: CacheUtils/PredictionCache/RoundedEntry.cs ================================================ using System; namespace CacheUtils.PredictionCache { public struct RoundedEntry : IEquatable { public readonly ushort Score; public readonly byte EnumIndex; public RoundedEntry(ushort data) { Score = Round((ushort)(data & 0x3ff)); EnumIndex = (byte)((data & 0xc000) >> 14); } private static ushort Round(ushort us) => (ushort)((ushort)Math.Round(us / 5.0) * 5); public bool Equals(RoundedEntry other) => Score == other.Score && EnumIndex == other.EnumIndex; public override int GetHashCode() { unchecked { return (Score.GetHashCode() * 397) ^ EnumIndex.GetHashCode(); } } } } ================================================ FILE: CacheUtils/PredictionCache/RoundedEntryPrediction.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Caches.DataStructures; namespace CacheUtils.PredictionCache { public sealed class RoundedEntryPrediction { public readonly RoundedEntry[] Entries; public RoundedEntryPrediction(RoundedEntry[] entries) => Entries = entries; public Prediction Convert(Dictionary lutDict, Prediction.Entry[] lut) { int numEntries = Entries.Length; var lutIndices = new byte[numEntries]; var index = 0; foreach (var entry in Entries) lutIndices[index++] = entry.Score > 1000 ? (byte) 255 : lutDict[entry]; return new Prediction(lutIndices, lut); } } } ================================================ FILE: CacheUtils/TranscriptCache/CanonicalTranscriptMarker.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Utilities; namespace CacheUtils.TranscriptCache { public sealed class CanonicalTranscriptMarker { private readonly HashSet _lrgTranscriptIds; public CanonicalTranscriptMarker(HashSet lrgTranscriptIds) { _lrgTranscriptIds = lrgTranscriptIds; } public int MarkTranscripts(MutableTranscript[] transcripts) { var transcriptsByGeneId = GetTranscriptsByEntrezGeneId(transcripts); var canonicalTranscriptsByGeneId = GetCanonicalTranscriptsByGeneId(transcriptsByGeneId); return SetCanonicalFlags(canonicalTranscriptsByGeneId, transcripts); } private SortedDictionary> GetTranscriptsByEntrezGeneId(IEnumerable transcripts) { var genes = new SortedDictionary>(); foreach (var transcript in transcripts) { string idWithVersion = transcript.Id + '.' + transcript.Version; int cdsLength = transcript.CodingRegion?.Length ?? 0; int transcriptLength = transcript.End - transcript.Start + 1; bool isLrg = _lrgTranscriptIds.Contains(transcript.Id); int accession = AccessionUtilities.GetAccessionNumber(transcript.Id); var metadata = new TranscriptMetadata(idWithVersion, accession, transcriptLength, cdsLength, isLrg); int geneId = ConvertGeneIdToInt(transcript.Gene.GeneId); if (genes.TryGetValue(geneId, out var observedMetadata)) observedMetadata.Add(metadata); else genes[geneId] = new HashSet { metadata }; } return genes; } private static SortedDictionary GetCanonicalTranscriptsByGeneId(SortedDictionary> genes) { // - Order all of the overlapping transcripts by cds length // - Pick the longest transcript that has an associated Locus Reference Genome (LRG) sequence // - If no LRGs exist for the set of transcripts, pick the longest transcript that is coding // - If there is a tie, pick the transcript with the smaller accession id number var canonicalTranscripts = new SortedDictionary(); foreach (var kvp in genes) { var sortedTranscripts = GetSortedTrustedTranscripts(kvp.Value); // pick the transcript with the smallest accession if (sortedTranscripts.Count > 0) canonicalTranscripts[kvp.Key] = sortedTranscripts[0].TranscriptId; } return canonicalTranscripts; } private static int ConvertGeneIdToInt(string geneId) { if (string.IsNullOrEmpty(geneId)) throw new InvalidDataException("Expected a non-empty Entrez gene ID during canonical aggregation."); if (geneId.StartsWith("ENSG")) geneId = geneId.Substring(4); if (!int.TryParse(geneId, out int geneIdNumber)) throw new InvalidDataException($"Unable to convert Entrez gene ID ({geneId}) to an integer."); return geneIdNumber; } private static int SetCanonicalFlags(IReadOnlyDictionary canonicalTranscriptsByGeneId, IEnumerable transcripts) { var numCanonicalTranscripts = 0; foreach (var transcript in transcripts) { int geneId = ConvertGeneIdToInt(transcript.Gene.GeneId); transcript.IsCanonical = false; // no canonical transcript if (!canonicalTranscriptsByGeneId.TryGetValue(geneId, out string canonicalTranscriptId)) continue; string idWithVersion = transcript.Id + '.' + transcript.Version; if (idWithVersion != canonicalTranscriptId) continue; // mark the transcript canonical transcript.IsCanonical = true; numCanonicalTranscripts++; } return numCanonicalTranscripts; } /// /// returns a sorted list of all the transcripts that have an ENST, NM_, or NR_ prefix /// private static List GetSortedTrustedTranscripts(IEnumerable transcripts) { var selectedTranscripts = transcripts.Where( transcript => transcript.TranscriptId.StartsWith("ENST") || transcript.TranscriptId.StartsWith("NM_") || transcript.TranscriptId.StartsWith("NR_")).ToList(); return selectedTranscripts.OrderByDescending(x => x.IsLrg) .ThenByDescending(x => x.CdsLength) .ThenByDescending(x => x.TranscriptLength) .ThenBy(x => x.Accession) .ToList(); } public sealed class TranscriptMetadata : IEquatable { public readonly string TranscriptId; public readonly int CdsLength; public readonly int TranscriptLength; public readonly bool IsLrg; public readonly int Accession; public TranscriptMetadata(string transcriptId, int accession, int transcriptLength, int cdsLength, bool isLrg) { TranscriptId = transcriptId; TranscriptLength = transcriptLength; CdsLength = cdsLength; IsLrg = isLrg; Accession = accession; } public bool Equals(TranscriptMetadata other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return string.Equals(TranscriptId, other.TranscriptId) && CdsLength == other.CdsLength && TranscriptLength == other.TranscriptLength && IsLrg == other.IsLrg && Accession == other.Accession; } public override int GetHashCode() { unchecked { int hashCode = TranscriptId != null ? TranscriptId.GetHashCode() : 0; hashCode = (hashCode * 397) ^ CdsLength; hashCode = (hashCode * 397) ^ TranscriptLength; hashCode = (hashCode * 397) ^ IsLrg.GetHashCode(); hashCode = (hashCode * 397) ^ Accession; return hashCode; } } } } } ================================================ FILE: CacheUtils/TranscriptCache/Comparers/GeneComparer.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.TranscriptCache.Comparers { internal sealed class GeneComparer : EqualityComparer { public override bool Equals(IGene x, IGene y) { return x.Start == y.Start && x.End == y.End && x.Chromosome.Index == y.Chromosome.Index && x.OnReverseStrand == y.OnReverseStrand && x.Symbol == y.Symbol && x.EntrezGeneId.WithVersion == y.EntrezGeneId.WithVersion && x.EnsemblId.WithVersion == y.EnsemblId.WithVersion && x.HgncId == y.HgncId; } public override int GetHashCode(IGene obj) { string entrezGeneId = obj.EntrezGeneId.WithVersion; string ensemblId = obj.EnsemblId.WithVersion; unchecked { int hashCode = obj.Start; hashCode = (hashCode * 397) ^ obj.End; hashCode = (hashCode * 397) ^ obj.Chromosome.Index; hashCode = (hashCode * 397) ^ obj.OnReverseStrand.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Symbol.GetHashCode(); if (entrezGeneId != null) hashCode = (hashCode * 397) ^ entrezGeneId.GetHashCode(); if (ensemblId != null) hashCode = (hashCode * 397) ^ ensemblId.GetHashCode(); hashCode = (hashCode * 397) ^ obj.HgncId; return hashCode; } } } } ================================================ FILE: CacheUtils/TranscriptCache/Comparers/IntervalComparer.cs ================================================ using System.Collections.Generic; using Intervals; namespace CacheUtils.TranscriptCache.Comparers { internal sealed class IntervalComparer : EqualityComparer { public override bool Equals(IInterval x, IInterval y) => x.Start == y.Start && x.End == y.End; public override int GetHashCode(IInterval obj) { unchecked { return (obj.Start * 397) ^ obj.End; } } } } ================================================ FILE: CacheUtils/TranscriptCache/Comparers/RegulatoryRegionComparer.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.TranscriptCache.Comparers { internal sealed class RegulatoryRegionComparer : EqualityComparer { public override bool Equals(IRegulatoryRegion x, IRegulatoryRegion y) { return x.Start == y.Start && x.End == y.End && x.Chromosome.Index == y.Chromosome.Index && x.Id.WithoutVersion == y.Id.WithoutVersion && x.Type == y.Type; } public override int GetHashCode(IRegulatoryRegion obj) { unchecked { int hashCode = obj.Start; hashCode = (hashCode * 397) ^ obj.End; hashCode = (hashCode * 397) ^ obj.Chromosome.Index.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Id.WithoutVersion.GetHashCode(); hashCode = (hashCode * 397) ^ (int)obj.Type; return hashCode; } } } } ================================================ FILE: CacheUtils/TranscriptCache/Comparers/TranscriptRegionComparer.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.TranscriptCache.Comparers { internal sealed class TranscriptRegionComparer : EqualityComparer { public override bool Equals(ITranscriptRegion x, ITranscriptRegion y) { if (ReferenceEquals(x, y)) return true; return x.Type == y.Type && x.Id == y.Id && x.Start == y.Start && x.End == y.End && x.CdnaStart == y.CdnaStart && x.CdnaEnd == y.CdnaEnd; } public override int GetHashCode(ITranscriptRegion obj) { unchecked { var hashCode = (int)obj.Type; hashCode = (hashCode * 397) ^ obj.Id.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Start; hashCode = (hashCode * 397) ^ obj.End; hashCode = (hashCode * 397) ^ obj.CdnaStart; hashCode = (hashCode * 397) ^ obj.CdnaEnd; return hashCode; } } } } ================================================ FILE: CacheUtils/TranscriptCache/Comparers/UgaGeneComparer.cs ================================================ using System.Collections.Generic; using CacheUtils.Genes.DataStructures; using Intervals; namespace CacheUtils.TranscriptCache.Comparers { public sealed class UgaGeneComparer : EqualityComparer { public override bool Equals(UgaGene x, UgaGene y) { if (ReferenceEquals(null, y)) return false; if (ReferenceEquals(x, y)) return true; return x.Chromosome.Index == y.Chromosome.Index && Equals(x.GRCh37, y.GRCh37) && Equals(x.GRCh38, y.GRCh38) && x.OnReverseStrand == y.OnReverseStrand && x.HgncId == y.HgncId && x.Symbol == y.Symbol && x.EntrezGeneId == y.EntrezGeneId && x.EnsemblId == y.EnsemblId; } private static bool Equals(IInterval x, IInterval y) { if (x == null && y == null) return true; if (x == null || y == null) return false; return x.Start == y.Start && x.End == y.End; } private static int GetHashCode(IInterval x) { unchecked { return (x.Start * 397) ^ x.End; } } public override int GetHashCode(UgaGene obj) { unchecked { int hashCode = obj.Chromosome.Index.GetHashCode(); if (obj.GRCh37 != null) hashCode = (hashCode * 397) ^ GetHashCode(obj.GRCh37); if (obj.GRCh38 != null) hashCode = (hashCode * 397) ^ GetHashCode(obj.GRCh38); hashCode = (hashCode * 397) ^ obj.OnReverseStrand.GetHashCode(); hashCode = (hashCode * 397) ^ obj.HgncId; if (obj.Symbol != null) hashCode = (hashCode * 397) ^ obj.Symbol.GetHashCode(); if (obj.EntrezGeneId != null) hashCode = (hashCode * 397) ^ obj.EntrezGeneId.GetHashCode(); if (obj.EnsemblId != null) hashCode = (hashCode * 397) ^ obj.EnsemblId.GetHashCode(); return hashCode; } } } } ================================================ FILE: CacheUtils/TranscriptCache/NSequence.cs ================================================  using Genome; namespace CacheUtils.TranscriptCache { public sealed class NSequence : ISequence { public int Length { get; } = 1000; public string Substring(int offset, int length) => new string('N', length); public Band[] CytogeneticBands => null; } } ================================================ FILE: CacheUtils/TranscriptCache/SortExtensions.cs ================================================ using System.Collections.Generic; using System.Linq; using Genome; using Intervals; namespace CacheUtils.TranscriptCache { public static class SortExtensions { public static IOrderedEnumerable Sort(this IEnumerable elements) where T : IChromosomeInterval => elements.OrderBy(x => x.Chromosome.Index).ThenBy(x => x.Start).ThenBy(x => x.End); public static IOrderedEnumerable SortInterval(this IEnumerable elements) where T : IInterval => elements.OrderBy(x => x.Start).ThenBy(x => x.End); } } ================================================ FILE: CacheUtils/TranscriptCache/TranscriptCacheBuilder.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.Utilities; using CacheUtils.Utilities; using Genome; using Intervals; using IO; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; namespace CacheUtils.TranscriptCache { public sealed class TranscriptCacheBuilder { private readonly GenomeAssembly _genomeAssembly; private readonly Source _source; private readonly long _vepReleaseTicks; private readonly ushort _vepVersion; public TranscriptCacheBuilder(GenomeAssembly genomeAssembly, Source source, long vepReleaseTicks, ushort vepVersion) { _genomeAssembly = genomeAssembly; _source = source; _vepReleaseTicks = vepReleaseTicks; _vepVersion = vepVersion; } public TranscriptCacheStaging CreateTranscriptCache(MutableTranscript[] mutableTranscripts, IEnumerable regulatoryRegions, IIntervalForest geneForest, int numRefSeqs) { Logger.Write("- assigning UGA genes to transcripts... "); AssignUgaGenesToTranscripts(mutableTranscripts, geneForest); Logger.WriteLine("finished."); var transcriptIntervalArrays = mutableTranscripts.ToTranscripts().ToIntervalArrays(numRefSeqs); var regulatoryRegionIntervalArrays = regulatoryRegions.ToIntervalArrays(numRefSeqs); var customHeader = new TranscriptCacheCustomHeader(_vepVersion, _vepReleaseTicks); var header = new CacheHeader(HeaderUtilities.GetHeader(_source, _genomeAssembly), customHeader); return TranscriptCacheStaging.GetStaging(header, transcriptIntervalArrays, regulatoryRegionIntervalArrays); } private void AssignUgaGenesToTranscripts(IEnumerable transcripts, IIntervalForest geneForest) { foreach (var transcript in transcripts) { var originalGene = transcript.Gene; var ugaGenes = geneForest.GetAllOverlappingValues(originalGene.Chromosome.Index, originalGene.Start, originalGene.End); if (ugaGenes == null) { string strand = originalGene.OnReverseStrand ? "R" : "F"; throw new InvalidDataException($"Found a transcript ({transcript.Id}) that does not have an overlapping UGA gene: gene ID: {originalGene.GeneId} {originalGene.Chromosome.UcscName} {originalGene.Start} {originalGene.End} {strand}"); } transcript.UpdatedGene = PickGeneById(ugaGenes, originalGene.GeneId).ToGene(_genomeAssembly); } } private UgaGene PickGeneById(IReadOnlyList genes, string geneId) { if (genes.Count == 1) return genes[0]; var genesById = genes.GetMultiValueDict(x => _source == Source.Ensembl ? x.EnsemblId : x.EntrezGeneId); if (!genesById.TryGetValue(geneId, out var idGenes)) throw new InvalidDataException($"Could not find {geneId} in the UGA genes list."); if (idGenes.Count == 1) return idGenes[0]; throw new InvalidDataException($"Found multiple entries for {geneId} in the UGA genes list."); } } } ================================================ FILE: CacheUtils/TranscriptCache/TranscriptCacheStaging.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.MiniCache; using CacheUtils.TranscriptCache.Comparers; using Intervals; using VariantAnnotation.Caches; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; namespace CacheUtils.TranscriptCache { public sealed class TranscriptCacheStaging : IStaging { private readonly TranscriptCacheData _cacheData; private TranscriptCacheStaging(TranscriptCacheData cacheData) { _cacheData = cacheData; } public void Write(Stream stream) { using (var writer = new TranscriptCacheWriter(stream, _cacheData.Header)) writer.Write(_cacheData); } public static TranscriptCacheStaging GetStaging(CacheHeader header, IntervalArray[] transcriptIntervalArrays, IntervalArray[] regulatoryRegionIntervalArrays) { var uniqueData = GetUniqueData(transcriptIntervalArrays); var cacheData = new TranscriptCacheData(header, uniqueData.Genes, uniqueData.TranscriptRegions, uniqueData.Mirnas, uniqueData.PeptideSeqs, transcriptIntervalArrays, regulatoryRegionIntervalArrays); return new TranscriptCacheStaging(cacheData); } private static (IGene[] Genes, ITranscriptRegion[] TranscriptRegions, IInterval[] Mirnas, string[] PeptideSeqs) GetUniqueData( IEnumerable> intervalArrays) { var intervalComparer = new IntervalComparer(); var transcriptRegionComparer = new TranscriptRegionComparer(); var geneComparer = new GeneComparer(); var geneSet = new HashSet(geneComparer); var transcriptRegionSet = new HashSet(transcriptRegionComparer); var mirnaSet = new HashSet(intervalComparer); var peptideSet = new HashSet(); foreach (var intervalArray in intervalArrays) { if (intervalArray == null) continue; foreach (var interval in intervalArray.Array) { var transcript = interval.Value; geneSet.Add(transcript.Gene); AddString(peptideSet, transcript.Translation?.PeptideSeq); AddTranscriptRegions(transcriptRegionSet, transcript.TranscriptRegions); AddIntervals(mirnaSet, transcript.MicroRnas); } } var genes = GetUniqueGenes(geneSet); var transcriptRegions = GetUniqueTranscriptRegions(transcriptRegionSet); var mirnas = GetUniqueIntervals(mirnaSet); var peptideSeqs = GetUniqueStrings(peptideSet); return (genes, transcriptRegions, mirnas, peptideSeqs); } private static void AddIntervals(ISet intervalSet, IInterval[] intervals) { if (intervals == null) return; foreach (var interval in intervals) intervalSet.Add(interval); } private static void AddTranscriptRegions(ISet transcriptRegionSet, ITranscriptRegion[] regions) { if (regions == null) return; foreach (var region in regions) transcriptRegionSet.Add(region); } private static void AddString(ISet stringSet, string s) { if (string.IsNullOrEmpty(s)) return; stringSet.Add(s); } private static string[] GetUniqueStrings(ICollection peptideSet) { return peptideSet.Count > 0 ? peptideSet.OrderBy(x => x).ToArray() : null; } private static IInterval[] GetUniqueIntervals(ICollection mirnaSet) { return mirnaSet.Count > 0 ? mirnaSet.SortInterval().ToArray() : null; } private static ITranscriptRegion[] GetUniqueTranscriptRegions(ICollection transcriptRegionSet) { return transcriptRegionSet.Count > 0 ? transcriptRegionSet.SortInterval().ToArray() : null; } private static IGene[] GetUniqueGenes(ICollection geneSet) { return geneSet.Count > 0 ? geneSet.Sort().ToArray() : null; } } } ================================================ FILE: CacheUtils/TranscriptCache/TranscriptCacheUtilities.cs ================================================ using System.Collections.Generic; using System.Linq; using CacheUtils.Genes.Utilities; using CacheUtils.MiniCache; using Genome; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.TranscriptCache { public static class TranscriptCacheUtilities { public static List GetTranscripts(DataBundle bundle, ChromosomeInterval interval) { ITranscript[] overlappingTranscripts = bundle.TranscriptCache.TranscriptIntervalForest.GetAllOverlappingValues(interval.Chromosome.Index, interval.Start, interval.End); return overlappingTranscripts?.ToList() ?? new List(); } public static IntervalArray[] ToIntervalArrays(this IEnumerable items, int numRefSeqs) where T : IChromosomeInterval { var intervalArrays = new IntervalArray[numRefSeqs]; Dictionary> itemsByRef = items.GetMultiValueDict(x => x.Chromosome.Index); foreach (ushort refIndex in itemsByRef.Keys.OrderBy(x => x)) { List unsortedItems = itemsByRef[refIndex]; Interval[] intervals = unsortedItems.OrderBy(x => x.Start).ThenBy(x => x.End).ToIntervals(unsortedItems.Count); intervalArrays[refIndex] = new IntervalArray(intervals); } return intervalArrays; } private static Interval[] ToIntervals(this IEnumerable items, int numItems) where T : IChromosomeInterval { var intervals = new Interval[numItems]; var i = 0; foreach (var item in items) { intervals[i++] = new Interval(item.Start, item.End, item); } return intervals; } } } ================================================ FILE: CacheUtils/TranscriptCache/TranscriptCacheWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Text; using CacheUtils.TranscriptCache.Comparers; using Compression.Algorithms; using Compression.FileHandling; using Intervals; using IO; using VariantAnnotation.Caches; using VariantAnnotation.IO.Caches; using VariantAnnotation.IO; namespace CacheUtils.TranscriptCache { public sealed class TranscriptCacheWriter : IDisposable { private readonly BlockStream _blockStream; private readonly ExtendedBinaryWriter _writer; private readonly CacheHeader _header; private readonly bool _leaveOpen; public TranscriptCacheWriter(Stream stream, CacheHeader header, bool leaveOpen = false) { _blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Compress); _writer = new ExtendedBinaryWriter(_blockStream, Encoding.UTF8, leaveOpen); _header = header; _leaveOpen = leaveOpen; } public void Dispose() { if (!_leaveOpen) _blockStream.Dispose(); _writer.Dispose(); } /// /// writes the annotations to the current database file /// public void Write(TranscriptCacheData cacheData) { _blockStream.WriteHeader(_header.Write); WriteItems(_writer, cacheData.Genes, x => x.Write(_writer)); WriteItems(_writer, cacheData.TranscriptRegions, x => x.Write(_writer)); WriteItems(_writer, cacheData.Mirnas, x => x.Write(_writer)); WriteItems(_writer, cacheData.PeptideSeqs, x => _writer.WriteOptAscii(x)); var geneComparer = new GeneComparer(); var transcriptRegionComparer = new TranscriptRegionComparer(); var intervalComparer = new IntervalComparer(); var geneIndices = CreateIndex(cacheData.Genes, geneComparer); var transcriptRegionIndices = CreateIndex(cacheData.TranscriptRegions, transcriptRegionComparer); var microRnaIndices = CreateIndex(cacheData.Mirnas, intervalComparer); var peptideIndices = CreateIndex(cacheData.PeptideSeqs, EqualityComparer.Default); WriteIntervals(_writer, cacheData.RegulatoryRegionIntervalArrays, x => x.Write(_writer)); WriteIntervals(_writer, cacheData.TranscriptIntervalArrays, x => x.Write(_writer, geneIndices, transcriptRegionIndices, microRnaIndices, peptideIndices)); } private static void WriteIntervals(IExtendedBinaryWriter writer, IReadOnlyCollection> intervalArrays, Action writeMethod) { writer.WriteOpt(intervalArrays.Count); foreach (var intervalArray in intervalArrays) { if (intervalArray == null) { writer.WriteOpt(0); continue; } writer.WriteOpt(intervalArray.Array.Length); foreach (var interval in intervalArray.Array) writeMethod(interval.Value); } writer.Write(CacheConstants.GuardInt); } internal static void WriteItems(IExtendedBinaryWriter writer, IReadOnlyCollection items, Action writeMethod) { if (items == null) { writer.WriteOpt(0); } else { writer.WriteOpt(items.Count); foreach (var item in items) writeMethod(item); } writer.Write(CacheConstants.GuardInt); } /// /// creates an index out of a array /// internal static Dictionary CreateIndex(IReadOnlyList array, IEqualityComparer comparer) { var index = new Dictionary(comparer); if (array == null) return index; for (var currentIndex = 0; currentIndex < array.Count; currentIndex++) index[array[currentIndex]] = currentIndex; return index; } } } ================================================ FILE: CacheUtils/TranscriptCache/TranscriptConversionExtensions.cs ================================================ using System.Collections.Generic; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.TranscriptCache { public static class TranscriptConversionExtensions { public static IEnumerable ToTranscripts(this MutableTranscript[] mutableTranscripts) { var transcripts = new List(mutableTranscripts.Length); transcripts.AddRange(mutableTranscripts.Select(mt => mt.ToTranscript())); return transcripts; } private static ITranscript ToTranscript(this MutableTranscript mt) { var translation = mt.CodingRegion == null ? null : GetTranslation(mt.CodingRegion, mt.CdsLength, CompactId.Convert(mt.ProteinId, mt.ProteinVersion), mt.PeptideSequence); var sortedMicroRnas = mt.MicroRnas?.OrderBy(x => x.Start).ToArray(); return new Transcript(mt.Chromosome, mt.Start, mt.End, CompactId.Convert(mt.Id, mt.Version), translation, mt.BioType, mt.UpdatedGene, mt.TotalExonLength, mt.NewStartExonPhase, mt.IsCanonical, mt.TranscriptRegions, (ushort) mt.Exons.Length, sortedMicroRnas, mt.SiftIndex, mt.PolyPhenIndex, mt.Source, mt.CdsStartNotFound, mt.CdsEndNotFound, mt.SelenocysteinePositions, mt.RnaEdits); } private static ITranslation GetTranslation(ICodingRegion oldCodingRegion, int cdsLength, CompactId proteinId, string peptideSeq) { var codingRegion = new CodingRegion(oldCodingRegion.Start, oldCodingRegion.End, oldCodingRegion.CdnaStart, oldCodingRegion.CdnaEnd, cdsLength); return new Translation(codingRegion, proteinId, peptideSeq); } } } ================================================ FILE: CacheUtils/TranscriptCache/TranscriptRegionMerger.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using Intervals; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.TranscriptCache { public static class TranscriptRegionMerger { public static ITranscriptRegion[] GetTranscriptRegions(IEnumerable cdnaMaps, MutableExon[] exons, IInterval[] introns, bool onReverseStrand) { var sortedRegions = cdnaMaps.OrderBy(x => x.Start).ThenBy(x => x.End).ToList(); var intronIntervals = introns == null ? null : CreateIntervals(introns.OrderBy(x => x.Start).ThenBy(x => x.End), introns.Length, onReverseStrand); var exonIntervals = CreateIntervals(exons.OrderBy(x => x.Start).ThenBy(x => x.End), exons.Length, onReverseStrand); return sortedRegions.AddGaps() .AddIds(intronIntervals, TranscriptRegionType.Gap, TranscriptRegionType.Intron) .AddIds(exonIntervals, TranscriptRegionType.Exon, TranscriptRegionType.Exon) .AddIds(exonIntervals, TranscriptRegionType.Gap, TranscriptRegionType.Gap) .AddCoords(TranscriptRegionType.Intron, onReverseStrand) .AddCoords(TranscriptRegionType.Gap, onReverseStrand) .ToInterfaceArray(); } private static List AddCoords(this List regions, TranscriptRegionType targetRegionType, bool onReverseStrand) { for (var regionIndex = 0; regionIndex < regions.Count; regionIndex++) { var region = regions[regionIndex]; if (region.Type != targetRegionType) continue; var coords = regions.GetExonCoords(regionIndex, onReverseStrand); region.CdnaStart = coords.CdnaStart; region.CdnaEnd = coords.CdnaEnd; } return regions; } private static (int CdnaStart, int CdnaEnd) GetExonCoords(this IReadOnlyList regions, int regionIndex, bool onReverseStrand) { int cdnaStart = -1; int cdnaEnd = -1; int testIndex = regionIndex; while (testIndex >= 0) { testIndex--; var region = regions[testIndex]; if (region.Type != TranscriptRegionType.Exon) continue; if (onReverseStrand) cdnaEnd = region.CdnaStart; else cdnaStart = region.CdnaEnd; break; } testIndex = regionIndex; while (testIndex < regions.Count) { testIndex++; var region = regions[testIndex]; if (region.Type != TranscriptRegionType.Exon) continue; if (onReverseStrand) cdnaStart = region.CdnaEnd; else cdnaEnd = region.CdnaStart; break; } return (cdnaStart, cdnaEnd); } private static ITranscriptRegion[] ToInterfaceArray(this IReadOnlyList mutableRegions) { var regions = new ITranscriptRegion[mutableRegions.Count]; for (var i = 0; i < mutableRegions.Count; i++) { var region = mutableRegions[i]; regions[i] = new TranscriptRegion(region.Type, region.Id, region.Start, region.End, region.CdnaStart, region.CdnaEnd); } return regions; } private static IdInterval[] CreateIntervals(IEnumerable intervals, int numIntervals, bool onReverseStrand) { var idIntervals = new IdInterval[numIntervals]; ushort id = onReverseStrand ? (ushort)numIntervals : (ushort)1; var index = 0; foreach (var interval in intervals) { idIntervals[index] = new IdInterval(interval.Start, interval.End, id); if (onReverseStrand) id--; else id++; index++; } return idIntervals.OrderBy(x => x.Start).ThenBy(x => x.End).ToArray(); } private static List AddIds(this List regions, IReadOnlyList intervals, TranscriptRegionType targetRegionType, TranscriptRegionType matchRegionType) { if (intervals == null) return regions; foreach (var region in regions) { if (region.Type != targetRegionType) continue; int regionMidPoint = region.Start + (region.End - region.Start >> 1); int index = intervals.BinarySearch(regionMidPoint); if (index < 0) continue; var intron = intervals[index]; region.Type = matchRegionType; region.Id = intron.Id; } return regions; } private static int BinarySearch(this IReadOnlyList intervals, int position) { var begin = 0; int end = intervals.Count - 1; while (begin <= end) { int index = begin + (end - begin >> 1); var interval = intervals[index]; if (position >= interval.Start && position <= interval.End) return index; if (interval.End < position) begin = index + 1; else if (position < interval.Start) end = index - 1; } return ~begin; } private static List AddGaps(this List sortedRegions) { for (var i = 1; i < sortedRegions.Count; i++) { var prevRegion = sortedRegions[i - 1]; var region = sortedRegions[i]; int gapLength = CalculateGapLength(prevRegion, region); if (gapLength == 0) continue; var gapRegion = new MutableTranscriptRegion(TranscriptRegionType.Gap, 0, prevRegion.End + 1, region.Start - 1); sortedRegions.Insert(i, gapRegion); i++; } return sortedRegions; } private static int CalculateGapLength(IInterval prevRegion, IInterval region) => region.Start - prevRegion.End - 1; private sealed class IdInterval : IInterval, IComparable { public int Start { get; } public int End { get; } public readonly ushort Id; public IdInterval(int start, int end, ushort id) { Start = start; End = end; Id = id; } public int CompareTo(IdInterval other) { if (ReferenceEquals(this, other)) return 0; if (ReferenceEquals(null, other)) return 1; int startComparison = Start.CompareTo(other.Start); return startComparison != 0 ? startComparison : End.CompareTo(other.End); } } } } ================================================ FILE: CacheUtils/TranscriptCache/TranscriptRegionValidater.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.DataDumperImport.DataStructures.Mutable; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; namespace CacheUtils.TranscriptCache { public static class TranscriptRegionValidater { public static void Validate(string transcriptId, IEnumerable cdnaMaps, IEnumerable exons, IEnumerable introns, ITranscriptRegion[] regions) { try { ValidateRegions(transcriptId, regions); if (regions.Length <= 1) return; CheckGenomicCoordinateContiguity(transcriptId, regions); } catch (Exception) { DumpTranscriptRegions(regions); DumpExons(exons); DumpIntrons(introns); DumpCdnaMaps(cdnaMaps); throw; } } private static void CheckGenomicCoordinateContiguity(string transcriptId, IReadOnlyList regions) { for (var i = 1; i < regions.Count; i++) { var prevRegion = regions[i - 1]; var region = regions[i]; int delta = region.Start - prevRegion.End; if (delta != 1) throw new InvalidDataException($"Found non-contiguous genomic coordinates in transcript regions in transcript ({transcriptId})."); } } private static void ValidateRegions(string transcriptId, IEnumerable regions) { foreach (var region in regions) { if (region.Id == 0) throw new InvalidDataException($"Expected transcript ({transcriptId}) to have regions with non-zero IDs."); if (region.CdnaStart < 1) throw new InvalidDataException($"Expected transcript ({transcriptId}) to have regions with true cDNA start positions."); if (region.CdnaEnd < 1) throw new InvalidDataException($"Expected transcript ({transcriptId}) to have regions with true cDNA end positions."); if (region.Type != TranscriptRegionType.Exon && region.Type != TranscriptRegionType.Intron && region.Type != TranscriptRegionType.Gap) throw new InvalidDataException($"Found unexpected transcript region type ({region.Type}) in transcript ({transcriptId})."); } } private static void DumpTranscriptRegions(IEnumerable regions) { Console.WriteLine("\ntranscript regions:"); foreach (var region in regions) DumpTranscriptRegion(region); } private static void DumpTranscriptRegion(ITranscriptRegion region) => Console.WriteLine($"{region.Type}\t{region.Id}\t{region.Start}\t{region.End}\t{region.CdnaStart}\t{region.CdnaEnd}"); private static void DumpCdnaMaps(IEnumerable cdnaMaps) { Console.WriteLine("\ncDNA maps:"); foreach (var cdnaMap in cdnaMaps.OrderBy(x => x.Start).ThenBy(x => x.End)) DumpCdnaMap(cdnaMap); } private static void DumpCdnaMap(ITranscriptRegion cdnaMap) => Console.WriteLine($"{cdnaMap.Start}\t{cdnaMap.End}\t{cdnaMap.CdnaStart}\t{cdnaMap.CdnaEnd}"); private static void DumpIntrons(IEnumerable introns) { Console.WriteLine("\nIntrons:"); foreach (var intron in introns.OrderBy(x => x.Start).ThenBy(x => x.End)) DumpIntron(intron); } private static void DumpIntron(IInterval intron) => Console.WriteLine($"{intron.Start}\t{intron.End}"); private static void DumpExons(IEnumerable exons) { Console.WriteLine("\nExons:"); foreach (var exon in exons.OrderBy(x => x.Start).ThenBy(x => x.End)) DumpExon(exon); } private static void DumpExon(IInterval exon) => Console.WriteLine($"{exon.Start}\t{exon.End}"); } } ================================================ FILE: CacheUtils/Utilities/AccessionUtilities.cs ================================================ using System; using System.IO; using VariantAnnotation.Utilities; namespace CacheUtils.Utilities { internal static class AccessionUtilities { internal static (string Id, byte Version) GetMaxVersion(string originalId, byte originalVersion) { (string pureId, byte idVersion) = FormatUtilities.SplitVersion(originalId); return (pureId, Math.Max(originalVersion, idVersion)); } public static int GetAccessionNumber(string s) { if (string.IsNullOrEmpty(s)) return -1; return s.StartsWith("ENS") ? GetEnsemblAccessionNumber(s) : GetRefSeqAccessionNumber(s); } private static int GetRefSeqAccessionNumber(string s) { int firstUnderlinePos = s.IndexOf('_'); if (firstUnderlinePos == -1) throw new InvalidDataException("Expected an underline in the transcript ID, but didn't find any."); string id = s.Substring(firstUnderlinePos + 1); return int.Parse(id); } private static int GetEnsemblAccessionNumber(string s) { string id = s.Substring(4); return int.Parse(id); } } } ================================================ FILE: CacheUtils/Utilities/HeaderUtilities.cs ================================================ using System; using Genome; using IO; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; namespace CacheUtils.Utilities { public static class HeaderUtilities { public static Header GetHeader(Source source, GenomeAssembly genomeAssembly) => new Header( CacheConstants.Identifier, CacheConstants.SchemaVersion, CacheConstants.DataVersion, source, DateTime.Now.Ticks, genomeAssembly); } } ================================================ FILE: CacheUtils/Utilities/RemoteFile.cs ================================================ using System; using System.IO; using System.Net; using IO; using VariantAnnotation.Utilities; namespace CacheUtils.Utilities { public sealed class RemoteFile { private readonly string _description; public readonly string FilePath; private readonly string _url; static RemoteFile() => ServicePointManager.DefaultConnectionLimit = int.MaxValue; public RemoteFile(string description, string url, bool addDate = true) { _description = description; _url = url; FilePath = Path.Combine(Path.GetTempPath(), GetFilename(url, addDate)); } internal static string GetFilename(string url, bool addDate) { int lastSlashPos = url.LastIndexOf('/'); string originalFilename = url.Substring(lastSlashPos + 1); if (!addDate) return originalFilename; string extension = Path.GetExtension(originalFilename); string filenameStub = Path.GetFileNameWithoutExtension(originalFilename); return $"{filenameStub}_{Date.GetDate(DateTime.Now.Ticks)}{extension}"; } public void Download() { if (File.Exists(FilePath)) return; Logger.WriteLine($"- downloading the {_description}"); while (!SuccessfulDownload()) { Logger.WriteLine($"- requeueing download of the {_description}"); } } private bool SuccessfulDownload() { try { using (var client = new WebClient()) { client.Proxy = null; client.DownloadFileTaskAsync(_url, FilePath).Wait(); } } catch (Exception) { return false; } return true; } } } ================================================ FILE: CacheUtils/Utilities/TaskExtensions.cs ================================================ using System; using System.Collections.Generic; using System.Threading; using System.Threading.Tasks; using CommandLine.Utilities; using IO; namespace CacheUtils.Utilities { public static class TaskExtensions { public static void Execute(this IReadOnlyList items, string description, Action executeAction, int numThreads = 5) { var bench = new Benchmark(); var tasks = new Task[items.Count]; var maxThread = new SemaphoreSlim(numThreads); for (var i = 0; i < items.Count; i++) { maxThread.Wait(); var item = items[i]; tasks[i] = Task.Factory.StartNew(() => executeAction(item), TaskCreationOptions.LongRunning) .ContinueWith(task => maxThread.Release()); } Task.WaitAll(tasks); Logger.WriteLine($"- all {description} finished ({Benchmark.ToHumanReadable(bench.GetElapsedTime())}).\n"); } } } ================================================ FILE: Cloud/AmazonS3ClientWrapper.cs ================================================ using System.Threading.Tasks; using Amazon.S3; using Amazon.S3.Model; using IO; namespace Cloud { public sealed class AmazonS3ClientWrapper : IS3Client { private readonly AmazonS3Client _s3Client; public AmazonS3ClientWrapper(AmazonS3Client s3Client) => _s3Client = s3Client; public Task GetObjectAsync(GetObjectRequest getRequest) => _s3Client.GetObjectAsync(getRequest); public Task PutObjectAsync(PutObjectRequest putRequest) => _s3Client.PutObjectAsync(putRequest); } } ================================================ FILE: Cloud/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: Cloud/Cloud.appsettings.json ================================================ { "DataSource": { "BaseUrl": "http://nirvana-annotations.s3.us-west-2.amazonaws.com/", "CacheDirectory": "ab0cf104f39708eabd07b8cb67e149ba-Cache", "ReferencesDirectory": "d95867deadfe690e40f42068d6b59df8-References", "ManifestDirectory": "a9f54ea6ac0548696c97a3ee64bc39ec2e71b84b-SaManifest" } } ================================================ FILE: Cloud/Cloud.csproj ================================================  net6.0 ..\bin\$(Configuration) PreserveNewest ================================================ FILE: Cloud/Configuration.cs ================================================ using Cloud.Utilities; namespace Cloud; using Microsoft.Extensions.Configuration; public sealed class Configuration { public readonly IConfigurationRoot Config; public readonly IConfigurationSection DataSources; public string CacheDirectory => DataSources["CacheDirectory"]; public string ReferencesDirectory => DataSources["ReferencesDirectory"]; public string ManifestDirectory => DataSources["ManifestDirectory"]; public string NirvanaBaseUrl => DataSources["BaseUrl"]; public Configuration() { const string appSettingsFilename = "Cloud.appsettings.json"; Config = new ConfigurationBuilder() .AddJsonFile(appSettingsFilename) .Build(); DataSources = Config.GetSection("DataSource"); } } ================================================ FILE: Cloud/LambdaUrlHelper.cs ================================================ using System; using Cloud.Utilities; using Genome; using IO; using ReferenceSequence; namespace Cloud { public static class LambdaUrlHelper { public const ushort SaSchemaVersion = 22; public const string UrlBaseEnvironmentVariableName = "NirvanaDataUrlBase"; private static readonly Configuration Config = new (); public static string S3CacheFolderBase = Config.CacheDirectory; // public const string S3ManifestFolderBase = "a9f54ea6ac0548696c97a3ee64bc39ec2e71b84b-SaManifest"; public static readonly string S3CacheFolder = $"{Config.CacheDirectory}/{CacheConstants.DataVersion}/"; private static readonly string S3RefPrefix = $"{Config.ReferencesDirectory}/{ReferenceSequenceCommon.HeaderVersion}/Homo_sapiens."; private const string UgaFileName = "UGA.tsv.gz"; public const string DefaultCacheSource = "Both"; public const string RefSuffix = ".Nirvana.dat"; public const string JsonSuffix = ".json.gz"; public const string JsonIndexSuffix = ".jsi"; public const string SuccessMessage = "Success"; public static string GetBaseUrl() { var envBaseUrl = Environment.GetEnvironmentVariable(UrlBaseEnvironmentVariableName); return string.IsNullOrEmpty(envBaseUrl) ? Config.NirvanaBaseUrl: envBaseUrl; } public static string GetManifestBaseUrl() => GetBaseUrl() + Config.ManifestDirectory; public static string GetCacheFolder() => GetBaseUrl() + S3CacheFolder; public static string GetUgaUrl() => GetCacheFolder() + UgaFileName; public static string GetRefPrefix() => GetBaseUrl() + S3RefPrefix; public static string GetRefUrl(GenomeAssembly genomeAssembly) => GetRefPrefix() + genomeAssembly + RefSuffix; } } ================================================ FILE: Cloud/Messages/Annotation/AnnotationConfig.cs ================================================ using System.Collections.Generic; namespace Cloud.Messages.Annotation { public sealed class AnnotationConfig { // ReSharper disable InconsistentNaming // ReSharper disable NotAccessedField.Global public string id; public string genomeAssembly; public string vcfUrl; public string tabixUrl; public S3Path outputDir; public string outputPrefix; public List customAnnotations; public List desiredVcfInfo; public List desiredVcfSampleInfo; public string customStrUrl; public AnnotationRange annotationRange; // ReSharper restore NotAccessedField.Global // ReSharper restore InconsistentNaming } } ================================================ FILE: Cloud/Messages/Annotation/AnnotationPosition.cs ================================================ namespace Cloud.Messages.Annotation { public struct AnnotationPosition { public readonly string Chromosome; public readonly int Position; public AnnotationPosition(string chromosome, int position) { Chromosome = chromosome; Position = position; } } } ================================================ FILE: Cloud/Messages/Annotation/AnnotationRange.cs ================================================ using System.Collections.Generic; using Genome; namespace Cloud.Messages.Annotation { public sealed class AnnotationRange { public readonly AnnotationPosition Start; public readonly AnnotationPosition? End; public AnnotationRange(AnnotationPosition start, AnnotationPosition? end) { Start = start; End = end; } public GenomicRange ToGenomicRange(Dictionary refNameToChromosome) { var startGenomicPosition = new GenomicPosition(ReferenceNameUtilities.GetChromosome(refNameToChromosome, Start.Chromosome), Start.Position); GenomicPosition? endGenomicPosition = null; if (End != null) endGenomicPosition = new GenomicPosition(ReferenceNameUtilities.GetChromosome(refNameToChromosome, End.Value.Chromosome), End.Value.Position); return new GenomicRange(startGenomicPosition, endGenomicPosition); } } } ================================================ FILE: Cloud/Messages/Annotation/AnnotationResult.cs ================================================ using ErrorHandling; namespace Cloud.Messages.Annotation { // ReSharper disable once ClassNeverInstantiated.Global public sealed class AnnotationResult { // ReSharper disable InconsistentNaming // ReSharper disable UnassignedField.Global public string id; public string status; public string filePath; public ErrorCategory? errorCategory; public int variantCount; // ReSharper restore UnassignedField.Global // ReSharper restore InconsistentNaming } } ================================================ FILE: Cloud/Messages/Custom/CustomConfig.cs ================================================ // ReSharper disable InconsistentNaming namespace Cloud.Messages.Custom { public sealed class CustomConfig { public string id; public string tsvUrl; public S3Path outputDir; public JwtFields jwtFields; public bool skipGeneIdValidation; public bool skipRefBaseValidation; } } ================================================ FILE: Cloud/Messages/Custom/CustomResult.cs ================================================ namespace Cloud.Messages.Custom { // ReSharper disable NotAccessedField.Global // ReSharper disable InconsistentNaming public sealed class CustomResult { public string id; public string status; public string genomeAssembly; public FileList created; public bool noValidEntries; public JwtFields jwtFields; public int variantCount; } // ReSharper restore InconsistentNaming // ReSharper restore NotAccessedField.Global } ================================================ FILE: Cloud/Messages/FileList.cs ================================================ namespace Cloud.Messages { // ReSharper disable InconsistentNaming // ReSharper disable NotAccessedField.Global public sealed class FileList { public string bucketName; public string outputDir; public string[] files; } // ReSharper restore NotAccessedField.Global // ReSharper restore InconsistentNaming } ================================================ FILE: Cloud/Messages/Gene/GeneConfig.cs ================================================ using ErrorHandling.Exceptions; using IO; namespace Cloud.Messages.Gene { public sealed class GeneConfig { // ReSharper disable InconsistentNaming public string id; public string[] geneSymbols; public string[] ngaUrls; // ReSharper restore InconsistentNaming public void Validate() { if (string.IsNullOrEmpty(id)) throw new UserErrorException("Please provide the id of the job."); if (geneSymbols == null || geneSymbols.Length == 0) throw new UserErrorException("Please provide at lease one gene symbol."); if (ngaUrls == null) return; foreach (string ngaUrl in ngaUrls) HttpUtilities.ValidateUrl(ngaUrl); } } } ================================================ FILE: Cloud/Messages/JwtFields.cs ================================================ namespace Cloud.Messages { public class JwtFields { // ReSharper disable InconsistentNaming public string authorizedParty; public string subject; public string tenantId; // ReSharper restore InconsistentNaming } } ================================================ FILE: Cloud/Messages/Nirvana/NirvanaConfig.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; using IO; namespace Cloud.Messages.Nirvana { public sealed class NirvanaConfig { // ReSharper disable InconsistentNaming public string id; public string genomeAssembly; public string vcfUrl; public string tabixUrl; public S3Path outputDir; // ReSharper disable once UnassignedField.Global public List customAnnotations; public List desiredVcfInfo; public List desiredVcfSampleInfo; public string customStrUrl; public JwtFields jwtFields; // ReSharper restore InconsistentNaming internal void CheckRequiredFieldsNotNull() { static string BuildErrorMessage(string message) => message + " cannot be null."; if (id == null) throw new UserErrorException(BuildErrorMessage("id")); if (genomeAssembly == null) throw new UserErrorException(BuildErrorMessage("genomeAssembly")); if (vcfUrl == null) throw new UserErrorException(BuildErrorMessage("vcfUrl")); if (tabixUrl == null) throw new UserErrorException(BuildErrorMessage("tabixUrl")); if (outputDir == null) throw new UserErrorException(BuildErrorMessage("outputDir")); if (outputDir.bucketName == null) throw new UserErrorException(BuildErrorMessage("bucketName of outputDir")); if (outputDir.region == null) throw new UserErrorException(BuildErrorMessage("region of outputDir")); if (outputDir.path == null) throw new UserErrorException(BuildErrorMessage("path of outputDir")); if (outputDir.accessKey == null) throw new UserErrorException(BuildErrorMessage("accessKey of outputDir")); if (outputDir.secretKey == null) throw new UserErrorException(BuildErrorMessage("secretKey of outputDir")); if (outputDir.sessionToken == null) throw new UserErrorException(BuildErrorMessage("sessionToken of outputDir")); } public void Validate() { CheckRequiredFieldsNotNull(); HttpUtilities.ValidateUrl(vcfUrl); HttpUtilities.ValidateUrl(tabixUrl); outputDir.Validate(true); customAnnotations?.ForEach(x => x.Validate()); if (customStrUrl != null) HttpUtilities.ValidateUrl(customStrUrl); } } } ================================================ FILE: Cloud/Messages/Nirvana/NirvanaResult.cs ================================================ namespace Cloud.Messages.Nirvana { public sealed class NirvanaResult { // ReSharper disable InconsistentNaming // ReSharper disable NotAccessedField.Global public string id; public string status; public FileList created; public JwtFields jwtFields; public int variantCount; // ReSharper restore NotAccessedField.Global // ReSharper restore InconsistentNaming } } ================================================ FILE: Cloud/Messages/S3Path.cs ================================================ // ReSharper disable InconsistentNaming using System; using Amazon; using Amazon.S3; using Amazon.S3.Model; using Cloud.Utilities; using ErrorHandling.Exceptions; using IO; namespace Cloud.Messages { public sealed class S3Path { public string bucketName; public string region; public string path; public string accessKey; public string secretKey; public string sessionToken; public void Validate(bool isDirectory) { ValidatePathFormat(path, isDirectory); path = FormatPath(path); CheckS3Region(); var s3Client = GetS3Client(TimeSpan.FromMinutes(5)); ValidateCredentials(s3Client, isDirectory); } private void CheckS3Region() { if (RegionEndpoint.GetBySystemName(region).DisplayName == "Unknown") throw new UserErrorException($"Unknown S3 Region {region}"); } private const int MaxRetryCount = 4; private void ValidateCredentials(IS3Client s3Client, bool isDirectory) { int maxRetryCount = MaxRetryCount; while (true) { try { if (isDirectory) { var putRequest = new PutObjectRequest { BucketName = bucketName, Key = path }; s3Client.PutObjectAsync(putRequest).Wait(); } else { var getRequest = new GetObjectRequest { BucketName = bucketName, Key = path, ByteRange = new ByteRange(0, 1) }; s3Client.GetObjectAsync(getRequest).Wait(); } // validation successful. Break and return. break; } catch (Exception exception) { var processedException = AwsExceptionUtilities.TryConvertUserException(exception, this); if (processedException is UserErrorException) throw processedException; Logger.WriteLine($"Failed to validate S3 credentials\n{processedException.Message}"); maxRetryCount--; if (maxRetryCount >= 0) continue; Logger.WriteLine("Max retry limit reached for validating S3 credentials."); throw processedException; } } } internal static void ValidatePathFormat(string path, bool isDirectory) { if (isDirectory == path.EndsWith('/')) return; string errorMessage = isDirectory ? $"Expect a directory, but S3 path {path} doesn't end up with a '/'" : $"Expect a file, but S3 path {path} ends up with a '/'"; throw new UserErrorException(errorMessage); } public static string FormatPath(string path) => path.TrimStart('/'); public IS3Client GetS3Client(TimeSpan timeOut) => new AmazonS3ClientWrapper(new AmazonS3Client(accessKey, secretKey, sessionToken, new AmazonS3Config { RegionEndpoint = RegionEndpoint.GetBySystemName(region), Timeout = timeOut })); } } ================================================ FILE: Cloud/Messages/SaUrls.cs ================================================ // ReSharper disable InconsistentNaming using System.IO; using System.Linq; using ErrorHandling.Exceptions; using IO; namespace Cloud.Messages { public sealed class SaUrls { public string nsaUrl; public string idxUrl; public string nsiUrl; public string ngaUrl; public CustomSaType SaType => GetSaType(); private CustomSaType _saType; public void Validate() { switch (SaType) { case CustomSaType.Nsa: HttpUtilities.ValidateUrl(nsaUrl); HttpUtilities.ValidateUrl(idxUrl); break; case CustomSaType.Nsi: HttpUtilities.ValidateUrl(nsiUrl); break; case CustomSaType.Nga: HttpUtilities.ValidateUrl(ngaUrl); break; default: throw new InvalidDataException("Unknown custom SA type."); } } internal CustomSaType GetSaType() { if (_saType != default) return _saType; bool[] checkSaTypes = {nsaUrl != null, nsiUrl != null, ngaUrl != null}; CustomSaType[] providedTypes = checkSaTypes.Select((x, i) => (Provided: x, SaTypeIndex: i + 1)).Where(y => y.Provided) .Select(y => (CustomSaType) y.SaTypeIndex).ToArray(); if (providedTypes.Length == 0) throw new UserErrorException("No custom annotation file provided."); if (providedTypes.Length > 1) throw new UserErrorException( $"Multiple types of annotation files found: {providedTypes.Select(x => x.ToString())}. Please just provide one type of custom annotation file(s)"); if (providedTypes[0] == CustomSaType.Nsa && idxUrl == null) throw new UserErrorException($"Index file is not provided for the NSA file {nsaUrl}."); _saType = providedTypes[0]; return _saType; } public override string ToString() { switch (SaType) { case CustomSaType.Nsa: return $"{{\"nsaUrl\":\"{nsaUrl}\", \"idxUrl\":\"{idxUrl}\"}}"; case CustomSaType.Nsi: return $"{{\"nsiUrl\":\"{nsiUrl}\"}}"; case CustomSaType.Nga: return $"{{\"ngaUrl\":\"{ngaUrl}\"}}"; default: throw new InvalidDataException("Unknown custom SA type."); } } } public enum CustomSaType { Nsa = 1, Nsi, Nga } } ================================================ FILE: Cloud/Messages/Single/SingleConfig.cs ================================================ using System.Collections.Generic; namespace Cloud.Messages.Single { public sealed class SingleConfig { // ReSharper disable InconsistentNaming public string id; public string genomeAssembly; public SingleVariant variant; public int vepVersion; public string supplementaryAnnotations; public List customAnnotations; // ReSharper restore InconsistentNaming } } ================================================ FILE: Cloud/Messages/Single/SingleVariant.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; namespace Cloud.Messages.Single { public sealed class SingleVariant { // ReSharper disable InconsistentNaming public string chromosome; public int? position; public string refAllele; public string[] altAlleles; public double? quality; public string[] filters; public string infoField; public string formatField; public string[] sampleFields; public string[] sampleNames; // ReSharper restore InconsistentNaming private const string VcfMissingValue = "."; public void Validate() { if (string.IsNullOrEmpty(chromosome)) throw new UserErrorException("Please provide the chromosome."); if (position == null) throw new UserErrorException("Please provide the position."); if (string.IsNullOrEmpty(refAllele)) throw new UserErrorException("Please provide the reference allele."); if (altAlleles == null || altAlleles.Length == 0) throw new UserErrorException("Please provide the alternate alleles."); if (!string.IsNullOrEmpty(formatField) || sampleFields != null || sampleNames != null) { if (string.IsNullOrEmpty(formatField)) throw new UserErrorException("Please provide a format field when supplying sample fields or sample names."); int numSampleFields = sampleFields?.Length ?? 0; if (numSampleFields == 0) throw new UserErrorException("Please provide sample fields when supplying sample names and the format field."); int numSampleNames = sampleNames?.Length ?? 0; if (numSampleNames == 0) throw new UserErrorException("Please provide sample names when supplying sample fields and the format field."); if (sampleFields?.Length != sampleNames?.Length) throw new UserErrorException("Please provide the same number of sample fields as sample names."); } } public string[] GetVcfFields() { string altAlleleField = GetStringFromNullableCollection(altAlleles, ','); string filterField = GetStringFromNullableCollection(filters, ';'); var vcfFields = new List { chromosome, position.ToString(), VcfMissingValue, refAllele, altAlleleField, quality?.ToString() ?? VcfMissingValue, filterField, infoField ?? VcfMissingValue }; if (sampleFields != null) { vcfFields.Add(formatField ?? VcfMissingValue); vcfFields.AddRange(sampleFields); } return vcfFields.ToArray(); } private static string GetStringFromNullableCollection(string[] values, char separator) => values == null || values.Length == 0 ? VcfMissingValue : string.Join(separator, values); } } ================================================ FILE: Cloud/Messages/StrValidation/ValidationConfig.cs ================================================ using IO; namespace Cloud.Messages.StrValidation { public sealed class ValidationConfig { // ReSharper disable InconsistentNaming public string id; public string genomeAssembly; public string customStrUrl; // ReSharper restore InconsistentNaming public void Validate() => HttpUtilities.ValidateUrl(customStrUrl); } } ================================================ FILE: Cloud/Messages/StrValidation/ValidationResult.cs ================================================ namespace Cloud.Messages.StrValidation { public class ValidationResult { // ReSharper disable InconsistentNaming // ReSharper disable NotAccessedField.Global public string id; public string status; // ReSharper restore NotAccessedField.Global // ReSharper restore InconsistentNaming } } ================================================ FILE: Cloud/Notifications/SNS.cs ================================================ using System; using IO; namespace Cloud.Notifications { public static class SNS { public static void SendMessage(string snsTopicArn, string snsMessage) { try { using (var snsClient = new Amazon.SimpleNotificationService.AmazonSimpleNotificationServiceClient()) { snsClient.PublishAsync(snsTopicArn, snsMessage).Wait(); } } catch (Exception e) { Logger.WriteLine("Unable to log to SNS!!"); Logger.WriteLine(e.Message); } } public static string CreateMessage(string message, string status, string stackTrace) => $"{message}\n{status}\nStackTrace: {stackTrace}"; } } ================================================ FILE: Cloud/RedactionUtilities.cs ================================================ using System.Text.RegularExpressions; namespace Cloud { public static class RedactionUtilities { private static readonly Regex AwsAccessKeyIdRegex = new Regex("AWSAccessKeyId=([^&]+)"); private static readonly Regex AmzCredentialRegex = new Regex("X-Amz-Credential=([^/]+)"); private static readonly Regex AccessKeyRegex = new Regex("\"accessKey\":\"([^\"]+)"); private static readonly Regex SecretKeyRegex = new Regex("\"secretKey\":\"([^\"]+)"); private static readonly Regex SessionTokenRegex = new Regex("\"sessionToken\":\"([^\"]+)"); public static string Redact(this string s) { var awsAccessKeyIdMatches = AwsAccessKeyIdRegex.Matches(s); var amzCredentialMatches = AmzCredentialRegex.Matches(s); var accessKeyMatches = AccessKeyRegex.Matches(s); var secretKeyMatches = SecretKeyRegex.Matches(s); var sessionTokenMatches = SessionTokenRegex.Matches(s); char[] charArray = s.ToCharArray(); charArray.Mask(awsAccessKeyIdMatches).Mask(amzCredentialMatches).Mask(accessKeyMatches) .Mask(secretKeyMatches).Mask(sessionTokenMatches); return new string(charArray); } private static char[] Mask(this char[] charArray, MatchCollection matches) { foreach (Match match in matches) { var group = match.Groups[1]; for (var i = 0; i < group.Length; i++) { charArray[group.Index + i] = 'X'; } } return charArray; } } } ================================================ FILE: Cloud/Utilities/AwsExceptionUtilities.cs ================================================ using System; using Amazon.S3; using Cloud.Messages; using ErrorHandling.Exceptions; namespace Cloud.Utilities { public static class AwsExceptionUtilities { public static Exception TryConvertUserException(Exception exception, S3Path s3Path) { AmazonS3Exception s3Exception; while ((s3Exception = exception as AmazonS3Exception) == null) { if (exception.InnerException == null) return exception; exception = exception.InnerException; } string extraInfo; switch (s3Exception.ErrorCode) { case "ExpiredToken": case "InvalidToken": extraInfo = s3Path?.sessionToken; break; case "InvalidAccessKeyId": extraInfo = s3Path?.accessKey; break; case "SignatureDoesNotMatch": extraInfo = s3Path?.secretKey; break; case "NoSuchBucket": extraInfo = s3Path?.bucketName; break; case "AccessDenied": case "NoSuchKey": extraInfo = s3Path?.path; break; default: return s3Exception; } string errorMessage = extraInfo == null ? s3Exception.Message : $"{s3Exception.Message} ({extraInfo})"; return new UserErrorException(errorMessage); } } } ================================================ FILE: Cloud/Utilities/JsonUtilities.cs ================================================ using System.IO; using System.Text; using Amazon.Lambda.Serialization.Json; namespace Cloud.Utilities { public static class JsonUtilities { private static readonly JsonSerializer JsonSerializer = new JsonSerializer(); public static string Stringify(object obj) => Encoding.UTF8.GetString(Serialize(obj).ToArray()); public static MemoryStream Serialize(object obj) { var memoryStream = new MemoryStream(); JsonSerializer.Serialize(obj, memoryStream); memoryStream.Position = 0; return memoryStream; } public static T Deserialize(MemoryStream memoryStream) => JsonSerializer.Deserialize(memoryStream); } } ================================================ FILE: Cloud/Utilities/LambdaUtilities.cs ================================================ using System; using System.IO; using Genome; using IO; namespace Cloud.Utilities { public static class LambdaUtilities { public const string SuccessMessage = "Success"; public const string SnsTopicKey = "SnsTopicArn"; public static void GarbageCollect() { GC.Collect(); GC.WaitForPendingFinalizers(); } public static string GetEnvironmentVariable(string key) { string value = Environment.GetEnvironmentVariable(key); if (string.IsNullOrEmpty(value)) throw new InvalidDataException($"Environment variable {key} is not set."); return value; } public static void DeleteTempOutput() { string[] files = Directory.GetFiles(Path.GetTempPath()); if (files.Length == 0) return; foreach (string tempFile in files) File.Delete(tempFile); } public static string GetManifestUrl(string version, GenomeAssembly genomeAssembly, int saSchemaVersion = LambdaUrlHelper.SaSchemaVersion) { if (string.IsNullOrEmpty(version)) version = "latest"; string s3BaseUrl = LambdaUrlHelper.GetManifestBaseUrl() +$"/{saSchemaVersion}/"; switch (version) { case "latest": return $"{s3BaseUrl}latest_SA_{genomeAssembly}.txt"; case "release": return $"{s3BaseUrl}DRAGEN_3.4_{genomeAssembly}.txt"; case "none": return null; default: return $"{s3BaseUrl}{version}_SA_{genomeAssembly}.txt"; } } public static string GetCachePathPrefix(GenomeAssembly genomeAssembly) { return LambdaUrlHelper.GetCacheFolder().UrlCombine(genomeAssembly.ToString()) .UrlCombine(LambdaUrlHelper.DefaultCacheSource); } } } ================================================ FILE: Cloud/Utilities/LogUtilities.cs ================================================ using System; using System.Collections.Generic; using System.Text; using Amazon.Lambda.Core; using IO; using Newtonsoft.Json; namespace Cloud.Utilities { public static class LogUtilities { public static void LogLambdaInfo(ILambdaContext context, string version) => Logger.WriteLine( $"Lambda version: {version} ARN: {context?.InvokedFunctionArn}\nLog group: {context?.LogGroupName}\nLog stream: {context?.LogStreamName}"); public static void LogObject(string title, T config) { string json; switch (config) { case string s: json = s; break; default: json = JsonConvert.SerializeObject(config); break; } Logger.WriteLine($"{title}:\n{json.Redact()}"); } public static void Log(IEnumerable environmentVariables) { var sb = new StringBuilder(); sb.AppendLine("Environment variables:"); foreach (string key in environmentVariables) { string value = Environment.GetEnvironmentVariable(key) ?? "null"; sb.AppendLine($"- {key}: {value}"); } Logger.WriteLine(sb.ToString()); } public static void UpdateLogger(ILambdaLogger logger, StringBuilder sb) { Logger.WriteLine = s => { logger.LogLine(s); sb?.Append(s + "\n"); }; } } } ================================================ FILE: Cloud/Utilities/UploadUtilities.cs ================================================ using System; using System.Security.Cryptography; using System.Threading; using Amazon.S3.Model; using ErrorHandling.Exceptions; using IO; namespace Cloud.Utilities { public static class UploadUtilities { public static void DecryptUpload(this IS3Client s3Client, string bucketName, string key, string filePath, AesCryptoServiceProvider aes, FileMetadata metadata, int retryDelay = 1000) { while (true) { if (s3Client.TryDecryptUpload(bucketName, key, filePath, aes, metadata)) return; Thread.Sleep(retryDelay); } } internal static bool TryDecryptUpload(this IS3Client s3Client, string bucketName, string key, string filePath, AesCryptoServiceProvider aes, FileMetadata metadata) { try { using (var fileStream = FileUtilities.GetReadStream(filePath)) using (var cryptoStream = new CryptoStream(fileStream, aes.CreateDecryptor(), CryptoStreamMode.Read)) using (var lengthStream = new LengthStream(cryptoStream, metadata.Length)) { string md5String = Convert.ToBase64String(metadata.MD5); var request = new PutObjectRequest { BucketName = bucketName, Key = key, InputStream = lengthStream, MD5Digest = md5String }; s3Client.PutObjectAsync(request).Wait(); } return true; } catch (Exception exception) { var processedException = AwsExceptionUtilities.TryConvertUserException(exception, null); if (processedException is UserErrorException) throw processedException; Logger.WriteLine($"Exception: {exception.Message}."); return false; } } } } ================================================ FILE: CommandLine/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: CommandLine/Builders/ConsoleAppBuilder.cs ================================================ using System; using System.Collections.Generic; using CommandLine.NDesk.Options; using CommandLine.Utilities; using CommandLine.VersionProviders; using ErrorHandling; using VariantAnnotation.Interface.Providers; namespace CommandLine.Builders { public sealed class ConsoleAppBuilder : IConsoleAppBuilder { private readonly IConsoleAppBuilderData _data; private readonly string[] _args; public ConsoleAppBuilder(string[] args, OptionSet ops) { _args = args; _data = new ConsoleAppBuilderData { Ops = ops, HasArguments = _args != null && _args.Length > 0 }; AddAdditionalOptions(); } private void AddAdditionalOptions() { _data.Ops.Add("help|h", "displays the help menu", v => _data.ShowHelpMenu = v != null); _data.Ops.Add("version|v", "displays the version", v => _data.ShowVersion = v != null); } public IConsoleAppValidator Parse() { if (!_data.HasArguments) { _data.ExitCode = ExitCodes.MissingCommandLineOption; _data.ShowHelpMenu = true; return new ConsoleAppValidator(_data); } try { _data.UnsupportedOps = _data.Ops.Parse(_args); if (_data.UnsupportedOps.Count > 0) { _data.AddError($"Found unknown command-line option(s): {string.Join(", ", _data.UnsupportedOps)}", ExitCodes.UnknownCommandLineOption); } } catch (OptionException oe) { _data.AddError(oe.Message, ExitCodes.UnknownCommandLineOption); } return new ConsoleAppValidator(_data); } public IConsoleAppBuilder UseVersionProvider(IVersionProvider versionProvider) { _data.VersionProvider = versionProvider; return this; } } public sealed class ConsoleAppValidator : IConsoleAppValidator { public IConsoleAppBuilderData Data { get; } public bool SkipValidation { get; } public ConsoleAppValidator(IConsoleAppBuilderData data) { Data = data; SkipValidation = !data.HasArguments || data.ShowHelpMenu || data.ShowVersion; } public IConsoleAppValidator DisableOutput(bool condition = true) { if (condition) Data.DisableOutput = true; return this; } public IConsoleAppBanner ShowBanner(string authors) { if (Data.ShowVersion) Console.WriteLine($"{CommandLineUtilities.Title} {CommandLineUtilities.InformationalVersion} {Data.VersionProvider.DataVersion}"); else if (!Data.DisableOutput) CommandLineUtilities.DisplayBanner(authors); return new ConsoleAppBanner(Data); } public IConsoleAppBanner SkipBanner() => new ConsoleAppBanner(Data); } public sealed class ConsoleAppBanner : IConsoleAppBanner { private readonly IConsoleAppBuilderData _data; public ConsoleAppBanner(IConsoleAppBuilderData data) => _data = data; public IConsoleAppHelpMenu ShowHelpMenu(string description, string commandLineExample) { // ReSharper disable once InvertIf if (_data.ShowHelpMenu || _data.Errors.Count > 0) { Help.Show(_data.Ops, commandLineExample, description); Console.WriteLine($"\n{_data.VersionProvider.DataVersion}\n"); } return new ConsoleAppHelpMenu(_data); } } public sealed class ConsoleAppHelpMenu : IConsoleAppHelpMenu { private readonly IConsoleAppBuilderData _data; public ConsoleAppHelpMenu(IConsoleAppBuilderData data) => _data = data; public IConsoleAppErrors ShowErrors() { // ReSharper disable once InvertIf if (_data.Errors.Count > 0) { Console.WriteLine("\nSome problems were encountered when parsing the command line options:"); PrintErrors(); Console.WriteLine("\nFor a complete list of command line options, type \"dotnet {0} -h\"", CommandLineUtilities.CommandFileName); } return new ConsoleAppErrors(_data); } private void PrintErrors() { foreach (string error in _data.Errors) { Console.Write("- "); Console.ForegroundColor = ConsoleColor.Red; Console.Write("ERROR: "); Console.ResetColor(); Console.WriteLine(error); } } } public sealed class ConsoleAppErrors : IConsoleAppErrors { private readonly IConsoleAppBuilderData _data; private bool Continue => _data.ExitCode == ExitCodes.Success && _data.HasArguments && !_data.ShowVersion && !_data.ShowHelpMenu; public ConsoleAppErrors(IConsoleAppBuilderData data) => _data = data; public ExitCodes Execute(Func executeMethod) { if (!Continue) return _data.ExitCode; var benchmark = new Benchmark(); ExitCodes exitCode; try { exitCode = executeMethod(); ShowPerformanceData(benchmark); } catch (Exception e) { exitCode = ExitCodeUtilities.ShowException(e); } return exitCode; } private void ShowPerformanceData(Benchmark benchmark) { if (_data.DisableOutput) return; long peakMemoryUsageBytes = MemoryUtilities.GetPeakMemoryUsage(); var wallTimeSpan = benchmark.GetElapsedTime(); Console.WriteLine(); if (peakMemoryUsageBytes > 0) Console.WriteLine("Peak memory usage: {0}", MemoryUtilities.ToHumanReadable(peakMemoryUsageBytes)); Console.WriteLine("Time: {0}", Benchmark.ToHumanReadable(wallTimeSpan)); } } public sealed class ConsoleAppBuilderData : IConsoleAppBuilderData { public OptionSet Ops { get; set; } public List UnsupportedOps { get; set; } public List Errors { get; } = new List(); public ExitCodes ExitCode { get; set; } = ExitCodes.Success; public bool DisableOutput { get; set; } public bool HasArguments { get; set; } public IVersionProvider VersionProvider { get; set; } = new DefaultVersionProvider(); public bool ShowHelpMenu { get; set; } public bool ShowVersion { get; set; } public void AddError(string errorMessage, ExitCodes exitCode) { ExitCode = exitCode; Errors.Add(errorMessage); } } } ================================================ FILE: CommandLine/Builders/IConsoleAppBuilder.cs ================================================ using System; using System.Collections.Generic; using CommandLine.NDesk.Options; using ErrorHandling; using VariantAnnotation.Interface.Providers; namespace CommandLine.Builders { /// We are using separate interfaces to enforce ordering in the console application /// builder. public interface IConsoleAppBuilder { // ReSharper disable once UnusedMemberInSuper.Global IConsoleAppBuilder UseVersionProvider(IVersionProvider versionProvider); IConsoleAppValidator Parse(); } public interface IConsoleAppValidator { IConsoleAppValidator DisableOutput(bool condition = true); IConsoleAppBanner ShowBanner(string authors); IConsoleAppBanner SkipBanner(); IConsoleAppBuilderData Data { get; } bool SkipValidation { get; } } public interface IConsoleAppBanner { IConsoleAppHelpMenu ShowHelpMenu(string description, string commandLineExample); } public interface IConsoleAppHelpMenu { IConsoleAppErrors ShowErrors(); } public interface IConsoleAppErrors { ExitCodes Execute(Func executeMethod); } public interface IConsoleAppBuilderData { OptionSet Ops { get; } List UnsupportedOps { get; set; } List Errors { get; } ExitCodes ExitCode { get; set; } bool DisableOutput { get; set; } bool HasArguments { get; } IVersionProvider VersionProvider { get; set; } bool ShowHelpMenu { get; set; } bool ShowVersion { get; set; } void AddError(string errorMessage, ExitCodes exitCode); } } ================================================ FILE: CommandLine/Builders/ITopLevelAppBuilder.cs ================================================ using System; using System.Collections.Generic; using ErrorHandling; namespace CommandLine.Builders { public interface ITopLevelAppBuilder { // ReSharper disable once UnusedMemberInSuper.Global ITopLevelAppValidator Parse(); } public interface ITopLevelAppValidator { ITopLevelAppBanner ShowBanner(string authors); ITopLevelAppBuilderData Data { get; } } public interface ITopLevelAppBanner { ITopLevelAppHelpMenu ShowHelpMenu(string description); } public interface ITopLevelAppHelpMenu { ITopLevelAppErrors ShowErrors(); } public interface ITopLevelAppErrors { ExitCodes Execute(); } public interface ITopLevelAppBuilderData { string[] Arguments { get; } Dictionary Ops { get; } bool HasArguments { get; } string Command { get; } List Errors { get; } ExitCodes ExitCode { get; set; } bool ShowHelpMenu { get; set; } Func ExecuteMethod { get; set; } void AddError(string errorMessage, ExitCodes exitCode); } } ================================================ FILE: CommandLine/Builders/TopLevelAppBuilder.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using CommandLine.Utilities; using ErrorHandling; namespace CommandLine.Builders { public sealed class TopLevelAppBuilder : ITopLevelAppBuilder { private readonly ITopLevelAppBuilderData _data; public TopLevelAppBuilder(string[] args, Dictionary ops) { _data = new TopLevelAppBuilderData(args, ops); } public ITopLevelAppValidator Parse() { if (!_data.HasArguments) { _data.ExitCode = ExitCodes.MissingCommandLineOption; _data.ShowHelpMenu = true; return new TopLevelAppValidator(_data); } _data.ExecuteMethod = GetExecuteMethod(_data.Command); return new TopLevelAppValidator(_data); } private Func GetExecuteMethod(string command) { var lowerDict = new Dictionary(); foreach (var kvp in _data.Ops) lowerDict[kvp.Key.ToLower()] = kvp.Value; if (lowerDict.TryGetValue(command, out var topLevelOption)) return topLevelOption.CommandMethod; _data.AddError($"An unrecognized command '{_data.Command}' was specified.", ExitCodes.UnknownCommandLineOption); return null; } } public sealed class TopLevelAppValidator : ITopLevelAppValidator { public ITopLevelAppBuilderData Data { get; } public TopLevelAppValidator(ITopLevelAppBuilderData data) => Data = data; public ITopLevelAppBanner ShowBanner(string authors) { CommandLineUtilities.DisplayBanner(authors); return new TopLevelAppBanner(Data); } } public sealed class TopLevelAppBanner : ITopLevelAppBanner { private readonly ITopLevelAppBuilderData _data; public TopLevelAppBanner(ITopLevelAppBuilderData data) => _data = data; public ITopLevelAppHelpMenu ShowHelpMenu(string description) { // ReSharper disable once InvertIf if (_data.ShowHelpMenu || _data.Errors.Count > 0) { Console.WriteLine(description); Console.WriteLine(); OutputHelper.WriteLabel("USAGE: "); Console.WriteLine($"dotnet {CommandLineUtilities.CommandFileName} [options]"); Console.WriteLine(); DisplayCommands(_data.Ops); } return new TopLevelAppHelpMenu(_data); } private static void DisplayCommands(Dictionary ops) { const string label = "COMMAND: "; var filler = new string(' ', label.Length); int commandColumnLen = GetMaxCommandLen(ops.Keys) + 3; var useLabel = true; foreach (var op in ops) { if (useLabel) { OutputHelper.WriteLabel(label); useLabel = false; } else Console.Write(filler); var commandFiller = new string(' ', commandColumnLen - op.Key.Length); Console.WriteLine(op.Key + commandFiller + op.Value.Description); } } private static int GetMaxCommandLen(IEnumerable ops) { return ops.Select(op => op.Length).Concat(new int[1]).Max(); } } public sealed class TopLevelAppHelpMenu : ITopLevelAppHelpMenu { private readonly ITopLevelAppBuilderData _data; public TopLevelAppHelpMenu(ITopLevelAppBuilderData data) => _data = data; public ITopLevelAppErrors ShowErrors() { // ReSharper disable once InvertIf if (_data.Errors.Count > 0) { Console.WriteLine("\nSome problems were encountered when parsing the command line options:"); PrintErrors(); } return new TopLevelAppErrors(_data); } private void PrintErrors() { foreach (string error in _data.Errors) { Console.Write("- "); Console.ForegroundColor = ConsoleColor.Red; Console.Write("ERROR: "); Console.ResetColor(); Console.WriteLine(error); } } } public sealed class TopLevelAppErrors : ITopLevelAppErrors { private readonly ITopLevelAppBuilderData _data; private bool Continue => _data.ExitCode == ExitCodes.Success && _data.HasArguments && !_data.ShowHelpMenu; public TopLevelAppErrors(ITopLevelAppBuilderData data) => _data = data; public ExitCodes Execute() { if (!Continue) return _data.ExitCode; ExitCodes exitCode; try { var arguments = _data.Arguments.Skip(1).ToArray(); exitCode = _data.ExecuteMethod(_data.Command, arguments); } catch (Exception e) { exitCode = ExitCodeUtilities.ShowException(e); } return exitCode; } } public sealed class TopLevelAppBuilderData : ITopLevelAppBuilderData { public string[] Arguments { get; } public Dictionary Ops { get; } public bool HasArguments => Arguments != null && Arguments.Length > 0; public string Command { get; } public List Errors { get; } = new List(); public ExitCodes ExitCode { get; set; } public bool ShowHelpMenu { get; set; } public Func ExecuteMethod { get; set; } public TopLevelAppBuilderData(string[] arguments, Dictionary ops) { Arguments = arguments; Ops = ops; Command = HasArguments ? arguments[0].ToLower() : null; } public void AddError(string errorMessage, ExitCodes exitCode) { ExitCode = exitCode; Errors.Add(errorMessage); } } } ================================================ FILE: CommandLine/Builders/TopLevelOption.cs ================================================ using System; using ErrorHandling; namespace CommandLine.Builders { public sealed class TopLevelOption { public readonly string Description; public readonly Func CommandMethod; public TopLevelOption(string description, Func commandMethod) { Description = description; CommandMethod = commandMethod; } } } ================================================ FILE: CommandLine/Builders/ValidationExtensions.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Net; using ErrorHandling; namespace CommandLine.Builders { public static class ValidationExtensions { public static IConsoleAppValidator CheckEachFilenameExists(this IConsoleAppValidator validator, IEnumerable filePaths, string description, string commandLineOption, bool isRequired = true) { foreach (string filePath in filePaths) { validator.CheckInputFilenameExists(filePath, description, commandLineOption, isRequired); } return validator; } public static IConsoleAppValidator CheckInputFilenameExists(this IConsoleAppValidator validator, string filePath, string description, string commandLineOption, bool isRequired = true, string ignoreValue = null) { if (validator.SkipValidation) return validator; if (string.IsNullOrEmpty(filePath) && isRequired) { validator.Data.AddError( $"The {description} file was not specified. Please use the {commandLineOption} parameter.", ExitCodes.MissingCommandLineOption); } else if (isRequired && (ignoreValue == null || filePath != ignoreValue) && !File.Exists(filePath) && !CheckUrlExist(filePath)) { validator.Data.AddError($"The {description} file ({filePath}) does not exist.", ExitCodes.FileNotFound); } return validator; } private static bool CheckUrlExist(string url) { try { var webRequest = WebRequest.Create(url); webRequest.GetResponse(); } catch //If exception thrown then couldn't get response from address { return false; } return true; } public static IConsoleAppValidator CheckOutputFilenameSuffix(this IConsoleAppValidator validator, string filePath, string fileSuffix, string description) { if (validator.SkipValidation) return validator; if (!filePath.EndsWith(fileSuffix)) { validator.Data.AddError($"The {description} file ({filePath}) does not end with a {fileSuffix}.", ExitCodes.BadArguments); } return validator; } public static IConsoleAppValidator CheckDirectoryExists(this IConsoleAppValidator validator, string dirPath, string description, string commandLineOption) { if (validator.SkipValidation) return validator; if (string.IsNullOrEmpty(dirPath)) { validator.Data.AddError( $"The {description} directory was not specified. Please use the {commandLineOption} parameter.", ExitCodes.MissingCommandLineOption); } else if (!Directory.Exists(dirPath)) { validator.Data.AddError($"The {description} directory ({dirPath}) does not exist.", ExitCodes.PathNotFound); } return validator; } public static IConsoleAppValidator HasRequiredParameter(this IConsoleAppValidator validator, T parameterValue, string description, string commandLineOption) { if (validator.SkipValidation) return validator; if (EqualityComparer.Default.Equals(parameterValue, default)) { validator.Data.AddError($"The {description} was not specified. Please use the {commandLineOption} parameter.", ExitCodes.MissingCommandLineOption); } return validator; } public static IConsoleAppValidator HasRequiredDate(this IConsoleAppValidator validator, string date, string description, string commandLineOption) { if (validator.SkipValidation) return validator; validator.HasRequiredParameter(date, description, commandLineOption); if (string.IsNullOrEmpty(date)) return validator; if (!DateTime.TryParse(date, out _)) { validator.Data.AddError($"The {description} was not specified as a date (YYYY-MM-dd). Please use the {commandLineOption} parameter.", ExitCodes.BadArguments); } return validator; } } } ================================================ FILE: CommandLine/CommandLine.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: CommandLine/NDesk.Options/Options.cs ================================================ // // Options.cs // // Authors: // Jonathan Pryor // // Copyright (C) 2008 Novell (http://www.novell.com) // // Permission is hereby granted, free of charge, to any person obtaining // a copy of this software and associated documentation files (the // "Software"), to deal in the Software without restriction, including // without limitation the rights to use, copy, modify, merge, publish, // distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice shall be // included in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // // The LINQ version just changes the implementation of // OptionSet.Parse(IEnumerable), and confers no semantic changes. // // A Getopt::Long-inspired option parsing library for C#. // // NDesk.Options.OptionSet is built upon a key/value table, where the // key is a option format string and the value is a delegate that is // invoked when the format string is matched. using System; using System.Collections.Generic; using System.Collections.ObjectModel; using System.IO; using System.Text; using System.Text.RegularExpressions; using OptimizedCore; namespace CommandLine.NDesk.Options { public sealed class OptionValueCollection { private readonly List _values = new List(); private readonly OptionContext _c; internal OptionValueCollection(OptionContext c) { _c = c; } #region ICollection public void Add(string item) { _values.Add(item); } public void Clear() { _values.Clear(); } public int Count => _values.Count; #endregion #region IList private void AssertValid(int index) { if (_c.Option == null) throw new InvalidOperationException("OptionContext.Option is null."); if (index >= _c.Option.MaxValueCount) throw new ArgumentOutOfRangeException(nameof(index)); if (_c.Option.OptionValueType == OptionValueType.Required && index >= _values.Count) throw new OptionException($"Missing required value for option '{_c.OptionName}'."); } public string this[int index] { get { AssertValid(index); return index >= _values.Count ? null : _values[index]; } } #endregion } public sealed class OptionContext { public OptionContext() { OptionValues = new OptionValueCollection(this); } public Option Option { get; set; } public string OptionName { get; set; } public int OptionIndex { get; set; } public OptionValueCollection OptionValues { get; } } public enum OptionValueType { None, Optional, Required } public abstract class Option { protected Option(string prototype, string description, int maxValueCount) { if (prototype == null) throw new ArgumentNullException(nameof(prototype)); if (prototype.Length == 0) throw new ArgumentException("Cannot be the empty string.", nameof(prototype)); if (maxValueCount < 0) throw new ArgumentOutOfRangeException(nameof(maxValueCount)); Names = prototype.Split('|'); Description = description; MaxValueCount = maxValueCount; OptionValueType = ParsePrototype(); if (MaxValueCount == 0 && OptionValueType != OptionValueType.None) throw new ArgumentException( "Cannot provide maxValueCount of 0 for OptionValueType.Required or " + "OptionValueType.Optional.", nameof(maxValueCount)); if (OptionValueType == OptionValueType.None && maxValueCount > 1) throw new ArgumentException( $"Cannot provide maxValueCount of {maxValueCount} for OptionValueType.None.", nameof(maxValueCount)); if (Array.IndexOf(Names, "<>") >= 0 && (Names.Length == 1 && OptionValueType != OptionValueType.None || Names.Length > 1 && MaxValueCount > 1)) throw new ArgumentException( "The default option handler '<>' cannot require values.", nameof(prototype)); } public string Description { get; } public OptionValueType OptionValueType { get; } public int MaxValueCount { get; } protected static T Parse(string value, OptionContext c) { T t; try { t = (T)Convert.ChangeType(value, typeof(T)); } catch (Exception e) { throw new OptionException( $"Could not convert string `{value}' to type {typeof(T).Name} for option `{c.OptionName}'.", e); } return t; } public string[] Names { get; } internal string[] ValueSeparators { get; private set; } private static readonly char[] NameTerminator = { '=', ':' }; private OptionValueType ParsePrototype() { var type = '\0'; var seps = new List(); for (var i = 0; i < Names.Length; ++i) { string name = Names[i]; if (name.Length == 0) throw new InvalidDataException($"Empty option names are not supported: {nameof(name)}"); int end = name.IndexOfAny(NameTerminator); if (end == -1) continue; Names[i] = name.Substring(0, end); if (type == '\0' || type == name[end]) type = name[end]; else throw new InvalidDataException($"Conflicting option types: '{type}' vs. '{name[end]}'."); AddSeparators(name, end, seps); } if (type == '\0') return OptionValueType.None; if (MaxValueCount <= 1 && seps.Count != 0) throw new InvalidDataException($"Cannot provide key/value separators for Options taking {MaxValueCount} value(s)."); if (MaxValueCount <= 1) return GetOptionValueType(type); switch (seps.Count) { case 0: ValueSeparators = new[] { ":", "=" }; break; case 1 when seps[0].Length == 0: ValueSeparators = null; break; default: ValueSeparators = seps.ToArray(); break; } return GetOptionValueType(type); } private static OptionValueType GetOptionValueType(char type) => type == '=' ? OptionValueType.Required : OptionValueType.Optional; private static void AddSeparators(string name, int end, ICollection seps) { int start = -1; for (int i = end + 1; i < name.Length; ++i) { switch (name[i]) { case '{': if (start != -1) throw new ArgumentException( $"Ill-formed name/value separator found in \"{name}\".", nameof(name)); start = i + 1; break; case '}': if (start == -1) throw new ArgumentException( $"Ill-formed name/value separator found in \"{name}\".", nameof(name)); seps.Add(name.Substring(start, i - start)); start = -1; break; default: if (start == -1) seps.Add(name[i].ToString()); break; } } if (start != -1) throw new ArgumentException( $"Ill-formed name/value separator found in \"{name}\".", nameof(name)); } public void Invoke(OptionContext c) { OnParseComplete(c); c.OptionName = null; c.Option = null; c.OptionValues.Clear(); } protected abstract void OnParseComplete(OptionContext c); } public sealed class OptionException : Exception { public OptionException(string message) : base(message) { } public OptionException(string message, Exception innerException) : base(message, innerException) { } } public sealed class OptionSet : KeyedCollection { protected override string GetKeyForItem(Option item) { if (item == null) throw new ArgumentNullException(nameof(item)); if (item.Names != null && item.Names.Length > 0) return item.Names[0]; // This should never happen, as it's invalid for Option to be // constructed w/o any names. throw new InvalidOperationException("Option has no names!"); } protected override void InsertItem(int index, Option item) { base.InsertItem(index, item); AddImpl(item); } private void AddImpl(Option option) { if (option == null) throw new ArgumentNullException(nameof(option)); var added = new List(option.Names.Length); try { // KeyedCollection.InsertItem/SetItem handle the 0th name. for (var i = 1; i < option.Names.Length; ++i) { Dictionary.Add(option.Names[i], option); added.Add(option.Names[i]); } } catch (Exception) { foreach (string name in added) Dictionary.Remove(name); throw; } } public new void Add(Option option) { base.Add(option); } private sealed class ActionOption : Option { private readonly Action _action; public ActionOption(string prototype, string description, int count, Action action) : base(prototype, description, count) { _action = action ?? throw new ArgumentNullException(nameof(action)); } protected override void OnParseComplete(OptionContext c) { _action(c.OptionValues); } } public void Add(string prototype, string description, Action action) { if (action == null) throw new ArgumentNullException(nameof(action)); Option p = new ActionOption(prototype, description, 1, delegate (OptionValueCollection v) { action(v[0]); }); base.Add(p); } private sealed class ActionOption : Option { private readonly Action _action; public ActionOption(string prototype, string description, Action action) : base(prototype, description, 1) { _action = action ?? throw new ArgumentNullException(nameof(action)); } protected override void OnParseComplete(OptionContext c) { _action(Parse(c.OptionValues[0], c)); } } public void Add(string prototype, string description, Action action) { Add(new ActionOption(prototype, description, action)); } private static OptionContext CreateOptionContext() { return new OptionContext(); } public List Parse(IEnumerable arguments) { OptionContext c = CreateOptionContext(); c.OptionIndex = -1; var process = true; var unprocessed = new List(); Option def = Contains("<>") ? this["<>"] : null; foreach (string argument in arguments) { ++c.OptionIndex; if (argument == "--") { process = false; continue; } if (!process) { Unprocessed(unprocessed, def, c, argument); continue; } if (!Parse(argument, c)) Unprocessed(unprocessed, def, c, argument); } c.Option?.Invoke(c); return unprocessed; } private static void Unprocessed(ICollection extra, Option def, OptionContext c, string argument) { if (def == null) { extra.Add(argument); return; } c.OptionValues.Add(argument); c.Option = def; c.Option.Invoke(c); } private readonly Regex _valueOption = new Regex( @"^(?--|-|/)(?[^:=]+)((?[:=])(?.*))?$"); private bool GetOptionParts(string argument, out string flag, out string name, out string sep, out string value) { if (argument == null) throw new ArgumentNullException(nameof(argument)); flag = name = sep = value = null; var m = _valueOption.Match(argument); if (!m.Success) return false; flag = m.Groups["flag"].Value; name = m.Groups["name"].Value; // ReSharper disable once InvertIf if (m.Groups["sep"].Success && m.Groups["value"].Success) { sep = m.Groups["sep"].Value; value = m.Groups["value"].Value; } return true; } private bool Parse(string argument, OptionContext c) { if (c.Option != null) { ParseValue(argument, c); return true; } if (!GetOptionParts(argument, out string f, out string n, out string s, out string v)) return false; if (!Contains(n)) return ParseBool(argument, n, c) || ParseBundledValue(f, n + s + v, c); var p = this[n]; c.OptionName = f + n; c.Option = p; // ReSharper disable once SwitchStatementMissingSomeCases switch (p.OptionValueType) { case OptionValueType.None: c.OptionValues.Add(n); c.Option.Invoke(c); break; case OptionValueType.Optional: case OptionValueType.Required: ParseValue(v, c); break; } return true; } private static void ParseValue(string option, OptionContext c) { if (option != null) foreach (string o in c.Option.ValueSeparators != null ? option.Split(c.Option.ValueSeparators, StringSplitOptions.None) : new[] { option }) { c.OptionValues.Add(o); } if (c.OptionValues.Count == c.Option.MaxValueCount || c.Option.OptionValueType == OptionValueType.Optional) c.Option.Invoke(c); else if (c.OptionValues.Count > c.Option.MaxValueCount) { throw new OptionException($"Error: Found {c.OptionValues.Count} option values when expecting {c.Option.MaxValueCount}."); } } private bool ParseBool(string option, string n, OptionContext c) { if (n.Length < 1 || n[n.Length - 1] != '+' && n[n.Length - 1] != '-') return false; string rn = n.Substring(0, n.Length - 1); if (!Contains(rn)) return false; var p = this[rn]; string v = n[n.Length - 1] == '+' ? option : null; c.OptionName = option; c.Option = p; c.OptionValues.Add(v); p.Invoke(c); return true; } private bool ParseBundledValue(string f, string n, OptionContext c) { if (f != "-") return false; for (var i = 0; i < n.Length; ++i) { string opt = f + n[i]; string rn = n[i].ToString(); if (!Contains(rn)) { if (i == 0) return false; throw new OptionException($"Cannot bundle unregistered option '{opt}'."); } var p = this[rn]; switch (p.OptionValueType) { case OptionValueType.None: Invoke(c, opt, n, p); break; case OptionValueType.Optional: case OptionValueType.Required: { string v = n.Substring(i + 1); c.Option = p; c.OptionName = opt; ParseValue(v.Length != 0 ? v : null, c); return true; } default: throw new InvalidOperationException("Unknown OptionValueType: " + p.OptionValueType); } } return true; } private static void Invoke(OptionContext c, string name, string value, Option option) { c.OptionName = name; c.Option = option; c.OptionValues.Add(value); option.Invoke(c); } private const int OptionWidth = 29; public void WriteOptionDescriptions(TextWriter o) { foreach (Option p in this) { var written = 0; if (!WriteOptionPrototype(o, p, ref written)) continue; if (written < OptionWidth) o.Write(new string(' ', OptionWidth - written)); else { o.WriteLine(); o.Write(new string(' ', OptionWidth)); } var indent = false; var prefix = new string(' ', OptionWidth + 2); foreach (string line in GetLines(GetDescription(p.Description))) { if (indent) o.Write(prefix); o.WriteLine(line); indent = true; } } } private static bool WriteOptionPrototype(TextWriter o, Option p, ref int written) { var names = p.Names; int i = GetNextOptionIndex(names, 0); if (i == names.Length) return false; if (names[i].Length == 1) { Write(o, ref written, " -"); Write(o, ref written, names[0]); } else { Write(o, ref written, " --"); Write(o, ref written, names[0]); } for (i = GetNextOptionIndex(names, i + 1); i < names.Length; i = GetNextOptionIndex(names, i + 1)) { Write(o, ref written, ", "); Write(o, ref written, names[i].Length == 1 ? "-" : "--"); Write(o, ref written, names[i]); } if (p.OptionValueType != OptionValueType.Optional && p.OptionValueType != OptionValueType.Required) return true; Write(o, ref written, " "); if (p.OptionValueType == OptionValueType.Optional) { Write(o, ref written, "["); } Write(o, ref written, "<" + GetArgumentName(0, p.MaxValueCount, p.Description) + '>'); string sep = p.ValueSeparators != null && p.ValueSeparators.Length > 0 ? p.ValueSeparators[0] : " "; for (var c = 1; c < p.MaxValueCount; ++c) { Write(o, ref written, sep + GetArgumentName(c, p.MaxValueCount, p.Description)); } if (p.OptionValueType == OptionValueType.Optional) { Write(o, ref written, "]"); } return true; } private static int GetNextOptionIndex(IReadOnlyList names, int i) { while (i < names.Count && names[i] == "<>") { ++i; } return i; } private static void Write(TextWriter o, ref int n, string s) { n += s.Length; o.Write(s); } private static string GetArgumentName(int index, int maxIndex, string description) { if (description == null) return maxIndex == 1 ? "VALUE" : "VALUE" + (index + 1); var nameStart = maxIndex == 1 ? new[] { "{0:", "{" } : new[] { "{" + index + ":" }; foreach (string t in nameStart) { int start, j = 0; do { start = description.IndexOf(t, j, StringComparison.Ordinal); } while (start >= 0 && j != 0 && description[j++ - 1] == '{'); if (start == -1) continue; int end = description.IndexOf("}", start, StringComparison.Ordinal); if (end == -1) continue; return description.Substring(start + t.Length, end - start - t.Length); } return maxIndex == 1 ? "VALUE" : "VALUE" + (index + 1); } private static string GetDescription(string description) { if (description == null) return string.Empty; StringBuilder sb = StringBuilderPool.Get(); int start = -1; for (var position = 0; position < description.Length; ++position) { position = ParseDescription(description, position, sb, ref start); } return StringBuilderPool.GetStringAndReturn(sb); } private static int ParseDescription(string description, int position, StringBuilder sb, ref int start) { switch (description[position]) { case '{': if (position == start) { sb.Append('{'); start = -1; break; } if (start < 0) start = position + 1; break; case '}': if (start < 0) { if (position + 1 == description.Length || description[position + 1] != '}') throw new InvalidOperationException("Invalid option description: " + description); ++position; sb.Append("}"); break; } sb.Append(description.Substring(start, position - start)); start = -1; break; case ':': if (start < 0) { sb.Append(description[position]); break; } start = position + 1; break; default: if (start < 0) sb.Append(description[position]); break; } return position; } private static IEnumerable GetLines(string description) { if (string.IsNullOrEmpty(description)) { yield return string.Empty; yield break; } description = description.Trim(); int length = 80 - OptionWidth - 1; int start = 0, end; do { end = GetLineEnd(start, length, description); char c = description[end - 1]; if (char.IsWhiteSpace(c)) --end; bool writeContinuation = end != description.Length && !IsEolChar(c); string line = description.Substring(start, end - start) + (writeContinuation ? "-" : ""); yield return line; start = end; if (char.IsWhiteSpace(c)) ++start; length = 80 - OptionWidth - 2 - 1; } while (end < description.Length); } private static bool IsEolChar(char c) { return !char.IsLetterOrDigit(c); } private static int GetLineEnd(int start, int length, string description) { int end = Math.Min(start + length, description.Length); int sep = -1; for (int i = start + 1; i < end; ++i) { if (description[i] == '\n') return i + 1; if (IsEolChar(description[i])) sep = i + 1; } if (sep == -1 || end == description.Length) return end; return sep; } } } ================================================ FILE: CommandLine/Utilities/Benchmark.cs ================================================ using System; namespace CommandLine.Utilities { public sealed class Benchmark { private DateTime _startTime; public Benchmark() => Reset(); public TimeSpan GetElapsedTime() { var stopTime = DateTime.Now; return new TimeSpan(stopTime.Ticks - _startTime.Ticks); } public static string ToHumanReadable(TimeSpan span) { return span.Days > 0 ? $"{span.Days}:{span.Hours:D2}:{span.Minutes:D2}:{span.Seconds:D2}.{span.Milliseconds/100:D1}" : $"{span.Hours:D2}:{span.Minutes:D2}:{span.Seconds:D2}.{span.Milliseconds/100:D1}"; } public static double GetElapsedIterationsPerSecond(TimeSpan span, int numUnits) => numUnits / span.TotalSeconds; public void Reset() => _startTime = DateTime.Now; } } ================================================ FILE: CommandLine/Utilities/CommandLineUtilities.cs ================================================ using System; using System.IO; using System.Reflection; namespace CommandLine.Utilities { public static class CommandLineUtilities { private static readonly string Copyright; public static readonly string Title; public static readonly string InformationalVersion; public static readonly string Version; static CommandLineUtilities() { var executingAssembly = Assembly.GetExecutingAssembly(); Copyright = GetCopyright(executingAssembly); Version = GetVersion(executingAssembly); InformationalVersion = GetInformationalVersion(executingAssembly); var entryAssembly = Assembly.GetEntryAssembly(); Title = GetTitle(entryAssembly); } private static string GetCopyright(Assembly entryAssembly) { var attr = GetAssemblyAttributes(entryAssembly); return attr?.Copyright.Replace("©", "(c)") ?? $"(c) {DateTime.Now.Year} Illumina, Inc."; } public static string GetVersion(Assembly entryAssembly) { var attr = GetAssemblyAttributes(entryAssembly); return attr?.Version; } private static string GetInformationalVersion(Assembly entryAssembly) { var attr = GetAssemblyAttributes(entryAssembly); return attr?.InformationalVersion; } private static string GetTitle(Assembly entryAssembly) { var attr = GetAssemblyAttributes(entryAssembly); return attr?.Title; } private static T GetAssemblyAttributes(Assembly entryAssembly) { var attrs = entryAssembly.GetCustomAttributes(typeof(T)) as T[]; // ReSharper disable once PossibleNullReferenceException return attrs.Length == 0 ? default : attrs[0]; } /// /// Displays the command-line banner for this program /// public static void DisplayBanner(string author) { // create the top and bottom lines const int lineLength = 75; var line = new string('-', lineLength); // create the filler string int fillerLength = lineLength - Title.Length - Copyright.Length; int fillerLength2 = lineLength - author.Length - InformationalVersion.Length; if (fillerLength < 1) { throw new InvalidOperationException("Unable to display the program banner, the program name is too long."); } if (fillerLength2 < 1) { throw new InvalidOperationException("Unable to display the program banner, the author name and version string is too long."); } var filler = new string(' ', fillerLength); var filler2 = new string(' ', fillerLength2); // display the actual banner Console.WriteLine(line); Console.ForegroundColor = ConsoleColor.Magenta; Console.Write(Title); Console.ResetColor(); Console.WriteLine("{0}{1}", filler, Copyright); Console.WriteLine("{0}{1}{2}", author, filler2, InformationalVersion); Console.WriteLine("{0}\n", line); } public static string CommandFileName => Path.GetFileName(Environment.GetCommandLineArgs()[0]); } } ================================================ FILE: CommandLine/Utilities/Help.cs ================================================ using System; using CommandLine.NDesk.Options; namespace CommandLine.Utilities { public static class Help { public static void Show(OptionSet ops, string commonOptions, string description) { OutputHelper.WriteLabel("USAGE: "); Console.WriteLine("dotnet {0} {1}", OutputHelper.GetExecutableName(), commonOptions); Console.WriteLine("{0}\n", description); OutputHelper.WriteLabel("OPTIONS:"); Console.WriteLine(); ops.WriteOptionDescriptions(Console.Out); } } } ================================================ FILE: CommandLine/Utilities/MemoryUtilities.cs ================================================ using System.Diagnostics; namespace CommandLine.Utilities { public static class MemoryUtilities { // ReSharper disable InconsistentNaming private const long NumBytesInGB = 1073741824; private const long NumBytesInMB = 1048576; private const long NumBytesInKB = 1024; // ReSharper restore InconsistentNaming /// /// shows the peak memory usage for the current process /// public static long GetPeakMemoryUsage() { return Process.GetCurrentProcess().PeakWorkingSet64; } /// /// converts the number of bytes used to a human readable format /// public static string ToHumanReadable(long numBytes) { if (numBytes > NumBytesInGB) { double gigaBytes = numBytes / (double)NumBytesInGB; return $"{gigaBytes:0.000} GB"; } if (numBytes > NumBytesInMB) { double megaBytes = numBytes / (double)NumBytesInMB; return $"{megaBytes:0.0} MB"; } // ReSharper disable once InvertIf if (numBytes > NumBytesInKB) { double kiloBytes = numBytes / (double)NumBytesInKB; return $"{kiloBytes:0.0} KB"; } return $"{numBytes} B"; } } } ================================================ FILE: CommandLine/Utilities/OutputHelper.cs ================================================ using System; using System.IO; namespace CommandLine.Utilities { public static class OutputHelper { public static void WriteLabel(string label) { Console.ForegroundColor = ConsoleColor.DarkGreen; Console.Write(label); Console.ResetColor(); } public static string GetExecutableName() { return Path.GetFileName(Environment.GetCommandLineArgs()[0]); } } } ================================================ FILE: CommandLine/VersionProviders/DefaultVersionProvider.cs ================================================ using VariantAnnotation.Interface.Providers; namespace CommandLine.VersionProviders { public sealed class DefaultVersionProvider : IVersionProvider { public string DataVersion { get; } = string.Empty; } } ================================================ FILE: CommandLine/VersionProviders/IVersionProvider.cs ================================================ namespace VariantAnnotation.Interface.Providers { public interface IVersionProvider { string DataVersion { get; } } } ================================================ FILE: CommonAssemblyInfo.props ================================================  Illumina © 2022 Illumina, Inc. 3.18.1 3.18.1 3.18.1 Stromberg, Roy, Platzer, Siddiqui, Ouyang, et al ================================================ FILE: Compression/Algorithms/ICompressionAlgorithm.cs ================================================ namespace Compression.Algorithms { public interface ICompressionAlgorithm { int Compress(byte[] source, int srcLength, byte[] destination, int destLength); int Decompress(byte[] source, int srcLength, byte[] destination, int destLength); int GetDecompressedLength(byte[] source, int srcLength); int GetCompressedBufferBounds(int srcLength); } } ================================================ FILE: Compression/Algorithms/Zlib.cs ================================================ using System; using System.Runtime.InteropServices; using Compression.Utilities; namespace Compression.Algorithms { public sealed class Zlib : ICompressionAlgorithm { private readonly int _compressionLevel; public Zlib(int compressionLevel = 1) { _compressionLevel = compressionLevel; LibraryUtilities.CheckLibrary(); } public int Compress(byte[] source, int srcLength, byte[] destination, int destLength) { if (destination == null || GetCompressedBufferBounds(srcLength) > destination.Length) { throw new InvalidOperationException("Zlib: Insufficient memory in destination buffer"); } return SafeNativeMethods.bgzf_compress(destination, destLength, source, srcLength, _compressionLevel); } public int Decompress(byte[] source, int srcLength, byte[] destination, int destLength) { if (destination == null) { throw new InvalidOperationException("Zlib: Insufficient memory in destination buffer"); } return SafeNativeMethods.bgzf_decompress(destination, destLength, source, srcLength); } public int GetDecompressedLength(byte[] source, int srcLength) { int pos = srcLength - 4; return source[pos + 3] << 24 | source[pos + 2] << 16 | source[pos + 1] << 8 | source[pos]; } public int GetCompressedBufferBounds(int srcLength) => (int)(srcLength * 1.06 + 28); private static class SafeNativeMethods { [DllImport("BlockCompression", CallingConvention = CallingConvention.Cdecl)] public static extern int bgzf_decompress(byte[] uncompressedBlock, int uncompressedSize, byte[] compressedBlock, int compressedSize); [DllImport("BlockCompression", CallingConvention = CallingConvention.Cdecl)] public static extern int bgzf_compress(byte[] compressedBlock, int compressedLen, byte[] uncompressedBlock, int uncompressedLen, int compressionLevel); } } } ================================================ FILE: Compression/Algorithms/Zstandard.cs ================================================ using System; using System.Runtime.InteropServices; using Compression.Utilities; namespace Compression.Algorithms { public sealed class Zstandard : ICompressionAlgorithm { private readonly int _compressionLevel; public Zstandard(int compressionLevel = 17) { _compressionLevel = compressionLevel; LibraryUtilities.CheckLibrary(); } public int Compress(byte[] source, int srcLength, byte[] destination, int destLength) { if (destination == null || GetCompressedBufferBounds(srcLength) > destination.Length) { throw new InvalidOperationException("Zstandard: Insufficient memory in destination buffer"); } return (int)SafeNativeMethods.ZSTD_compress(destination, (ulong)destLength, source, (ulong)srcLength, _compressionLevel); } public int Decompress(byte[] source, int srcLength, byte[] destination, int destLength) { if (destination == null) { throw new InvalidOperationException("Zstandard: Insufficient memory in destination buffer"); } return (int)SafeNativeMethods.ZSTD_decompress(destination, (ulong)destLength, source, (ulong)srcLength); } public int GetDecompressedLength(byte[] source, int srcLength) => (int)SafeNativeMethods.ZSTD_getDecompressedSize(source, srcLength); // empirically derived via polynomial regression with additional padding added public int GetCompressedBufferBounds(int srcLength) => srcLength + 32; private static class SafeNativeMethods { [DllImport("BlockCompression", CallingConvention = CallingConvention.Cdecl)] public static extern ulong ZSTD_compress(byte[] destination, ulong destinationLen, byte[] source, ulong sourceLen, int compressionLevel); [DllImport("BlockCompression", CallingConvention = CallingConvention.Cdecl)] public static extern ulong ZSTD_decompress(byte[] destination, ulong destinationLen, byte[] source, ulong sourceLen); [DllImport("BlockCompression", CallingConvention = CallingConvention.Cdecl)] public static extern ulong ZSTD_getDecompressedSize(byte[] source, int sourceLen); } } } ================================================ FILE: Compression/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: Compression/Compression.csproj ================================================  net6.0 ..\bin\$(Configuration) PreserveNewest PreserveNewest PreserveNewest ================================================ FILE: Compression/DataStructures/Block.cs ================================================ using System; using System.IO; using Compression.Algorithms; using Compression.FileHandling; using ErrorHandling.Exceptions; namespace Compression.DataStructures { public sealed class Block { private readonly ICompressionAlgorithm _compressionAlgorithm; private readonly BlockHeader _header; private readonly byte[] _compressedBlock; private readonly byte[] _uncompressedBlock; public long FileOffset { get; private set; } public int Offset { get; internal set; } internal const int DefaultSize = 16777216; private readonly int _size; private readonly int _compressedBlockSize; public bool IsFull => Offset == _size; public bool HasMoreData => Offset < _header.NumUncompressedBytes; public Block(ICompressionAlgorithm compressionAlgorithm, int size = DefaultSize) { _compressionAlgorithm = compressionAlgorithm; Offset = 0; _size = size; _uncompressedBlock = new byte[_size]; _compressedBlockSize = compressionAlgorithm.GetCompressedBufferBounds(_size); _compressedBlock = new byte[_compressedBlockSize]; _header = new BlockHeader(); } public int CopyTo(byte[] array, int offset, int count) { int copyLength = Math.Min(_size - Offset, count); if (copyLength == 0) return 0; Buffer.BlockCopy(array, offset, _uncompressedBlock, Offset, copyLength); Offset += copyLength; return copyLength; } public int CopyFrom(byte[] array, int offset, int count) { int copyLength = Math.Min(_header.NumUncompressedBytes - Offset, count); if (copyLength == 0) return 0; Buffer.BlockCopy(_uncompressedBlock, Offset, array, offset, copyLength); Offset += copyLength; return copyLength; } public void Write(Stream stream) { _header.NumUncompressedBytes = Offset; _header.NumCompressedBytes = _compressionAlgorithm.Compress(_uncompressedBlock, _header.NumUncompressedBytes, _compressedBlock, _compressedBlockSize); if (_header.NumCompressedBytes > _header.NumUncompressedBytes) { _header.NumCompressedBytes = -1; _header.Write(stream); stream.Write(_uncompressedBlock, 0, _header.NumUncompressedBytes); } else { _header.Write(stream); stream.Write(_compressedBlock, 0, _header.NumCompressedBytes); } Offset = 0; } public void WriteEof(Stream stream) { _header.NumUncompressedBytes = -1; _header.NumCompressedBytes = -1; _header.Write(stream); } public int Read(Stream stream) { FileOffset = stream.Position; _header.Read(stream); if (_header.IsEmpty) return -1; int numBytesRead = _header.NumCompressedBytes == -1 ? ReadUncompressedBlock(stream) : ReadCompressedBlock(stream); Offset = 0; return BlockHeader.HeaderSize + numBytesRead; } private int ReadCompressedBlock(Stream stream) { int numBytesRead = stream.Read(_compressedBlock, 0, _header.NumCompressedBytes); if (numBytesRead != _header.NumCompressedBytes) { throw new IOException($"Expected {_header.NumCompressedBytes} bytes from the block, but received only {numBytesRead} bytes."); } int numUncompressedBytes = _compressionAlgorithm.Decompress(_compressedBlock, _header.NumCompressedBytes, _uncompressedBlock, _size); if (numUncompressedBytes != _header.NumUncompressedBytes) { throw new CompressionException($"Expected {_header.NumUncompressedBytes} bytes after decompression, but found only {numUncompressedBytes} bytes."); } return numBytesRead; } private int ReadUncompressedBlock(Stream stream) { int numBytesRead = stream.Read(_uncompressedBlock, 0, _header.NumUncompressedBytes); if (numBytesRead != _header.NumUncompressedBytes) { throw new IOException($"Expected {_header.NumUncompressedBytes} bytes from the uncompressed block, but received only {numBytesRead} bytes."); } return numBytesRead; } } } ================================================ FILE: Compression/FileHandling/BgzBlockReader.cs ================================================ using System; using System.IO; using ErrorHandling.Exceptions; namespace Compression.FileHandling { public sealed class BgzBlockReader:IDisposable { private readonly string _filePath; private readonly Stream _stream; private readonly bool _leaveStreamOpen; public long Position => _stream.Position; public BgzBlockReader(Stream stream, bool leaveStreamOpen = false) { _filePath = "(stream)"; _stream = stream; _leaveStreamOpen = leaveStreamOpen; } //read the next compressed block into provided buffer public int ReadCompressedBlock(byte[] buffer) { if (buffer.Length < BlockGZipStream.BlockGZipFormatCommon.MaxBlockSize) throw new InsufficientMemoryException($"Pease provide a buffer at least {BlockGZipStream.BlockGZipFormatCommon.MaxBlockSize} bytes in size."); int headerSize = _stream.Read(buffer, 0, BlockGZipStream.BlockGZipFormatCommon.BlockHeaderLength); // handle the case where no data was read if (headerSize == 0) return 0; // check the header if (!BlockGZipStream.HasValidHeader(headerSize, buffer)) { throw new CompressionException($"Found an invalid header when reading the GZip block ({_filePath})"); } int blockLength = BitConverter.ToUInt16(buffer, 16) + 1; int expectedDataSize = blockLength - BlockGZipStream.BlockGZipFormatCommon.BlockHeaderLength; var dataSize = _stream.Read(buffer, BlockGZipStream.BlockGZipFormatCommon.BlockHeaderLength, expectedDataSize); // handle unexpected truncation if (expectedDataSize != dataSize) { throw new CompressionException($"Found unexpected truncation when reading the GZip block ({_filePath})"); } return headerSize+dataSize; } public void Dispose() { if (_leaveStreamOpen) return; _stream?.Dispose(); } } } ================================================ FILE: Compression/FileHandling/BgzfBlock.cs ================================================ using System; using System.IO; using System.Text; using Compression.Algorithms; using ErrorHandling.Exceptions; namespace Compression.FileHandling { public sealed class BgzfBlock { private const int MaxBlockSize = 65536; private readonly byte[] _compressedBlock = new byte[MaxBlockSize]; private readonly byte[] _uncompressedBlock = new byte[MaxBlockSize]; private readonly Zlib _bgzf = new Zlib(); public string Read(Stream stream) { int count = stream.Read(_compressedBlock, 0, BlockGZipStream.BlockGZipFormatCommon.BlockHeaderLength); if (count == 0) return string.Empty; if (!BlockGZipStream.HasValidHeader(count, _compressedBlock)) throw new InvalidDataException("Found an invalid header when reading the GZip block"); int blockLength = BitConverter.ToUInt16(_compressedBlock, 16) + 1; int remaining = blockLength - BlockGZipStream.BlockGZipFormatCommon.BlockHeaderLength; count = stream.Read(_compressedBlock, BlockGZipStream.BlockGZipFormatCommon.BlockHeaderLength, remaining); if (count != remaining) throw new InvalidDataException("Found unexpected truncation when reading the GZip block"); count = _bgzf.Decompress(_compressedBlock, blockLength, _uncompressedBlock, MaxBlockSize); if (count < 0) throw new CompressionException("Encountered an error when uncompressing the GZip block"); return Encoding.UTF8.GetString(_uncompressedBlock, 0, count); } } } ================================================ FILE: Compression/FileHandling/BgzipTextReader.cs ================================================ using System; using System.IO; using System.Reflection; using System.Text; namespace Compression.FileHandling { public sealed class BgzipTextReader : IDisposable { private readonly bool _leaveOpen; private readonly StreamReader _reader; private readonly FieldInfo _charPosInfo; private readonly FieldInfo _charLenInfo; public BgzipTextReader(BlockGZipStream stream, bool leaveOpen = false) { _leaveOpen = leaveOpen; _reader = new StreamReader(stream, Encoding.UTF8, leaveOpen); Type readerType = _reader.GetType(); _charPosInfo = readerType.GetField("_charPos", BindingFlags.NonPublic | BindingFlags.Instance); _charLenInfo = readerType.GetField("_charLen", BindingFlags.NonPublic | BindingFlags.Instance); } public long Position { get { var bufferPos = (int)_charPosInfo.GetValue(_reader); var bufferSize = (int)_charLenInfo.GetValue(_reader); return _reader.BaseStream.Position - bufferSize + bufferPos; } } public string ReadLine() => _reader.ReadLine(); public void Dispose() { if (!_leaveOpen) _reader.Dispose(); } } } ================================================ FILE: Compression/FileHandling/BgzipTextWriter.cs ================================================ using System; using System.IO; using System.Text; using OptimizedCore; namespace Compression.FileHandling { public sealed class BgzipTextWriter : StreamWriter, IDisposable { private readonly BlockGZipStream _stream; private readonly byte[] _buffer; private int _bufferIndex; private const int CharBufferSize = 8 * 1024 * 1024; private char[] _charBuffer; private byte[] _byteBuffer; private const int BufferSize = BlockGZipStream.BlockGZipFormatCommon.BlockSize; private static readonly UTF8Encoding Utf8WithoutBom = new UTF8Encoding(false); public long Position => _stream.Position + _bufferIndex; public BgzipTextWriter(BlockGZipStream stream) : base(stream, Utf8WithoutBom, BufferSize, true) { _buffer = new byte[BufferSize]; _stream = stream; _charBuffer = ExpandableArray.Get(CharBufferSize); _byteBuffer = ExpandableArray.Get(CharBufferSize * 2); } public override void Flush() { if (_bufferIndex == 0) return; _stream.Write(_buffer, 0, _bufferIndex); //here we want to close the gzip blockB _stream.CloseBlock(); _bufferIndex = 0; } public override void WriteLine() => Write("\n"); public override void WriteLine(string value) => Write(value + "\n"); public override void Write(string value) { if (string.IsNullOrEmpty(value)) return; var lineBytes = Encoding.UTF8.GetBytes(value); WriteBytes(lineBytes, lineBytes.Length); } public override void Write(StringBuilder sb) { if (sb == null || sb.Length == 0) return; if (sb.Length > _charBuffer.Length) { _charBuffer = ExpandableArray.Resize(_charBuffer, sb.Length * 2); _byteBuffer = ExpandableArray.Resize(_byteBuffer, _charBuffer.Length * 2); } sb.CopyTo(0, _charBuffer, 0, sb.Length); var length = Encoding.UTF8.GetBytes(_charBuffer, 0, sb.Length, _byteBuffer, 0); WriteBytes(_byteBuffer, length); } private void WriteBytes(byte[] lineBytes, int length) { if (length <= BufferSize - _bufferIndex) { Array.Copy(lineBytes, 0, _buffer, _bufferIndex, length); _bufferIndex += length; } else { // fill up the buffer Array.Copy(lineBytes, 0, _buffer, _bufferIndex, BufferSize - _bufferIndex); int lineIndex = BufferSize - _bufferIndex; // write it out to the stream _stream.Write(_buffer, 0, BufferSize); _bufferIndex = 0; while (lineIndex + BufferSize <= length) { _stream.Write(lineBytes, lineIndex, BufferSize); lineIndex += BufferSize; } // the leftover bytes should be saved in buffer if (lineIndex >= length) return; Array.Copy(lineBytes, lineIndex, _buffer, 0, length - lineIndex); _bufferIndex = length - lineIndex; } } public new void Dispose() { Flush(); _stream.Dispose(); ExpandableArray.Return(_charBuffer); ExpandableArray.Return(_byteBuffer); } } } ================================================ FILE: Compression/FileHandling/BlockGZipStream.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using Compression.Algorithms; using ErrorHandling.Exceptions; namespace Compression.FileHandling { // BGZF/GZIP header (specialized from RFC 1952; little endian): // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ // | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ // BGZF/GZIP footer: // +---+---+---+---+---+---+---+---+ // | CRC| Source len| // +---+---+---+---+---+---+---+---+ public sealed class BlockGZipStream : Stream { private readonly byte[] _compressedBlock; private readonly byte[] _uncompressedBlock; private int _blockOffset; private int _blockLength; private long _blockAddress; private readonly bool _isCompressor; private readonly bool _leaveStreamOpen; private readonly string _filePath; private Stream _stream; private readonly Zlib _bgzf; private bool _isDisposed; public static class BlockGZipFormatCommon { public const int BlockSize = 65280; public const int MaxBlockSize = 65536; public const int BlockHeaderLength = 18; } #region Stream public override bool CanRead => _stream != null && !_isCompressor && _stream.CanRead; public override bool CanWrite => _stream != null && _isCompressor && _stream.CanWrite; public override bool CanSeek => _stream != null && !_isCompressor && _stream.CanSeek; public override long Length => throw new NotSupportedException(); public override long Position { get => (_blockAddress << 16) | ((long)_blockOffset & 0xffff); set => SeekVirtualFilePointer((ulong)value); } public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); public override void SetLength(long value) => throw new NotSupportedException(); public override void Flush() => _stream.Flush(); protected override void Dispose(bool disposing) { if (_isDisposed) return; try { if (_isCompressor) { Flush(_blockOffset); // write an empty block (as EOF marker) Flush(0); } if (!_leaveStreamOpen) { _stream.Dispose(); _stream = null; } _isDisposed = true; } finally { base.Dispose(disposing); } } #endregion public BlockGZipStream(Stream stream, CompressionMode compressionMode, bool leaveStreamOpen = false, int compressionLevel = 5) { _filePath = "(stream)"; _leaveStreamOpen = leaveStreamOpen; _stream = stream; // sanity check: make sure the stream exists if (stream == null) throw new ArgumentNullException(nameof(stream)); // sanity check: make sure we can use the stream for reading or writing _isCompressor = compressionMode == CompressionMode.Compress; if (_isCompressor && !_stream.CanWrite) throw new CompressionException("A stream lacking write capability was provided to the block GZip compressor."); if (!_isCompressor && !_stream.CanRead) throw new CompressionException("A stream lacking read capability was provided to the block GZip decompressor."); _bgzf = new Zlib(compressionLevel); _uncompressedBlock = new byte[BlockGZipFormatCommon.MaxBlockSize]; _compressedBlock = new byte[_bgzf.GetCompressedBufferBounds(BlockGZipFormatCommon.MaxBlockSize)]; } private void Flush(int uncompressedSize) { int blockLength = _bgzf.Compress(_uncompressedBlock, uncompressedSize, _compressedBlock, BlockGZipFormatCommon.MaxBlockSize); _blockOffset = 0; _stream.Write(_compressedBlock, 0, blockLength); _blockAddress = _stream.Position; } public static bool HasValidHeader(int numHeaderBytes, IReadOnlyList header) { if (numHeaderBytes != BlockGZipFormatCommon.BlockHeaderLength) return false; return header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 && header[12] == 66 && header[13] == 67; } private void ReadBlock() { long blockAddress = _stream.CanSeek ? _stream.Position : 0; int count = _stream.Read(_compressedBlock, 0, BlockGZipFormatCommon.BlockHeaderLength); // handle the case where no data was read if (count == 0) { _blockLength = 0; return; } // check the header if (!HasValidHeader(count, _compressedBlock)) { throw new CompressionException($"Found an invalid header when reading the GZip block ({_filePath})"); } int blockLength = BitConverter.ToUInt16(_compressedBlock, 16) + 1; int remaining = blockLength - BlockGZipFormatCommon.BlockHeaderLength; count = _stream.Read(_compressedBlock, BlockGZipFormatCommon.BlockHeaderLength, remaining); // handle unexpected truncation if (count != remaining) { throw new CompressionException($"Found unexpected truncation when reading the GZip block ({_filePath})"); } count = _bgzf.Decompress(_compressedBlock, blockLength, _uncompressedBlock, BlockGZipFormatCommon.MaxBlockSize); if (count < 0) { throw new CompressionException($"Encountered an error when uncompressing the GZip block ({_filePath})"); } // Do not reset offset if this read follows a seek if (_blockLength != 0) _blockOffset = 0; _blockAddress = blockAddress; _blockLength = count; } public override int Read(byte[] buffer, int offset, int count) { if (_isCompressor) throw new CompressionException("Tried to read data from a compression BlockGZipStream."); if (count == 0) return 0; var numBytesRead = 0; int dataOffset = offset; while (numBytesRead < count) { int numBytesAvailable = _blockLength - _blockOffset; if (numBytesAvailable <= 0) { ReadBlock(); numBytesAvailable = _blockLength - _blockOffset; if (numBytesAvailable <= 0) break; } int copyLength = Math.Min(count - numBytesRead, numBytesAvailable); Buffer.BlockCopy(_uncompressedBlock, _blockOffset, buffer, dataOffset, copyLength); _blockOffset += copyLength; dataOffset += copyLength; numBytesRead += copyLength; } // ReSharper disable once InvertIf if (_blockOffset == _blockLength) { _blockAddress = _stream.CanSeek ? _stream.Position : 0; _blockOffset = _blockLength = 0; } return numBytesRead; } public override void Write(byte[] buffer, int offset, int count) { if (!_isCompressor) throw new CompressionException("Tried to write data to a decompression BlockGZipStream."); var numBytesWritten = 0; int dataOffset = offset; // copy the data to the buffer while (numBytesWritten < count) { int copyLength = Math.Min(BlockGZipFormatCommon.BlockSize - _blockOffset, count - numBytesWritten); Buffer.BlockCopy(buffer, dataOffset, _uncompressedBlock, _blockOffset, copyLength); _blockOffset += copyLength; dataOffset += copyLength; numBytesWritten += copyLength; if (_blockOffset == BlockGZipFormatCommon.BlockSize) Flush(_blockOffset); } } public void CloseBlock() => Flush(_blockOffset); private void SeekVirtualFilePointer(ulong virtualPosition) { long compressedOffset = GetCompressedOffset(virtualPosition); int uncompressedOffset = GetUncompressedOffset(virtualPosition); // if we're already in the right block, no need to reload buffer. if (_blockAddress != compressedOffset) { _blockAddress = compressedOffset; _stream.Position = _blockAddress; ReadBlock(); } _blockOffset = uncompressedOffset; } private static long GetCompressedOffset(ulong virtualPosition) { unchecked { return (long)((virtualPosition >> 16) & 0xFFFFFFFFFFFFL); } } private static int GetUncompressedOffset(ulong virtualPosition) { unchecked { return (int)(virtualPosition & 0xffff); } } } } ================================================ FILE: Compression/FileHandling/BlockHeader.cs ================================================ using System.IO; using ErrorHandling.Exceptions; namespace Compression.FileHandling { public sealed class BlockHeader { private readonly byte[] _header; public const int HeaderSize = 12; private const int HeaderId = -822411574; // cafeface public int NumUncompressedBytes; public int NumCompressedBytes; public bool IsEmpty => NumUncompressedBytes == -1 && NumCompressedBytes == -1; public BlockHeader() => _header = new byte[HeaderSize]; private int GetInt(int offset) => _header[offset] | _header[offset + 1] << 8 | _header[offset + 2] << 16 | _header[offset + 3] << 24; public void Read(Stream stream) { int numBytesRead = stream.Read(_header, 0, HeaderSize); if (numBytesRead == 0) { NumUncompressedBytes = -1; NumCompressedBytes = -1; return; } if (numBytesRead != HeaderSize) throw new IOException($"Expected {HeaderSize} bytes from the block header, but received only {numBytesRead} bytes."); int headerId = GetInt(0); if (headerId != HeaderId) throw new CompressionException($"Expected the header ID ({HeaderId}), but found the following: {headerId}"); NumUncompressedBytes = GetInt(4); NumCompressedBytes = GetInt(8); } private void SetInt(int value, int offset) { _header[offset] = (byte)value; _header[offset + 1] = (byte)(value >> 8); _header[offset + 2] = (byte)(value >> 16); _header[offset + 3] = (byte)(value >> 24); } public void Write(Stream stream) { SetInt(HeaderId, 0); SetInt(NumUncompressedBytes, 4); SetInt(NumCompressedBytes, 8); stream.Write(_header, 0, HeaderSize); } } } ================================================ FILE: Compression/FileHandling/BlockStream.cs ================================================ using System; using System.IO; using System.IO.Compression; using System.Text; using Compression.Algorithms; using Compression.DataStructures; using ErrorHandling.Exceptions; namespace Compression.FileHandling { public sealed class BlockStream : Stream { private readonly bool _isCompressor; private readonly bool _leaveStreamOpen; private Stream _stream; private BinaryWriter _writer; private Action _headerWrite; private readonly Block _block; private bool _foundEof; private bool _isDisposed; #region Stream public override bool CanRead => _stream.CanRead; public override bool CanWrite => _stream.CanWrite; public override bool CanSeek => _stream.CanSeek; public override long Length => throw new NotSupportedException(); public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); public override void SetLength(long value) => throw new NotSupportedException(); public override long Position { get => _stream.Position; set => throw new NotSupportedException(); } public override void Flush() { if (_block.Offset > 0) _block.Write(_stream); } protected override void Dispose(bool disposing) { if (_isDisposed) return; try { if (_isCompressor) { Flush(); _block.WriteEof(_stream); // update the header if (_headerWrite != null) { _stream.Position = 0; _headerWrite(_writer); } _writer.Dispose(); _writer = null; } if (!_leaveStreamOpen) { _stream.Dispose(); _stream = null; } _isDisposed = true; } finally { base.Dispose(disposing); } } #endregion public BlockStream(ICompressionAlgorithm compressionAlgorithm, Stream stream, CompressionMode compressionMode, bool leaveStreamOpen = false, int size = 16777216) { _stream = stream ?? throw new ArgumentNullException(nameof(stream)); _isCompressor = compressionMode == CompressionMode.Compress; _leaveStreamOpen = leaveStreamOpen; _block = new Block(compressionAlgorithm, size); // sanity check: make sure we can use the stream for reading or writing if (_isCompressor && !_stream.CanWrite) throw new ArgumentException("A stream lacking write capability was provided to the block GZip compressor."); if (!_isCompressor && !_stream.CanRead) throw new ArgumentException("A stream lacking read capability was provided to the block GZip decompressor."); if (_isCompressor) _writer = new BinaryWriter(_stream, Encoding.UTF8, true); } public void WriteHeader(Action headerWrite) { _headerWrite = headerWrite; _headerWrite(_writer); } public override int Read(byte[] buffer, int offset, int count) { if (_foundEof) return 0; if (_isCompressor) throw new CompressionException("Tried to read data from a compression BlockGZipStream."); ValidateParameters(buffer, offset, count); var numBytesRead = 0; int dataOffset = offset; while (numBytesRead < count) { if (!_block.HasMoreData) { int numBytes = _block.Read(_stream); if (numBytes == -1) { _foundEof = true; return numBytesRead; } } int copyLength = _block.CopyFrom(buffer, dataOffset, count - numBytesRead); dataOffset += copyLength; numBytesRead += copyLength; } return numBytesRead; } private void ValidateParameters(byte[] array, int offset, int count) { if (array == null) throw new ArgumentNullException(nameof(array)); if (offset < 0) throw new ArgumentOutOfRangeException(nameof(offset)); if (count < 0) throw new ArgumentOutOfRangeException(nameof(count)); if (array.Length - offset < count) throw new ArgumentException("Invalid Argument Offset Count"); } public override void Write(byte[] buffer, int offset, int count) { if (!_isCompressor) throw new CompressionException("Tried to write data to a decompression BlockGZipStream."); ValidateParameters(buffer, offset, count); var numBytesWritten = 0; int dataOffset = offset; while (numBytesWritten < count) { int copyLength = _block.CopyTo(buffer, dataOffset, count - numBytesWritten); dataOffset += copyLength; numBytesWritten += copyLength; if (_block.IsFull) _block.Write(_stream); } } public (long FileOffset, int InternalOffset) GetBlockPosition() => (_stream.Position, _block.Offset); public void SetBlockPosition(long fileOffset, int internalOffset = 0) { if (fileOffset != _block.FileOffset) { _stream.Position = fileOffset; _block.Read(_stream); } _foundEof = false; _block.Offset = internalOffset; } } } ================================================ FILE: Compression/Utilities/BlockExtensions.cs ================================================ using System; using System.Buffers; using System.IO; using Compression.Algorithms; namespace Compression.Utilities { public static class BlockExtensions { private static readonly Zstandard Zstd = new(21); public static byte[] ReadCompressedByteArray(this BinaryReader reader, ArrayPool bytePool) { int uncompressedSize = reader.ReadInt32(); int compressedSize = reader.ReadInt32(); byte[] compressedBuffer = bytePool.Rent(compressedSize); byte[] uncompressedBuffer = bytePool.Rent(uncompressedSize); reader.Read(compressedBuffer, 0, compressedSize); Zstd.Decompress(compressedBuffer, compressedSize, uncompressedBuffer, uncompressedBuffer.Length); bytePool.Return(compressedBuffer); return uncompressedBuffer; } public static void WriteCompressedByteArray(this BinaryWriter writer, byte[] uncompressed, int uncompressedSize) { ArrayPool bytePool = ArrayPool.Shared; int compressedBufferSize = Zstd.GetCompressedBufferBounds(uncompressedSize); byte[] compressedBuffer = bytePool.Rent(compressedBufferSize); int compressedSize = Zstd.Compress(uncompressed, uncompressedSize, compressedBuffer, compressedBuffer.Length); writer.Write(uncompressedSize); writer.Write(compressedSize); writer.Write(compressedBuffer, 0, compressedSize); double percentCompression = compressedSize / (double) uncompressedSize * 100.0; Console.WriteLine($"uncompressed: {uncompressedSize:N0}, compressed: {compressedSize:N0}, {percentCompression:0.0}%"); bytePool.Return(compressedBuffer); } } } ================================================ FILE: Compression/Utilities/GZipUtilities.cs ================================================ using System; using System.IO; using System.IO.Compression; using System.Text; using Compression.FileHandling; using ErrorHandling.Exceptions; using IO; namespace Compression.Utilities { public static class GZipUtilities { private const int NumHeaderBytes = 18; private enum CompressionAlgorithm { Uncompressed, GZip, BlockGZip } public static StreamReader GetAppropriateStreamReader(string filePath) => FileUtilities.GetStreamReader(GetAppropriateReadStream(filePath)); public static StreamWriter GetStreamWriter(string filePath) => new StreamWriter(GetWriteStream(filePath)); private static Stream GetWriteStream(string filePath) => new BlockGZipStream(FileUtilities.GetCreateStream(filePath), CompressionMode.Compress); private static Stream GetAppropriateStream(Stream stream, CompressionAlgorithm compressionAlgorithm) { Stream newStream; // ReSharper disable once SwitchStatementMissingSomeCases switch (compressionAlgorithm) { case CompressionAlgorithm.BlockGZip: newStream = new BlockGZipStream(stream, CompressionMode.Decompress); break; case CompressionAlgorithm.GZip: newStream = new GZipStream(stream, CompressionMode.Decompress); break; default: newStream = stream; break; } return newStream; } //todo: can have just one method for both file and http streams //used in custom annotation lambda public static Stream GetAppropriateStream(Stream stream) { byte[] header = GetHeader(stream); var compressionAlgorithm = IdentifyCompressionAlgorithm(header); stream.Position = 0; var appropriateStream = GetAppropriateStream(stream, compressionAlgorithm); return appropriateStream; } public static Stream GetAppropriateReadStream(string filePath) { CompressionAlgorithm compressionAlgorithm; using (var headerStream = PersistentStreamUtils.GetReadStream(filePath)) { byte[] header = GetHeader(headerStream); compressionAlgorithm = IdentifyCompressionAlgorithm(header); } var fileStream = PersistentStreamUtils.GetReadStream(filePath); return GetAppropriateStream(fileStream, compressionAlgorithm); } private static byte[] GetHeader(Stream stream) { byte[] header = null; try { using (var reader = new ExtendedBinaryReader(stream, Encoding.UTF8, true)) { header = reader.ReadBytes(NumHeaderBytes); } } catch (Exception e) { if (e.Message.Contains("because it is being used by another process.")) { throw new ProcessLockedFileException(e.Message); } } return header; } // ReSharper disable once SuggestBaseTypeForParameter private static CompressionAlgorithm IdentifyCompressionAlgorithm(byte[] header) { var result = CompressionAlgorithm.Uncompressed; if (header == null || header.Length != NumHeaderBytes) return result; // check if this is a gzip file if (header[0] != 31 || header[1] != 139 || header[2] != 8) return result; result = CompressionAlgorithm.GZip; // check if this is a block GZip file if ((header[3] & 4) != 0 && header[12] == 66 && header[13] == 67) result = CompressionAlgorithm.BlockGZip; return result; } } } ================================================ FILE: Compression/Utilities/LibraryUtilities.cs ================================================ using System; using System.IO; using System.Runtime.InteropServices; using ErrorHandling.Exceptions; namespace Compression.Utilities { public static class LibraryUtilities { public static void CheckLibrary() { const int expectedLibraryId = -822411574; // cafeface // check to see if we have our compression library try { int observedLibraryId = SafeNativeMethods.get_library_id(); if (observedLibraryId != expectedLibraryId) throw new InvalidDataException("Received an incorrect library ID when validating the Block Compression library."); } catch (Exception) { throw new MissingCompressionLibraryException("BlockCompression"); } } private static class SafeNativeMethods { [DllImport("BlockCompression", CallingConvention = CallingConvention.Cdecl)] public static extern int get_library_id(); } } } ================================================ FILE: CreateLambdaZips.sh ================================================ #!/usr/bin/env bash LAMBDA_DIRS=(AnnotationLambda CustomAnnotationLambda GeneAnnotationLambda NirvanaLambda SingleAnnotationLambda) OUTPUT_DIR=bin/Release/netcoreapp2.1 ARTIFACT_S3_DIR=${ARTIFACT_S3_DIR:=develop} S3_PREFIX=s3://nirvana-cloudformation/$ARTIFACT_S3_DIR # install Amazon.Lambda.Tools if it's not already there dotnet tool list -g | grep dotnet-lambda &> /dev/null if [ $? -ne 0 ]; then dotnet tool install -g Amazon.Lambda.Tools fi # get the script's directory TOP_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" # get the version VERSION=$(git describe --long | cut -c 2-) # some fancy formatting function Header () { echo -e "\n\e[91m\e[1m${1}\e[0m" } # silence pushd and popd pushd () { command pushd "$@" > /dev/null } popd () { command popd "$@" > /dev/null } # create the zip files for LAMBDA_DIR in "${LAMBDA_DIRS[@]}" do LAMBDA_PATH=$TOP_DIR/$LAMBDA_DIR pushd $LAMBDA_PATH # create the zip file dotnet lambda package //p:Version=$VERSION -c Release # upload the file to S3 Header "Uploading ${LAMBDA_DIR}:" ZIP_PATH=${LAMBDA_PATH}/${OUTPUT_DIR}/${LAMBDA_DIR}.zip aws s3 cp $ZIP_PATH ${S3_PREFIX}/${LAMBDA_DIR}-${VERSION}.zip popd done Header "All zip files have been uploaded to ${S3_PREFIX}" ================================================ FILE: CustomAnnotationLambda/CustomAnnotationLambda.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Text; using Amazon.Lambda.Core; using Cloud; using Cloud.Messages; using Cloud.Messages.Custom; using Cloud.Notifications; using Cloud.Utilities; using CommandLine.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using SAUtils.Custom; [assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.Json.JsonSerializer))] namespace CustomAnnotationLambda { // ReSharper disable once ClassNeverInstantiated.Global public sealed class CustomAnnotationLambda { // ReSharper disable once UnusedMember.Global public CustomResult Run(CustomConfig config, ILambdaContext context) { var result = new CustomResult { id = config.id }; string snsTopicArn = null; var runLog = new StringBuilder(); try { LogUtilities.UpdateLogger(context.Logger, runLog); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey }); LambdaUtilities.GarbageCollect(); LambdaUtilities.DeleteTempOutput(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); config.CheckRequiredFieldsNotNull(); var s3Client = config.outputDir.GetS3Client(context.RemainingTime); config.CheckResourcesExist(); LambdaUtilities.DeleteTempOutput(); string inputFileName = config.tsvUrl.TrimEndFromFirst("?").TrimStartToLast("/"); Logger.WriteLine($"input file name is: {inputFileName}"); return IsGeneAnnotationTsv(config.tsvUrl) ? GeneAnnotationCreator.Create(config, inputFileName, result, s3Client) : VariantAnnotationCreator.Create(config, inputFileName, result, s3Client); } catch (Exception e) { result.jwtFields = config.jwtFields; result.variantCount = 0; return HandleException(runLog, result, e, snsTopicArn); } } private static bool IsGeneAnnotationTsv(string tsvUrl) { using (var customTsvStream = (PersistentStream) PersistentStreamUtils.GetReadStream(tsvUrl)) using (var reader = new StreamReader(customTsvStream)) { reader.ReadLine(); string secondLine = reader.ReadLine(); if (secondLine == null) throw new UserErrorException("The input TSV file has less than two lines"); return secondLine.StartsWith("#geneSymbol"); } } public static CustomResult GetSuccessResult(CustomConfig customSaConfig, CustomResult result, List outputFiles) { Logger.WriteLine("All files uploaded."); result.created = new FileList { bucketName = customSaConfig.outputDir.bucketName, outputDir = customSaConfig.outputDir.path, files = outputFiles.ToArray() }; result.status = LambdaUtilities.SuccessMessage; LogUtilities.LogObject("Result", result); LambdaUtilities.DeleteTempOutput(); return result; } private static CustomResult HandleException(StringBuilder runLog, CustomResult result, Exception e, string snsTopicArn) { Logger.Log(e); var errorCategory = ExceptionUtilities.ExceptionToErrorCategory(e); result.status = $"{errorCategory}: {e.Message}"; result.noValidEntries = e.Message.Contains(GeneAnnotationsParser.NoValidEntriesErrorMessage); if (errorCategory != ErrorCategory.UserError) { string snsMessage = SNS.CreateMessage(runLog.ToString(), result.status, e.StackTrace); SNS.SendMessage(snsTopicArn, snsMessage); } LogUtilities.LogObject("Result", result); LambdaUtilities.DeleteTempOutput(); return result; } } } ================================================ FILE: CustomAnnotationLambda/CustomAnnotationLambda.csproj ================================================  net6.0 true Lambda bin\$(Configuration) ================================================ FILE: CustomAnnotationLambda/CustomConfigExtensions.cs ================================================ using Cloud; using Cloud.Messages.Custom; using ErrorHandling.Exceptions; using IO; namespace CustomAnnotationLambda { public static class CustomConfigExtensions { public static void CheckRequiredFieldsNotNull(this CustomConfig config) { static string BuildErrorMessage(string message) => message + " cannot be null."; if (config.id == null) throw new UserErrorException(BuildErrorMessage("id")); if (config.tsvUrl == null) throw new UserErrorException(BuildErrorMessage("tsvUrl")); if (config.outputDir == null) throw new UserErrorException(BuildErrorMessage("outputDir")); if (config.outputDir.bucketName == null) throw new UserErrorException(BuildErrorMessage("bucketName of outputDir")); if (config.outputDir.path == null) throw new UserErrorException(BuildErrorMessage("path of outputDir")); if (config.outputDir.region == null) throw new UserErrorException(BuildErrorMessage("region of outputDir")); if (config.outputDir.accessKey == null) throw new UserErrorException(BuildErrorMessage("accessKey of outputDir")); if (config.outputDir.secretKey == null) throw new UserErrorException(BuildErrorMessage("secretKey of outputDir")); if (config.outputDir.sessionToken == null) throw new UserErrorException(BuildErrorMessage("sessionToken of outputDir")); } public static void CheckResourcesExist(this CustomConfig config) { HttpUtilities.ValidateUrl(config.tsvUrl); HttpUtilities.ValidateUrl(LambdaUrlHelper.GetUgaUrl(), false); config.outputDir.Validate(true); } } } ================================================ FILE: CustomAnnotationLambda/GeneAnnotationCreator.cs ================================================ using System.Collections.Generic; using System.IO; using System.Security.Cryptography; using Cloud; using Cloud.Messages.Custom; using Cloud.Utilities; using Compression.Utilities; using ErrorHandling.Exceptions; using IO; using SAUtils.Custom; using SAUtils.GeneIdentifiers; using VariantAnnotation.SA; namespace CustomAnnotationLambda { public static class GeneAnnotationCreator { private const string LogFileName = "unrecognizedGeneIds.txt"; public static CustomResult Create(CustomConfig config, string inputFileName, CustomResult result, IS3Client s3Client) { string inputBaseName = inputFileName.TrimEndFromFirst(".tsv"); string ngaFileName = inputBaseName + SaCommon.GeneFileSuffix; string localNgaPath = Path.Combine(Path.GetTempPath(), ngaFileName); string localSchemaPath = localNgaPath + SaCommon.JsonSchemaSuffix; string localLogPath = Path.Combine(Path.GetTempPath(), LogFileName); int variantCount = 0; HttpUtilities.ValidateUrl(LambdaUrlHelper.GetUgaUrl()); var outputFiles = new List(); using (var aes = new AesCryptoServiceProvider()) { FileMetadata ngaMetadata, schemaMetadata, logMetaData; using (var logStream = FileUtilities.GetCreateStream(localLogPath)) using (var logCryptoStream = new CryptoStream(logStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var logMd5Stream = new MD5Stream(logCryptoStream)) // using (var customTsvStream = (PersistentStream)PersistentStreamUtils.GetReadStream(config.tsvUrl)) using (var parser = GetGeneAnnotationsParserFromCustomTsvStream(customTsvStream)) // using (var ngaStream = FileUtilities.GetCreateStream(localNgaPath)) using (var ngaCryptoStream = new CryptoStream(ngaStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var ngaMd5Stream = new MD5Stream(ngaCryptoStream)) // using (var schemaStream = FileUtilities.GetCreateStream(localSchemaPath)) using (var schemaCryptoStream = new CryptoStream(schemaStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var schemaMd5Stream = new MD5Stream(schemaCryptoStream)) { using (var ngaWriter = CaUtilities.GetNgaWriter(ngaMd5Stream, parser, inputFileName)) using (var schemaWriter = new StreamWriter(schemaMd5Stream)) using (var logWriter = new StreamWriter(logMd5Stream)) { variantCount = ngaWriter.Write(parser.GetItems(config.skipGeneIdValidation, logWriter)); var unknownGenes = parser.GetUnknownGenes(); if (!config.skipGeneIdValidation && unknownGenes.Count > 0) { throw new UserErrorException($"{GeneAnnotationsParser.UnknownGeneIdsErrorMessage} {string.Join(',', unknownGenes)}"); } schemaWriter.Write(parser.JsonSchema); } //all the writers have to be disposed before GetFileMetaData is called ngaMetadata = ngaMd5Stream.GetFileMetadata(); schemaMetadata = schemaMd5Stream.GetFileMetadata(); logMetaData = logMd5Stream.GetFileMetadata(); } if (config.skipGeneIdValidation) { string logS3Key = string.Join('/', config.outputDir.path.Trim('/'), LogFileName); Logger.WriteLine("uploading log file to " + logS3Key); s3Client.DecryptUpload(config.outputDir.bucketName, logS3Key, localLogPath, aes, logMetaData); } string nsaS3Path = string.Join('/', config.outputDir.path.Trim('/'), ngaFileName); string schemaS3Path = nsaS3Path + SaCommon.JsonSchemaSuffix; s3Client.DecryptUpload(config.outputDir.bucketName, nsaS3Path, localNgaPath, aes, ngaMetadata); s3Client.DecryptUpload(config.outputDir.bucketName, schemaS3Path, localSchemaPath, aes, schemaMetadata); outputFiles.Add(ngaFileName); outputFiles.Add(ngaFileName + SaCommon.JsonSchemaSuffix); LambdaUtilities.DeleteTempOutput(); result.jwtFields = config.jwtFields; result.variantCount = variantCount; return CustomAnnotationLambda.GetSuccessResult(config, result, outputFiles); } } private static GeneAnnotationsParser GetGeneAnnotationsParserFromCustomTsvStream(PersistentStream customTsvStream) { var (entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(null, LambdaUrlHelper.GetUgaUrl()); return GeneAnnotationsParser.Create(new StreamReader(GZipUtilities.GetAppropriateStream(customTsvStream)), entrezGeneIdToSymbol, ensemblGeneIdToSymbol); } } } ================================================ FILE: CustomAnnotationLambda/VariantAnnotationCreator.cs ================================================ using System.Collections.Generic; using System.IO; using System.Security.Cryptography; using Cloud; using Cloud.Messages.Custom; using Cloud.Utilities; using Compression.Utilities; using Genome; using IO; using SAUtils.Custom; using SAUtils.DataStructures; using SAUtils.Schema; using VariantAnnotation.Interface.SA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace CustomAnnotationLambda { public static class VariantAnnotationCreator { public static CustomResult Create(CustomConfig config, string inputFileName, CustomResult result, IS3Client s3Client) { string tempPath = Path.GetTempPath(); string inputBaseName = inputFileName.TrimEndFromFirst(".tsv"); string nsaFileName = inputBaseName + SaCommon.SaFileSuffix; string localNsaPath = Path.Combine(tempPath, nsaFileName); string localIndexPath = localNsaPath + SaCommon.IndexSuffix; string localSchemaPath = localNsaPath + SaCommon.JsonSchemaSuffix; int variantCount = 0; var outputFiles = new List(); using (var aes = new AesCryptoServiceProvider()) { FileMetadata nsaMetadata, indexMetadata, schemaMetadata; List intervals; string jsonTag; SaJsonSchema intervalJsonSchema; DataSourceVersion version; GenomeAssembly genomeAssembly; int nsaItemsCount; ReportFor reportFor; using (var customTsvStream = (PersistentStream) PersistentStreamUtils.GetReadStream(config.tsvUrl)) using (var parser = GetVariantAnnotationsParserFromCustomTsvStream(customTsvStream)) // using (var nsaStream = FileUtilities.GetCreateStream(localNsaPath)) using (var nsaCryptoStream = new CryptoStream(nsaStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var nsaMd5Stream = new MD5Stream(nsaCryptoStream)) // using (var indexStream = FileUtilities.GetCreateStream(localIndexPath)) using (var indexCryptoStream = new CryptoStream(indexStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var indexMd5Stream = new MD5Stream(indexCryptoStream)) // using (var schemaStream = FileUtilities.GetCreateStream(localSchemaPath)) using (var schemaCryptoStream = new CryptoStream(schemaStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var schemaMd5Stream = new MD5Stream(schemaCryptoStream)) { genomeAssembly = parser.Assembly; result.genomeAssembly = genomeAssembly.ToString(); reportFor = parser.ReportFor; result.jwtFields = config.jwtFields; using (var nsaWriter = CaUtilities.GetNsaWriter(nsaMd5Stream, indexMd5Stream, parser, inputFileName, parser.SequenceProvider, out version, config.skipRefBaseValidation)) using (var schemaWriter = new StreamWriter(schemaMd5Stream)) { (jsonTag, nsaItemsCount, intervalJsonSchema, intervals) = CaUtilities.WriteSmallVariants(parser, nsaWriter, schemaWriter); } variantCount += nsaItemsCount; variantCount += intervals?.Count ?? 0; nsaMetadata = nsaMd5Stream.GetFileMetadata(); indexMetadata = indexMd5Stream.GetFileMetadata(); schemaMetadata = schemaMd5Stream.GetFileMetadata(); } result.variantCount = variantCount; if (nsaItemsCount > 0) { string nsaS3Path = string.Join('/', config.outputDir.path.Trim('/'), nsaFileName); string indexS3Path = nsaS3Path + SaCommon.IndexSuffix; string schemaS3Path = nsaS3Path + SaCommon.JsonSchemaSuffix; s3Client.DecryptUpload(config.outputDir.bucketName, nsaS3Path, localNsaPath, aes, nsaMetadata); s3Client.DecryptUpload(config.outputDir.bucketName, indexS3Path, localIndexPath, aes, indexMetadata); s3Client.DecryptUpload(config.outputDir.bucketName, schemaS3Path, localSchemaPath, aes, schemaMetadata); outputFiles.Add(nsaFileName); outputFiles.Add(nsaFileName + SaCommon.IndexSuffix); outputFiles.Add(nsaFileName + SaCommon.JsonSchemaSuffix); } if (intervals == null) return CustomAnnotationLambda.GetSuccessResult(config, result, outputFiles); FileMetadata nsiMetadata, nsiSchemaMetadata; string nsiFileName = inputBaseName + SaCommon.IntervalFileSuffix; string localNsiPath = Path.Combine(tempPath, nsiFileName); string localNsiSchemaPath = localNsiPath + SaCommon.JsonSchemaSuffix; // using (var nsiStream = FileUtilities.GetCreateStream(localNsiPath)) using (var nsiCryptoStream = new CryptoStream(nsiStream, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var nsiMd5Stream = new MD5Stream(nsiCryptoStream)) // using (var nsiSchemaSteam = FileUtilities.GetCreateStream(localNsiSchemaPath)) using (var nsiSchemaCryptoStream = new CryptoStream(nsiSchemaSteam, aes.CreateEncryptor(), CryptoStreamMode.Write)) using (var nsiSchemaMd5Stream = new MD5Stream(nsiSchemaCryptoStream)) { using (var nsiWriter = CaUtilities.GetNsiWriter(nsiMd5Stream, version, genomeAssembly, jsonTag, reportFor)) using (var schemaWriter = new StreamWriter(nsiSchemaMd5Stream)) { nsiWriter.Write(intervals); schemaWriter.Write(intervalJsonSchema); } nsiMetadata = nsiMd5Stream.GetFileMetadata(); nsiSchemaMetadata = nsiSchemaMd5Stream.GetFileMetadata(); } string nsiS3Path = string.Join('/', config.outputDir.path.Trim('/'), nsiFileName); string nsiSchemaS3PathFile = nsiS3Path + SaCommon.JsonSchemaSuffix; s3Client.DecryptUpload(config.outputDir.bucketName, nsiS3Path, localNsiPath, aes, nsiMetadata); s3Client.DecryptUpload(config.outputDir.bucketName, nsiSchemaS3PathFile, localNsiSchemaPath, aes, nsiSchemaMetadata); outputFiles.Add(nsiFileName); outputFiles.Add(nsiFileName + SaCommon.JsonSchemaSuffix); } LambdaUtilities.DeleteTempOutput(); return CustomAnnotationLambda.GetSuccessResult(config, result, outputFiles); } private static VariantAnnotationsParser GetVariantAnnotationsParserFromCustomTsvStream(PersistentStream customTsvStream) { var parser = VariantAnnotationsParser.Create(new StreamReader(GZipUtilities.GetAppropriateStream(customTsvStream))); parser.SequenceProvider = new ReferenceSequenceProvider(PersistentStreamUtils.GetReadStream(LambdaUrlHelper.GetRefUrl(parser.Assembly))); return parser; } } } ================================================ FILE: CustomStrValidationLambda/CustomStrValidationLambda.cs ================================================ using System; using System.IO; using Amazon.Lambda.Core; using Cloud; using Cloud.Messages.StrValidation; using Cloud.Notifications; using Cloud.Utilities; using CommandLine.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using Genome; using IO; using Nirvana; using RepeatExpansions.IO; using VariantAnnotation.Interface.Providers; [assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.Json.JsonSerializer))] namespace CustomStrValidationLambda { public class CustomStrValidationLambda { public ValidationResult Run(ValidationConfig config, ILambdaContext context) { string snsTopicArn = null; try { LogUtilities.UpdateLogger(context.Logger, null); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey }); LambdaUtilities.GarbageCollect(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); config.Validate(); GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly); string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly); var refProvider = ProviderUtilities.GetSequenceProvider(nirvanaS3Ref); using (var stream = PersistentStreamUtils.GetReadStream(config.customStrUrl)) TryLoadStrFile(stream, genomeAssembly, refProvider); } catch (Exception exception) { return HandleException(config.id, exception, snsTopicArn); } return GetSuccessOutput(config.id); } private static void TryLoadStrFile(Stream stream, GenomeAssembly genomeAssembly, ISequenceProvider refProvider) { try { RepeatExpansionReader.Load(stream, genomeAssembly, refProvider.RefNameToChromosome, refProvider.RefIndexToChromosome.Count); } catch (Exception exception) { throw new UserErrorException(exception.Message); } } private static ValidationResult HandleException(string id, Exception exception, string snsTopicArn) { Logger.Log(exception); string snsMessage = SNS.CreateMessage(exception.Message, "exception", exception.StackTrace); SNS.SendMessage(snsTopicArn, snsMessage); ErrorCategory errorCategory = ExceptionUtilities.ExceptionToErrorCategory(exception); var errorMessagePrefix = errorCategory == ErrorCategory.UserError ? "User error" : "Nirvana error"; return new ValidationResult { id = id, status = $"{errorMessagePrefix}: {exception.Message}" }; } private static ValidationResult GetSuccessOutput(string id) => new ValidationResult { id = id, status = LambdaUtilities.SuccessMessage }; } } ================================================ FILE: CustomStrValidationLambda/CustomStrValidationLambda.csproj ================================================  net6.0 true Lambda ================================================ FILE: Downloader/AnnotationRepository.cs ================================================ using System; using System.Collections.Generic; using System.Threading; using Downloader.Utilities; namespace Downloader { public static class AnnotationRepository { public static void DownloadMetadata(IClient client, List files) => files.ParallelExecute(client.SetMetadata, Retry, "finished", "download the file metadata"); public static void DownloadFiles(IClient client, List files) => files.ParallelExecute(client.DownloadFile, Retry, "finished", "download the file"); private static void Retry(RemoteFile file, Func clientFunc, CancellationTokenSource tokenSource, string exceptionMessage) { var numAttempts = 0; const int maxAttempts = 3; while (true) { numAttempts++; if (numAttempts == maxAttempts) { Console.WriteLine($" - Unable to {exceptionMessage} for {file.Description} after {maxAttempts} attempts."); tokenSource.Cancel(); break; } bool success = clientFunc(file); if (success) break; } } } } ================================================ FILE: Downloader/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: Downloader/Client.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Net; using System.Net.Http; using System.Net.Http.Headers; using Downloader.Utilities; namespace Downloader { public sealed class Client : IClient { private readonly HttpClient _httpClient; public Client(string hostName) { var baseUri = new Uri($"http://{hostName}"); ServicePointManager.DefaultConnectionLimit = int.MaxValue; ServicePointManager.FindServicePoint(baseUri).ConnectionLeaseTimeout = 60 * 1000; _httpClient = new HttpClient { BaseAddress = baseUri }; _httpClient.DefaultRequestHeaders.Clear(); _httpClient.DefaultRequestHeaders.ConnectionClose = false; _httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("text/plain")); _httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/octet-stream")); } public List DownloadLines(string remotePath) { var lines = new List(); using (var response = _httpClient.GetAsync(remotePath, HttpCompletionOption.ResponseHeadersRead).AsSync()) { var stream = response.Content.ReadAsStreamAsync().AsSync(); if (!response.IsSuccessStatusCode) return lines; using (var reader = new StreamReader(stream)) { while (true) { string line = reader.ReadLineAsync().AsSync(); if (line == null) break; lines.Add(line); } } } return lines; } public bool SetMetadata(RemoteFile file) { using (var response = _httpClient.GetAsync(file.RemotePath, HttpCompletionOption.ResponseHeadersRead).AsSync()) { if (response.StatusCode == HttpStatusCode.NotFound) { Console.Write(" - "); ConsoleEmbellishments.PrintWarning("WARNING: "); Console.WriteLine($"{file.Description} could not be found. Skipping this file."); file.Missing = true; file.Skipped = true; return true; } if (!response.IsSuccessStatusCode) return false; long? contentLength = response.Content.Headers.ContentLength; if (contentLength.HasValue) file.FileSize = contentLength.Value; DateTimeOffset? lastModified = response.Content.Headers.LastModified; if (lastModified.HasValue) file.LastModified = lastModified.Value; } return true; } public bool DownloadFile(RemoteFile file) { using (var response = _httpClient.GetAsync(file.RemotePath, HttpCompletionOption.ResponseHeadersRead).AsSync()) { if (!response.IsSuccessStatusCode) return false; Console.WriteLine($" - downloading {file.Description}"); var stream = response.Content.ReadAsStreamAsync().ConfigureAwait(false).GetAwaiter().GetResult(); var fileInfo = new FileInfo(file.LocalPath); using (var fileStream = fileInfo.OpenWrite()) stream.CopyTo(fileStream); } return true; } } } ================================================ FILE: Downloader/Configuration.cs ================================================ using System; using System.IO; using Cloud; using Cloud.Utilities; using Genome; using Microsoft.Extensions.Configuration; namespace Downloader { public static class Configuration { public static (string HostName, string CacheDir, string ReferencesDir, string ManifestGRCh37, string ManifestGRCh38) Load( string hostName, string manifestPrefix) { const string appSettingsFilename = "Downloader.appsettings.json"; IConfigurationRoot config = new ConfigurationBuilder() .AddJsonFile(appSettingsFilename) .Build(); IConfigurationSection dataSource = config.GetSection("DataSource"); if (string.IsNullOrEmpty(hostName)) { hostName = dataSource["HostName"]; if (string.IsNullOrEmpty(hostName)) throw new InvalidDataException($"Could not find the HostName entry in the {appSettingsFilename} file."); // this env variable will over-ride the configuration in cloud Environment.SetEnvironmentVariable(LambdaUrlHelper.UrlBaseEnvironmentVariableName, $"http://{hostName}/"); } var cloudConfiguration = new Cloud.Configuration(); string cacheDir = cloudConfiguration.CacheDirectory; if (string.IsNullOrEmpty(cacheDir)) throw new InvalidDataException($"Could not find the CacheDirectory entry in the Cloud.appsettings.json file."); string referencesDir = cloudConfiguration.ReferencesDirectory; if (string.IsNullOrEmpty(referencesDir)) throw new InvalidDataException($"Could not find the ReferencesDirectory entry in the Cloud.appsettings.json file."); string manifestGRCh37 ; string manifestGRCh38 ; if (string.IsNullOrEmpty(manifestPrefix)) { manifestGRCh37 = LambdaUtilities.GetManifestUrl(dataSource["ManifestGRCh37"], GenomeAssembly.GRCh37); if (string.IsNullOrEmpty(manifestGRCh37)) throw new InvalidDataException($"Could not find the ManifestGRCh37 entry in the {appSettingsFilename} file."); manifestGRCh38 = LambdaUtilities.GetManifestUrl(dataSource["ManifestGRCh38"], GenomeAssembly.GRCh38); if (string.IsNullOrEmpty(manifestGRCh38)) throw new InvalidDataException($"Could not find the ManifestGRCh38 entry in the {appSettingsFilename} file."); } else { manifestGRCh37 = LambdaUtilities.GetManifestUrl($"{manifestPrefix}", GenomeAssembly.GRCh37); manifestGRCh38 = LambdaUtilities.GetManifestUrl($"{manifestPrefix}", GenomeAssembly.GRCh38); } return (hostName, '/' + cacheDir, '/' + referencesDir, manifestGRCh37, manifestGRCh38); } } } ================================================ FILE: Downloader/Downloader.appsettings.json ================================================ { "DataSource": { "HostName": "annotations.nirvana.illumina.com", "ManifestGRCh37": "latest", "ManifestGRCh38": "latest" } } ================================================ FILE: Downloader/Downloader.csproj ================================================  Exe net6.0 ..\bin\$(Configuration) PreserveNewest ================================================ FILE: Downloader/DownloaderMain.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using CommandLine.Builders; using CommandLine.NDesk.Options; using CommandLine.Utilities; using Downloader.FileExtensions; using Downloader.Utilities; using ErrorHandling; using Genome; using VariantAnnotation.Interface; using GenomeAssemblyHelper = Downloader.Utilities.GenomeAssemblyHelper; namespace Downloader { public static class DownloaderMain { private static string _genomeAssembly; private static string _outputDirectory; private static string _hostName; private static string _manifestPrefix; private static ExitCodes ProgramExecution() { (string hostName, string remoteCacheDir, string remoteReferencesDir, string manifestGRCh37, string manifestGRCh38) = Configuration.Load(_hostName, _manifestPrefix); List genomeAssemblies = GenomeAssemblyHelper.GetGenomeAssemblies(_genomeAssembly); var client = new Client(hostName); Console.Write("- downloading manifest... "); Dictionary> remotePathsByGenomeAssembly = Manifest.GetRemotePaths(client, genomeAssemblies, manifestGRCh37, manifestGRCh38); (string cacheDir, string referencesDir, string saDir, List outputDirectories) = OutputDirectory.Create(_outputDirectory, genomeAssemblies); var fileList = new List(); fileList.AddCacheFiles(genomeAssemblies, remoteCacheDir, cacheDir) .AddReferenceFiles(genomeAssemblies, remoteReferencesDir, referencesDir) .AddSupplementaryAnnotationFiles(remotePathsByGenomeAssembly, saDir); Console.WriteLine($"{fileList.Count} files.\n"); // get rid of extra files in the output directories OutputDirectory.Cleanup(fileList, outputDirectories, referencesDir); // get length, checksum, and checks existence Console.WriteLine("- downloading file metadata:"); AnnotationRepository.DownloadMetadata(client, fileList); // remove obsolete files from the output directory OutputDirectory.RemoveOldFiles(fileList); // remove skipped files from our list List filesToDownload = OutputDirectory.RemoveSkippedFiles(fileList); // download the latest files if (filesToDownload.Count > 0) { long numBytesToDownload = OutputDirectory.GetNumDownloadBytes(filesToDownload); DiskSpaceUtilities.CheckAvailableDiskSpace(_outputDirectory, numBytesToDownload); Console.WriteLine($"- downloading files ({MemoryUtilities.ToHumanReadable(numBytesToDownload)}):"); AnnotationRepository.DownloadFiles(client, filesToDownload); } // sanity check OutputDirectory.CheckFiles(fileList); bool foundError = fileList.Any(x => !x.Pass); return foundError ? ExitCodes.InvalidData : ExitCodes.Success; } public static int Main(string[] args) { var ops = new OptionSet { { "ga=", "genome assembly {version}", v => _genomeAssembly = v }, { "host=", "annotation {hostname} (optional)", v => _hostName = v }, { "manifest=", "manifest {prefix} (optional)", v => _manifestPrefix = v }, { "out|o=", "top-level output {directory}", v => _outputDirectory = v } }; ExitCodes exitCode = new ConsoleAppBuilder(args, ops) .Parse() .HasRequiredParameter(_genomeAssembly, "genome assembly", "--ga") .CheckDirectoryExists(_outputDirectory, "top-level output directory", "--out") .ShowBanner(Constants.Authors) .ShowHelpMenu("Downloads the Nirvana data files from S3", "--ga --out ") .ShowErrors() .Execute(ProgramExecution); return (int) exitCode; } } } ================================================ FILE: Downloader/FileExtensions/CacheFileExtensions.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using IO; namespace Downloader.FileExtensions { public static class CacheFileExtensions { public static List AddCacheFiles(this List files, IEnumerable genomeAssemblies, string remoteCacheDirectory, string cacheDirectory) { foreach (var genomeAssembly in genomeAssemblies) { files.AddCache(genomeAssembly, remoteCacheDirectory, cacheDirectory, "transcripts"); files.AddCache(genomeAssembly, remoteCacheDirectory, cacheDirectory, "sift"); files.AddCache(genomeAssembly, remoteCacheDirectory, cacheDirectory, "polyphen"); } return files; } private static void AddCache(this ICollection files, GenomeAssembly genomeAssembly, string remoteCacheDirectory, string cacheDirectory, string type) { string filename = $"Both.{type}.ndb"; string remotePath = $"{remoteCacheDirectory}/{CacheConstants.DataVersion}/{genomeAssembly}/{filename}"; string localPath = Path.Combine(cacheDirectory, genomeAssembly.ToString(), filename); string description = $"{filename} ({genomeAssembly})"; files.Add(new RemoteFile(remotePath, localPath, description)); } } } ================================================ FILE: Downloader/FileExtensions/ReferencesFileExtensions.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using ReferenceSequence; namespace Downloader.FileExtensions { public static class ReferencesFileExtensions { public static List AddReferenceFiles(this List files, IEnumerable genomeAssemblies, string remoteReferencesDirectory, string referencesDirectory) { // ReSharper disable once LoopCanBeConvertedToQuery foreach (GenomeAssembly genomeAssembly in genomeAssemblies) { string filename = GetFilename(genomeAssembly); var remotePath = $"{remoteReferencesDirectory}/{ReferenceSequenceCommon.HeaderVersion}/{filename}"; string localPath = Path.Combine(referencesDirectory, filename); files.Add(new RemoteFile(remotePath, localPath, filename)); } return files; } public static string GetFilename(GenomeAssembly genomeAssembly) => $"Homo_sapiens.{genomeAssembly}.Nirvana.dat"; } } ================================================ FILE: Downloader/FileExtensions/SupplementaryAnnotationFileExtensions.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; namespace Downloader.FileExtensions { public static class SupplementaryAnnotationFileExtensions { private static readonly HashSet NeedsIndexSet = new HashSet(); static SupplementaryAnnotationFileExtensions() { NeedsIndexSet.Add(".nsa"); NeedsIndexSet.Add(".npd"); NeedsIndexSet.Add(".rma"); NeedsIndexSet.Add(".gsa"); } public static void AddSupplementaryAnnotationFiles(this List files, Dictionary> remotePathsByGenomeAssembly, string saDirectory) { foreach ((var genomeAssembly, List remotePaths) in remotePathsByGenomeAssembly) { files.AddDataSources(remotePaths, genomeAssembly, saDirectory); } } private static void AddDataSources(this ICollection files, IEnumerable remotePaths, GenomeAssembly genomeAssembly, string saDirectory) { foreach (string path in remotePaths) { files.AddFile(genomeAssembly, saDirectory, path); string extension = Path.GetExtension(path); if (NeedsIndexSet.Contains(extension)) files.AddFile(genomeAssembly, saDirectory, path + ".idx"); } } private static void AddFile(this ICollection files, GenomeAssembly genomeAssembly, string saDirectory, string path) { string filename = Path.GetFileName(path); string remotePath = path; string localPath = Path.Combine(saDirectory, genomeAssembly.ToString(), filename); string description = $"{filename} ({genomeAssembly})"; files.Add(new RemoteFile(remotePath, localPath, description)); } } } ================================================ FILE: Downloader/IClient.cs ================================================ using System.Collections.Generic; namespace Downloader { /// /// IClient should abstract away all network activity for improved testing /// public interface IClient { List DownloadLines(string remotePath); bool SetMetadata(RemoteFile file); bool DownloadFile(RemoteFile file); } } ================================================ FILE: Downloader/Manifest.cs ================================================ using System; using System.Collections.Generic; using Genome; namespace Downloader { public static class Manifest { public static Dictionary> GetRemotePaths(IClient client, IEnumerable genomeAssemblies, string manifestGRCh37, string manifestGRCh38) { IEnumerable<(GenomeAssembly GenomeAssembly, string ManifestPath)> genomeAssemblyPaths = CreateGenomeAssemblyPaths(manifestGRCh37, manifestGRCh38, genomeAssemblies); var remotePathsByGenomeAssembly = new Dictionary>(); foreach ((var genomeAssembly, string manifestPath) in genomeAssemblyPaths) { List remotePaths = client.DownloadLines(manifestPath); remotePathsByGenomeAssembly[genomeAssembly] = remotePaths; } return remotePathsByGenomeAssembly; } internal static IEnumerable<(GenomeAssembly GenomeAssembly, string ManifestPath)> CreateGenomeAssemblyPaths( string manifestGRCh37, string manifestGRCh38, IEnumerable genomeAssemblies) { var genomeAssemblyPaths = new List<(GenomeAssembly, string)>(); foreach (var genomeAssembly in genomeAssemblies) { // ReSharper disable once SwitchStatementMissingSomeCases switch (genomeAssembly) { case GenomeAssembly.GRCh37: genomeAssemblyPaths.Add((genomeAssembly, manifestGRCh37)); break; case GenomeAssembly.GRCh38: genomeAssemblyPaths.Add((genomeAssembly, manifestGRCh38)); break; } } return genomeAssemblyPaths; } } } ================================================ FILE: Downloader/OutputDirectory.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Downloader.FileExtensions; using Downloader.Utilities; using Genome; namespace Downloader { public static class OutputDirectory { public static (string Cache, string Reference, string SupplementaryAnnotation, List OutputDirectories) Create(string outputDirectory, List genomeAssemblies) { string cacheDirectory = Path.Combine(outputDirectory, "Cache"); string referencesDirectory = Path.Combine(outputDirectory, "References"); string saDirectory = Path.Combine(outputDirectory, "SupplementaryAnnotation"); var outputDirectories = new List {referencesDirectory}; CreateGenomeAssemblySubdirectories(cacheDirectory, genomeAssemblies, outputDirectories); CreateGenomeAssemblySubdirectories(saDirectory, genomeAssemblies, outputDirectories); Directory.CreateDirectory(referencesDirectory); return (cacheDirectory, referencesDirectory, saDirectory, outputDirectories); } private static void CreateGenomeAssemblySubdirectories(string topLevelDirectory, IEnumerable genomeAssemblies, ICollection outputDirectories) { foreach (var genomeAssembly in genomeAssemblies) { string directory = Path.Combine(topLevelDirectory, genomeAssembly.ToString()); outputDirectories.Add(directory); Directory.CreateDirectory(directory); } } public static void Cleanup(IEnumerable files, IEnumerable outputDirectories, string referencesDirectory) { IEnumerable existingFiles = GetExistingFiles(outputDirectories); IEnumerable referenceFiles = GetReferenceFiles(referencesDirectory); List desiredFiles = files.Select(x => x.LocalPath).ToList(); List filesToDelete = existingFiles.Except(desiredFiles).Except(referenceFiles).ToList(); if (filesToDelete.Count == 0) return; Console.WriteLine("- removing extra files in output directories"); foreach (string file in filesToDelete) { Console.WriteLine($" - deleting extra file: {file}"); File.Delete(file); } Console.WriteLine(); } private static IEnumerable GetReferenceFiles(string referencesDirectory) => new List { Path.Combine(referencesDirectory, ReferencesFileExtensions.GetFilename(GenomeAssembly.GRCh37)), Path.Combine(referencesDirectory, ReferencesFileExtensions.GetFilename(GenomeAssembly.GRCh38)) }; private static IEnumerable GetExistingFiles(IEnumerable outputDirectories) { var existingFiles = new List(); foreach (string outputDir in outputDirectories) { string[] files = Directory.GetFiles(outputDir, "*", SearchOption.TopDirectoryOnly); foreach (string localPath in files) { if (!localPath.StartsWith(outputDir)) continue; existingFiles.Add(localPath); } } return existingFiles; } public static void RemoveOldFiles(IEnumerable files) { var filesToDelete = new List(); foreach (var file in files) { var fileInfo = new FileInfo(file.LocalPath); if (!fileInfo.Exists || file.Skipped) continue; if (HasDifferentFileSize(fileInfo.Length, file.FileSize) || HasOlderFile(fileInfo.CreationTimeUtc, file.LastModified)) { filesToDelete.Add(file); continue; } // these files already exist and can be skipped file.Skipped = true; } if (filesToDelete.Count == 0) return; Console.WriteLine("- removing old files:"); foreach (var file in filesToDelete) { Console.WriteLine($" - deleting {file.Description}"); File.Delete(file.LocalPath); } Console.WriteLine(); } private static bool HasOlderFile(in DateTimeOffset localOffset, DateTimeOffset remoteOffset) => DateTimeOffset.Compare(remoteOffset, localOffset) == 1; private static bool HasDifferentFileSize(long localLength, long remoteLength) => localLength != remoteLength; public static long GetNumDownloadBytes(IEnumerable files) { long numBytes = 0; foreach (var file in files) numBytes += file.FileSize; return numBytes; } public static List RemoveSkippedFiles(List files) { var filesToDownload = new List(files.Count); foreach (var file in files.OrderBy(x => x.FileSize)) { if (file.Skipped) continue; filesToDownload.Add(file); } return filesToDownload; } public static void CheckFiles(IEnumerable files) { var divider = new string('-', 75); Console.WriteLine("Description Status"); Console.WriteLine(divider); foreach (var file in files.OrderBy(x => x.Description)) { string description = GetPaddedField(file.Description, 58); Console.Write($"{description} "); PrintStatus(file); Console.WriteLine(); } Console.WriteLine(divider); } private static string GetPaddedField(string s, int fieldLength) { if (s.Length > fieldLength) return s.Substring(0, fieldLength - 3) + "..."; return s.PadRight(fieldLength, ' '); } private static void PrintStatus(RemoteFile file) { if (file.Missing) { ConsoleEmbellishments.PrintWarning("Missing (server)"); return; } var fileInfo = new FileInfo(file.LocalPath); if (!fileInfo.Exists) { ConsoleEmbellishments.PrintError("Missing (local)"); return; } if (fileInfo.Length < file.FileSize) { ConsoleEmbellishments.PrintError(" Truncated"); return; } if (fileInfo.Length > file.FileSize) { ConsoleEmbellishments.PrintError(" Too large"); return; } ConsoleEmbellishments.PrintSuccess(" OK"); file.Pass = true; } } } ================================================ FILE: Downloader/RemoteFile.cs ================================================ using System; namespace Downloader { public sealed class RemoteFile { public readonly string RemotePath; public readonly string LocalPath; public readonly string Description; public DateTimeOffset LastModified; public long FileSize; public bool Skipped; // skipped from downloading public bool Missing; // missing from the server public bool Pass; // passes the checks after download public RemoteFile(string remotePath, string localPath, string description) { RemotePath = remotePath; LocalPath = localPath; Description = description; } } } ================================================ FILE: Downloader/Utilities/ConsoleEmbellishments.cs ================================================ using System; namespace Downloader.Utilities { public static class ConsoleEmbellishments { public static void PrintWarning(string s) => Highlight(s, ConsoleColor.Yellow); public static void PrintError(string s) => Highlight(s, ConsoleColor.Red); public static void PrintSuccess(string s) => Highlight(s, ConsoleColor.Green); private static void Highlight(string s, ConsoleColor color) { Console.ForegroundColor = color; Console.Write(s); Console.ResetColor(); } } } ================================================ FILE: Downloader/Utilities/DiskSpaceUtilities.cs ================================================ using System; using System.IO; using CommandLine.Utilities; namespace Downloader.Utilities { public static class DiskSpaceUtilities { public static void CheckAvailableDiskSpace(string outputDirectory, long numBytesToDownload) { string absolutePath = GetAbsolutePath(outputDirectory); DriveInfo driveInfo = GetDriveWithLongestCommonPrefix(absolutePath); // skip available disk space checking if we can't figure out which drive is being used if (driveInfo == null) return; long numAvailableBytes = driveInfo.AvailableFreeSpace; if (numBytesToDownload <= numAvailableBytes) return; string neededSpace = MemoryUtilities.ToHumanReadable(numBytesToDownload); string availableSpace = MemoryUtilities.ToHumanReadable(numAvailableBytes); ConsoleEmbellishments.PrintError("Not enough disk space available"); Console.WriteLine($" in {absolutePath}. Need: {neededSpace}, available: {availableSpace}"); Environment.Exit(1); } private static string GetAbsolutePath(string directoryPath) { var directoryInfo = new DirectoryInfo(directoryPath); string absolutePath = directoryInfo.FullName; // the absolute path in Windows doesn't always provide the drive letter in uppercase // this is benign on Linux since the root is always / string root = directoryInfo.Root.ToString().ToUpperInvariant(); return root + absolutePath.Substring(root.Length); } private static DriveInfo GetDriveWithLongestCommonPrefix(string absolutePath) { DriveInfo[] allDrives = DriveInfo.GetDrives(); var maxPrefixLength = 0; DriveInfo maxPrefixDrive = null; foreach (DriveInfo d in allDrives) { // Windows drive letters are always in uppercase if (!d.IsReady || !absolutePath.StartsWith(d.Name) || d.Name.Length <= maxPrefixLength) continue; maxPrefixLength = d.Name.Length; maxPrefixDrive = d; } return maxPrefixDrive; } } } ================================================ FILE: Downloader/Utilities/GenomeAssemblyHelper.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; using Genome; namespace Downloader.Utilities { public static class GenomeAssemblyHelper { public static List GetGenomeAssemblies(string genomeAssembly) { genomeAssembly = genomeAssembly.ToLower(); var genomeAssemblies = new List(); switch (genomeAssembly.ToLower()) { case "grch37": genomeAssemblies.Add(GenomeAssembly.GRCh37); break; case "grch38": genomeAssemblies.Add(GenomeAssembly.GRCh38); break; case "both": genomeAssemblies.Add(GenomeAssembly.GRCh37); genomeAssemblies.Add(GenomeAssembly.GRCh38); break; default: throw new UserErrorException($"Found an unknown genome assembly ({genomeAssembly}). Expected: GRCh37, GRCh38, or both"); } return genomeAssemblies; } } } ================================================ FILE: Downloader/Utilities/ParallelUtilities.cs ================================================ using System; using System.Collections.Generic; using System.Threading; using System.Threading.Tasks; using CommandLine.Utilities; using ErrorHandling.Exceptions; namespace Downloader.Utilities { public static class ParallelUtilities { private const int NumThreads = 5; public static void ParallelExecute(this List files, Func clientFunc, Action, CancellationTokenSource, string> httpAction, string finishedMessage, string exceptionMessage) { var bench = new Benchmark(); var tasks = new Task[files.Count]; var maxThread = new SemaphoreSlim(NumThreads); var tokenSource = new CancellationTokenSource(); var cancellationToken = tokenSource.Token; try { for (var i = 0; i < files.Count; i++) { maxThread.Wait(cancellationToken); var file = files[i]; tasks[i] = Task.Factory .StartNew(() => httpAction(file, clientFunc, tokenSource, exceptionMessage), TaskCreationOptions.LongRunning) .ContinueWith(task => maxThread.Release(), cancellationToken); if (cancellationToken.IsCancellationRequested) break; } Task.WaitAll(tasks); Console.WriteLine($" - {finishedMessage} ({Benchmark.ToHumanReadable(bench.GetElapsedTime())}).\n"); } catch (OperationCanceledException) { throw new UserErrorException($"Unable to {exceptionMessage}. Please verify network connection."); } } } } ================================================ FILE: Downloader/Utilities/SyncUtilities.cs ================================================ using System.Threading.Tasks; namespace Downloader.Utilities { public static class SyncUtilities { public static T AsSync(this Task task) => task.ConfigureAwait(false).GetAwaiter().GetResult(); } } ================================================ FILE: ErrorHandling/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: ErrorHandling/ErrorCategory.cs ================================================ namespace ErrorHandling { public enum ErrorCategory { UserError, NirvanaError, TimeOutError, InvocationThrottledError } } ================================================ FILE: ErrorHandling/ErrorHandling.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: ErrorHandling/ExceptionUtilities.cs ================================================ using System; using System.Collections.Generic; using ErrorHandling.Exceptions; namespace ErrorHandling { public static class ExceptionUtilities { public const string UserError = "UserError"; public static Exception MakeUserError(this Exception e) { e.Data[UserError] = true; return e; } // define which exceptions should not include a full stack trace public static readonly HashSet UserFriendlyExceptions = new HashSet { typeof(UserErrorException), typeof(FileNotSortedException), typeof(UnauthorizedAccessException), typeof(InvalidFileFormatException), typeof(ProcessLockedFileException), typeof(OutOfMemoryException), typeof(MissingCompressionLibraryException) }; public static bool HasException(Exception e) { if (e == null) return false; return e is T || HasException(e.InnerException); } public static bool HasErrorMessage(this Exception e, string errorMessage) { if (e == null) return false; return e.Message == errorMessage|| e.InnerException.HasErrorMessage(errorMessage); } public static Exception GetInnermostException(Exception e) { while (e.InnerException != null) e = e.InnerException; return e; } public static ErrorCategory ExceptionToErrorCategory(Exception exception) => UserFriendlyExceptions.Contains(exception.GetType()) ? ErrorCategory.UserError : ErrorCategory.NirvanaError; } } ================================================ FILE: ErrorHandling/Exceptions/CompressionException.cs ================================================ using System; namespace ErrorHandling.Exceptions { public sealed class CompressionException : Exception { // constructor public CompressionException(string message) : base(message) { } } } ================================================ FILE: ErrorHandling/Exceptions/DeploymentErrorException.cs ================================================ using System; namespace ErrorHandling.Exceptions { public sealed class DeploymentErrorException : Exception { public DeploymentErrorException(string message) : base(message) { } } } ================================================ FILE: ErrorHandling/Exceptions/FileNotSortedException.cs ================================================ using System; namespace ErrorHandling.Exceptions { public sealed class FileNotSortedException : Exception { // constructor public FileNotSortedException(string message) : base(message) { } } } ================================================ FILE: ErrorHandling/Exceptions/InvalidFileFormatException.cs ================================================ using System; namespace ErrorHandling.Exceptions { public sealed class InvalidFileFormatException : Exception { // constructor public InvalidFileFormatException(string message) : base(message) { } } } ================================================ FILE: ErrorHandling/Exceptions/MissingCompressionLibraryException.cs ================================================ using System; namespace ErrorHandling.Exceptions { public sealed class MissingCompressionLibraryException : Exception { // constructor public MissingCompressionLibraryException(string missingLibraryFilename) : base(GetErrorMessage(missingLibraryFilename)) { } /// /// returns the error message given the missing compression library filename /// private static string GetErrorMessage(string missingLibraryFilename) { return $"Unable to find the block GZip compression library ({missingLibraryFilename})"; } } } ================================================ FILE: ErrorHandling/Exceptions/ProcessLockedFileException.cs ================================================ using System; namespace ErrorHandling.Exceptions { public sealed class ProcessLockedFileException : Exception { // constructor public ProcessLockedFileException(string message) : base(message) { } } } ================================================ FILE: ErrorHandling/Exceptions/UserErrorException.cs ================================================ using System; namespace ErrorHandling.Exceptions { public sealed class UserErrorException : Exception { // constructor public UserErrorException(string message) : base(message) { } } } ================================================ FILE: ErrorHandling/ExitCodeUtilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; namespace ErrorHandling { public static class ExitCodeUtilities { private static readonly Dictionary ExceptionsToExitCodes; public const string VcfLine = "VcfLine"; public const string Line = "Line"; // constructor static ExitCodeUtilities() { // add the exception to exit code mappings ExceptionsToExitCodes = new Dictionary { { typeof(ArgumentNullException), ExitCodes.BadArguments }, { typeof(ArgumentOutOfRangeException), ExitCodes.BadArguments }, { typeof(Exception), ExitCodes.InvalidFunction }, { typeof(FileNotFoundException), ExitCodes.FileNotFound }, { typeof(FileNotSortedException), ExitCodes.FileNotSorted }, { typeof(FormatException), ExitCodes.BadFormat }, { typeof(InvalidDataException), ExitCodes.InvalidData }, { typeof(InvalidFileFormatException), ExitCodes.InvalidFileFormat }, { typeof(InvalidOperationException), ExitCodes.InvalidFunction }, { typeof(NotImplementedException), ExitCodes.CallNotImplemented }, { typeof(UserErrorException), ExitCodes.UserError }, { typeof(UnauthorizedAccessException), ExitCodes.AccessDenied }, { typeof(ProcessLockedFileException), ExitCodes.SharingViolation }, { typeof(OutOfMemoryException), ExitCodes.OutofMemory }, { typeof(MissingCompressionLibraryException), ExitCodes.MissingCompressionLibrary }, { typeof(CompressionException), ExitCodes.Compression } }; } internal static ExitCodes GetExitCode(Type exceptionType) { if (!ExceptionsToExitCodes.TryGetValue(exceptionType, out ExitCodes exitCode)) exitCode = ExitCodes.InvalidFunction; return exitCode; } /// /// Displays the details behind the exception /// Throw exceptions that are not user friendly if needed /// public static ExitCodes ShowException(Exception e) { Console.ForegroundColor = ConsoleColor.Red; Console.Write("\nERROR: "); Console.ResetColor(); e = ExceptionUtilities.GetInnermostException(e); Console.WriteLine("{0}", e.Message); var exceptionType = e.GetType(); // ReSharper disable once InvertIf if (!ExceptionUtilities.UserFriendlyExceptions.Contains(exceptionType)) { // print the stack trace Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("\nStack trace:"); Console.ResetColor(); Console.WriteLine(e.StackTrace); // extract out the vcf line // ReSharper disable once InvertIf if (e.Data.Contains(VcfLine)) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("\nVCF line:"); Console.ResetColor(); Console.WriteLine(e.Data[VcfLine]); } if (e.Data.Contains(Line)) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("\nLine:"); Console.ResetColor(); Console.WriteLine(e.Data[Line]); } } return GetExitCode(exceptionType); } } } ================================================ FILE: ErrorHandling/ExitCodes.cs ================================================ namespace ErrorHandling { /// /// Common Windows Error Codes: https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382(v=vs.85).aspx /// C:\Program Files (x86)\Windows Kits\8.1\Include\shared\winerror.h /// public enum ExitCodes { // ================ // Windows-specific // ================ Success = 0, InvalidFunction = 1, FileNotFound = 2, PathNotFound = 3, AccessDenied = 5, BadFormat = 11, InvalidData = 13, OutofMemory = 14, SharingViolation = 32, CallNotImplemented = 120, BadArguments = 160, // ================= // Illumina-specific // ================= // command-line (200 - 209) UnknownCommandLineOption = 200, MissingCommandLineOption = 201, // general (210 - 219) UserError = 210, // file (220 - 229) InvalidFileFormat = 220, FileNotSorted = 221, MissingCompressionLibrary = 223, // functionality (240 - 259) Compression = 240 } } ================================================ FILE: GeneAnnotationLambda/GeneAnnotationLambda.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using Amazon.Lambda.Core; using Amazon.Lambda.Serialization.Json; using Cloud; using Cloud.Messages.Gene; using Cloud.Notifications; using Cloud.Utilities; using CommandLine.Utilities; using ErrorHandling.Exceptions; using Genome; using IO; using Jasix.DataStructures; using VariantAnnotation.GeneAnnotation; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO; using VariantAnnotation.SA; using VariantAnnotation.Utilities; [assembly: LambdaSerializer(typeof(JsonSerializer))] namespace GeneAnnotationLambda { // ReSharper disable once UnusedMember.Global // ReSharper disable once ClassNeverInstantiated.Global public class GeneAnnotationLambda { private readonly string _saPathPrefix = LambdaUrlHelper.GetBaseUrl(); // ReSharper disable once UnusedMember.Global public Stream Run(GeneConfig config, ILambdaContext context) { string snsTopicArn = null; var runLog = new StringBuilder(); try { LogUtilities.UpdateLogger(context.Logger, runLog); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey }); LambdaUtilities.GarbageCollect(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); config.Validate(); // SaVersion will be provided as an environment variable. Defaults to "latest" string saVersion = Environment.GetEnvironmentVariable("SaVersion"); string saManifestUrl = LambdaUtilities.GetManifestUrl(saVersion, GenomeAssembly.GRCh38, SaCommon.SchemaVersion); string result = GetGeneAnnotation(config, saManifestUrl, _saPathPrefix); return LambdaResponse.Create(config.id, LambdaUrlHelper.SuccessMessage, result); } catch (Exception e) { return HandleException(config.id, snsTopicArn, e); } } private static Stream HandleException(string id, string snsTopicArn, Exception e) { Logger.Log(e); string snsMessage = SNS.CreateMessage(e.Message, "exception", e.StackTrace); SNS.SendMessage(snsTopicArn, snsMessage); return LambdaResponse.Create(id, e.Message, null); } public static string GetGeneAnnotation(GeneConfig input, string saManifestFilePath, string saPathPrefix) { var geneAnnotationProvider = new GeneAnnotationProvider(PersistentStreamUtils.GetStreams( GetNgaFileList(saManifestFilePath, saPathPrefix, input.ngaUrls).ToList())); var sb = new StringBuilder(1024 * 1024); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue(JasixCommons.HeaderSectionTag, GetHeader(geneAnnotationProvider), false); //not all gene symbols have annotations. So, we need to check and only output the ones that are not null var geneAnnotations = input.geneSymbols.Select(geneSymbol => geneAnnotationProvider.Annotate(geneSymbol)) .Where(annotation => !string.IsNullOrEmpty(annotation)) .ToList(); jsonObject.AddStringValues("genes", geneAnnotations, false); sb.Append(JsonObject.CloseBrace); // AWS lambda response message can not be larger than 6MB if (sb.Length > 6_000_000) throw new UserErrorException("Too many genes provided in the request. Please decrease the number of genes and try again later."); return sb.ToString(); } private static string GetHeader(IProvider geneAnnotationProvider) { var sb = new StringBuilder(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("annotator", "Nirvana " + CommandLineUtilities.Version); jsonObject.AddStringValue("creationTime", Date.CurrentTimeStamp); jsonObject.AddIntValue("schemaVersion", SaCommon.SchemaVersion); jsonObject.AddObjectValues("dataSources", geneAnnotationProvider.DataSourceVersions); sb.Append(JsonObject.CloseBrace); return sb.ToString(); } public static IEnumerable GetNgaFileList(string saManifestPath, string saPathPrefix, string[] ngaFiles) { using (var reader = new StreamReader(PersistentStreamUtils.GetReadStream(saManifestPath))) { string line; while ((line = reader.ReadLine()) != null) { string filePath = saPathPrefix + line; string suffix = filePath.GetFileSuffix(true); if (suffix == SaCommon.GeneFileSuffix) yield return filePath; } } if (ngaFiles == null) yield break; foreach (string ngaFile in ngaFiles) yield return ngaFile; } } } ================================================ FILE: GeneAnnotationLambda/GeneAnnotationLambda.csproj ================================================  net6.0 true Lambda bin\$(Configuration) ================================================ FILE: GeneAnnotationLambda/GeneResult.cs ================================================ using System.Data; using System.IO; using System.Text; using Cloud; using Cloud.Utilities; using Newtonsoft.Json; namespace GeneAnnotationLambda { public static class LambdaResponse { private const string OutputBeforeNirvanaJson = ",\"annotation\":"; private const string OutputEnd = "}"; public static Stream Create(string id, string status, string nirvanaJson) { string statusJson = JsonConvert.SerializeObject(status); string outputStart = $"{{\"id\":\"{id}\",\"status\":{statusJson}"; string output; if (status == LambdaUrlHelper.SuccessMessage) { if (nirvanaJson == null) throw new NoNullAllowedException("Nirvana annotation cannot be null when the job is successful."); output = outputStart + OutputBeforeNirvanaJson + nirvanaJson + OutputEnd; } else { output = outputStart + OutputEnd; } LogUtilities.LogObject("Result", output); var outputStream = new MemoryStream(Encoding.UTF8.GetBytes(output)); return outputStream; } } } ================================================ FILE: Genome/Band.cs ================================================ namespace Genome { public struct Band { public readonly int Begin; public readonly int End; public readonly string Name; public Band(int begin, int end, string name) { Begin = begin; End = end; Name = name; } public int Compare(int position) { if (position < Begin) return 1; return position > End ? -1 : 0; } } } ================================================ FILE: Genome/Chromosome.cs ================================================ using System; using IO; namespace Genome { public sealed class Chromosome : IComparable { public string UcscName { get; } public string EnsemblName { get; } public string RefSeqAccession { get; } public string GenBankAccession { get; } public int FlankingLength { get; private set; } public int Length { get; } public ushort Index { get; } public const ushort UnknownReferenceIndex = ushort.MaxValue; public const int ShortFlankingLength = 100; public static Chromosome GetEmptyChromosome(string name) { return new Chromosome(name, name, name, name, 0, ushort.MaxValue) { FlankingLength = ShortFlankingLength }; } public Chromosome(string ucscName, string ensemblName, string refSeqAccession, string genBankAccession, int length, ushort index) { UcscName = ucscName; EnsemblName = ensemblName; RefSeqAccession = refSeqAccession; GenBankAccession = genBankAccession; Length = length; Index = index; // for short references (< 30 kbp), let's use a shorter flanking length const int longFlankingLength = 5_000; const int shortReferenceThreshold = 30_000_000; FlankingLength = length < shortReferenceThreshold ? ShortFlankingLength : longFlankingLength; } public void Write(ExtendedBinaryWriter writer) { writer.WriteOptAscii(UcscName); writer.WriteOptAscii(EnsemblName); writer.WriteOptAscii(RefSeqAccession); writer.WriteOptAscii(GenBankAccession); writer.WriteOpt(Length); writer.WriteOpt(Index); } public static Chromosome Read(ExtendedBinaryReader reader) { string ucscName = reader.ReadAsciiString(); string ensemblName = reader.ReadAsciiString(); string refseqAccession = reader.ReadAsciiString(); string genBankAccession = reader.ReadAsciiString(); int length = reader.ReadOptInt32(); ushort refIndex = reader.ReadOptUInt16(); return new Chromosome(ucscName, ensemblName, refseqAccession, genBankAccession, length, refIndex); } public bool Equals(Chromosome other) => Index == other.Index && Length == other.Length; public int CompareTo(Chromosome other) => Index == other.Index ? Length.CompareTo(other.Length) : Index.CompareTo(other.Index); public override int GetHashCode() { return UcscName.GetHashCode() ^ Length ^ Index; } } } ================================================ FILE: Genome/ChromosomeInterval.cs ================================================ namespace Genome { public sealed class ChromosomeInterval : IChromosomeInterval { public Chromosome Chromosome { get; } public int Start { get; } public int End { get; } public ChromosomeInterval(Chromosome chromosome, int start, int end) { Chromosome = chromosome; Start = start; End = end; } } } ================================================ FILE: Genome/ContigInfo.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; namespace Genome { public static class ContigInfo { private static readonly (string, int)[] ChromLengthsGrch37 = { ("1", 249250621), ("2", 243199373), ("3", 198022430), ("4", 191154276), ("5", 180915260), ("6", 171115067), ("7", 159138663), ("8", 146364022), ("9", 141213431), ("10", 135534747), ("11", 135006516), ("12", 133851895), ("13", 115169878), ("14", 107349540), ("15", 102531392), ("16", 90354753), ("17", 81195210), ("18", 78077248), ("19", 59128983), ("20", 63025520), ("21", 48129895), ("22", 51304566), ("X", 155270560), ("Y", 59373566) }; private static readonly (string, int)[] ChromLengthsGrch38 = { ("1", 248956422), ("2", 242193529), ("3", 198295559), ("4", 190214555), ("5", 181538259), ("6", 170805979), ("7", 159345973), ("8", 145138636), ("9", 138394717), ("10", 133797422), ("11", 135086622), ("12", 133275309), ("13", 114364328), ("14", 107043718), ("15", 101991189), ("16", 90338345), ("17", 83257441), ("18", 80373285), ("19", 58617616), ("20", 64444167), ("21", 46709983), ("22", 50818468), ("X", 156040895), ("Y", 57227415) }; private static readonly Dictionary> ChromLengthToAssembly = GetChromLengthToAssembly(); private static Dictionary> GetChromLengthToAssembly() { var chromLengthToAssembly = new Dictionary>(); foreach ((string chrom, int length) in ChromLengthsGrch37) { chromLengthToAssembly[chrom] = new Dictionary { { length, GenomeAssembly.GRCh37 } }; } foreach ((string contig, int length) in ChromLengthsGrch38) { chromLengthToAssembly[contig][length] = GenomeAssembly.GRCh38; } chromLengthToAssembly["MT"] = new Dictionary { { 16569, GenomeAssembly.rCRS } }; return chromLengthToAssembly; } public static GenomeAssembly GetGenomeAssembly(Chromosome chromosome, int length) { if (!ChromLengthToAssembly.TryGetValue(chromosome.EnsemblName, out var lengthToAssembly)) return GenomeAssembly.Unknown; if (lengthToAssembly.TryGetValue(length, out GenomeAssembly assembly)) return assembly; if (chromosome.EnsemblName == "MT") return GenomeAssembly.Unknown; throw new UserErrorException($"Invalid length provided in VCF header: chromosome {chromosome.EnsemblName}, length {length}"); } } } ================================================ FILE: Genome/CytogeneticBands.cs ================================================ namespace Genome { public static class CytogeneticBands { public static string Find(this Band[] bands, Chromosome chromosome, int start, int end) { if (chromosome.IsEmpty()) return null; string startCytogeneticBand = bands.GetCytogeneticBand(start); if (startCytogeneticBand == null) return null; // handle the single coordinate case if (start == end) return $"{chromosome.EnsemblName}{startCytogeneticBand}"; // handle the dual coordinate case string endCytogeneticBand = bands.GetCytogeneticBand(end); if (endCytogeneticBand == null) return null; return startCytogeneticBand == endCytogeneticBand ? $"{chromosome.EnsemblName}{startCytogeneticBand}" : $"{chromosome.EnsemblName}{startCytogeneticBand}-{endCytogeneticBand}"; } private static string GetCytogeneticBand(this Band[] bands, int pos) { int index = BinarySearch(bands, pos); return index < 0 ? null : bands[index].Name; } private static int BinarySearch(Band[] array, int position) { var begin = 0; int end = array.Length - 1; while (begin <= end) { int index = begin + (end - begin >> 1); int ret = array[index].Compare(position); if (ret == 0) return index; if (ret < 0) begin = index + 1; else end = index - 1; } return ~begin; } } } ================================================ FILE: Genome/Genome.csproj ================================================ net6.0 ..\bin\$(Configuration) ================================================ FILE: Genome/GenomeAssembly.cs ================================================ namespace Genome { // ReSharper disable InconsistentNaming public enum GenomeAssembly : byte { Unknown, GRCh37, GRCh38, hg19, rCRS, // Revised Cambridge Reference Sequence (rCRS) of the Human Mitochondrial DNA SARSCoV2 } // ReSharper restore InconsistentNaming } ================================================ FILE: Genome/GenomeAssemblyHelper.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; namespace Genome { public static class GenomeAssemblyHelper { public static readonly HashSet AutosomeAndAllosomeAssemblies = new HashSet { GenomeAssembly.GRCh37, GenomeAssembly.GRCh38, GenomeAssembly.hg19,GenomeAssembly.SARSCoV2 }; public static GenomeAssembly Convert(string genomeAssembly) { GenomeAssembly ret; switch (string.IsNullOrEmpty(genomeAssembly) ? string.Empty : genomeAssembly.ToLower()) { case "grch37": ret = GenomeAssembly.GRCh37; break; case "grch38": ret = GenomeAssembly.GRCh38; break; case "hg19": ret = GenomeAssembly.hg19; break; case "rcrs": ret = GenomeAssembly.rCRS; break; case "sarscov2": ret = GenomeAssembly.SARSCoV2; break; case "": ret = GenomeAssembly.Unknown; break; default: throw new UserErrorException($"Unknown genome assembly was specified: {genomeAssembly}"); } return ret; } } } ================================================ FILE: Genome/GenomicPosition.cs ================================================ namespace Genome { public struct GenomicPosition { public readonly Chromosome Chromosome; public readonly int Position; public GenomicPosition(Chromosome chromosome, int position) { Chromosome = chromosome; Position = position; } } } ================================================ FILE: Genome/GenomicRange.cs ================================================ namespace Genome { public sealed class GenomicRange { public GenomicPosition Start { get; } public GenomicPosition? End { get; } public GenomicRange(GenomicPosition start, GenomicPosition? end) { Start = start; End = end; } } } ================================================ FILE: Genome/GenomicRangeChecker.cs ================================================ namespace Genome { public sealed class GenomicRangeChecker { private readonly GenomicRange _genomicRange; private bool _reachedLastChromosome; public GenomicRangeChecker(GenomicRange genomicRange) { _genomicRange = genomicRange; } public bool OutOfRange(Chromosome chromosome, int position) { if (_genomicRange?.End == null) return false; if (!_reachedLastChromosome && chromosome.Equals(_genomicRange.End?.Chromosome)) _reachedLastChromosome = true; return _reachedLastChromosome && (position > _genomicRange.End?.Position || !chromosome.Equals(_genomicRange.End?.Chromosome)) ; } } } ================================================ FILE: Genome/IChromosomeInterval.cs ================================================ using Intervals; namespace Genome { public interface IChromosomeInterval : IInterval { Chromosome Chromosome { get; } } } ================================================ FILE: Genome/ISequence.cs ================================================ namespace Genome { public interface ISequence { int Length { get; } Band[] CytogeneticBands { get; } string Substring(int offset, int length); } } ================================================ FILE: Genome/ReferenceNameUtilities.cs ================================================ using System.Collections.Generic; using System.IO; namespace Genome { public static class ReferenceNameUtilities { public static Chromosome GetChromosome(Dictionary refNameToChromosome, string referenceName) { if (referenceName == null) return Chromosome.GetEmptyChromosome(string.Empty); return !refNameToChromosome.TryGetValue(referenceName, out Chromosome chromosome) ? Chromosome.GetEmptyChromosome(referenceName) : chromosome; } public static Chromosome GetChromosome(Dictionary refIndexToChromosome, ushort referenceIndex) { if (!refIndexToChromosome.TryGetValue(referenceIndex, out Chromosome chromosome)) { throw new InvalidDataException($"Unable to find the reference index ({referenceIndex}) in the refIndexToChromosome dictionary."); } return chromosome; } public static bool IsEmpty(this Chromosome chromosome) => chromosome.Index == ushort.MaxValue; } } ================================================ FILE: Genome/SequenceUtilities.cs ================================================ using System.Collections.Generic; using System.Linq; namespace Genome { public static class SequenceUtilities { private static readonly char[] ReverseComplementLookupTable; private static readonly HashSet CanonicalBases; static SequenceUtilities() { // initialize the reverse complement code const string forwardBases = "ABCDGHKMRTVYabcdghkmrtvy"; const string reverseBases = "TVGHCDMKYABRTVGHCDMKYABR"; ReverseComplementLookupTable = new char[256]; for (var i = 0; i < 256; i++) ReverseComplementLookupTable[i] = 'N'; for (var i = 0; i < forwardBases.Length; i++) { ReverseComplementLookupTable[forwardBases[i]] = reverseBases[i]; } CanonicalBases = new HashSet { 'A', 'C', 'G', 'T', '-' }; } /// /// returns the reverse complement of the given bases /// public static string GetReverseComplement(string bases) { // sanity check if (bases == null) return null; int numBases = bases.Length; var reverseChars = new char[numBases]; for (var i = 0; i < numBases; ++i) { reverseChars[i] = ReverseComplementLookupTable[bases[numBases - i - 1]]; } return new string(reverseChars); } /// /// returns true if we have a base other than the 4 standard bases: A, C, G, and T /// public static bool HasNonCanonicalBase(string bases) => !string.IsNullOrEmpty(bases) && bases.Any(c => !CanonicalBases.Contains(c)); } } ================================================ FILE: IO/BufferedBinaryReader.cs ================================================ using System; using System.IO; using System.Text; namespace IO { public sealed class BufferedBinaryReader : IBufferedBinaryReader { private const int BufferSize = 10_485_760; private const int ShortLen = 2; private const int IntLen = 4; private readonly Stream _stream; private readonly byte[] _buffer; private bool _foundEof; private int _bufferLen; private int _bufferPos; private readonly bool _leaveOpen; public BufferedBinaryReader(Stream stream, bool leaveOpen = false, int bufferSize = BufferSize) { if (stream == null) throw new ArgumentNullException(nameof(stream)); if (!stream.CanRead) throw new ArgumentException("A non-readable stream was supplied.", nameof(stream)); if (bufferSize <= 0) throw new ArgumentOutOfRangeException(nameof(bufferSize)); _stream = stream; _buffer = new byte[bufferSize]; _leaveOpen = leaveOpen; FillBuffer(); } private void FillBuffer() { int numRemainingBytes = _bufferLen - _bufferPos; if (numRemainingBytes > 0) Buffer.BlockCopy(_buffer, _bufferPos, _buffer, 0, numRemainingBytes); _bufferPos = 0; _bufferLen = numRemainingBytes; int numBytesRead = _stream.Read(_buffer, numRemainingBytes, _buffer.Length - numRemainingBytes); _bufferLen = numRemainingBytes + numBytesRead; if (_bufferPos == 0 && _bufferLen == 0) _foundEof = true; } public string ReadAsciiString() { int numBytes = ReadOptInt32(); return numBytes == 0 ? null : Encoding.ASCII.GetString(ReadBytes(numBytes)); } public bool ReadBoolean() { if (_bufferPos == _bufferLen) FillBuffer(); return _buffer[_bufferPos++] != 0; } public byte ReadByte() { if (_bufferPos == _bufferLen) FillBuffer(); return _buffer[_bufferPos++]; } public byte[] ReadBytes(int numBytes) { if (numBytes == 1) return new[] { ReadByte() }; var values = new byte[numBytes]; Read(values, numBytes); return values; } private void Read(byte[] buffer, int numBytes) { var offset = 0; int numBytesRemaining = numBytes; while (numBytesRemaining > 0) { if (_bufferPos == _bufferLen) { FillBuffer(); if (_foundEof) break; } int numBytesAvailable = _bufferLen - _bufferPos; int copyLength = numBytesRemaining < numBytesAvailable ? numBytesRemaining : numBytesAvailable; Buffer.BlockCopy(_buffer, _bufferPos, buffer, offset, copyLength); offset += copyLength; _bufferPos += copyLength; numBytesRemaining -= copyLength; } } public int ReadOptInt32() { if (_bufferPos > _bufferLen - 5) FillBuffer(); var count = 0; var shift = 0; while (shift != 35) { byte b = _buffer[_bufferPos++]; count |= (b & sbyte.MaxValue) << shift; shift += 7; if ((b & 128) == 0) return count; } throw new FormatException("Unable to read the 7-bit encoded integer"); } public ushort ReadOptUInt16() { if (_bufferPos > _bufferLen - 3) FillBuffer(); ushort count = 0; var shift = 0; while (shift != 21) { byte b = ReadByte(); count |= (ushort)((b & sbyte.MaxValue) << shift); shift += 7; if ((b & 128) == 0) return count; } throw new FormatException("Unable to read the 7-bit encoded unsigned short"); } public unsafe ushort ReadUInt16() { if (_bufferPos > _bufferLen - ShortLen) FillBuffer(); ushort value; fixed (byte* pBuffer = &_buffer[_bufferPos]) { value = (ushort)(pBuffer[0] | pBuffer[1] << 8); _bufferPos += ShortLen; } return value; } public unsafe uint ReadUInt32() { if (_bufferPos > _bufferLen - IntLen) FillBuffer(); uint value; fixed (byte* pBuffer = &_buffer[_bufferPos]) { value = (uint)(pBuffer[0] | pBuffer[1] << 8 | pBuffer[2] << 16 | pBuffer[3] << 24); _bufferPos += IntLen; } return value; } public void Dispose() { if (!_leaveOpen) _stream?.Dispose(); } } } ================================================ FILE: IO/CacheConstants.cs ================================================ namespace IO { //todo: create cache utils project and move it there public static class CacheConstants { public const uint GuardInt = 4041327495; // 87c3e1f0 public const string Identifier = "NirvanaDB"; // increment the schema version when the file structures are updated // N.B. we only need to regenerate unit tests when the schema version is incremented // e.g. adding a new feature like regulatory elements public const ushort SchemaVersion = 21; // increment the data version when the contents are updated // e.g. a bug is fixed in SIFT parsing or if transcripts are filtered differently public const ushort DataVersion = 27; public static string TranscriptPath(string prefix) => Combine(prefix, ".transcripts.ndb"); public static string SiftPath(string prefix) => Combine(prefix, ".sift.ndb"); public static string PolyPhenPath(string prefix) => Combine(prefix, ".polyphen.ndb"); public static string BasesPath(string prefix) => Combine(prefix, ".bases"); private static string Combine(string prefix, string suffix) => prefix == null ? null : prefix + suffix; } } ================================================ FILE: IO/ExtendedBinaryReader.cs ================================================ using System; using System.IO; using System.Text; namespace IO { public sealed class ExtendedBinaryReader : BinaryReader { public ExtendedBinaryReader(Stream s) : this(s, new UTF8Encoding()) { } public ExtendedBinaryReader(Stream input, Encoding encoding, bool leaveOpen = false) : base(input, encoding, leaveOpen) {} /// /// returns an unsigned short from the binary reader /// public ushort ReadOptUInt16() { ushort count = 0; var shift = 0; while (shift != 21) { byte b = ReadByte(); count |= (ushort)((b & sbyte.MaxValue) << shift); shift += 7; if ((b & 128) == 0) return count; } throw new FormatException("Unable to read the 7-bit encoded unsigned short"); } /// /// returns an integer from the binary reader /// public int ReadOptInt32() { var count = 0; var shift = 0; while (shift != 35) { byte b = ReadByte(); count |= (b & sbyte.MaxValue) << shift; shift += 7; if ((b & 128) == 0) return count; } throw new FormatException("Unable to read the 7-bit encoded integer"); } /// /// returns a long from the binary reader /// public long ReadOptInt64() { long count = 0; var shift = 0; while (shift != 70) { byte b = ReadByte(); count |= (long)(b & sbyte.MaxValue) << shift; shift += 7; if ((b & 128) == 0) return count; } throw new FormatException("Unable to read the 7-bit encoded long"); } /// /// returns an ASCII string from the binary reader /// public string ReadAsciiString() { int numBytes = ReadOptInt32(); // grab the ASCII characters // ReSharper disable once AssignNullToNotNullAttribute return numBytes == 0 ? null : Encoding.ASCII.GetString(ReadBytes(numBytes)); } } } ================================================ FILE: IO/ExtendedBinaryWriter.cs ================================================ using System.IO; using System.Text; namespace IO { public sealed class ExtendedBinaryWriter : BinaryWriter, IExtendedBinaryWriter { public ExtendedBinaryWriter(Stream output) : this(output, new UTF8Encoding(false, true)) { } public ExtendedBinaryWriter(Stream output, Encoding encoding, bool leaveOpen = false) : base(output, encoding, leaveOpen) { } /// /// writes an unsigned short to the binary writer /// public void WriteOpt(ushort value) { ushort num = value; while (num >= 128U) { Write((byte)(num | 128U)); num >>= 7; } Write((byte)num); } /// /// writes an integer to the binary writer /// public void WriteOpt(int value) { var num = (uint)value; while (num >= 128U) { Write((byte)(num | 128U)); num >>= 7; } Write((byte)num); } public void WriteOpt(uint value) { uint num = value; while (num >= 128U) { Write((byte)(num | 128U)); num >>= 7; } Write((byte)num); } /// /// writes a long to the binary writer /// public void WriteOpt(long value) { var num = (ulong)value; while (num >= 128U) { Write((byte)(num | 128U)); num >>= 7; } Write((byte)num); } /// /// writes an ASCII string to the binary writer /// public void WriteOptAscii(string s) { int numBytes = s?.Length ?? 0; WriteOpt(numBytes); // sanity check: handle null strings if (s == null) return; // write the ASCII bytes Write(Encoding.ASCII.GetBytes(s)); } } } ================================================ FILE: IO/FilePathUtilities.cs ================================================ using System; namespace IO { public static class StringExtensions { public static string TrimStartToLast(this string s, string value, bool includeSeparator = false) { int extPos = s.LastIndexOf(value, StringComparison.Ordinal); if (extPos == -1) return s; return includeSeparator ? s.Substring(extPos) : s.Substring(extPos + value.Length); } public static string TrimEndFromFirst(this string s, string value, bool includeSeparator = false) { int extPos = s.IndexOf(value, StringComparison.Ordinal); if (extPos == -1) return s; return includeSeparator ? s.Substring(0, extPos + value.Length) : s.Substring(0, extPos); } public static string GetFileSuffix(this string s, bool includeDot) => HttpUtilities.IsUrl(s) ? s.TrimEndFromFirst("?").TrimStartToLast(".", includeDot) : s.TrimStartToLast(".", includeDot); } } ================================================ FILE: IO/FileUtilities.cs ================================================ using System.IO; using System.Text; namespace IO { public static class FileUtilities { private const int StreamReaderBufferSize = 10_485_760; public static FileStream GetReadStream(string path) => new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); public static FileStream GetCreateStream(string path) => new FileStream(path, FileMode.Create); public static StreamReader GetStreamReader(Stream stream, bool leaveOpen = false) => new StreamReader(stream, Encoding.Default, true, StreamReaderBufferSize, leaveOpen); public static string[] GetFileNamesInDir(string directory, string pattern = null) { if (!Directory.Exists(directory)) { throw new FileNotFoundException($"{directory} doesn't exist."); } return pattern == null ? Directory.GetFiles(directory) : Directory.GetFiles(directory, pattern); } } } ================================================ FILE: IO/HttpUtilities.cs ================================================ using System; using System.Collections.Generic; using System.Globalization; using System.Net; using System.Xml.Linq; using System.Linq; using System.Threading; using ErrorHandling.Exceptions; namespace IO { public static class HttpUtilities { private static readonly string[] AuthenticationErrorCodes = { "InvalidAccessKeyId", "SignatureDoesNotMatch" }; private static readonly string[] ResourceNotExistErrorCodes = { "NoSuchKey", "NoSuchBucket" }; private const int MaxRetryCount = 10; public static long GetLength(string url) { var response = TryGetResponse(url); long contentLength = response.ContentLength; response.Close(); return contentLength; } // Only throw exceptions when all the three tries failed. private static HttpWebResponse TryGetResponse(string url) { var exceptions = new List(); for (var retryCounter = 0; retryCounter < MaxRetryCount; retryCounter++) { try { if (retryCounter > 0) { Console.WriteLine($"Attempt {retryCounter+1} to get response from {url}"); Thread.Sleep(2_000); } var request = (HttpWebRequest) WebRequest.Create(url); if (retryCounter > 0) { Console.WriteLine($"Succeeded at attempt#: {retryCounter+1}"); } return (HttpWebResponse) request.GetResponse(); } catch (Exception e) { Logger.WriteLine($"TryGetResponse exception found when connecting to {url}"); Logger.Log(e); exceptions.Add(ProcessHttpRequestWebProtocolErrorException(e, url)); } } throw new AggregateException(exceptions); } public static void ValidateUrl(string url, bool isUserProvided = true) { try { var response = TryGetResponse(url); response.Close(); } catch (Exception) { if (isUserProvided) throw new UserErrorException($"Unable to validate the URL for {UrlUtilities.GetFileName(url)}"); throw new DeploymentErrorException($"Deployment issue detected. Unable to validate the URL for {url}."); } } public static bool IsWebProtocolErrorException(Exception exception) { if (exception is not WebException webException) return false; return webException.Status == WebExceptionStatus.ProtocolError; } public static Exception ProcessHttpRequestWebProtocolErrorException(Exception exception, string url) { if (!IsWebProtocolErrorException(exception)) return exception; string urlPath = UrlUtilities.GetPath(url); var webException = (WebException)exception; (string errorCode, string errorMessage) = GetWebExceptionMessage(webException); // Expired URL is always a user error if (errorMessage == "Request has expired") return new UserErrorException($"The provided URL for {urlPath} has expired."); // Authentication error is always considered as a user error if (AuthenticationErrorCodes.Contains(errorCode)) return new UserErrorException($"Authentication error while reading from URL for {urlPath}."); // Resource not exist error is always considered as a user error if (ResourceNotExistErrorCodes.Contains(errorCode)) return new UserErrorException($"An invalid URL for {urlPath} was specified."); // Sometimes it is difficult to figure out whether the error is caused by the user or not. // For example, the AccessDenied error code could be triggered by either incorrect credentials provided by the user, or network congestion while reading from S3. // Therefore, such errors are treated as general exceptions. // And we don't pass through the general error to end user to avoid possible confusion. Logger.WriteLine($"The following error occurred while reading from {url}: {errorMessage}. Exception: {exception.Message}"); return new WebException($"An error occurred while reading from the URL for {urlPath} ({exception.GetType()})"); } private static (string Code, string Message) GetWebExceptionMessage(WebException exception) { using (var stream = exception.Response.GetResponseStream()) { if (stream == null) return (null, null); var xElement = XElement.Load(stream); return (xElement.Element("Code")?.Value, xElement.Element("Message")?.Value); } } public static bool IsUrl(string path) => path.StartsWith("http", true, CultureInfo.InvariantCulture); } } ================================================ FILE: IO/IBufferedBinaryReader.cs ================================================ using System; namespace IO { public interface IBufferedBinaryReader : IDisposable { string ReadAsciiString(); bool ReadBoolean(); byte ReadByte(); int ReadOptInt32(); ushort ReadOptUInt16(); uint ReadUInt32(); } } ================================================ FILE: IO/IConnect.cs ================================================ using System.IO; using System.Net; namespace IO { public interface IConnect { (HttpWebResponse Response, Stream Stream) Connect(long position); } } ================================================ FILE: IO/IExtendedBinaryWriter.cs ================================================ namespace IO { public interface IExtendedBinaryWriter { void Write(bool b); void Write(byte b); void Write(byte[] buffer); void Write(ushort value); void Write(uint value); void WriteOpt(ushort value); void WriteOpt(int value); void WriteOpt(long value); void WriteOptAscii(string s); } } ================================================ FILE: IO/IO.csproj ================================================  net6.0 ..\bin\$(Configuration) true ================================================ FILE: IO/IS3Client.cs ================================================ using System.Threading.Tasks; using Amazon.S3.Model; namespace IO { public interface IS3Client { Task GetObjectAsync(GetObjectRequest getRequest); Task PutObjectAsync(PutObjectRequest putRequest); } } ================================================ FILE: IO/ISerializable.cs ================================================ namespace IO { public interface ISerializable { void Write(IExtendedBinaryWriter writer); } } ================================================ FILE: IO/LengthStream.cs ================================================ using System; using System.IO; namespace IO { /// /// /// The S3 PutObjectRequest object requires an input stream that supports length and position. /// Neither of these are typically available from the CryptoStream /// public sealed class LengthStream : Stream { private readonly Stream _stream; private long _position; public LengthStream(Stream stream, long length) { _stream = stream; Length = length; } public override int Read(byte[] buffer, int offset, int count) { _position += count; return _stream.Read(buffer, offset, count); } public override long Position { get => _position; set => throw new NotSupportedException(); } public override void Flush() => throw new NotSupportedException(); public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); public override void SetLength(long value) => throw new NotSupportedException(); public override void Write(byte[] buffer, int offset, int count) => throw new NotSupportedException(); public override bool CanRead => _stream.CanRead; public override bool CanSeek => _stream.CanSeek; public override bool CanWrite => _stream.CanWrite; public override long Length { get; } } } ================================================ FILE: IO/Logger.cs ================================================ using System; using System.Text; namespace IO { public static class Logger { // can be redirected to any logger public static Action WriteLine { get; set; } public static Action Write { get; set; } public const string Url = "Url"; static Logger() { WriteLine = Console.WriteLine; Write = Console.Write; } public static void SetBold() => Console.ForegroundColor = ConsoleColor.Yellow; public static void ResetColor() => Console.ResetColor(); public static void Silence() { WriteLine = s => { }; Write = s => { }; } public static void Log(Exception e) { var sb = new StringBuilder(); var line = new string('-', 80); sb.AppendLine(line); const string vcfLine = "VcfLine"; const string errorLine = "Line"; while (e != null) { sb.AppendLine($"{e.GetType()}: {e.Message}"); sb.AppendLine($"Stack trace: {e.StackTrace}"); if (e.Data.Contains(vcfLine)) sb.AppendLine($"VCF line: {e.Data[vcfLine]}"); if (e.Data.Contains(errorLine)) sb.AppendLine($"Line: {e.Data[errorLine]}"); if (e.Data.Contains(Url)) sb.AppendLine($"URL: {e.Data[Url]}"); sb.AppendLine(errorLine); e = e.InnerException; } WriteLine(sb.ToString()); } } } ================================================ FILE: IO/MD5Stream.cs ================================================ using System; using System.IO; using System.Security.Cryptography; namespace IO { public sealed class MD5Stream : Stream { private readonly Stream _stream; private readonly IncrementalHash _md5 = IncrementalHash.CreateHash(HashAlgorithmName.MD5); private FileMetadata _metadata; private long _length; /// public MD5Stream(Stream stream) => _stream = stream; public override void Write(byte[] buffer, int offset, int count) { _stream.Write(buffer, offset, count); _md5.AppendData(buffer, offset, count); _length += count; } public FileMetadata GetFileMetadata() { if (_metadata != null) return _metadata; _metadata = new FileMetadata(_md5.GetHashAndReset(), _length); return _metadata; } public override long Position { get => _length; set => throw new NotSupportedException(); } public override void Flush() => _stream.Flush(); public override int Read(byte[] buffer, int offset, int count) => throw new NotSupportedException(); public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); public override void SetLength(long value) => throw new NotSupportedException(); public override bool CanRead => _stream.CanRead; public override bool CanSeek => _stream.CanSeek; public override bool CanWrite => _stream.CanWrite; public override long Length => _stream.Length; } public sealed class FileMetadata { public byte[] MD5 { get; } public long Length { get; } public FileMetadata(byte[] md5, long length) { MD5 = md5; Length = length; } } } ================================================ FILE: IO/PersistentConnect.cs ================================================ using System; using System.IO; using System.Net; namespace IO { public sealed class PersistentConnect : IConnect { private readonly string _url; public PersistentConnect(string url) => _url = url; public (HttpWebResponse Response, Stream Stream) Connect(long position) { if (position < 0) throw new ArgumentOutOfRangeException(nameof(position)); try { var request = WebRequest.CreateHttp(_url); request.AddRange(position); request.Timeout = 10_000; request.ReadWriteTimeout = 15_000; var response = (HttpWebResponse)request.GetResponse(); var stream = response.GetResponseStream(); return (response, stream); } catch (Exception e) { e.Data[Logger.Url] = _url; throw; } } } } ================================================ FILE: IO/PersistentStream.cs ================================================ using System; using System.IO; using System.Net; using System.Runtime.CompilerServices; using System.Threading; namespace IO { public sealed class PersistentStream : Stream { private readonly IConnect _connect; private HttpWebResponse _response; private Stream _stream; private long _position; private const int MaxRetryAttempts = 5; private const int NumRetryMilliseconds = 2_000; public override bool CanRead => _stream.CanRead; public override bool CanSeek => _stream.CanSeek; public override bool CanWrite => _stream.CanWrite; public override long Length => _stream.Length; public override void Flush() => _stream.Flush(); public override long Seek(long offset, SeekOrigin origin) => _stream.Seek(offset, origin); public override void SetLength(long value) => _stream.SetLength(value); public override void Write(byte[] buffer, int offset, int count) => _stream.Write(buffer, offset, count); public override long Position { get => _position; set { Disconnect(); ConnectWithRetries(value); _position = value; } } public PersistentStream(IConnect connect, long position) { _position = position; _connect = connect; ConnectWithRetries(_position); } private void ConnectWithRetries(long position) { if (position < 0) throw new ArgumentOutOfRangeException(nameof(position)); var keepTrying = true; var numRetries = 0; while (keepTrying) { try { (_response, _stream) = _connect.Connect(position); keepTrying = false; } catch (Exception e) { Log(MethodName(), e); if (numRetries == MaxRetryAttempts) throw; Disconnect(); Thread.Sleep(NumRetryMilliseconds); numRetries++; } } } private void Disconnect() { _response?.Dispose(); _stream?.Dispose(); } public override int Read(byte[] buffer, int offset, int count) { var numBytesRead = 0; while (count > 0) { int cnt = PersistentRead(buffer, offset, count); if (cnt == 0) return numBytesRead; offset += cnt; numBytesRead += cnt; _position += cnt; count -= cnt; } return numBytesRead; } private int PersistentRead(byte[] buffer, int offset, int count) { var keepTrying = true; var numRetries = 0; var numBytesRead = 0; while (keepTrying) { try { numBytesRead = _stream.Read(buffer, offset, count); keepTrying = false; } catch (Exception e) { Log(MethodName(), e); if (numRetries == MaxRetryAttempts) throw; Disconnect(); Thread.Sleep(NumRetryMilliseconds); ConnectWithRetries(_position); numRetries++; } } return numBytesRead; } private static void Log(string methodName, Exception e) { Logger.WriteLine($"Retrying exception found in {methodName}"); Logger.Log(e); } private static string MethodName([CallerMemberName] string caller = null) => caller; protected override void Dispose(bool disposing) { try { if (disposing) Disconnect(); _response = null; _stream = null; } finally { base.Dispose(disposing); } } } } ================================================ FILE: IO/PersistentStreamUtils.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; namespace IO { public static class PersistentStreamUtils { public static Stream GetReadStream(string urlOrPath, long position = 0) { if (string.IsNullOrEmpty(urlOrPath)) return null; if (!HttpUtilities.IsUrl(urlOrPath)) return File.Exists(urlOrPath) ? FileUtilities.GetReadStream(urlOrPath) : null; return new PersistentStream(new PersistentConnect(urlOrPath), position); } public static List GetStreams(List locations) { if (locations == null) return null; var streams = new List(locations.Count); streams.AddRange(locations.Select(urlOrPath => GetReadStream(urlOrPath))); return streams; } } } ================================================ FILE: IO/SpanBufferBinaryReader.cs ================================================ using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text; namespace IO { public static class SpanBufferBinaryReader { private const int MostSignificantBit = 128; private const int VlqBitShift = 7; public static ushort ReadOptUInt16(ref ReadOnlySpan byteSpan) { ushort value = 0; var shift = 0; var index = 0; while (shift != 21) { byte b = byteSpan[index++]; value |= (ushort)((b & sbyte.MaxValue) << shift); shift += VlqBitShift; // ReSharper disable once InvertIf if ((b & MostSignificantBit) == 0) { byteSpan = byteSpan.Slice(index); return value; } } throw new FormatException("Unable to read the 7-bit encoded unsigned short"); } public static int ReadOptInt32(ref ReadOnlySpan byteSpan) { var value = 0; var shift = 0; var index = 0; while (shift != 35) { byte b = byteSpan[index++]; value |= (b & sbyte.MaxValue) << shift; shift += VlqBitShift; // ReSharper disable once InvertIf if ((b & MostSignificantBit) == 0) { byteSpan = byteSpan.Slice(index); return value; } } throw new FormatException("Unable to read the 7-bit encoded integer"); } public static uint ReadOptUInt32(ref ReadOnlySpan byteSpan) { uint value = 0; var shift = 0; var index = 0; while (shift != 35) { byte b = byteSpan[index++]; value |= (uint)((b & sbyte.MaxValue) << shift); shift += VlqBitShift; // ReSharper disable once InvertIf if ((b & MostSignificantBit) == 0) { byteSpan = byteSpan.Slice(index); return value; } } throw new FormatException("Unable to read the 7-bit encoded unsigned integer"); } public static long ReadOptInt64(ref ReadOnlySpan byteSpan) { long value = 0; var shift = 0; var index = 0; while (shift != 70) { byte b = byteSpan[index++]; value |= (long) (b & sbyte.MaxValue) << shift; shift += VlqBitShift; // ReSharper disable once InvertIf if ((b & MostSignificantBit) == 0) { byteSpan = byteSpan.Slice(index); return value; } } throw new FormatException("Unable to read the 7-bit encoded long"); } public static ulong ReadOptUInt64(ref ReadOnlySpan byteSpan) { ulong value = 0; var shift = 0; var index = 0; while (shift != 70) { byte b = byteSpan[index++]; value |= (ulong) (b & sbyte.MaxValue) << shift; shift += VlqBitShift; // ReSharper disable once InvertIf if ((b & MostSignificantBit) == 0) { byteSpan = byteSpan.Slice(index); return value; } } throw new FormatException("Unable to read the 7-bit encoded ulong"); } public static string ReadUtf8String(ref ReadOnlySpan byteSpan) { int numBytes = ReadOptInt32(ref byteSpan); if (numBytes == 0) return string.Empty; string value = Encoding.UTF8.GetString(byteSpan[..numBytes]); byteSpan = byteSpan.Slice(numBytes); return value; } public static string ReadAsciiString(ref ReadOnlySpan byteSpan) { int numBytes = ReadOptInt32(ref byteSpan); if (numBytes == 0) return string.Empty; string value = Encoding.ASCII.GetString(byteSpan[..numBytes]); byteSpan = byteSpan.Slice(numBytes); return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void SkipString(ref ReadOnlySpan byteSpan) { int numBytes = ReadOptInt32(ref byteSpan); if (numBytes == 0) return; byteSpan = byteSpan.Slice(numBytes); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static byte ReadByte(ref ReadOnlySpan byteSpan) { byte value = byteSpan[0]; byteSpan = byteSpan.Slice(1); return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static ReadOnlySpan ReadBytes(ref ReadOnlySpan byteSpan, int numBytes) { ReadOnlySpan value = byteSpan[..numBytes]; byteSpan = byteSpan.Slice(numBytes); return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int ReadInt32(ref ReadOnlySpan byteSpan) { var value = MemoryMarshal.Read(byteSpan); byteSpan = byteSpan.Slice(4); return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static ulong ReadUInt64(ref ReadOnlySpan byteSpan) { var value = MemoryMarshal.Read(byteSpan); byteSpan = byteSpan.Slice(8); return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static ushort ReadUInt16(ref ReadOnlySpan byteSpan) { var value = MemoryMarshal.Read(byteSpan); byteSpan = byteSpan.Slice(2); return value; } } } ================================================ FILE: IO/UrlUtilities.cs ================================================ using System; using System.IO; namespace IO { public static class UrlUtilities { private const char UriSeparator = '/'; public static string GetPath(string url) => new Uri(url).LocalPath.TrimStart(UriSeparator); public static string GetFileName(string url) => Path.GetFileName(GetPath(url)); public static string UrlCombine(this string prefix, string suffix) => prefix.TrimEnd(UriSeparator) + UriSeparator + suffix.TrimStart(UriSeparator); } } ================================================ FILE: IO/v2/FileType.cs ================================================ namespace IO.v2 { public enum FileType : ushort { // reference Reference = 1, // cache GeneSymbol = 1000, Gene = 1100, EnsemblTranscript = 1200, RefseqTranscript = 1300, SIFT = 1400, PolyPhen = 1500, // supplementary annotation SaVariants = 2000, SaIntervals = 3000, SaGenes = 4000, FusionCatcher = 4100, GeneFusionJson = 4101, PhyloP = 5000, GsaWriter = 6000, GsaIndex = 6500, } } ================================================ FILE: IO/v2/Header.cs ================================================ using System.IO; namespace IO.v2 { /// /// Common header for all our Nirvana file formats /// public sealed record Header(FileType FileType, ushort FileFormatVersion) { // see http://www.libpng.org/pub/png/spec/1.2/PNG-Rationale.html#R.PNG-file-signature // decimal 137 78 73 82 13 10 26 10 // hexadecimal 89 4E 49 52 0d 0a 1a 0a // ASCII C notation \211 N I R \r \n \032 \n private const ulong NirvanaSignature = 727905342105144969; public const uint NirvanaFooter = 4283582798; // N I R 0xFF public static Header Read(BinaryReader reader) { ulong signature = reader.ReadUInt64(); var fileType = (FileType) reader.ReadUInt16(); ushort fileFormatVersion = reader.ReadUInt16(); if (signature != NirvanaSignature) throw new InvalidDataException("Invalid Nirvana file signature. Is this the correct file?"); return new Header(fileType, fileFormatVersion); } public void Write(BinaryWriter writer) { writer.Write(NirvanaSignature); writer.Write((ushort) FileType); writer.Write(FileFormatVersion); } } } ================================================ FILE: Intervals/Extensions.cs ================================================ namespace Intervals { public static class Extensions { /// /// interval 2 is overlapped with interval 1 +/- flanking length /// public static bool Overlaps(this IInterval interval1, IInterval interval2, int flankingLength = 0) => Utilities.Overlaps(interval1.Start - flankingLength, interval1.End + flankingLength, interval2.Start, interval2.End); public static bool Overlaps(this IInterval interval, int start, int end) => Utilities.Overlaps( interval.Start, interval.End, start, end); public static bool Contains(this IInterval interval1, IInterval interval2) => Utilities.Contains( interval1.Start, interval1.End, interval2.Start, interval2.End); public static Interval Intersects(this IInterval interval1, IInterval interval2) { (int start, int end) = Utilities.Intersects(interval1.Start, interval1.End, interval2.Start, interval2.End); return new Interval(start, end); } } } ================================================ FILE: Intervals/IInterval.cs ================================================ namespace Intervals { public interface IInterval { int Start { get; } int End { get; } } } ================================================ FILE: Intervals/IIntervalForest.cs ================================================ namespace Intervals { public interface IIntervalForest { bool OverlapsAny(ushort refIndex, int begin, int end); T[] GetAllOverlappingValues(ushort refIndex, int begin, int end); } } ================================================ FILE: Intervals/IIntervalSearch.cs ================================================ namespace Intervals { public interface IIntervalSearch { T[] GetAllOverlappingValues(int begin, int end); } public struct Interval { public readonly int Begin; public readonly int End; public readonly T Value; public int Max; public Interval(int begin, int end, T value) { Begin = begin; End = end; Value = value; Max = -1; } /// /// our compare function /// public int CompareMax(int position) { if (position < Max) return -1; return position > Max ? 1 : 0; } /// /// returns true if this interval overlaps with the specified interval /// public bool Overlaps(int intervalBegin, int intervalEnd) { return End >= intervalBegin && Begin <= intervalEnd; } } } ================================================ FILE: Intervals/Interval.cs ================================================ namespace Intervals { public struct Interval : IInterval { public int Start { get; } public int End { get; } public Interval(int start, int end) { Start = start; End = end; } } } ================================================ FILE: Intervals/IntervalArray.cs ================================================ using System.Collections.Generic; namespace Intervals { public sealed class IntervalArray : IIntervalSearch { public readonly Interval[] Array; public IntervalArray(Interval[] array) { Array = array; SetMaxIntervals(); } /// /// returns true if there are any overlapping intervals in the specified region /// public bool OverlapsAny(int begin, int end) => GetFirstIndexAny(begin, end) >= 0; /// /// returns values for all intervals that overlap the specified interval /// public T[] GetAllOverlappingValues(int begin, int end) { int firstIndex = GetFirstIndex(begin, end); return firstIndex == -1 ? null : AddOverlappingValues(firstIndex, begin, end); } public Interval[] GetAllOverlappingIntervals(int begin, int end) { var intervals = new List>(); int firstIndex = GetFirstIndex(begin, end); if (firstIndex == -1) return null; for (int index = firstIndex; index < Array.Length; index++) { Interval interval = Array[index]; if (interval.Begin > end) break; if (interval.Overlaps(begin, end)) intervals.Add(interval); } return intervals.ToArray(); } /// /// adds the overlapping values for all intervals overlapping the specified interval /// private T[] AddOverlappingValues(int firstIndex, int begin, int end) { var values = new List(); for (int index = firstIndex; index < Array.Length; index++) { var interval = Array[index]; if (interval.Begin > end) break; if (interval.Overlaps(begin, end)) values.Add(interval.Value); } return values.ToArray(); } /// /// finds the first index that overlaps on the interval [begin, max) /// private int GetFirstIndex(int intervalBegin, int intervalEnd) { var begin = 0; int end = Array.Length - 1; var lastOverlapIndex = -1; while (begin <= end) { int index = begin + (end - begin >> 1); if (Array[index].Overlaps(intervalBegin, intervalEnd)) lastOverlapIndex = index; int ret = Array[index].CompareMax(intervalBegin); if (ret <= 0) end = index - 1; else begin = index + 1; } return lastOverlapIndex; } /// /// finds the first index that overlaps on the interval [begin, max) /// private int GetFirstIndexAny(int intervalBegin, int intervalEnd) { var begin = 0; int end = Array.Length - 1; while (begin <= end) { int index = begin + (end - begin >> 1); if (Array[index].Overlaps(intervalBegin, intervalEnd)) return index; int ret = Array[index].CompareMax(intervalBegin); if (ret <= 0) end = index - 1; else begin = index + 1; } return ~begin; } /// /// sets the max endpoint for each interval element /// private void SetMaxIntervals() { var currentMax = int.MinValue; for (var i = 0; i < Array.Length; i++) { if (Array[i].End > currentMax) currentMax = Array[i].End; Array[i].Max = currentMax; } } } } ================================================ FILE: Intervals/IntervalForest.cs ================================================ namespace Intervals { public sealed class IntervalForest : IIntervalForest { private readonly IntervalArray[] _intervalArrays; private readonly ushort _maxIndex; public IntervalForest(IntervalArray[] intervalArrays) { _intervalArrays = intervalArrays; _maxIndex = (ushort)(intervalArrays.Length - 1); } /// /// returns whether there is any interval that overlaps the specified interval /// public bool OverlapsAny(ushort refIndex, int begin, int end) { if (refIndex > _maxIndex) return false; var intervalArray = _intervalArrays[refIndex]; if (intervalArray == null) return false; return intervalArray.OverlapsAny(begin, end); } /// /// returns values for all intervals that overlap the specified interval /// public T[] GetAllOverlappingValues(ushort refIndex, int begin, int end) { if (refIndex > _maxIndex) return null; var intervalArray = _intervalArrays[refIndex]; return intervalArray?.GetAllOverlappingValues(begin, end); } public Interval[] GetAllOverlappingIntervals(ushort refIndex, int begin, int end) { if (refIndex > _maxIndex) return null; var intervalArray = _intervalArrays[refIndex]; return intervalArray?.GetAllOverlappingIntervals(begin, end); } } } ================================================ FILE: Intervals/Intervals.csproj ================================================ net6.0 ..\bin\$(Configuration) ================================================ FILE: Intervals/NullIntervalSearch.cs ================================================ namespace Intervals { public sealed class NullIntervalSearch : IIntervalForest, IIntervalSearch { #region IIntervalForest public bool OverlapsAny(ushort refIndex, int begin, int end) { return false; } public T[] GetAllOverlappingValues(ushort refIndex, int begin, int end) { return null; } #endregion #region IIntervalSearch public T[] GetAllOverlappingValues(int begin, int end) { return null; } #endregion } } ================================================ FILE: Intervals/OverlapType.cs ================================================ namespace Intervals { public enum OverlapType { None, CompletelyOverlaps, CompletelyWithin, Partial } public enum EndpointOverlapType { None, Start, End, Both } } ================================================ FILE: Intervals/Utilities.cs ================================================ using System; namespace Intervals { public static class Utilities { public static bool Overlaps(int firstStart, int firstEnd, int secondStart, int secondEnd) => firstStart <= secondEnd && secondStart <= firstEnd; public static bool Contains(int firstStart, int firstEnd, int secondStart, int secondEnd) => firstStart <= secondStart && secondEnd <= firstEnd; // given two intervals T and V, describe how V overlaps T public static OverlapType GetOverlapType(int tStart, int tEnd, int vStart, int vEnd) { if (tEnd < vStart || vEnd < tStart) return OverlapType.None; if (vStart >= tStart && vEnd <= tEnd) return OverlapType.CompletelyWithin; if (tStart >= vStart && tEnd <= vEnd) return OverlapType.CompletelyOverlaps; return OverlapType.Partial; } public static EndpointOverlapType GetEndpointOverlapType(int tStart, int tEnd, int vStart, int vEnd) { bool overlapsStart = Overlaps(tStart, tStart, vStart, vEnd); bool overlapsEnd = Overlaps(tEnd, tEnd, vStart, vEnd); if (!overlapsStart && !overlapsEnd) return EndpointOverlapType.None; if (overlapsStart && overlapsEnd) return EndpointOverlapType.Both; return overlapsStart ? EndpointOverlapType.Start : EndpointOverlapType.End; } public static (int Start, int End) Intersects(int firstStart, int firstEnd, int secondStart, int secondEnd) => Overlaps(firstStart, firstEnd, secondStart, secondEnd) ? (Math.Max(firstStart, secondStart), Math.Min(firstEnd, secondEnd)) : (-1, -1); } } ================================================ FILE: Jasix/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: Jasix/DataStructures/JasixChrIndex.cs ================================================ using System.Collections.Generic; using System.Linq; using ErrorHandling.Exceptions; using Intervals; using IO; namespace Jasix.DataStructures { public sealed class JasixChrIndex { public readonly string ReferenceSequence; private readonly List _nodes; private JasixNode _currentNode; private readonly List> _largeVariants; private IntervalArray _intervalArray; public JasixChrIndex(string refName) { ReferenceSequence = refName; _nodes = new List(); _largeVariants = new List>(); _intervalArray = null; } public JasixChrIndex(ExtendedBinaryReader reader) : this("") { ReferenceSequence = reader.ReadAsciiString(); int count = reader.ReadOptInt32(); for (var i = 0; i < count; i++) _nodes.Add(new JasixNode(reader)); int intervalCount = reader.ReadOptInt32(); if (intervalCount == 0) return; for (var i = 0; i < intervalCount; i++) _largeVariants.Add(ReadInterval(reader)); _intervalArray = new IntervalArray(_largeVariants.ToArray()); } private static Interval ReadInterval(ExtendedBinaryReader reader) { int begin = reader.ReadOptInt32(); int end = reader.ReadOptInt32(); long position = reader.ReadOptInt64(); return new Interval(begin, end, position); } public void Write(IExtendedBinaryWriter writer) { if (_currentNode != null) _nodes.Add(_currentNode); writer.WriteOptAscii(ReferenceSequence); writer.WriteOpt(_nodes.Count); foreach (var node in _nodes) { node.Write(writer); } writer.WriteOpt(_largeVariants.Count); if (_largeVariants.Count == 0) return; foreach (Interval interval in _largeVariants.OrderBy(x => x.Begin).ThenBy(x => x.End)) { WriteInterval(interval, writer); } } private static void WriteInterval(Interval interval, IExtendedBinaryWriter writer) { writer.WriteOpt(interval.Begin); writer.WriteOpt(interval.End); writer.WriteOpt(interval.Value); } public void Add(int begin, int end, long filePosition) { if (begin > end) throw new UserErrorException($"start position {begin} is greater than end position {end}"); if (Utilities.IsLargeVariant(begin,end)) { _largeVariants.Add(new Interval(begin, end, filePosition)); end = begin;// large variants will be recorded as SNVs so that we can query for all entries from a given position } if (_currentNode == null) { _currentNode = new JasixNode(begin, end, filePosition); return; } if (_currentNode.TryAdd(begin, end)) return; _nodes.Add(_currentNode); _currentNode = new JasixNode(begin, end, filePosition); } public void Flush() { if (_currentNode != null) _nodes.Add(_currentNode); if (_largeVariants.Count != 0) _intervalArray = new IntervalArray(_largeVariants.ToArray()); } public long FindFirstSmallVariant(int start, int end) { var searchNode = new JasixNode(start, end, 0); var firstOverlappingNode = FindFirstOverlappingNode(searchNode); return firstOverlappingNode?.FileLocation ?? -1; } private JasixNode FindFirstOverlappingNode(JasixNode searchNode) { int index = _nodes.BinarySearch(searchNode); if (index < 0) index = ~index; // if it is to the left of the first node, check if the end overlaps if (index == 0) { return _nodes[index].Overlaps(searchNode) ? _nodes[index] : null; } if (index == _nodes.Count) { // if range overlaps the last node location of the last node, otherwise, -1 return _nodes[index - 1].Overlaps(searchNode) ? _nodes[index - 1] : null; } // if some intervals from the previous node overlaps the range if (_nodes[index - 1].Overlaps(searchNode)) return _nodes[index - 1]; return _nodes[index].Overlaps(searchNode) ? _nodes[index] : null; } public long[] FindLargeVariants(int begin, int end) { long[] positions = _intervalArray?.GetAllOverlappingValues(begin, end); if (positions == null || positions.Length == 0) return null; return positions; } } } ================================================ FILE: Jasix/DataStructures/JasixCommons.cs ================================================  namespace Jasix.DataStructures { public static class JasixCommons { public const int Version = 1; public const string FileExt = ".jsi"; public const string GenesSectionTag = "genes"; public const string HeaderSectionTag = "header"; public const string PositionsSectionTag = "positions"; private const int MaxVariantLength = 50; public const int MinNodeWidth = MaxVariantLength; public const int PreferredNodeCount = MaxVariantLength*2; } } ================================================ FILE: Jasix/DataStructures/JasixIndex.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using ErrorHandling.Exceptions; using IO; namespace Jasix.DataStructures { public struct FileRange { public readonly long Begin; public long End; public FileRange(long begin, long end = long.MaxValue) { Begin = begin; End = end; } } public sealed class JasixIndex:IDisposable { private readonly Stream _stream; private readonly Dictionary _chrIndices; private readonly Dictionary _synonymToChrName; private readonly Dictionary _sectionRanges; // the json file might contain sections. We want to be able to index these sections too public JasixIndex() { _chrIndices = new Dictionary(); _synonymToChrName = new Dictionary(); _sectionRanges = new Dictionary(); } public JasixIndex(Stream stream) : this() { _stream = stream; using (var reader = new ExtendedBinaryReader(stream)) { int version = reader.ReadOptInt32(); if (version != JasixCommons.Version) throw new InvalidDataException($"Invalid Jasix version: Observed {version}, expected{JasixCommons.Version}"); int count = reader.ReadOptInt32(); for (var i = 0; i < count; i++) { var chrIndex = new JasixChrIndex(reader); _chrIndices[chrIndex.ReferenceSequence] = chrIndex; } int synonymCount = reader.ReadOptInt32(); for (var i = 0; i < synonymCount; i++) { string synonym = reader.ReadAsciiString(); string indexName = reader.ReadAsciiString(); _synonymToChrName[synonym] = indexName; } int sectionCount = reader.ReadOptInt32(); for (var i = 0; i < sectionCount; i++) { string sectionName = reader.ReadAsciiString(); long begin = reader.ReadOptInt64(); long end = reader.ReadOptInt64(); _sectionRanges[sectionName] = new FileRange(begin, end); } } } public void Write(Stream writeStream) { var writer = new ExtendedBinaryWriter(writeStream); writer.WriteOpt(JasixCommons.Version); writer.WriteOpt(_chrIndices.Count); foreach (var chrIndex in _chrIndices.Values) { chrIndex.Write(writer); } writer.WriteOpt(_synonymToChrName.Count); foreach ((string key, string value) in _synonymToChrName) { writer.Write(key); writer.Write(value); } writer.WriteOpt(_sectionRanges.Count); foreach ((string name, FileRange sectionRange) in _sectionRanges) { writer.WriteOptAscii(name); writer.WriteOpt(sectionRange.Begin); writer.WriteOpt(sectionRange.End); } } public void Flush() { foreach (var chrIndex in _chrIndices.Values) { chrIndex.Flush(); } } public void Add(string chr, int start, int end, long fileLoc, string chrSynonym=null) { if (!string.IsNullOrEmpty(chrSynonym)) { _synonymToChrName[chrSynonym] = chr; } if (_chrIndices.TryGetValue(chr, out var chrIndex)) { chrIndex.Add(start, end, fileLoc); } else { _chrIndices[chr] = new JasixChrIndex(chr); _chrIndices[chr].Add(start, end, fileLoc); } } public void BeginSection(string section, long fileLoc) { if (_sectionRanges.ContainsKey(section)) throw new UserErrorException($"Multiple beginning for section: {section}!!"); _sectionRanges[section] = new FileRange(fileLoc); } public void EndSection(string section, long fileLoc) { if (!_sectionRanges.TryGetValue(section, out var fileRange)) return; // throw new UserErrorException($"Attempting to close section:{section} before opening it!!"); if (fileRange.End!=long.MaxValue) throw new UserErrorException($"Multiple closing for section {section} !!"); fileRange.End = fileLoc; _sectionRanges[section] = fileRange; } //returns file location of the first node that overlapping the given position chr:start-end public long GetFirstVariantPosition(string chr, int start, int end) { if (_chrIndices == null || _chrIndices.Count == 0) return -1; if (_synonymToChrName.TryGetValue(chr, out string indexName)) chr = indexName; if (_chrIndices.TryGetValue(chr, out var chrIndex)) { return chrIndex.FindFirstSmallVariant(start, end); } return -1; } public long[] LargeVariantPositions(string chr, int begin, int end) { if (_chrIndices == null || _chrIndices.Count == 0) return null; if (_synonymToChrName.TryGetValue(chr, out string indexName)) chr = indexName; return _chrIndices.TryGetValue(chr, out var chrIndex) ? chrIndex.FindLargeVariants(begin, end) : null; } public IEnumerable GetChromosomeList() { return _chrIndices.Keys; } public bool ContainsChr(string chr) { return _chrIndices.Keys.Contains(_synonymToChrName.TryGetValue(chr, out string indexName) ? indexName : chr); } public string GetIndexChromName(string chromName) { if (_chrIndices.ContainsKey(chromName)) return chromName; return _synonymToChrName.TryGetValue(chromName, out string indexName) ? indexName : null; } public long GetSectionBegin(string section) { return _sectionRanges.ContainsKey(section)? _sectionRanges[section].Begin: -1; } public long GetSectionEnd(string section) { return _sectionRanges.ContainsKey(section)? _sectionRanges[section].End: -1; } public void Dispose() { _stream?.Dispose(); } public IEnumerable GetSections() => _sectionRanges.Keys; } } ================================================ FILE: Jasix/DataStructures/JasixNode.cs ================================================ using System; using IO; namespace Jasix.DataStructures { public sealed class JasixNode : IComparable { private readonly int _start; private int _end; public readonly long FileLocation; private int _count; public JasixNode(int start, int end, long location) { _start = start; _end = end; _count = 1; FileLocation = location; } public JasixNode(ExtendedBinaryReader reader) { _start = reader.ReadOptInt32(); //on disk we will store the end as an offset to save space _end = _start + reader.ReadOptInt32(); FileLocation = reader.ReadOptInt64(); } public bool Overlaps(JasixNode other) { return other._end >= _start && other._start <= _end; } public int CompareTo(JasixNode other) { if (other == null) return -1; // ReSharper disable once ImpureMethodCallOnReadonlyValueField return _start.CompareTo(other._start); } public bool TryAdd(int start, int end) { if (start < _start) return false; if (end - _start > JasixCommons.MinNodeWidth && _count >= JasixCommons.PreferredNodeCount) return false; _end = end; _count++; return true; } public void Write(IExtendedBinaryWriter writer) { writer.WriteOpt(_start); writer.WriteOpt(_end-_start); writer.WriteOpt(FileLocation); } } } ================================================ FILE: Jasix/DataStructures/JsonSchema.cs ================================================ using System.Collections.Generic; using Intervals; // ReSharper disable InconsistentNaming // The names have to be this way as they have to match the json schema exactly namespace Jasix.DataStructures { // ReSharper disable once ClassNeverInstantiated.Global public sealed class JsonSchema : IInterval { // ReSharper disable UnassignedField.Global public string chromosome; public int position; public string refAllele; public List altAlleles; public int svEnd; // ReSharper restore UnassignedField.Global public int Start => position; public int End => Utilities.GetJsonEntryEnd(this); } } ================================================ FILE: Jasix/DataStructures/Utilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; using ErrorHandling.Exceptions; using Newtonsoft.Json.Linq; namespace Jasix.DataStructures { public static class Utilities { private const char DoubleQuote = '\"'; private const char OpenBracket = '['; private const char CloseBracket = ']'; public static (string Chromosome, int Start, int End) ParseQuery(string position) { //chr1:100-101 //chr1:100 //chr1 - report all entries for chr1 var regexPos = new Regex(@"^(\w+)(?::(\d+)(?:-(\d+))?)?$", RegexOptions.Compiled); string trimmedPos = position.Trim(' '); var match = regexPos.Match(trimmedPos); if (!match.Success) throw new UserErrorException($"region {trimmedPos} is not valid, please specify a valid region, e.g., chr1, 1, 1:1234 or 1:1234-4567"); string chromosome = match.Groups[1].ToString(); if (!match.Groups[2].Success && !match.Groups[3].Success) return (chromosome, 1, int.MaxValue); int start = Convert.ToInt32(match.Groups[2].ToString()); int end = match.Groups[3].Success ? Convert.ToInt32(match.Groups[3].ToString()) : start; return (chromosome, start, end); } public static void PrintQuerySectionOpening(string sectionName, StreamWriter writer) { writer.Write(DoubleQuote + sectionName + DoubleQuote+ ":" + OpenBracket + Environment.NewLine); } public static void PrintQuerySectionClosing(StreamWriter writer) { writer.Write(Environment.NewLine + CloseBracket); } public static void PrintJsonEntry(string entry, bool needComma, StreamWriter writer) { if (needComma) writer.Write("," + writer.NewLine); var jObject = JObject.Parse(entry); writer.Write(jObject.ToString()); } public static bool IsLargeVariant(int start, int end) { return end - start + 1 > JasixCommons.MinNodeWidth; } public static int GetJsonEntryEnd(JsonSchema jsonEntry) { if (jsonEntry.svEnd > 0) return jsonEntry.svEnd; List altAlleles = jsonEntry.altAlleles; int altAlleleOffset = altAlleles != null && altAlleles.All(IsNucleotideAllele) && altAlleles.Any(x => x.Length > 1) ? 1 : 0; return Math.Max(jsonEntry.refAllele.Length - 1, altAlleleOffset) + jsonEntry.position; } public static bool IsNucleotideAllele(string altAllele) { return string.IsNullOrEmpty(altAllele) || altAllele.ToCharArray().All(x => x == 'A' || x == 'T' || x == 'C' || x == 'G'); } } } ================================================ FILE: Jasix/IndexCreator.cs ================================================ using System; using System.IO; using System.IO.Compression; using CommandLine.Utilities; using Compression.FileHandling; using ErrorHandling.Exceptions; using IO; using Jasix.DataStructures; using Newtonsoft.Json; using OptimizedCore; namespace Jasix { public sealed class IndexCreator : IDisposable { private readonly BgzipTextReader _reader; private readonly Stream _writeStream; private readonly Benchmark _chromBenchmark; private readonly Benchmark _benchmark; public IndexCreator(BlockGZipStream readStream, Stream writeStream) { _reader = new BgzipTextReader(readStream); _writeStream = writeStream; _chromBenchmark = new Benchmark(); _benchmark = new Benchmark(); } public IndexCreator(string fileName) : this( new BlockGZipStream(FileUtilities.GetReadStream(fileName), CompressionMode.Decompress), FileUtilities.GetCreateStream(fileName + JasixCommons.FileExt)) {} public void CreateIndex() { var index = new JasixIndex(); IndexHeader(index); string lastLine = IndexPositions(index); IndexGenes(lastLine, index); index.Write(_writeStream); Console.WriteLine(); long peakMemoryUsageBytes = MemoryUtilities.GetPeakMemoryUsage(); var wallTimeSpan = _benchmark.GetElapsedTime(); Console.WriteLine(); if (peakMemoryUsageBytes > 0) Console.WriteLine("Peak memory usage: {0}", MemoryUtilities.ToHumanReadable(peakMemoryUsageBytes)); Console.WriteLine("Time: {0}", Benchmark.ToHumanReadable(wallTimeSpan)); } private string IndexPositions(JasixIndex index) { // we need the location before accessing the line long linePosition = _reader.Position; index.BeginSection(JasixCommons.PositionsSectionTag, linePosition); Console.WriteLine($"section:{JasixCommons.PositionsSectionTag} starts at {linePosition}"); var previousChr = ""; var previousPos = 0; string line; while ((line = _reader.ReadLine()) != null) { if (line.OptimizedStartsWith(']')) { index.EndSection(JasixCommons.PositionsSectionTag, linePosition); Console.WriteLine($"section:{JasixCommons.PositionsSectionTag} ends at {linePosition}"); break; } line = line.TrimEnd(','); (string chr, int position, int end) = GetChromPosition(line); CheckSorting(chr, position, previousChr, previousPos); index.Add(chr, position, end, linePosition); linePosition = _reader.Position; previousChr = chr; previousPos = position; } return line; } private void IndexGenes(string lastLine, JasixIndex index) { if (lastLine == null) return; do { long linePosition = _reader.Position; if (lastLine.EndsWith($",\"{JasixCommons.GenesSectionTag}\":[")) { index.BeginSection(JasixCommons.GenesSectionTag, _reader.Position); Console.WriteLine($"section:{JasixCommons.GenesSectionTag} starts at {_reader.Position}"); } if (lastLine.EndsWith("]}")) { index.EndSection(JasixCommons.GenesSectionTag, linePosition); Console.WriteLine($"section:{JasixCommons.GenesSectionTag} ends at {linePosition}"); break; } } while ((lastLine = _reader.ReadLine()) != null); } private void IndexHeader(JasixIndex index) { string searchTag = $"\"{JasixCommons.PositionsSectionTag}\":["; string headerTag = $"{{\"{JasixCommons.HeaderSectionTag}\":"; string line; long previousPosition = _reader.Position; while ((line = _reader.ReadLine()) != null) { if (line.StartsWith(headerTag)) { index.BeginSection(JasixCommons.HeaderSectionTag, previousPosition); Console.WriteLine($"section:{JasixCommons.HeaderSectionTag} starts at {previousPosition}"); } if (line.EndsWith(searchTag)) { { index.EndSection(JasixCommons.HeaderSectionTag, previousPosition); Console.WriteLine($"section:{JasixCommons.HeaderSectionTag} ends at {previousPosition}"); } break; } previousPosition = _reader.Position; } } // ReSharper disable once UnusedParameter.Local // ReSharper disable once ParameterOnlyUsedForPreconditionCheck.Local private void CheckSorting(string chr, int pos, string previousChr, int previousPos) { if (chr == previousChr && pos < previousPos) { throw new UserErrorException($"the Json file is not sorted at {chr}: {pos}"); } if (chr == previousChr || previousChr == "") return; Console.WriteLine($"Ref Sequence {previousChr} indexed in {Benchmark.ToHumanReadable(_chromBenchmark.GetElapsedTime())}"); _chromBenchmark.Reset(); } internal static (string chr, int position, int end) GetChromPosition(string line) { JsonSchema jsonEntry; try { jsonEntry = JsonConvert.DeserializeObject(line); } catch (Exception) { Console.WriteLine($"Error in line:\n{line}"); throw; } int end = Utilities.GetJsonEntryEnd(jsonEntry); return (jsonEntry.chromosome, jsonEntry.position, end); } public void Dispose() { _reader?.Dispose(); _writeStream?.Dispose(); } } } ================================================ FILE: Jasix/Jasix.csproj ================================================  Exe net6.0 ..\bin\$(Configuration) ================================================ FILE: Jasix/JasixMain.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using Jasix.DataStructures; using VariantAnnotation.Interface; namespace Jasix { public static class JasixMain { private static string _inputJson; private static string _outputFile; private static readonly List Queries = new List(); private static string _section; private static bool _printHeader; private static bool _printHeaderOnly; private static bool _list; private static bool _createIndex; public static int Main(string[] args) { var ops = new OptionSet { { "header|t", "print also the header lines", v => _printHeader = v != null }, { "only-header|H", "print only the header lines", v => _printHeaderOnly = v != null }, { "list|l", "list chromosome and section names", v => _list = v != null }, { "index|c", "create index", v => _createIndex = v != null }, { "in|i=", "input", v => _inputJson = v }, { "out|o=", "compressed output file name (default:console)", v => _outputFile = v }, { "query|q=", "query range", v => Queries.Add(v) }, { "section|s=", "complete section (positions or genes) to output", v => _section = v } }; var exitCode = new ConsoleAppBuilder(args, ops) .Parse() .CheckInputFilenameExists(_inputJson, "input Json file", "[in.json.gz]") .DisableOutput(!_createIndex && _outputFile == null) .ShowBanner(Constants.Authors) .ShowHelpMenu("Indexes a Nirvana annotated JSON file", "-i in.json.gz [options]") .ShowErrors() .Execute(ProgramExecution); return (int)exitCode; } private static ExitCodes ProgramExecution() { if (_createIndex) { using (var indexCreator = new IndexCreator(_inputJson)) { indexCreator.CreateIndex(); } return ExitCodes.Success; } string indexFileName = _inputJson + JasixCommons.FileExt; ValidateIndexFile(indexFileName); var writer = string.IsNullOrEmpty(_outputFile) ? null : GZipUtilities.GetStreamWriter(_outputFile); using (var queryProcessor = new QueryProcessor(GZipUtilities.GetAppropriateStreamReader(_inputJson), FileUtilities.GetReadStream(indexFileName), writer)) { if (_list) { queryProcessor.ListChromosomesAndSections(); return ExitCodes.Success; } if (_printHeaderOnly) { queryProcessor.PrintHeaderOnly(); return ExitCodes.Success; } if (!string.IsNullOrEmpty(_section)) { queryProcessor.PrintSection(_section); return ExitCodes.Success; } if (Queries == null) { Console.WriteLine("Please specify query region(s)"); return ExitCodes.BadArguments; } queryProcessor.ProcessQuery(Queries, _printHeader); } return ExitCodes.Success; } private static void ValidateIndexFile(string indexFileName) { if (!File.Exists(indexFileName)) throw new UserErrorException("No index file found,please generate index file first."); //var indexFileCreateTime = File.GetCreationTime(indexFileName).Ticks; //var fileCreateTime = File.GetCreationTime(_inputJson).Ticks; //if (fileCreateTime > indexFileCreateTime - 1000) // adding a 100ms buffer // throw new UserErrorException("Index file is older than the input file, please re-generate the index."); } } } ================================================ FILE: Jasix/OnTheFlyIndexCreator.cs ================================================ using System; using System.IO; using System.Linq; using ErrorHandling.Exceptions; using Jasix.DataStructures; using VariantAnnotation.Interface.Positions; namespace Jasix { public sealed class OnTheFlyIndexCreator : IDisposable { private readonly Stream _indexStream; private readonly JasixIndex _jasixIndex; private int _lastPosition; private string _lastChromName; public OnTheFlyIndexCreator(Stream indexStream) { _indexStream = indexStream; _jasixIndex = new JasixIndex(); } public void Add(IPosition position, long fileLocation) { string chromName = position.Chromosome.EnsemblName; int start = position.Start; int? end = position.InfoData?.End; if (chromName == _lastChromName && start < _lastPosition) { throw new UserErrorException($"The Json file is not sorted at {position.Chromosome.UcscName}: {start}"); } _lastPosition = start; _lastChromName = chromName; if (end == null) { string[] altAlleles = position.AltAlleles; int altAlleleOffset = altAlleles != null && altAlleles.All(Utilities.IsNucleotideAllele) && altAlleles.Any(x => x.Length > 1) ? 1 : 0; end = Math.Max(position.RefAllele.Length - 1, altAlleleOffset) + start; } _jasixIndex.Add(position.Chromosome.EnsemblName, start, end.Value, fileLocation, position.Chromosome.UcscName); } public void BeginSection(string sectionName, long fileLocation) { _jasixIndex.BeginSection(sectionName, fileLocation); } public void EndSection(string sectionName, long fileLocation) { _jasixIndex.EndSection(sectionName, fileLocation); } public void Dispose() { Flush(); _indexStream.Dispose(); } public void Flush() { _jasixIndex.Write(_indexStream); _indexStream.Flush(); } } } ================================================ FILE: Jasix/QueryProcessor.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Compression.FileHandling; using Intervals; using Jasix.DataStructures; using Newtonsoft.Json; using OptimizedCore; using Utilities = Jasix.DataStructures.Utilities; namespace Jasix { public sealed class QueryProcessor:IDisposable { #region members private readonly StreamReader _jsonReader; private readonly StreamWriter _writer; private readonly Stream _indexStream; private readonly JasixIndex _jasixIndex; #endregion #region IDisposable public void Dispose() { _jsonReader?.Dispose(); _writer?.Dispose(); _indexStream?.Dispose(); } #endregion public QueryProcessor(StreamReader jsonReader, Stream indexStream, StreamWriter writer=null) { _jsonReader = jsonReader; _writer = writer ?? new StreamWriter(Console.OpenStandardOutput()); _indexStream = indexStream; _jasixIndex = new JasixIndex(_indexStream); } public void ListChromosomesAndSections() { foreach (string chrName in _jasixIndex.GetChromosomeList()) { _writer.WriteLine(chrName); } foreach (var section in _jasixIndex.GetSections()) { _writer.WriteLine(section); } } public void PrintHeaderOnly() { string headerString = "{"+GetHeader()+"}"; Utilities.PrintJsonEntry(headerString, false, _writer); } public void PrintSection(string section) { _writer.WriteLine("["); var needComma = false; foreach (var line in GetSectionLines(section)) { Utilities.PrintJsonEntry(line.TrimEnd(','), needComma,_writer); needComma = true; } _writer.WriteLine("]"); } public int ProcessQuery(IEnumerable queryStrings, bool printHeader = false) { if (printHeader) { _writer.Write("{\n\"header\":"); var headerContent = GetHeader().Split(':',2)[1]; Utilities.PrintJsonEntry(headerContent, false, _writer); _writer.WriteLine(","); } else _writer.Write("{"); Utilities.PrintQuerySectionOpening(JasixCommons.PositionsSectionTag, _writer); var count = 0; foreach (string queryString in queryStrings) { var query = Utilities.ParseQuery(queryString); query.Chromosome = _jasixIndex.GetIndexChromName(query.Chromosome); if (!_jasixIndex.ContainsChr(query.Chromosome)) continue; count += PrintLargeVariantsExtendingIntoQuery(query); count += PrintAllVariantsFromQueryBegin(query, count > 0); } Utilities.PrintQuerySectionClosing(_writer); _writer.WriteLine("}"); return count; } private int PrintAllVariantsFromQueryBegin((string, int, int) query, bool needComma) { var count = 0; foreach (string line in ReadOverlappingJsonLines(query)) { Utilities.PrintJsonEntry(line, needComma, _writer); needComma = true; count++; } return count; } private int PrintLargeVariantsExtendingIntoQuery((string, int, int) query) { var count = 0; foreach (string line in ReadJsonLinesExtendingInto(query)) { Utilities.PrintJsonEntry(line, count>0, _writer); count++; } return count; } internal IEnumerable ReadJsonLinesExtendingInto((string Chr, int Start, int End) query) { // query for large variants like chr1:100-99 returns all overlapping large variants that start before 100 (string chr, int start, _) = query; long[] locations = _jasixIndex.LargeVariantPositions(chr, start, start - 1); if (locations == null || locations.Length == 0) yield break; foreach (long location in locations) { RepositionReader(location); string line; while ((line = _jsonReader.ReadLine()) != null) { if (!line.OptimizedStartsWith(',')) { //buffer starts with ',\n', skip this first line line = line.TrimEnd(','); yield return line; break; } } } } private void RepositionReader(long location) { _jsonReader.DiscardBufferedData(); _jsonReader.BaseStream.Position = location; } public string GetHeader() { long headerLocation = _jasixIndex.GetSectionBegin(JasixCommons.HeaderSectionTag); RepositionReader(headerLocation); string headerLine = _jsonReader.ReadLine(); string additionalTail = $",\"{JasixCommons.PositionsSectionTag}\":["; return headerLine?.Substring(1, headerLine.Length - 1 - additionalTail.Length); } private static readonly byte[] BgzBlock = new byte[BlockGZipStream.BlockGZipFormatCommon.MaxBlockSize]; public IEnumerable GetSectionLines(string section) { if (_jasixIndex.GetSectionBegin(section) == -1) yield break; long sectionBegin = _jasixIndex.GetSectionBegin(section); RepositionReader(sectionBegin); string line = _jsonReader.ReadLine(); // at the end of both positions and genes section, we have a line that closes the array. // So, our terminating condition can be the following while (line != null && !line.StartsWith("]")) { yield return line; line = _jsonReader.ReadLine(); } } internal IEnumerable ReadOverlappingJsonLines((string Chr, int Start, int End) query) { (string chr, int start, int end) = query; long position = _jasixIndex.GetFirstVariantPosition(chr, start, end); if (position == -1) yield break; RepositionReader(position); string line; while ((line = _jsonReader.ReadLine()) != null && !line.OptimizedStartsWith(']')) //The array of positions entry end with "]," Going past it will cause the json deserializer to crash { line = line.TrimEnd(','); if (string.IsNullOrEmpty(line)) continue; JsonSchema jsonEntry = ParseJsonEntry(line); string jsonChrom = _jasixIndex.GetIndexChromName(jsonEntry.chromosome); if (jsonChrom != chr) break; if (jsonEntry.Start > end) break; if (!jsonEntry.Overlaps(start, end)) continue; // if there is an SV that starts before the query start that is printed by the large variant printer if (Utilities.IsLargeVariant(jsonEntry.Start, jsonEntry.End) && jsonEntry.Start < start) continue; yield return line; } } private static JsonSchema ParseJsonEntry(string line) { JsonSchema jsonEntry; try { jsonEntry = JsonConvert.DeserializeObject(line); } catch (Exception) { Console.WriteLine($"Error in line:\n{line}"); throw; } return jsonEntry; } } } ================================================ FILE: Jist/Jist.csproj ================================================ Exe net6.0 ..\bin\$(Configuration) ================================================ FILE: Jist/JistMain.cs ================================================ using System; using System.IO; using ErrorHandling; using IO; using Jasix.DataStructures; namespace Jist { public class JistMain { public static int Main(string[] args) { Console.WriteLine("Running Nirvana Json Stitching tool"); if (args.Length < 1) { Console.WriteLine("Usage: dotnet jist.dll input-json.gz-prefix output-json.gz "); Environment.Exit((int)ExitCodes.UserError); } var inputFilePrefix = args[0]; var outputFileName = args[1]; var directory = Path.GetDirectoryName(inputFilePrefix); if (string.IsNullOrEmpty(directory)) directory = "."; var prefix = Path.GetFileName(inputFilePrefix); var inputFiles = Directory.GetFiles(directory, prefix+"*.json.gz"); Array.Sort(inputFiles); Console.WriteLine("Files to stitch"); foreach (var file in inputFiles) { Console.WriteLine(file); if (!File.Exists(file + JasixCommons.FileExt)) { Console.WriteLine($"Cannot find {file +JasixCommons.FileExt}. Please provide corresponding {JasixCommons.FileExt} files for each input JSON"); return (int)ExitCodes.UserError; } } if (inputFiles.Length == 0) { Console.WriteLine($"Found {inputFiles.Length} files to stitch. Need at least 1."); Environment.Exit((int)ExitCodes.UserError); } if (inputFiles.Length == 1) { Console.WriteLine("Found only one input JSON. Copying it to output file..."); File.Copy(inputFiles[0], outputFileName, true); return (int)ExitCodes.Success; } var inputStreams = new Stream[inputFiles.Length]; var indexStreams = new Stream[inputFiles.Length]; for (var i = 0; i < inputFiles.Length; i++) { inputStreams[i] = FileUtilities.GetReadStream(inputFiles[i]); indexStreams[i] = FileUtilities.GetReadStream(inputFiles[i] + JasixCommons.FileExt); } using(var outputStream = FileUtilities.GetCreateStream(outputFileName)) using (var stitcher = new JsonStitcher(inputStreams, indexStreams, outputStream)) { return stitcher.Stitch(); } } } } ================================================ FILE: Jist/JistUtilities.cs ================================================ using System.IO; using System.IO.Compression; using Compression.FileHandling; namespace Jist { public static class JistUtilities { public static byte[] GetCompressedBlock(string s, int compressionLevel=1) { using (var stream = new MemoryStream()) { using(var memStream = new BlockGZipStream(stream, CompressionMode.Compress, true)) using (var writer = new StreamWriter(memStream)) { writer.Write(s); } return stream.ToArray(); } } } } ================================================ FILE: Jist/JsonStitcher.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using System.Text; using Compression.FileHandling; using ErrorHandling; using Jasix.DataStructures; namespace Jist { public sealed class JsonStitcher:IDisposable { private readonly Stream[] _jsonStreams; private readonly Stream[] _jasixStreams; private readonly Stream _outStream; private readonly bool _leaveOutStreamOpen; private readonly HashSet _geneLines; public JsonStitcher(Stream[] jsonStreams, Stream[] jasixStreams, Stream outStream, bool leaveOutStreamOpen=false) { _jasixStreams = jasixStreams; _jsonStreams = jsonStreams; _outStream = outStream; _leaveOutStreamOpen = leaveOutStreamOpen; _geneLines = new HashSet(); } public const string GeneHeaderLine = "\n],\"genes\":["; public const string FooterLine = "]}"; private bool _isFirstHeaderBlock = true; private static readonly byte[] BgzBlock = new byte[BlockGZipStream.BlockGZipFormatCommon.MaxBlockSize]; private static readonly byte[] CommaBlock = JistUtilities.GetCompressedBlock(",\n");//will be added to the end of a block when needed public int Stitch() { var positionBlockCount = 0; var geneLineCount = 0; using (var writer = new BinaryWriter(_outStream, Encoding.Default, _leaveOutStreamOpen)) { var needsCommaBlock = false; for (var i=0; i < _jsonStreams.Length; i++) { if (needsCommaBlock) writer.Write(CommaBlock, 0, CommaBlock.Length); var jsonStream = _jsonStreams[i]; var jasixStream = _jasixStreams[i]; positionBlockCount+= WritePositionBlocks(jsonStream, jasixStream, writer); geneLineCount+= ReadGeneLines(jsonStream); //after the first file, every file will need a comma block to maintain valid json after positions block // and after each gene block needsCommaBlock = true; } writer.Flush(); //write out the gene blocks WriteGeneBlocks(_outStream); } Console.WriteLine($"Total position blocks written: {positionBlockCount}"); Console.WriteLine($"Gene lines read: {geneLineCount}"); Console.WriteLine($"Unique gene lines: {_geneLines.Count}"); return (int) ExitCodes.Success; } private int ReadGeneLines(Stream jsonStream) { var lineCount = 0; using (var bGzipStream = new BlockGZipStream(jsonStream, CompressionMode.Decompress)) using(var reader = new StreamReader(bGzipStream)) { string line; while ((line= reader.ReadLine())!= null) { if (line==string.Empty) continue; if (line == FooterLine) break; if (!line.EndsWith(',')) line += ','; lineCount++; _geneLines.Add(line); } } return lineCount; } private void WriteGeneBlocks(Stream stream) { using (var bGzipStream = new BlockGZipStream(stream, CompressionMode.Compress, _leaveOutStreamOpen)) using(var writer = new StreamWriter(bGzipStream)) { var count = _geneLines.Count; if (count == 0) { writer.WriteLine(FooterLine); return; } writer.WriteLine(GeneHeaderLine); var i = 0; foreach (var geneLine in _geneLines.OrderBy(x=>x)) { i++; //the last gene line shouldn't have a comma at the end writer.WriteLine(i == count ? geneLine.TrimEnd(',') : geneLine); } writer.WriteLine(FooterLine); } } private int WritePositionBlocks(Stream jsonStream, Stream jasixStream, BinaryWriter writer) { var blockCount = 0; using (var reader = new BgzBlockReader(jsonStream, true)) using (var jasixIndex = new JasixIndex(jasixStream)) { var positionSectionBegin = jasixIndex.GetSectionBegin(JasixCommons.PositionsSectionTag); if (positionSectionBegin == -1) return 0;//no positions found. and therefore, cannot have genes either. var positionSectionEnd = jasixIndex.GetSectionEnd(JasixCommons.PositionsSectionTag); var geneSectionBegin = jasixIndex.GetSectionBegin(JasixCommons.GenesSectionTag); var isFirstBlock = true; for (int count = reader.ReadCompressedBlock(BgzBlock); count > 0; count=reader.ReadCompressedBlock(BgzBlock)) { if (isFirstBlock) { if (_isFirstHeaderBlock) { writer.Write(BgzBlock, 0, count); _isFirstHeaderBlock = false; } isFirstBlock = false; continue; } //we need the following check because there is one block between the positions and the genes block that we want to skip // the block that contains: ],"genes":[... // the 16 bit left shift is due to the representation of the position in bgzip file if(reader.Position >= positionSectionEnd >>16) { //we have read the last position block blockCount++; writer.Write(BgzBlock, 0, count); if(geneSectionBegin!=-1) jsonStream.Position = geneSectionBegin >> 16; return blockCount; } blockCount++; writer.Write(BgzBlock, 0, count); } } return blockCount; } public void Dispose() { if (_jsonStreams != null) { foreach (Stream jsonStream in _jsonStreams) { jsonStream?.Dispose(); } } if (_jasixStreams != null) { foreach (Stream jasixStream in _jasixStreams) { jasixStream?.Dispose(); } } if (_leaveOutStreamOpen) { _outStream.Flush(); return; } _outStream?.Dispose(); } } } ================================================ FILE: LICENSE ================================================ # PolyForm Strict License 1.0.0 ## Acceptance In order to get any license under these terms, you must agree to them as both strict obligations and conditions to all your licenses. ## Copyright License The licensor grants you a copyright license for the software to do everything you might do with the software that would otherwise infringe the licensor's copyright in it for any permitted purpose, other than distributing the software or making changes or new works based on the software. ## Patent License The licensor grants you a patent license for the software that covers patent claims the licensor can license, or becomes able to license, that you would infringe by using the software. ## Noncommercial Purposes Any noncommercial purpose is a permitted purpose. ## Personal Uses Personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, amateur pursuits, or religious observance, without any anticipated commercial application, is use for a permitted purpose. ## Noncommercial Organizations Use by any charitable organization, educational institution, public research organization, public safety or health organization, environmental protection organization, or government institution is use for a permitted purpose regardless of the source of funding or obligations resulting from the funding. ## Fair Use You may have "fair use" rights for the software under the law. These terms do not limit them. ## No Other Rights These terms do not allow you to sublicense or transfer any of your licenses to anyone else, or prevent the licensor from granting licenses to anyone else. These terms do not imply any other licenses. ## Patent Defense If you make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company. ## Violations The first time you are notified in writing that you have violated any of these terms, or done anything with the software not covered by your licenses, your licenses can nonetheless continue if you come into full compliance with these terms, and take practical steps to correct past violations, within 32 days of receiving notice. Otherwise, all your licenses end immediately. ## No Liability ***As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.*** ## Definitions The **licensor** is the individual or entity offering these terms, and the **software** is the software the licensor makes available under these terms. **You** refers to the individual or entity agreeing to these terms. **Your company** is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. **Control** means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect. **Your licenses** are all the licenses granted to you for the software under these terms. **Use** means anything you do with the software requiring one of your licenses. ================================================ FILE: MitoHeteroplasmy/MitoHeteroplasmy.csproj ================================================ net6.0 ================================================ FILE: MitoHeteroplasmy/MitoHeteroplasmyProvider.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using Genome; using RepeatExpansions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using Variants; namespace MitoHeteroplasmy { public sealed class MitoHeteroplasmyProvider : IMitoHeteroplasmyProvider { public string Name { get; } = "MitochondrialHeteroplasmy"; public GenomeAssembly Assembly { get; } = GenomeAssembly.rCRS; public IEnumerable DataSourceVersions { get; } private const string Version = "20180410"; private const string Description = "Variant read frequency percentiles for the Mitochondrial reference"; private const string MitoChromUcscName = "chrM"; private static readonly long CreateDateTicks = new DateTime(2020, 5, 21).Ticks; private static readonly Dictionary AlleleToInt = new Dictionary { { "A", 0 }, { "C", 1 }, { "G", 2 }, { "T", 3 } }; private const int SequenceLengthMax = int.MaxValue / 4; private readonly Dictionary _alleleToDistribution = new Dictionary(); public MitoHeteroplasmyProvider() { var dataSourceVersion = new DataSourceVersion(Name, Version, CreateDateTicks, Description); DataSourceVersions = new[] {dataSourceVersion}; } public void Add(int position, string altAllele, double[] vrfs, int[] alleleDepths) { double[] percentiles = PercentileUtilities.ComputePercentiles(vrfs.Length, alleleDepths); _alleleToDistribution[EncodeMitoPositionAndAltAllele(position, altAllele)] = (vrfs, percentiles); } public double?[] GetVrfPercentiles(IVariant[] variants, double[] vrfs) { if (vrfs == null) return null; if (variants == null || variants.Length == 0) return null; if (variants[0].Chromosome.UcscName != MitoChromUcscName) return null; var percentiles = vrfs.Zip(variants, (vrf, variant) => GetVrfPercentile(variant, vrf)).ToArray(); return percentiles.All(x => x == null) ? null : percentiles; } private double? GetVrfPercentile(IVariant variant, double vrf) { var position = variant.Start; var altAllele = variant.AltAllele; if (string.IsNullOrEmpty(altAllele) || !AlleleToInt.ContainsKey(altAllele)) return null; var positionAndAltAlleleIntForm = EncodeMitoPositionAndAltAllele(position, altAllele); if (!_alleleToDistribution.TryGetValue(positionAndAltAlleleIntForm, out (double[] Vrfs, double[] Percentiles) data)) return null; if (vrf > 0.999) vrf = 0.999; return PercentileUtilities.GetPercentile(vrf, data.Vrfs, data.Percentiles); } private static int EncodeMitoPositionAndAltAllele(int position, string altAllele) => SequenceLengthMax * AlleleToInt[altAllele] + position; private static double ToRoundedVrf(double vrf) => Math.Round(vrf, 3, MidpointRounding.AwayFromZero); } } ================================================ FILE: MitoHeteroplasmy/MitoHeteroplasmyReader.cs ================================================ using System; using System.IO; using System.IO.Compression; using System.Linq; using System.Reflection; using OptimizedCore; namespace MitoHeteroplasmy { public static class MitoHeteroplasmyReader { private const int PositionIndex = 1; private const int RefIndex = 2; private const int AltIndex = 3; private const int VrfBinsIndex = 4; private const int VrfCountsIndex = 5; private const string ResourceName = "MitoHeteroplasmy.Resources.MitoHeteroplasmy.tsv.gz"; public static MitoHeteroplasmyProvider GetProvider() { var assembly = Assembly.GetExecutingAssembly(); using var stream = assembly.GetManifestResourceStream(ResourceName); if (stream == null) throw new NullReferenceException("Unable to read from the Mitochondrial Heteroplasmy file"); using var gzStream = new GZipStream(stream, CompressionMode.Decompress); using var reader = new StreamReader(gzStream); string line; var heteroplasmyProvider = new MitoHeteroplasmyProvider(); while ((line = reader.ReadLine())!=null) { if(line.StartsWith("#")) continue; var fields = line.OptimizedSplit('\t'); var position = int.Parse(fields[PositionIndex]); var refAllele = fields[RefIndex]; var altAllele = fields[AltIndex]; if (altAllele=="." || !(refAllele.Length == 1 && altAllele.Length == 1)) continue; var vrfs = fields[VrfBinsIndex].Split(',').Select(double.Parse); var alleleDepths = fields[VrfCountsIndex].Split(',').Select(int.Parse).ToArray(); heteroplasmyProvider.Add(position, altAllele, vrfs.ToArray(), alleleDepths); } return heteroplasmyProvider; } } } ================================================ FILE: Nirvana/AnnotationFiles.cs ================================================ using System.Collections.Generic; using System.IO; using Cloud; using Cloud.Messages; using IO; using VariantAnnotation.ProteinConservation; using VariantAnnotation.SA; namespace Nirvana { public sealed class AnnotationFiles { public List<(string Nsa, string Idx)> NsaFiles { get; } = new(); public List<(string Gsa, string Idx)> GsaFiles { get; } = new(); public List NsiFiles { get; } = new(); public List NgaFiles { get; } = new(); public List GeneFusionSourceFiles { get; } = new(); public List GeneFusionJsonFiles { get; } = new(); public (string Npd, string Idx) PhylopFile { get; private set; } public string LowComplexityRegionFile { get; private set; } public string ProteinConservationFile { get; private set; } public (string Rma, string Idx) RefMinorFile { get; private set; } public void AddFiles(string saDirectoryPath) { foreach (string filePath in GetFiles(saDirectoryPath)) { // ReSharper disable once SwitchStatementMissingSomeCases switch (filePath.GetFileSuffix(true)) { case SaCommon.SaFileSuffix: NsaFiles.Add((filePath, filePath + SaCommon.IndexSuffix)); break; case SaCommon.GsaFileSuffix: GsaFiles.Add((filePath, filePath + SaCommon.IndexSuffix)); break; case SaCommon.IntervalFileSuffix: NsiFiles.Add(filePath); break; case SaCommon.GeneFileSuffix: NgaFiles.Add(filePath); break; case SaCommon.PhylopFileSuffix: PhylopFile = (filePath, filePath + SaCommon.IndexSuffix); break; case ProteinConservationCommon.FileSuffix: ProteinConservationFile = filePath; break; case SaCommon.LcrFileSuffix: LowComplexityRegionFile = filePath; break; case SaCommon.RefMinorFileSuffix: RefMinorFile = (filePath, filePath + SaCommon.IndexSuffix); break; case SaCommon.GeneFusionSourceSuffix: GeneFusionSourceFiles.Add(filePath); break; case SaCommon.GeneFusionJsonSuffix: GeneFusionJsonFiles.Add(filePath); break; } } } public void AddFiles(SaUrls saUrls) { switch (saUrls.SaType) { case CustomSaType.Nsa: NsaFiles.Add((saUrls.nsaUrl, saUrls.idxUrl)); break; case CustomSaType.Nsi: NsiFiles.Add(saUrls.nsiUrl); break; case CustomSaType.Nga: NgaFiles.Add(saUrls.ngaUrl); break; default: throw new InvalidDataException("Unknown custom SA type."); } } private static IEnumerable GetFiles(string directoryOrManifestFilePath) { if (HttpUtilities.IsUrl(directoryOrManifestFilePath)) { using (var reader = new StreamReader(PersistentStreamUtils.GetReadStream(directoryOrManifestFilePath))) { string line; while ((line = reader.ReadLine()) != null) { yield return LambdaUrlHelper.GetBaseUrl() + line; } } } else { foreach (string file in Directory.GetFiles(directoryOrManifestFilePath)) yield return file; } } } } ================================================ FILE: Nirvana/AnnotationResources.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Cloud.Messages; using CommandLine.Utilities; using Genome; using IO; using MitoHeteroplasmy; using RepeatExpansions; using VariantAnnotation; using VariantAnnotation.Interface; using VariantAnnotation.Interface.GeneAnnotation; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Vcf.VariantCreator; namespace Nirvana { public sealed class AnnotationResources : IAnnotationResources { private Dictionary> _variantPositions; public ISequenceProvider SequenceProvider { get; } public ITranscriptAnnotationProvider TranscriptAnnotationProvider { get; } private ProteinConservationProvider ProteinConservationProvider { get; } public IAnnotationProvider SaProvider { get; } public IAnnotationProvider GsaProvider { get; } public IAnnotationProvider ConservationProvider { get; } public IRefMinorProvider RefMinorProvider { get; } public IAnnotationProvider LcrProvider { get; } public IGeneAnnotationProvider GeneAnnotationProvider { get; } public IMitoHeteroplasmyProvider MitoHeteroplasmyProvider { get; } public IAnnotator Annotator { get; } public IVariantIdCreator VidCreator { get; } public List DataSourceVersions { get; } public string VepDataVersion { get; } public long InputStartVirtualPosition { get; set; } public string AnnotatorVersionTag { get; set; } = "Nirvana " + CommandLineUtilities.Version; public bool ForceMitochondrialAnnotation { get; } public readonly PerformanceMetrics Metrics; public AnnotationResources(string refSequencePath, string inputCachePrefix, List saDirectoryPaths, List customAnnotations, string customStrTsvPath, bool forceMitochondrialAnnotation, bool useLegacyVids, PerformanceMetrics metrics) { Metrics = metrics; PerformanceMetrics.ShowInitializationHeader(); SequenceProvider = ProviderUtilities.GetSequenceProvider(refSequencePath); var annotationFiles = new AnnotationFiles(); saDirectoryPaths?.ForEach(x => annotationFiles.AddFiles(x)); customAnnotations?.ForEach(x => annotationFiles.AddFiles(x)); ProteinConservationProvider = ProviderUtilities.GetProteinConservationProvider(annotationFiles); ProteinConservationProvider?.Load(); metrics.Cache.Start(); TranscriptAnnotationProvider = ProviderUtilities.GetTranscriptAnnotationProvider(inputCachePrefix, SequenceProvider, ProteinConservationProvider); metrics.ShowCacheLoad(); SaProvider = ProviderUtilities.GetNsaProvider(annotationFiles); GsaProvider = ProviderUtilities.GetGsaProvider(annotationFiles); ConservationProvider = ProviderUtilities.GetConservationProvider(annotationFiles); LcrProvider = ProviderUtilities.GetLcrProvider(annotationFiles); RefMinorProvider = ProviderUtilities.GetRefMinorProvider(annotationFiles); GeneAnnotationProvider = ProviderUtilities.GetGeneAnnotationProvider(annotationFiles); IRepeatExpansionProvider repeatExpansionProvider = GetRepeatExpansionProvider(SequenceProvider.Assembly, SequenceProvider.RefNameToChromosome, SequenceProvider.RefIndexToChromosome.Count, customStrTsvPath); MitoHeteroplasmyProvider = MitoHeteroplasmyReader.GetProvider(); Annotator = new Annotator( TranscriptAnnotationProvider, SequenceProvider, SaProvider, ConservationProvider, LcrProvider, GeneAnnotationProvider, repeatExpansionProvider, GsaProvider ); if (useLegacyVids) VidCreator = new LegacyVariantId(SequenceProvider.RefNameToChromosome); else VidCreator = new VariantId(); DataSourceVersions = GetDataSourceVersions( TranscriptAnnotationProvider, SaProvider, GsaProvider, GeneAnnotationProvider, ConservationProvider, LcrProvider, MitoHeteroplasmyProvider ) .ToList(); VepDataVersion = TranscriptAnnotationProvider.VepVersion + "." + CacheConstants.DataVersion + "." + SaCommon.DataVersion; ForceMitochondrialAnnotation = forceMitochondrialAnnotation; } private static IRepeatExpansionProvider GetRepeatExpansionProvider(GenomeAssembly genomeAssembly, Dictionary refNameToChromosome, int numRefSeqs, string customStrTsvPath) { if (genomeAssembly != GenomeAssembly.GRCh37 && genomeAssembly != GenomeAssembly.GRCh38) return null; return new RepeatExpansionProvider(genomeAssembly, refNameToChromosome, numRefSeqs, customStrTsvPath); } private static IEnumerable GetDataSourceVersions(params IProvider[] providers) { var dataSourceVersions = new List(); foreach (IProvider provider in providers) if (provider != null) dataSourceVersions.AddRange(provider.DataSourceVersions); return dataSourceVersions.ToHashSet(new DataSourceVersionComparer()); } public void SingleVariantPreLoad(IPosition position) { var chromToPositions = new Dictionary>(); PreLoadUtilities.TryAddPosition(chromToPositions, position.Chromosome, position.Start, position.RefAllele, position.VcfFields[VcfCommon.AltIndex], SequenceProvider.Sequence); _variantPositions = chromToPositions; PreLoad(position.Chromosome); } public void GetVariantPositions(Stream vcfStream, GenomicRange genomicRange) { if (genomicRange != null) vcfStream.Position = Tabix.VirtualPosition.From(InputStartVirtualPosition).BlockOffset; int numPositions; Metrics.SaPositionScan.Start(); (_variantPositions, numPositions) = PreLoadUtilities.GetPositions(vcfStream, genomicRange, SequenceProvider, RefMinorProvider); Metrics.ShowSaPositionScanLoad(numPositions); } public void PreLoad(Chromosome chromosome) { SequenceProvider.LoadChromosome(chromosome); if (_variantPositions == null || !_variantPositions.TryGetValue(chromosome, out List positions)) return; SaProvider?.PreLoad(chromosome, positions); } public void Dispose() { SequenceProvider?.Dispose(); TranscriptAnnotationProvider?.Dispose(); SaProvider?.Dispose(); GsaProvider?.Dispose(); ConservationProvider?.Dispose(); RefMinorProvider?.Dispose(); GeneAnnotationProvider?.Dispose(); } } } ================================================ FILE: Nirvana/Nirvana.cs ================================================ using System; using System.Collections.Generic; using System.IO.Compression; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.FileHandling; using Compression.Utilities; using ErrorHandling; using IO; using Jasix.DataStructures; using OptimizedCore; using VariantAnnotation; using VariantAnnotation.Interface; using VariantAnnotation.Providers; using Vcf; namespace Nirvana { public static class Nirvana { private static string _inputCachePrefix; private static readonly List SupplementaryAnnotationDirectories = new List(); private static string _vcfPath; private static string _refSequencePath; private static string _outputFileName; private static string _customStrTsv; private static string _customInfoKeysString; private static string _customSampleInfoKeysString; private static bool _forceMitochondrialAnnotation; private static bool _useLegacyVids; private static bool _enableDq; private static ExitCodes ProgramExecution() { var annotationResources = GetAnnotationResources(); string jasixFileName = _outputFileName == "-" ? null : _outputFileName + ".json.gz" + JasixCommons.FileExt; var customInfoKeys = string.IsNullOrEmpty(_customInfoKeysString) ? null: new HashSet(_customInfoKeysString.OptimizedSplit(',')); var customSampleInfoKeys = string.IsNullOrEmpty(_customSampleInfoKeysString) ? null: new HashSet(_customSampleInfoKeysString.OptimizedSplit(',')); using (var inputVcfStream = _vcfPath == "-" ? Console.OpenStandardInput() : GZipUtilities.GetAppropriateReadStream(_vcfPath)) using (var outputJsonStream = _outputFileName == "-" ? Console.OpenStandardOutput() : new BlockGZipStream(FileUtilities.GetCreateStream(_outputFileName + ".json.gz"), CompressionMode.Compress)) using (var outputJsonIndexStream = jasixFileName == null ? null : FileUtilities.GetCreateStream(jasixFileName)) return StreamAnnotation.Annotate(null, inputVcfStream, outputJsonStream, outputJsonIndexStream, annotationResources, new NullVcfFilter(), false, _enableDq, customInfoKeys, customSampleInfoKeys).exitCode; } private static AnnotationResources GetAnnotationResources() { if (_outputFileName == "-") Logger.Silence(); var metrics = new PerformanceMetrics(); var annotationResources = new AnnotationResources(_refSequencePath, _inputCachePrefix, SupplementaryAnnotationDirectories, null, _customStrTsv, _forceMitochondrialAnnotation, _useLegacyVids, metrics); if (SupplementaryAnnotationDirectories.Count == 0) return annotationResources; using (var preloadVcfStream = GZipUtilities.GetAppropriateStream(PersistentStreamUtils.GetReadStream(_vcfPath))) { annotationResources.GetVariantPositions(preloadVcfStream, null); } return annotationResources; } public static int Main(string[] args) { var ops = new OptionSet { { "cache|c=", "input cache {prefix}", v => _inputCachePrefix = v }, { "in|i=", "input VCF {path}", v => _vcfPath = v }, { "out|o=", "output {file path}", v => _outputFileName = v }, { "ref|r=", "input compressed reference sequence {path}", v => _refSequencePath = v }, { "sd=", "input supplementary annotation {directory}", v => SupplementaryAnnotationDirectories.Add(v) }, { "force-mt", "forces to annotate mitochondrial variants", v => _forceMitochondrialAnnotation = v != null }, { "legacy-vids", "enables support for legacy VIDs", v => _useLegacyVids = v != null }, { "enable-dq", "report DQ from VCF samples field", v => _enableDq = v != null }, { "str=", "user provided STR annotation TSV file", v => _customStrTsv = v }, { "vcf-info=", "additional vcf info field keys (comma separated) desired in the output", v => _customInfoKeysString = v }, { "vcf-sample-info=", "additional vcf format field keys (comma separated) desired in the output", v => _customSampleInfoKeysString = v } }; var exitCode = new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .CheckInputFilenameExists(_vcfPath, "vcf", "--in", true, "-") .CheckInputFilenameExists(_refSequencePath, "reference sequence", "--ref") .CheckInputFilenameExists(CacheConstants.TranscriptPath(_inputCachePrefix), "transcript cache", "--cache") .CheckInputFilenameExists(CacheConstants.SiftPath(_inputCachePrefix), "SIFT cache", "--cache") .CheckInputFilenameExists(CacheConstants.PolyPhenPath(_inputCachePrefix), "PolyPhen cache", "--cache") .CheckInputFilenameExists(_customStrTsv, "custom STR annotation TSV", "--str", false) .HasRequiredParameter(_outputFileName, "output file stub", "--out") .DisableOutput(_outputFileName == "-") .ShowBanner(Constants.Authors) .ShowHelpMenu("Annotates a set of variants", "-i -c --sd -r -o ") .ShowErrors() .Execute(ProgramExecution); return (int)exitCode; } } } ================================================ FILE: Nirvana/Nirvana.csproj ================================================  Exe net6.0 ..\bin\$(Configuration) ================================================ FILE: Nirvana/PreLoadUtilities.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace Nirvana { public static class PreLoadUtilities { public static (Dictionary> PositionsByChromosome, int Count) GetPositions(Stream vcfStream, GenomicRange genomicRange, ISequenceProvider sequenceProvider, IRefMinorProvider refMinorProvider) { var positionsByChromosome = new Dictionary>(); var rangeChecker = new GenomicRangeChecker(genomicRange); var refNameToChrom = sequenceProvider.RefNameToChromosome; using (var reader = new StreamReader(vcfStream)) { string line; string currentReferenceName = ""; Chromosome chromosome = null; while ((line = reader.ReadLine()) != null) { if (line.StartsWith('#')) continue; string[] cols = line.OptimizedSplit('\t'); string referenceName = cols[VcfCommon.ChromIndex]; if (referenceName != currentReferenceName) { if (!refNameToChrom.TryGetValue(referenceName, out chromosome)) continue; currentReferenceName = referenceName; } (int position, bool foundError) = cols[VcfCommon.PosIndex].OptimizedParseInt32(); if (foundError) throw new InvalidDataException($"Unable to convert the VCF position to an integer: {cols[VcfCommon.PosIndex]}"); if (rangeChecker.OutOfRange(chromosome, position)) break; string refAllele = cols[VcfCommon.RefIndex]; string altAllele = cols[VcfCommon.AltIndex]; if (altAllele == "." && !IsRefMinor(refMinorProvider, chromosome, position)) continue; sequenceProvider.LoadChromosome(chromosome); TryAddPosition(positionsByChromosome, chromosome, position, refAllele, altAllele, sequenceProvider.Sequence); } } int count = SortPositionsAndGetCount(positionsByChromosome); return (positionsByChromosome, count); } private static bool IsRefMinor(IRefMinorProvider refMinorProvider, Chromosome chrom, int position) { if (refMinorProvider == null) return false; return !string.IsNullOrEmpty(refMinorProvider.GetGlobalMajorAllele(chrom, position)); } public static void TryAddPosition(Dictionary> chromPositions, Chromosome chromosome, int position, string refAllele, string altAllele, ISequence refSequence) { if (!chromPositions.ContainsKey(chromosome)) chromPositions.Add(chromosome, new List(16 * 1024)); foreach (string allele in altAllele.OptimizedSplit(',')) { if (allele.OptimizedStartsWith('<') && allele != "") continue; (int shiftedPos, string _, string _) = VariantUtils.TrimAndLeftAlign(position, refAllele, allele, refSequence); chromPositions[chromosome].Add(shiftedPos); } } private static int SortPositionsAndGetCount(Dictionary> positionsByChromosome) { var count = 0; foreach (var positions in positionsByChromosome.Values) { positions.Sort(); count += positions.Count; } return count; } } } ================================================ FILE: Nirvana/ProviderUtilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using ErrorHandling.Exceptions; using IO; using VariantAnnotation.GeneAnnotation; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.GenericScore; using VariantAnnotation.Interface.GeneAnnotation; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; namespace Nirvana { public static class ProviderUtilities { public static ISequenceProvider GetSequenceProvider(string compressedReferencePath) { return new ReferenceSequenceProvider(PersistentStreamUtils.GetReadStream(compressedReferencePath)); } public static ProteinConservationProvider GetProteinConservationProvider(AnnotationFiles files) => files == null || string.IsNullOrEmpty(files.ProteinConservationFile) ? null : new ProteinConservationProvider(PersistentStreamUtils.GetReadStream(files.ProteinConservationFile)); public static IAnnotationProvider GetConservationProvider(AnnotationFiles files) { if (files == null || files.PhylopFile == default) return null; (Stream phylopStream, Stream indexStream) = GetDataAndIndexStreams(files.PhylopFile.Npd, files.PhylopFile.Idx); return new ConservationScoreProvider() .AddPhylopReader(phylopStream, indexStream); } private static (Stream, Stream) GetDataAndIndexStreams(string dataFilePath, string indexPath) { var dataStream = PersistentStreamUtils.GetReadStream(dataFilePath); var indexStream = PersistentStreamUtils.GetReadStream(indexPath); if (dataStream == null) { throw new UserErrorException($"Unable to open data file {dataFilePath}"); } if (indexStream == null) { throw new UserErrorException($"Unable to open index file {indexPath}"); } return (dataStream, indexStream); } public static IAnnotationProvider GetLcrProvider(AnnotationFiles files) => files?.LowComplexityRegionFile == null ? null : new LcrProvider(PersistentStreamUtils.GetReadStream(files.LowComplexityRegionFile)); public static IRefMinorProvider GetRefMinorProvider(AnnotationFiles files) { if( files == null || files.RefMinorFile == default) return null; return new RefMinorProvider(PersistentStreamUtils.GetReadStream(files.RefMinorFile.Rma), PersistentStreamUtils.GetReadStream(files.RefMinorFile.Idx)); } public static IGeneAnnotationProvider GetGeneAnnotationProvider(AnnotationFiles files) => files?.NsiFiles == null ? null : new GeneAnnotationProvider(PersistentStreamUtils.GetStreams(files.NgaFiles)); public static IAnnotationProvider GetNsaProvider(AnnotationFiles files) { if (files == null) return null; INsaReader[] nsaReaders = GetNsaReaders(files.NsaFiles); INsiReader[] nsiReaders = GetNsiReaders(files.NsiFiles); IGeneFusionSaReader[] fusionReaders = GetGeneFusionReaders(files.GeneFusionSourceFiles, files.GeneFusionJsonFiles); int numReaders = nsaReaders.Length + nsiReaders.Length + fusionReaders.Length; return numReaders == 0 ? null : new NsaProvider(nsaReaders, nsiReaders, fusionReaders); } private static INsaReader[] GetNsaReaders(IReadOnlyCollection<(string Nsa, string Idx)> filePaths) { var readers = new List(filePaths.Count); foreach ((string nsaPath, string idxPath) in filePaths) { var (nsaStream, idxStream) = GetDataAndIndexStreams(nsaPath, idxPath); readers.Add(new NsaReader(nsaStream, idxStream)); } return readers.SortByJsonKey(); } public static IAnnotationProvider GetGsaProvider(AnnotationFiles files) { if (files?.GsaFiles == null || files.GsaFiles.Count == 0) return null; List<(string Gsa, string Idx)> filePaths = files.GsaFiles; var readers = new ScoreReader[filePaths.Count]; var i = 0; foreach ((string gsaPath, string idxPath) in filePaths) { var (gsaStream, idxStream) = GetDataAndIndexStreams(gsaPath, idxPath); readers[i] = ScoreReader.Read(gsaStream, idxStream); i++; } readers = readers.SortByJsonKey(); return new ScoreProvider(readers); } private static INsiReader[] GetNsiReaders(IReadOnlyCollection filePaths) { var readers = new List(filePaths.Count); foreach (string filePath in filePaths) readers.Add(NsiReader.Read(PersistentStreamUtils.GetReadStream(filePath))); return readers.SortByJsonKey(); } private static IGeneFusionSaReader[] GetGeneFusionReaders(IReadOnlyCollection sourceFilePaths, IReadOnlyCollection jsonFilePaths) { var readers = new List(jsonFilePaths.Count); foreach (string filePath in sourceFilePaths) readers.Add(new GeneFusionSourceReader(PersistentStreamUtils.GetReadStream(filePath))); foreach (string filePath in jsonFilePaths) readers.Add(new GeneFusionJsonReader(PersistentStreamUtils.GetReadStream(filePath))); return readers.SortByJsonKey(); } private static T[] SortByJsonKey(this IEnumerable entries) where T : ISaMetadata => entries.OrderBy(x => x.JsonKey, StringComparer.Ordinal).ToArray(); public static ITranscriptAnnotationProvider GetTranscriptAnnotationProvider(string path, ISequenceProvider sequenceProvider, ProteinConservationProvider proteinConservationProvider) => new TranscriptAnnotationProvider(path, sequenceProvider, proteinConservationProvider); } } ================================================ FILE: Nirvana/StreamAnnotation.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling; using ErrorHandling.Exceptions; using Genome; using IO; using MitoHeteroplasmy; using OptimizedCore; using VariantAnnotation; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO; using VariantAnnotation.Pools; using VariantAnnotation.Utilities; using Variants; using Vcf; namespace Nirvana { public static class StreamAnnotation { public static (int variantCount, ExitCodes exitCode) Annotate(Stream headerStream, Stream inputVcfStream, Stream outputJsonStream, Stream outputJsonIndexStream, AnnotationResources annotationResources, IVcfFilter vcfFilter, bool ignoreEmptyChromosome, bool enableDq = false, HashSet customInfoKeys=null, HashSet customSampleInfoKeys=null) { var metrics = annotationResources.Metrics; PerformanceMetrics.ShowAnnotationHeader(); Chromosome currentChromosome = Chromosome.GetEmptyChromosome("dummy"); int numVariants = 0; int variantCount = 0; IMitoHeteroplasmyProvider mitoHeteroplasmyProvider = MitoHeteroplasmyReader.GetProvider(); using (var vcfReader = GetVcfReader(headerStream, inputVcfStream, annotationResources, vcfFilter, mitoHeteroplasmyProvider, enableDq, customInfoKeys, customSampleInfoKeys)) using (var jsonWriter = new JsonWriter(outputJsonStream, outputJsonIndexStream, annotationResources, Date.CurrentTimeStamp, vcfReader.GetSampleNames(), false)) { try { CheckGenomeAssembly(annotationResources, vcfReader); SetMitochondrialAnnotationBehavior(annotationResources, vcfReader); IPosition position; while ((position = vcfReader.GetNextPosition()) != null) { Chromosome chromosome = position.Chromosome; if (ignoreEmptyChromosome && chromosome.IsEmpty()) continue; if (chromosome.Index != currentChromosome.Index) { if (!currentChromosome.IsEmpty()) metrics.ShowAnnotationEntry(currentChromosome, numVariants); numVariants = 0; metrics.Preload.Start(); annotationResources.PreLoad(chromosome); metrics.Preload.Stop(); metrics.Annotation.Start(); currentChromosome = chromosome; } var annotatedPosition = position.Variants != null ? annotationResources.Annotator.Annotate(position) : null; var jsb = annotatedPosition?.GetJsonStringBuilder(); if (jsb != null) jsonWriter.WritePosition(annotatedPosition.Position, jsb); StringBuilderPool.Return(jsb); ReturnPoolObjects(annotatedPosition); numVariants++; variantCount += position.Variants?.Length ?? 0; } jsonWriter.WriteGenes(annotationResources.Annotator.GetGeneAnnotations()); } catch (Exception e) { e.Data[ExitCodeUtilities.VcfLine] = vcfReader.VcfLine; throw; } } if (!currentChromosome.IsEmpty()) metrics.ShowAnnotationEntry(currentChromosome, numVariants); metrics.ShowSummaryTable(); return (variantCount, ExitCodes.Success); } private static void ReturnPoolObjects(IAnnotatedPosition annotatedPosition) { if (annotatedPosition?.AnnotatedVariants != null) foreach (var annotatedVariant in annotatedPosition.AnnotatedVariants) { if (annotatedVariant.Transcripts != null) { foreach (IAnnotatedTranscript annotatedTranscript in annotatedVariant.Transcripts) { AnnotatedTranscriptPool.Return((AnnotatedTranscript) annotatedTranscript); } } var variant = annotatedVariant.Variant; if (variant is Variant) VariantPool.Return((Variant) annotatedVariant.Variant); AnnotatedVariantPool.Return((AnnotatedVariant) annotatedVariant); } PositionPool.Return((Position) annotatedPosition?.Position); AnnotatedPositionPool.Return((AnnotatedPosition) annotatedPosition); } private static void CheckGenomeAssembly(IAnnotationResources annotationResources, VcfReader vcfReader) { if (vcfReader.InferredGenomeAssembly != GenomeAssembly.Unknown && vcfReader.InferredGenomeAssembly != annotationResources.Annotator.Assembly) throw new UserErrorException($"A mismatch between genome assemblies was found. The input VCF uses {vcfReader.InferredGenomeAssembly} whereas annotation was configured for {annotationResources.Annotator.Assembly}."); } private static void SetMitochondrialAnnotationBehavior(IAnnotationResources annotationResources, IVcfReader vcfReader) { if (vcfReader.IsRcrsMitochondrion && annotationResources.Annotator.Assembly == GenomeAssembly.GRCh37 || annotationResources.Annotator.Assembly == GenomeAssembly.GRCh38 || annotationResources.ForceMitochondrialAnnotation) annotationResources.Annotator.EnableMitochondrialAnnotation(); } private static VcfReader GetVcfReader(Stream headerStream, Stream vcfStream, IAnnotationResources annotationResources, IVcfFilter vcfFilter, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, bool enableDq = false, HashSet customInfoKeys=null , HashSet customSampleInfoKeys=null) { var vcfReader = FileUtilities.GetStreamReader(vcfStream); StreamReader headerReader; if (headerStream == null) headerReader = vcfReader; else { headerReader = FileUtilities.GetStreamReader(headerStream); vcfStream.Position = Tabix.VirtualPosition.From(annotationResources.InputStartVirtualPosition).BlockOffset; } return VcfReader.Create(headerReader, vcfReader, annotationResources.SequenceProvider, annotationResources.RefMinorProvider, vcfFilter, annotationResources.VidCreator, mitoHeteroplasmyProvider, enableDq, customInfoKeys, customSampleInfoKeys); } } } ================================================ FILE: Nirvana.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.29201.188 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "VariantAnnotation.Interface", "VariantAnnotation.Interface\VariantAnnotation.Interface.csproj", "{248C8736-3A76-4F45-A131-A776BD3257C9}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Nirvana", "Nirvana\Nirvana.csproj", "{84CD8FB5-1071-47D5-AF1A-E028134D3C70}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Vcf", "Vcf\Vcf.csproj", "{0DF48817-8AED-449A-AA87-CB91040D8439}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Compression", "Compression\Compression.csproj", "{8E2CD866-DFCF-4486-A289-32DEFA050E87}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "VariantAnnotation", "VariantAnnotation\VariantAnnotation.csproj", "{155E28ED-122E-49DD-A8F0-FE3F670073B8}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ErrorHandling", "ErrorHandling\ErrorHandling.csproj", "{A65F4919-CDB8-49C5-ADA4-66055A3F4923}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UnitTests", "UnitTests\UnitTests.csproj", "{0CB1644A-BEA1-4CF6-AD5F-E544512769C2}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CommandLine", "CommandLine\CommandLine.csproj", "{147C336A-6A6E-43F4-BDDC-8C8B72199C5D}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CacheUtils", "CacheUtils\CacheUtils.csproj", "{986CF15B-DFAE-4C39-98D0-75A15271B34A}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SAUtils", "SAUtils\SAUtils.csproj", "{F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Jasix", "Jasix\Jasix.csproj", "{ECC7869C-1B21-42C1-B8BD-4190F15B3B6F}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "OptimizedCore", "OptimizedCore\OptimizedCore.csproj", "{76FEE3B3-FB8E-4421-A63F-CA659FB1ACA0}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Intervals", "Intervals\Intervals.csproj", "{82CA75B3-37DF-40DA-AA1B-70888CF3ED05}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Variants", "Variants\Variants.csproj", "{0A94104A-71E7-4925-B667-C29C18E3356D}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Genome", "Genome\Genome.csproj", "{3B5C30A5-FBBC-4247-BE62-2B64960213FD}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "IO", "IO\IO.csproj", "{654069F3-3B86-4325-823F-BC78946A26FF}" ProjectSection(ProjectDependencies) = postProject {82CA75B3-37DF-40DA-AA1B-70888CF3ED05} = {82CA75B3-37DF-40DA-AA1B-70888CF3ED05} EndProjectSection EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tabix", "Tabix\Tabix.csproj", "{F337E3F6-72AA-44B4-B11F-D69EE14B6152}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cloud", "Cloud\Cloud.csproj", "{E93914C8-2599-46BE-BE18-6229E53F581B}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ReferenceSequence", "ReferenceSequence\ReferenceSequence.csproj", "{234765A8-2B5C-4FD5-ACBA-6D48002E9074}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Downloader", "Downloader\Downloader.csproj", "{5B81B762-8A86-466A-A947-AC2CA53EE40D}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AnnotationLambda", "AnnotationLambda\AnnotationLambda.csproj", "{374D5D10-98DF-4D18-9ECF-D20B5C19D258}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CustomAnnotationLambda", "CustomAnnotationLambda\CustomAnnotationLambda.csproj", "{FFC36924-DA37-41E1-8FA8-5FF54AC84CC0}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "GeneAnnotationLambda", "GeneAnnotationLambda\GeneAnnotationLambda.csproj", "{9A0F21D6-D0B0-4074-BACA-5FF179E83007}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NirvanaLambda", "NirvanaLambda\NirvanaLambda.csproj", "{37EEEA52-94F8-4B27-A044-1CD6DBF2F86E}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SingleAnnotationLambda", "SingleAnnotationLambda\SingleAnnotationLambda.csproj", "{C9B4E16E-FF30-4CE0-A617-F833696FBE10}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RepeatExpansions", "RepeatExpansions\RepeatExpansions.csproj", "{E586F712-DEDA-4CA2-AE97-96DE0180DB0E}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Jist", "Jist\Jist.csproj", "{62109AB0-2E66-4C84-8D62-7A8C9B7E335A}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CustomStrValidationLambda", "CustomStrValidationLambda\CustomStrValidationLambda.csproj", "{F3E60E51-EE07-4768-8EC3-E3A323DFA547}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MitoHeteroplasmy", "MitoHeteroplasmy\MitoHeteroplasmy.csproj", "{387E4C8D-6A27-40DE-A305-F3F047B8D865}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {248C8736-3A76-4F45-A131-A776BD3257C9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {248C8736-3A76-4F45-A131-A776BD3257C9}.Debug|Any CPU.Build.0 = Debug|Any CPU {248C8736-3A76-4F45-A131-A776BD3257C9}.Release|Any CPU.ActiveCfg = Release|Any CPU {248C8736-3A76-4F45-A131-A776BD3257C9}.Release|Any CPU.Build.0 = Release|Any CPU {84CD8FB5-1071-47D5-AF1A-E028134D3C70}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {84CD8FB5-1071-47D5-AF1A-E028134D3C70}.Debug|Any CPU.Build.0 = Debug|Any CPU {84CD8FB5-1071-47D5-AF1A-E028134D3C70}.Release|Any CPU.ActiveCfg = Release|Any CPU {84CD8FB5-1071-47D5-AF1A-E028134D3C70}.Release|Any CPU.Build.0 = Release|Any CPU {0DF48817-8AED-449A-AA87-CB91040D8439}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {0DF48817-8AED-449A-AA87-CB91040D8439}.Debug|Any CPU.Build.0 = Debug|Any CPU {0DF48817-8AED-449A-AA87-CB91040D8439}.Release|Any CPU.ActiveCfg = Release|Any CPU {0DF48817-8AED-449A-AA87-CB91040D8439}.Release|Any CPU.Build.0 = Release|Any CPU {8E2CD866-DFCF-4486-A289-32DEFA050E87}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {8E2CD866-DFCF-4486-A289-32DEFA050E87}.Debug|Any CPU.Build.0 = Debug|Any CPU {8E2CD866-DFCF-4486-A289-32DEFA050E87}.Release|Any CPU.ActiveCfg = Release|Any CPU {8E2CD866-DFCF-4486-A289-32DEFA050E87}.Release|Any CPU.Build.0 = Release|Any CPU {155E28ED-122E-49DD-A8F0-FE3F670073B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {155E28ED-122E-49DD-A8F0-FE3F670073B8}.Debug|Any CPU.Build.0 = Debug|Any CPU {155E28ED-122E-49DD-A8F0-FE3F670073B8}.Release|Any CPU.ActiveCfg = Release|Any CPU {155E28ED-122E-49DD-A8F0-FE3F670073B8}.Release|Any CPU.Build.0 = Release|Any CPU {A65F4919-CDB8-49C5-ADA4-66055A3F4923}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {A65F4919-CDB8-49C5-ADA4-66055A3F4923}.Debug|Any CPU.Build.0 = Debug|Any CPU {A65F4919-CDB8-49C5-ADA4-66055A3F4923}.Release|Any CPU.ActiveCfg = Release|Any CPU {A65F4919-CDB8-49C5-ADA4-66055A3F4923}.Release|Any CPU.Build.0 = Release|Any CPU {0CB1644A-BEA1-4CF6-AD5F-E544512769C2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {0CB1644A-BEA1-4CF6-AD5F-E544512769C2}.Debug|Any CPU.Build.0 = Debug|Any CPU {0CB1644A-BEA1-4CF6-AD5F-E544512769C2}.Release|Any CPU.ActiveCfg = Release|Any CPU {0CB1644A-BEA1-4CF6-AD5F-E544512769C2}.Release|Any CPU.Build.0 = Release|Any CPU {147C336A-6A6E-43F4-BDDC-8C8B72199C5D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {147C336A-6A6E-43F4-BDDC-8C8B72199C5D}.Debug|Any CPU.Build.0 = Debug|Any CPU {147C336A-6A6E-43F4-BDDC-8C8B72199C5D}.Release|Any CPU.ActiveCfg = Release|Any CPU {147C336A-6A6E-43F4-BDDC-8C8B72199C5D}.Release|Any CPU.Build.0 = Release|Any CPU {986CF15B-DFAE-4C39-98D0-75A15271B34A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {986CF15B-DFAE-4C39-98D0-75A15271B34A}.Debug|Any CPU.Build.0 = Debug|Any CPU {986CF15B-DFAE-4C39-98D0-75A15271B34A}.Release|Any CPU.ActiveCfg = Release|Any CPU {986CF15B-DFAE-4C39-98D0-75A15271B34A}.Release|Any CPU.Build.0 = Release|Any CPU {F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}.Debug|Any CPU.Build.0 = Debug|Any CPU {F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}.Release|Any CPU.ActiveCfg = Release|Any CPU {F1F05D39-1BE0-4CFD-AD60-F27FB31D925A}.Release|Any CPU.Build.0 = Release|Any CPU {ECC7869C-1B21-42C1-B8BD-4190F15B3B6F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {ECC7869C-1B21-42C1-B8BD-4190F15B3B6F}.Debug|Any CPU.Build.0 = Debug|Any CPU {ECC7869C-1B21-42C1-B8BD-4190F15B3B6F}.Release|Any CPU.ActiveCfg = Release|Any CPU {ECC7869C-1B21-42C1-B8BD-4190F15B3B6F}.Release|Any CPU.Build.0 = Release|Any CPU {76FEE3B3-FB8E-4421-A63F-CA659FB1ACA0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {76FEE3B3-FB8E-4421-A63F-CA659FB1ACA0}.Debug|Any CPU.Build.0 = Debug|Any CPU {76FEE3B3-FB8E-4421-A63F-CA659FB1ACA0}.Release|Any CPU.ActiveCfg = Release|Any CPU {76FEE3B3-FB8E-4421-A63F-CA659FB1ACA0}.Release|Any CPU.Build.0 = Release|Any CPU {82CA75B3-37DF-40DA-AA1B-70888CF3ED05}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {82CA75B3-37DF-40DA-AA1B-70888CF3ED05}.Debug|Any CPU.Build.0 = Debug|Any CPU {82CA75B3-37DF-40DA-AA1B-70888CF3ED05}.Release|Any CPU.ActiveCfg = Release|Any CPU {82CA75B3-37DF-40DA-AA1B-70888CF3ED05}.Release|Any CPU.Build.0 = Release|Any CPU {0A94104A-71E7-4925-B667-C29C18E3356D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {0A94104A-71E7-4925-B667-C29C18E3356D}.Debug|Any CPU.Build.0 = Debug|Any CPU {0A94104A-71E7-4925-B667-C29C18E3356D}.Release|Any CPU.ActiveCfg = Release|Any CPU {0A94104A-71E7-4925-B667-C29C18E3356D}.Release|Any CPU.Build.0 = Release|Any CPU {3B5C30A5-FBBC-4247-BE62-2B64960213FD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {3B5C30A5-FBBC-4247-BE62-2B64960213FD}.Debug|Any CPU.Build.0 = Debug|Any CPU {3B5C30A5-FBBC-4247-BE62-2B64960213FD}.Release|Any CPU.ActiveCfg = Release|Any CPU {3B5C30A5-FBBC-4247-BE62-2B64960213FD}.Release|Any CPU.Build.0 = Release|Any CPU {654069F3-3B86-4325-823F-BC78946A26FF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {654069F3-3B86-4325-823F-BC78946A26FF}.Debug|Any CPU.Build.0 = Debug|Any CPU {654069F3-3B86-4325-823F-BC78946A26FF}.Release|Any CPU.ActiveCfg = Release|Any CPU {654069F3-3B86-4325-823F-BC78946A26FF}.Release|Any CPU.Build.0 = Release|Any CPU {F337E3F6-72AA-44B4-B11F-D69EE14B6152}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {F337E3F6-72AA-44B4-B11F-D69EE14B6152}.Debug|Any CPU.Build.0 = Debug|Any CPU {F337E3F6-72AA-44B4-B11F-D69EE14B6152}.Release|Any CPU.ActiveCfg = Release|Any CPU {F337E3F6-72AA-44B4-B11F-D69EE14B6152}.Release|Any CPU.Build.0 = Release|Any CPU {E93914C8-2599-46BE-BE18-6229E53F581B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {E93914C8-2599-46BE-BE18-6229E53F581B}.Debug|Any CPU.Build.0 = Debug|Any CPU {E93914C8-2599-46BE-BE18-6229E53F581B}.Release|Any CPU.ActiveCfg = Release|Any CPU {E93914C8-2599-46BE-BE18-6229E53F581B}.Release|Any CPU.Build.0 = Release|Any CPU {234765A8-2B5C-4FD5-ACBA-6D48002E9074}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {234765A8-2B5C-4FD5-ACBA-6D48002E9074}.Debug|Any CPU.Build.0 = Debug|Any CPU {234765A8-2B5C-4FD5-ACBA-6D48002E9074}.Release|Any CPU.ActiveCfg = Release|Any CPU {234765A8-2B5C-4FD5-ACBA-6D48002E9074}.Release|Any CPU.Build.0 = Release|Any CPU {5B81B762-8A86-466A-A947-AC2CA53EE40D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {5B81B762-8A86-466A-A947-AC2CA53EE40D}.Debug|Any CPU.Build.0 = Debug|Any CPU {5B81B762-8A86-466A-A947-AC2CA53EE40D}.Release|Any CPU.ActiveCfg = Release|Any CPU {5B81B762-8A86-466A-A947-AC2CA53EE40D}.Release|Any CPU.Build.0 = Release|Any CPU {374D5D10-98DF-4D18-9ECF-D20B5C19D258}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {374D5D10-98DF-4D18-9ECF-D20B5C19D258}.Debug|Any CPU.Build.0 = Debug|Any CPU {374D5D10-98DF-4D18-9ECF-D20B5C19D258}.Release|Any CPU.ActiveCfg = Release|Any CPU {374D5D10-98DF-4D18-9ECF-D20B5C19D258}.Release|Any CPU.Build.0 = Release|Any CPU {FFC36924-DA37-41E1-8FA8-5FF54AC84CC0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {FFC36924-DA37-41E1-8FA8-5FF54AC84CC0}.Debug|Any CPU.Build.0 = Debug|Any CPU {FFC36924-DA37-41E1-8FA8-5FF54AC84CC0}.Release|Any CPU.ActiveCfg = Release|Any CPU {FFC36924-DA37-41E1-8FA8-5FF54AC84CC0}.Release|Any CPU.Build.0 = Release|Any CPU {9A0F21D6-D0B0-4074-BACA-5FF179E83007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {9A0F21D6-D0B0-4074-BACA-5FF179E83007}.Debug|Any CPU.Build.0 = Debug|Any CPU {9A0F21D6-D0B0-4074-BACA-5FF179E83007}.Release|Any CPU.ActiveCfg = Release|Any CPU {9A0F21D6-D0B0-4074-BACA-5FF179E83007}.Release|Any CPU.Build.0 = Release|Any CPU {37EEEA52-94F8-4B27-A044-1CD6DBF2F86E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {37EEEA52-94F8-4B27-A044-1CD6DBF2F86E}.Debug|Any CPU.Build.0 = Debug|Any CPU {37EEEA52-94F8-4B27-A044-1CD6DBF2F86E}.Release|Any CPU.ActiveCfg = Release|Any CPU {37EEEA52-94F8-4B27-A044-1CD6DBF2F86E}.Release|Any CPU.Build.0 = Release|Any CPU {C9B4E16E-FF30-4CE0-A617-F833696FBE10}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {C9B4E16E-FF30-4CE0-A617-F833696FBE10}.Debug|Any CPU.Build.0 = Debug|Any CPU {C9B4E16E-FF30-4CE0-A617-F833696FBE10}.Release|Any CPU.ActiveCfg = Release|Any CPU {C9B4E16E-FF30-4CE0-A617-F833696FBE10}.Release|Any CPU.Build.0 = Release|Any CPU {E586F712-DEDA-4CA2-AE97-96DE0180DB0E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {E586F712-DEDA-4CA2-AE97-96DE0180DB0E}.Debug|Any CPU.Build.0 = Debug|Any CPU {E586F712-DEDA-4CA2-AE97-96DE0180DB0E}.Release|Any CPU.ActiveCfg = Release|Any CPU {E586F712-DEDA-4CA2-AE97-96DE0180DB0E}.Release|Any CPU.Build.0 = Release|Any CPU {62109AB0-2E66-4C84-8D62-7A8C9B7E335A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {62109AB0-2E66-4C84-8D62-7A8C9B7E335A}.Debug|Any CPU.Build.0 = Debug|Any CPU {62109AB0-2E66-4C84-8D62-7A8C9B7E335A}.Release|Any CPU.ActiveCfg = Release|Any CPU {62109AB0-2E66-4C84-8D62-7A8C9B7E335A}.Release|Any CPU.Build.0 = Release|Any CPU {F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Debug|Any CPU.Build.0 = Debug|Any CPU {F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Release|Any CPU.ActiveCfg = Release|Any CPU {F3E60E51-EE07-4768-8EC3-E3A323DFA547}.Release|Any CPU.Build.0 = Release|Any CPU {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Debug|Any CPU.Build.0 = Debug|Any CPU {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Release|Any CPU.ActiveCfg = Release|Any CPU {387E4C8D-6A27-40DE-A305-F3F047B8D865}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {BA40BDB6-7E39-4F75-AC8A-EB65F7FC8209} EndGlobalSection EndGlobal ================================================ FILE: Nirvana.sln.DotSettings ================================================  SOLUTION WARNING WARNING DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW DO_NOT_SHOW True False True True True True True True True True True True True True 150 UseVarWhenEvident UseVarWhenEvident UseVarWhenEvident CNV GR IO LI LZ MD RNA SNS SV False <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="I" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="T" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="AaBb" /> C:\Users\Michael\AppData\Local\JetBrains\Transient\ReSharperPlatformVs15\v08_b4a306d0\SolutionCaches MarkersAndFullLine LIVE_MONITOR LIVE_MONITOR LIVE_MONITOR DO_NOTHING LIVE_MONITOR LIVE_MONITOR LIVE_MONITOR LIVE_MONITOR LIVE_MONITOR LIVE_MONITOR LIVE_MONITOR LIVE_MONITOR DO_NOTHING LIVE_MONITOR True True True True True 10 <data /> <data><IncludeFilters /><ExcludeFilters /></data> True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True ================================================ FILE: NirvanaLambda/AnnotationJob.cs ================================================ using System; using System.IO; using System.Threading.Tasks; using Amazon.Lambda; using Amazon.Lambda.Core; using Amazon.Lambda.Model; using Cloud.Messages.Annotation; using Cloud.Utilities; using ErrorHandling; using IO; namespace NirvanaLambda { public sealed class AnnotationJob { private const int MinAnnotationTime = 5_000; private const int ReservedPostAnnotationTime = 10_000; private const int WaitBeforeRetry = 2_000; private const string UnknownErrorMessage = "Unknown error -1"; private int _numRetries; private double _annotationTimeOut; private readonly ILambdaContext _lambdaContext; private readonly int _jobIndex; private ErrorCategory? _errorCategory; public AnnotationJob(ILambdaContext context, int jobIndex) { _lambdaContext = context; _jobIndex = jobIndex; } public AnnotationResultSummary Invoke(string functionArn, string functionInput) { try { return InvokeAndRetryWhenThrottled(functionArn, functionInput).Result; } catch (Exception e) { Logger.Log(e); return GetResultSummaryFromFailedInvocation(e); } } private async Task InvokeAndRetryWhenThrottled(string functionArn, string functionInput) { AnnotationResultSummary resultSummary; while (true) { try { var invokeRequest = new InvokeRequest { FunctionName = functionArn, Payload = functionInput, InvocationType = "RequestResponse" }; var payload = GetAnnotationResult(invokeRequest); resultSummary = GetResultSummaryFromSuccessInvocation(payload); break; } catch (Exception e) when (ExceptionUtilities.HasException(e)) { Logger.WriteLine($"Job {_jobIndex}: Invocation is throttled. Retry in {WaitBeforeRetry} ms."); _numRetries++; await Task.Delay(WaitBeforeRetry); } catch (Exception e) when (e.HasErrorMessage(UnknownErrorMessage)) { Logger.WriteLine($"Job {_jobIndex}: {UnknownErrorMessage}. Retry in {WaitBeforeRetry} ms."); _numRetries++; await Task.Delay(WaitBeforeRetry); } } return resultSummary; } internal static AnnotationResultSummary GetResultSummaryFromSuccessInvocation(MemoryStream payload) { var annotationResult = JsonUtilities.Deserialize(payload); string errorMessage = annotationResult.errorCategory == null ? null : annotationResult.status; return AnnotationResultSummary.Create(annotationResult, annotationResult.errorCategory, errorMessage); } private MemoryStream GetAnnotationResult(InvokeRequest invokeRequest) { CheckRemainingTime(); var config = new AmazonLambdaConfig { ResignRetries = true, Timeout = TimeSpan.FromMilliseconds(_annotationTimeOut) }; InvokeResponse response; using (var lambdaClient = new AmazonLambdaClient(config)) { response = lambdaClient.InvokeAsync(invokeRequest).Result; } CheckResponse(response); return response.Payload; } private void CheckRemainingTime() { double currentRemainingTime = _lambdaContext.RemainingTime.TotalMilliseconds; if (currentRemainingTime < MinAnnotationTime + ReservedPostAnnotationTime) { if (_numRetries > 0) { _errorCategory = ErrorCategory.InvocationThrottledError; throw new Exception($"Invocation is still throttled after {_numRetries} retries."); } _errorCategory = ErrorCategory.TimeOutError; throw new Exception($"Only {currentRemainingTime} ms left. No enough time for annotation job."); } _annotationTimeOut = currentRemainingTime - ReservedPostAnnotationTime; } // ReSharper disable once ParameterOnlyUsedForPreconditionCheck.Global internal void CheckResponse(InvokeResponse response) { if (response == null) { _errorCategory = ErrorCategory.NirvanaError; throw new Exception("Failed to get the response from annotation job"); } if (response.FunctionError == "Unhandled") { _errorCategory = ErrorCategory.NirvanaError; throw new Exception("There is unhandled error in annotation job. A possible reason for this is the out-of-memory issue."); } } internal AnnotationResultSummary GetResultSummaryFromFailedInvocation(Exception e) { var additionalDescription = ""; if (ExceptionUtilities.HasException(e)) { _errorCategory = ErrorCategory.TimeOutError; additionalDescription = $" Annotation job was not finished in {_annotationTimeOut} milliseconds."; } if (_errorCategory == null) _errorCategory = ExceptionUtilities.ExceptionToErrorCategory(e); e = ExceptionUtilities.GetInnermostException(e); string errorMessage = $"Failed job when invoking the annotation job: {e.Message}.{additionalDescription}"; return AnnotationResultSummary.Create(null, _errorCategory, errorMessage); } } } ================================================ FILE: NirvanaLambda/AnnotationResultSummary.cs ================================================ using System.IO; using Cloud.Messages.Annotation; using ErrorHandling; namespace NirvanaLambda { public sealed class AnnotationResultSummary { public string ErrorMessage; public string FileName; public int VariantCount; public ErrorCategory? ErrorCategory; public static AnnotationResultSummary Create(AnnotationResult annotationResult, ErrorCategory? errorCategory, string errorMessage) { string fileName = Path.GetFileName(annotationResult?.filePath); return new AnnotationResultSummary { ErrorCategory = errorCategory, ErrorMessage = errorMessage, FileName = fileName, VariantCount = annotationResult?.variantCount ?? 0 }; } } } ================================================ FILE: NirvanaLambda/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: NirvanaLambda/NirvanaLambda.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using System.Text; using System.Threading.Tasks; using Amazon.Lambda.Core; using Cloud; using Cloud.Messages; using Cloud.Messages.Annotation; using Cloud.Messages.Nirvana; using Cloud.Notifications; using Cloud.Utilities; using CommandLine.Utilities; using Compression.FileHandling; using ErrorHandling; using ErrorHandling.Exceptions; using Genome; using Intervals; using IO; using Tabix; using VariantAnnotation.Caches.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Providers; using JsonSerializer = Amazon.Lambda.Serialization.Json.JsonSerializer; [assembly: LambdaSerializer(typeof(JsonSerializer))] namespace NirvanaLambda { // ReSharper disable once UnusedMember.Global // ReSharper disable once ClassNeverInstantiated.Global public sealed class NirvanaLambda { private const string AnnotationLambdaFailedStatus = "One or more annotation Lambdas failed"; private const string AnnotationLambdaKey = "annotation_lambda_arn"; private const string TryAgainMessage = "Please try again later."; private const int MaxNumPartitions = 30; private const int MinNumPartitions = 6; private const int MinPartitionSize = 10_000_000; private readonly HashSet _supportedAssemblies = new HashSet { GenomeAssembly.GRCh37, GenomeAssembly.GRCh38 }; // ReSharper disable once UnusedMember.Global public NirvanaResult Run(NirvanaConfig config, ILambdaContext context) { NirvanaResult result; string snsTopicArn = null; var runLog = new StringBuilder(); try { LogUtilities.UpdateLogger(context.Logger, runLog); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey, "annotation_lambda_arn" }); LambdaUtilities.GarbageCollect(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); string annotationLambdaArn = LambdaUtilities.GetEnvironmentVariable(AnnotationLambdaKey); config.Validate(); var genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly); if (!_supportedAssemblies.Contains(genomeAssembly)) throw new UserErrorException($"Unsupported assembly: {config.genomeAssembly}"); AnnotationRange[] annotationRanges = GetAnnotationRanges(config, genomeAssembly); result = GetNirvanaResult(annotationRanges, config, annotationLambdaArn, context, runLog, snsTopicArn); } catch (Exception exception) { result = HandleException(runLog, config, exception, snsTopicArn); } LogUtilities.LogObject("Result", result); return result; } private static AnnotationRange[] GetAnnotationRanges(NirvanaConfig config, GenomeAssembly genomeAssembly) { string cachePathPrefix = LambdaUtilities.GetCachePathPrefix(genomeAssembly); using Stream tabixStream = PersistentStreamUtils.GetReadStream(config.tabixUrl); using var tabixReader = new BinaryReader(new BlockGZipStream(tabixStream, CompressionMode.Decompress)); using Stream referenceStream = PersistentStreamUtils.GetReadStream(LambdaUrlHelper.GetRefUrl(genomeAssembly)); using var sequenceProvider = new ReferenceSequenceProvider(referenceStream); long vcfSize = HttpUtilities.GetLength(config.vcfUrl); int numPartitions = Math.Max(Math.Min((int) ((vcfSize - 1) / MinPartitionSize + 1), MaxNumPartitions), MinNumPartitions); Tabix.Index tabixIndex = Reader.Read(tabixReader, sequenceProvider.RefNameToChromosome); List blockOffsets = PartitionUtilities.GetFileOffsets(config.vcfUrl, numPartitions, tabixIndex); // stop early if we're going to annotate the entire file if (blockOffsets.Count == 1 && blockOffsets[0] == 0) return null; using var taProvider = new TranscriptAnnotationProvider(cachePathPrefix, sequenceProvider, null); IntervalArray[] transcriptIntervalArrays = taProvider.TranscriptIntervalArrays; IntervalForest geneIntervalForest = GeneForestGenerator.GetGeneForest(transcriptIntervalArrays); Dictionary refNameToChromosome = sequenceProvider.RefNameToChromosome; return PartitionUtilities.GenerateAnnotationRanges(blockOffsets, config.vcfUrl, geneIntervalForest, refNameToChromosome); } private static NirvanaResult HandleException(StringBuilder runLog, NirvanaConfig config, Exception e, string snsTopicArn) { Logger.Log(e); var errorCategory = ExceptionUtilities.ExceptionToErrorCategory(e); return GetNirvanaFailResult(runLog, config, errorCategory, e.Message, e.StackTrace, snsTopicArn); } private static NirvanaResult GetNirvanaFailResult(StringBuilder runLog, NirvanaConfig config, ErrorCategory errorCategory, string errorMessage, string stackTrace, string snsTopicArn) { string status = GetFailedRunStatus(errorCategory, errorMessage); if (errorCategory != ErrorCategory.UserError) { string snsMessage = SNS.CreateMessage(runLog.ToString(), status, stackTrace); SNS.SendMessage(snsTopicArn, snsMessage); } return new NirvanaResult { id = config.id, status = status, variantCount = 0, jwtFields = config.jwtFields }; } internal static string GetFailedRunStatus(ErrorCategory errorCategory, string errorMessage) { // ReSharper disable once SwitchStatementMissingSomeCases switch (errorCategory) { case ErrorCategory.TimeOutError: return "Timeout error: annotation of the VCF was not finished on time due to network congestion. " + TryAgainMessage; case ErrorCategory.InvocationThrottledError: return "Invocation throttled error: there are too many lambdas currently running in this account. " + TryAgainMessage; case ErrorCategory.UserError: return "User error: " + FirstCharToLower(errorMessage); default: return "Nirvana error: an unexpected annotation error occurred while annotating this VCF."; } } private static NirvanaResult GetNirvanaResult(AnnotationRange[] annotationRanges, NirvanaConfig config, string annotationLambdaArn, ILambdaContext context, StringBuilder runLog, string snsTopicArn) { Task[] annotationTasks = CallAnnotationLambdas(config, annotationLambdaArn, context, annotationRanges); AnnotationResultSummary[] processedAnnotationResults = Task.WhenAll(annotationTasks).Result; (ErrorCategory? errorCategory, string errorMessage) = GetMostSevereErrorCategoryAndMessage(processedAnnotationResults); if (errorCategory != null) return GetNirvanaFailResult(runLog, config, errorCategory.Value, errorMessage, null, snsTopicArn); string[] fileNames = processedAnnotationResults.Select(x => x.FileName).ToArray(); int variantCount = processedAnnotationResults.Sum(x => x.VariantCount); return new NirvanaResult { id = config.id, status = LambdaUrlHelper.SuccessMessage, created = new FileList { bucketName = config.outputDir.bucketName, outputDir = config.outputDir.path, files = fileNames }, variantCount = variantCount, jwtFields = config.jwtFields }; } private static (ErrorCategory?, string) GetMostSevereErrorCategoryAndMessage(IEnumerable annotationResultSummaries) { List<(AnnotationResultSummary Item, int Index)> failedJobs = annotationResultSummaries .Select(x => x ?? AnnotationResultSummary.Create(null, ErrorCategory.NirvanaError, "No result summary available for the annotation job.")) .Select((x, i) => (Item: x, Index: i)).Where(x => x.Item.ErrorCategory != null).ToList(); if (failedJobs.Count == 0) return (null, null); Logger.WriteLine(AnnotationLambdaFailedStatus); failedJobs.ForEach(x => Logger.WriteLine($"Job {x.Index + 1}: {x.Item.ErrorCategory} {x.Item.ErrorMessage}")); ErrorCategory? mostSevereError = failedJobs.Select(x => x.Item.ErrorCategory).Min(); string errorMessage = mostSevereError == ErrorCategory.UserError ? string.Join(";", failedJobs.Where(x => x.Item.ErrorCategory == mostSevereError).Select(x => x.Item.ErrorMessage).Distinct()) : ""; return (mostSevereError, errorMessage); } private static Task[] CallAnnotationLambdas(NirvanaConfig config, string annotationLambdaArn, ILambdaContext context, IEnumerable annotationRanges) => annotationRanges?.Select((x, i) => RunAnnotationJob(config, annotationLambdaArn, context, x, i + 1)).ToArray() ?? new[] {RunAnnotationJob(config, annotationLambdaArn, context, null, 1)}; private static Task RunAnnotationJob(NirvanaConfig config, string annotationLambdaArn, ILambdaContext context, AnnotationRange range, int jobIndex) { var annotationConfig = GetAnnotationConfig(config, range, jobIndex); Logger.WriteLine($"Job: {jobIndex}, Annotation region: {DescribeAnnotationRegion(range)}"); string configString = JsonUtilities.Stringify(annotationConfig); var annotationJob = new AnnotationJob(context, jobIndex); return Task.Run(() => annotationJob.Invoke(annotationLambdaArn, configString)); } private static string DescribeAnnotationRegion(AnnotationRange ar) { if (ar == null) return "Whole VCF"; string ret = $"{ar.Start.Chromosome}:{ar.Start.Position}-"; return ar.End == null ? ret : $"{ret}{ar.End?.Chromosome}:{ar.End?.Position}"; } private static AnnotationConfig GetAnnotationConfig(NirvanaConfig config, AnnotationRange annotationRange, int jobIndex) => new() { id = config.id + $"_job{jobIndex}", genomeAssembly = config.genomeAssembly, vcfUrl = config.vcfUrl, tabixUrl = config.tabixUrl, outputDir = config.outputDir, outputPrefix = GetIndexedPrefix(config.vcfUrl, jobIndex), customAnnotations = config.customAnnotations, desiredVcfInfo = config.desiredVcfInfo, desiredVcfSampleInfo = config.desiredVcfSampleInfo, customStrUrl = config.customStrUrl, annotationRange = annotationRange }; internal static string GetIndexedPrefix(string inputVcfPath, int jobIndex) => inputVcfPath.TrimEndFromFirst("?").TrimStartToLast("/").TrimEndFromFirst(".vcf") + "_" + jobIndex.ToString("00000"); private static string FirstCharToLower(string input) => string.IsNullOrEmpty(input) || char.IsLower(input[0]) ? input : char.ToLowerInvariant(input[0]) + input.Substring(1); } } ================================================ FILE: NirvanaLambda/NirvanaLambda.csproj ================================================  net6.0 true Lambda bin\$(Configuration) ================================================ FILE: NirvanaLambda/PartitionUtilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using Cloud.Messages.Annotation; using Compression.FileHandling; using Genome; using Intervals; using IO; using Tabix; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.IO; namespace NirvanaLambda { public static class PartitionUtilities { public static List GetFileOffsets(string vcfUrl, int numPartitions, Tabix.Index tabixIndex) { long fileSize = HttpUtilities.GetLength(vcfUrl); long[] sizeBasedOffsets = GetEqualSizeOffsets(fileSize, numPartitions); return GetBlockOffsets(sizeBasedOffsets, tabixIndex); } private static List GetBlockOffsets(long[] sizeBasedOffsets, Tabix.Index tabixIndex) { long[] allLinearOffsets = GetAllLinearFileOffsets(tabixIndex); return FindEqualOrClosestSmallerOffsets(sizeBasedOffsets, allLinearOffsets); } internal static List FindEqualOrClosestSmallerOffsets(long[] sizeBasedOffsets, long[] allLinearOffsets) { if (sizeBasedOffsets == null || allLinearOffsets == null) return new List(); var closestOffsets = new List(); var startIndex = 0; foreach (long offset in sizeBasedOffsets) { int searchedIndex = Array.BinarySearch(allLinearOffsets, startIndex, allLinearOffsets.Length - startIndex, offset); if (searchedIndex < 0) searchedIndex = ~searchedIndex - 1; if (searchedIndex < 0) searchedIndex = 0; // only add new offset if it is different from the last one in the list if (closestOffsets.Count == 0 || startIndex != searchedIndex) closestOffsets.Add(allLinearOffsets[searchedIndex]); startIndex = searchedIndex; } return closestOffsets; } internal static long[] GetEqualSizeOffsets(long fileSize, int numPartitions) { var offsets = new long[numPartitions]; long baseSize = fileSize / numPartitions; //put all the extra {fileSize%numPartitions} bytes to the last partition for (var i = 0; i < numPartitions; i++) offsets[i] = baseSize * i; return offsets; } private static long[] GetAllLinearFileOffsets(Tabix.Index tabixIndex) { if (tabixIndex.ReferenceSequences.Length == 0) return new long[] {0}; return MergeConsecutiveEqualValues( tabixIndex.ReferenceSequences.SelectMany(x => x.LinearFileOffsets.Select(y => VirtualPosition.From((long) y).FileOffset))).ToArray(); } public static IEnumerable MergeConsecutiveEqualValues(IEnumerable values) { var isFirstValue = true; T lastValue = default; foreach (var value in values) { if (!isFirstValue && lastValue.Equals(value)) continue; isFirstValue = false; lastValue = value; yield return value; } } public static AnnotationRange[] GenerateAnnotationRanges(List blockBasedOffsets, string vcfUrl, IntervalForest geneIntervalForest, Dictionary refNameToChromosome) { // There may be less intervals for annotation Lambda after the adjustment AnnotationPosition[] adjustedStarts = AdjustPartitionGenomicStarts(blockBasedOffsets, vcfUrl, geneIntervalForest, refNameToChromosome); return GetRanges(adjustedStarts); } private static AnnotationPosition[] AdjustPartitionGenomicStarts(IReadOnlyList blockBasedOffsets, string vcfUrl, IIntervalForest geneIntervalForest, Dictionary refNameToChromosome) { var allAdjustedStarts = new AnnotationPosition[blockBasedOffsets.Count]; for (var index = 0; index < blockBasedOffsets.Count; index++) { long blockBasedOffset = blockBasedOffsets[index]; using (var stream = PersistentStreamUtils.GetReadStream(vcfUrl, blockBasedOffset)) using (var gzipStream = new BlockGZipStream(stream, CompressionMode.Decompress)) { var annotationPosition = GetFirstGenomicPosition(gzipStream, index == 0); allAdjustedStarts[index] = FindProperStartPosition(annotationPosition, geneIntervalForest, refNameToChromosome); } } AnnotationPosition[] adjustedStarts = MergeConsecutiveEqualValues(allAdjustedStarts).ToArray(); return adjustedStarts; } private static AnnotationRange[] GetRanges(AnnotationPosition[] adjustedStarts) { int numStarts = adjustedStarts.Length; int lastIndex = numStarts - 1; if (numStarts == 1) return null; var ranges = new AnnotationRange[numStarts]; for (var i = 0; i < lastIndex; i++) //The end position in an annotation range can be smaller than 1, which indicate it ends at the end of previous chromosome ranges[i] = new AnnotationRange(adjustedStarts[i], new AnnotationPosition(adjustedStarts[i + 1].Chromosome, adjustedStarts[i + 1].Position - 1)); ranges[lastIndex] = new AnnotationRange(adjustedStarts[lastIndex], null); return ranges; } private static AnnotationPosition GetFirstGenomicPosition(Stream vcfStream, bool isFirstBlock) { if (vcfStream == null) throw new ArgumentNullException(nameof(vcfStream),"The VCF stream trying to read is null."); using (var streamReader = new StreamReader(vcfStream)) { // Discard the first line if this is not the first block, as it may be a partial VCF line if (!isFirstBlock) streamReader.ReadLine(); string line; while ((line = streamReader.ReadLine()) != null) { if (line.StartsWith('#')) continue; string[] splits = line.Split('\t', 3); if (splits.Length < 3) continue; string chrom = splits[VcfCommon.ChromIndex]; string positionString = splits[VcfCommon.PosIndex]; if (!int.TryParse(positionString, out int position)) throw new InvalidDataException($"Position {positionString} in VCF line {line} is not a number."); return new AnnotationPosition(chrom, position); } throw new InvalidDataException("No variant found in the VCF stream."); } } private static AnnotationPosition FindProperStartPosition(AnnotationPosition genomicPosition, IIntervalForest geneIntervalForest, Dictionary refNameToChromosome) { var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, genomicPosition.Chromosome); int currentPosition = genomicPosition.Position; IGene[] overlappingGenes; while ((overlappingGenes = geneIntervalForest.GetAllOverlappingValues(chromosome.Index, currentPosition, currentPosition)) != null) { if (overlappingGenes.Length > 0) currentPosition = overlappingGenes.Select(x => x.Start).Min() - 1; } // Always return the position right before the overlapping genes to KISS return new AnnotationPosition(genomicPosition.Chromosome, currentPosition < 1 ? 1 : currentPosition); } } } ================================================ FILE: OptimizedCore/ExpandableArray.cs ================================================ using System.Buffers; namespace OptimizedCore { public static class ExpandableArray { public static T[] Get(int size) { var pool = ArrayPool.Shared; return pool.Rent(size); } public static T[] Resize(T[] array, int newSize) { var pool = ArrayPool.Shared; pool.Return(array); return pool.Rent(newSize); } public static void Return(T[] array) { var pool = ArrayPool.Shared; pool.Return(array); } } } ================================================ FILE: OptimizedCore/NullSequenceEqual.cs ================================================ namespace OptimizedCore { public static class NullSequenceEqual { public static bool ArrayEqual(this T[] first, T[] second) { if (ReferenceEquals(first, second)) return true; if (first == null || second == null) return false; if (first.Length != second.Length) return false; for (var i = 0; i < first.Length; i++) if (!first[i].Equals(second[i])) return false; return true; } } } ================================================ FILE: OptimizedCore/OptimizedCore.csproj ================================================ net6.0 ..\bin\$(Configuration) true ================================================ FILE: OptimizedCore/StringBuilderPool.cs ================================================ using System.Text; using Microsoft.Extensions.ObjectPool; namespace OptimizedCore { public static class StringBuilderPool { private static readonly ObjectPool Pool = new DefaultObjectPool(new StringBuilderPooledObjectPolicy(), 1024); public static StringBuilder Get() { var sb = Pool.Get(); sb.Clear(); return sb; } public static string GetStringAndReturn(StringBuilder sb) { var s = sb.ToString(); Return(sb); return s; } public static void Return(StringBuilder sb) { if (sb == null) return; Pool.Return(sb); } } } ================================================ FILE: OptimizedCore/StringExtensions.cs ================================================ using System; namespace OptimizedCore { public static class StringExtensions { public static unsafe string[] OptimizedSplit(this string s, char delimiter, int numColumns = -1) { var numReplaces = 0; int sLen = s.Length; var sepList = new int[s.Length]; // find the locations of our tab delimiter fixed (char* chPtr = s) { for (var index = 0; index < sLen; ++index) { if (chPtr[index] == delimiter) sepList[numReplaces++] = index; } } // extract our columns var startIndex = 0; var colIndex = 0; int numDelimitedColumns = numReplaces + 1; if (numColumns < numDelimitedColumns) numColumns = numDelimitedColumns; var columns = new string[numColumns]; for (var index = 0; index < numReplaces && startIndex < sLen; ++index) { columns[colIndex++] = s.Substring(startIndex, sepList[index] - startIndex); startIndex = sepList[index] + 1; } // handle the last column if (startIndex < sLen && numReplaces >= 0) columns[colIndex] = s.Substring(startIndex); else if (colIndex == numReplaces) columns[colIndex] = string.Empty; return columns; } public static (string Key, string Value) OptimizedKeyValue(this string s) { int equalPos = s.IndexOf('='); return equalPos == -1 ? (s, null) : (s.Substring(0, equalPos), s.Substring(equalPos + 1)); } /// /// handles -2_147_483_647 to +2_147_483_647 /// public static unsafe (int Number, bool FoundError) OptimizedParseInt32(this string s) { var number = 0; // 2_147_483_647 if (string.IsNullOrEmpty(s) || s.Length > 11) return (0, true); try { fixed (char* chPtr = s) { int index = s.Length - 1; var ptr = chPtr; var applyNegative = false; if (*ptr == '-') { applyNegative = true; ptr++; index--; } while (index >= 0) { if (*ptr < 48 || *ptr > 57) return (0, true); checked { number *= 10; number += *ptr++ - '0'; } index--; } if (applyNegative) number = -number; } } catch (OverflowException) { return (0, true); } return (number, false); } public static bool OptimizedStartsWith(this string s, char ch) => s.Length > 0 && s[0] == ch; public static bool OptimizedEndsWith(this string s, char ch) => s.Length > 0 && s[s.Length - 1] == ch; } } ================================================ FILE: Phantom/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: Phantom/Phantom.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: README.md ================================================ > [!IMPORTANT] > Nirvana is no longer actively maintained as an open sourced tool. Please visit [Illumina Connected Annotations](https://developer.illumina.com/illumina-connected-annotations) for the latest version. Latest documentation can be found [here](https://illumina.github.io/IlluminaConnectedAnnotationsDocumentation/) Nirvana is licensed under [PolyForm Strict License 1.0.0](https://github.com/Illumina/Nirvana/blob/main/LICENSE) # Nirvana Nirvana provides **clinical-grade annotation of genomic variants** (SNVs, MNVs, insertions, deletions, indels, and SVs (including CNVs). It can be run as a stand-alone package or integrated into larger software tools that require variant annotation. The input to Nirvana are VCFs and the output is a structured JSON representation of all annotation and sample information (as extracted from the VCF). Optionally, a subset of the annotated data is available in VCF and/or gVCF files. Nirvana handles multiple alternate alleles and multiple samples with ease. The software is being developed under a rigorous SDLC and testing process to ensure accuracy of the results and enable embedding in other software with regulatory needs. Nirvana uses a continuous integration pipeline where millions of variant annotations are monitored against baseline values on a daily basis. Backronym: **NI**mble and **R**obust **VA**riant a**N**not**A**tor
## Resources *Please note that our documentation site has moved.* * [Documentation](https://illumina.github.io/NirvanaDocumentation/) * [Release Notes](https://github.com/Illumina/Nirvana/releases) ================================================ FILE: ReferenceSequence/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: ReferenceSequence/Commands/CreateReferenceMain.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using Genome; using IO; using ReferenceSequence.Common; using ReferenceSequence.Compression; using ReferenceSequence.Creation; using ReferenceSequence.IO; namespace ReferenceSequence.Commands { public static class CreateReferenceMain { private static string _fastaPrefix; private static string _genomeAssemblyReportPath; private static string _cytogeneticBandPath; private static string _referenceNamesPath; private static string _genomeAssembly; private static string _outputCompressedPath; private static byte _patchLevel; private static ExitCodes ProgramExecution() { var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); Console.Write("- loading previous reference names... "); List oldChromosomes = ReferenceNamesReader.GetReferenceNames(FileUtilities.GetReadStream(_referenceNamesPath)); Console.WriteLine("finished."); Dictionary oldRefNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(oldChromosomes); Console.Write("- reading the genome assembly report... "); List chromosomes = AssemblyReader.GetChromosomes(FileUtilities.GetReadStream(_genomeAssemblyReportPath), oldRefNameToChromosome, oldChromosomes.Count); int numRefSeqs = chromosomes.Count; Console.WriteLine($"{numRefSeqs} references found."); Console.Write("- checking reference index contiguity... "); CheckReferenceIndexContiguity(chromosomes, oldChromosomes); Console.WriteLine("contiguous."); Dictionary refNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(chromosomes); Console.Write("- reading cytogenetic bands... "); List[] cytogeneticBandsByRef = CytogeneticBandsReader.GetCytogeneticBands(FileUtilities.GetReadStream(_cytogeneticBandPath), numRefSeqs, refNameToChromosome); Console.WriteLine("finished."); Console.WriteLine("- reading FASTA files:"); List fastaSequences = GetFastaSequences(_fastaPrefix, refNameToChromosome); long genomeLength = GetGenomeLength(fastaSequences); Console.WriteLine($"- genome length: {genomeLength:N0}"); Console.Write("- check if chrY has PAR masking... "); CheckChrYPadding(fastaSequences); Console.WriteLine("unmasked."); Console.Write("- applying 2-bit compression... "); List referenceSequences = CreateReferenceSequences(fastaSequences, cytogeneticBandsByRef); Console.WriteLine("finished."); Console.Write("- creating reference sequence file... "); CreateReferenceSequenceFile(genomeAssembly, _patchLevel, chromosomes, referenceSequences); long fileSize = new FileInfo(_outputCompressedPath).Length; Console.WriteLine($"{fileSize:N0} bytes"); return ExitCodes.Success; } private static long GetGenomeLength(IEnumerable fastaSequences) => fastaSequences.Aggregate(0, (current, fastaSequence) => current + fastaSequence.Bases.Length); private static List CreateReferenceSequences(IEnumerable fastaSequences, IReadOnlyList> cytogeneticBandsByRef) { var referenceSequences = new List(); foreach (var fastaSequence in fastaSequences) { Band[] cytogeneticBands = cytogeneticBandsByRef[fastaSequence.Chromosome.Index].ToArray(); (byte[] buffer, MaskedEntry[] maskedEntries) = TwoBitCompressor.Compress(fastaSequence.Bases); var referenceSequence = new Creation.ReferenceSequence(buffer, maskedEntries, cytogeneticBands, 0, fastaSequence.Bases.Length); referenceSequences.Add(referenceSequence); } return referenceSequences; } private static void CheckChrYPadding(IEnumerable fastaSequences) { FastaSequence chrY = fastaSequences.FirstOrDefault(s => s.Chromosome.UcscName == "chrY"); if (chrY == null) return; int numN = CountNs(chrY.Bases); if (numN > 33720001) { throw new InvalidDataException($"Found a large number of Ns ({numN}) in the Y chromosome. Are you sure the PAR region is unmasked?"); } } private static List GetFastaSequences(string fastaPrefix, Dictionary refNameToChromosome) { string directory = Path.GetDirectoryName(fastaPrefix); string prefix = Path.GetFileName(fastaPrefix); string[] fastaFiles = Directory.GetFiles(directory, $"{prefix}*.fa.gz"); var references = new List(); foreach (string filePath in fastaFiles) { Console.Write($" - parsing {Path.GetFileName(filePath)}... "); FastaReader.AddReferenceSequences(new GZipStream(FileUtilities.GetReadStream(filePath), CompressionMode.Decompress), refNameToChromosome, references); Console.WriteLine($"total: {references.Count} sequences"); } return references.OrderBy(x => x.Chromosome.Index).ToList(); } private static void CheckReferenceIndexContiguity(IEnumerable chromosomes, IReadOnlyList oldChromosomes) { ushort testRefIndex = 0; foreach (var chromosome in chromosomes) { if (chromosome.Index != testRefIndex) { Console.WriteLine($"Found a non-contiguous entry at test refIndex: {testRefIndex} vs chromosome.Index: {chromosome.Index}"); Console.WriteLine($"NEW: RefIndex: {chromosome.Index}, Ensembl: {chromosome.EnsemblName}, UCSC: {chromosome.UcscName}, GenBank: {chromosome.GenBankAccession}, RefSeq: {chromosome.RefSeqAccession}"); Console.WriteLine($"OLD: RefIndex: {oldChromosomes[testRefIndex].Index}, Ensembl: {oldChromosomes[testRefIndex].EnsemblName}, UCSC: {oldChromosomes[testRefIndex].UcscName}, GenBank: {oldChromosomes[testRefIndex].GenBankAccession}, RefSeq: {oldChromosomes[testRefIndex].RefSeqAccession}"); Environment.Exit(1); } testRefIndex++; } } private static void CreateReferenceSequenceFile(GenomeAssembly genomeAssembly, byte patchLevel, IReadOnlyCollection chromosomes, List referenceSequences) { using (var writer = new ReferenceSequenceWriter(FileUtilities.GetCreateStream(_outputCompressedPath), chromosomes, genomeAssembly, patchLevel)) { writer.Write(referenceSequences); } } private static int CountNs(string s) { var numN = 0; foreach (char c in s) if (c == 'N') numN++; return numN; } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "cb|c=", "cytogenetic band {filename}", v => _cytogeneticBandPath = v }, { "ga=", "genome assembly {version}", v => _genomeAssembly = v }, { "gar|g=", "genome assembly report {filename}", v => _genomeAssemblyReportPath = v }, { "in|i=", "FASTA {prefix}", v => _fastaPrefix = v }, { "patch=", "patch {level}", (byte v) => _patchLevel = v }, { "rn=", "reference names {filename}", v => _referenceNamesPath = v }, { "out|o=", "output compressed reference {filename}", v => _outputCompressedPath = v } }; string commandLineExample = $"{command} --in --gar --cb --rn --ga --out "; return new ConsoleAppBuilder(args, ops) .Parse() .CheckInputFilenameExists(_genomeAssemblyReportPath, "genome assembly report", "--gar") .CheckInputFilenameExists(_cytogeneticBandPath, "cytogenetic band", "--cb") .CheckInputFilenameExists(_referenceNamesPath, "reference names", "--rn") .HasRequiredParameter(_fastaPrefix, "FASTA prefix", "--in") .HasRequiredParameter(_genomeAssembly, "genome assembly", "--ga") .HasRequiredParameter(_patchLevel, "patch level", "--patch") .HasRequiredParameter(_outputCompressedPath, "output reference", "--out") .SkipBanner() .ShowHelpMenu("Converts a FASTA file to the Nirvana reference format.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: ReferenceSequence/Commands/CreateSubstringMain.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using Genome; using IO; using ReferenceSequence.Common; using ReferenceSequence.Compression; using ReferenceSequence.Creation; using ReferenceSequence.IO; namespace ReferenceSequence.Commands { public static class CreateSubstringMain { private static string _fastaPath; private static string _genomeAssemblyReportPath; private static string _cytogeneticBandPath; private static string _genomeAssembly; private static string _outputCompressedPath; private static int _beginPosition; private static int _endPosition; private static ExitCodes ProgramExecution() { var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); Console.Write("- reading the genome assembly report... "); var dummyRefNameToChromosome = new Dictionary(); List chromosomes = AssemblyReader.GetChromosomes(FileUtilities.GetReadStream(_genomeAssemblyReportPath), dummyRefNameToChromosome, 0); int numRefSeqs = chromosomes.Count; Console.WriteLine($"{numRefSeqs} references found."); Dictionary refNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(chromosomes); Console.Write("- reading FASTA file... "); var fastaSequence = GetFastaSequence(_fastaPath, refNameToChromosome); Console.WriteLine($"- sequence length: {fastaSequence.Bases.Length:N0}"); Console.Write("- reading cytogenetic bands... "); List cytogeneticBands = GetCytogeneticBands(fastaSequence.Chromosome.Index, numRefSeqs, refNameToChromosome); Console.WriteLine("finished."); Console.Write("- applying 2-bit compression... "); var referenceSequence = CreateReferenceSequence(fastaSequence, cytogeneticBands); Console.WriteLine("finished."); Console.Write("- creating reference sequence file... "); var minimalChromosomes = new List { fastaSequence.Chromosome }; CreateReferenceSequenceFile(genomeAssembly, minimalChromosomes, referenceSequence); long fileSize = new FileInfo(_outputCompressedPath).Length; Console.WriteLine($"{fileSize:N0} bytes"); return ExitCodes.Success; } private static List GetCytogeneticBands(ushort refIndex, int numRefSeqs, Dictionary refNameToChromosome) { List chrBands = CytogeneticBandsReader.GetCytogeneticBands(FileUtilities.GetReadStream(_cytogeneticBandPath), numRefSeqs, refNameToChromosome)[refIndex]; int substringBegin = _beginPosition; int substringEnd = _beginPosition + _endPosition - 1; return chrBands.Where(band => Intervals.Utilities.Overlaps(substringBegin, substringEnd, band.Begin, band.End)) .ToList(); } private static void CreateReferenceSequenceFile(GenomeAssembly genomeAssembly, IReadOnlyCollection chromosomes, Creation.ReferenceSequence referenceSequence) { using (var writer = new ReferenceSequenceWriter(FileUtilities.GetCreateStream(_outputCompressedPath), chromosomes, genomeAssembly, 0)) { writer.Write(new List {referenceSequence}); } } private static Creation.ReferenceSequence CreateReferenceSequence(FastaSequence fastaSequence, List cytogeneticBands) { Band[] bands = cytogeneticBands.ToArray(); (byte[] buffer, MaskedEntry[] maskedEntries) = TwoBitCompressor.Compress(fastaSequence.Bases); return new Creation.ReferenceSequence(buffer, maskedEntries, bands, _beginPosition - 1, fastaSequence.Bases.Length); } private static FastaSequence GetFastaSequence(string fastaPath, Dictionary refNameToChromosome) { var references = new List(); FastaReader.AddReferenceSequences(new GZipStream(FileUtilities.GetReadStream(fastaPath), CompressionMode.Decompress), refNameToChromosome, references); if (references.Count != 1) { throw new InvalidDataException($"Expected 1 reference, but found {references.Count} references."); } var reference = references[0]; int length = _endPosition - _beginPosition + 1; string substring = reference.Bases.Substring(_beginPosition - 1, length); return new FastaSequence(reference.Chromosome, substring); } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "begin=", "begin {position}", (int v) => _beginPosition = v }, { "cb|c=", "cytogenetic band {filename}", v => _cytogeneticBandPath = v }, { "end=", "end {position}", (int v) => _endPosition = v }, { "ga=", "genome assembly {version}", v => _genomeAssembly = v }, { "gar|g=", "genome assembly report {filename}", v => _genomeAssemblyReportPath = v }, { "in|i=", "FASTA {filename}", v => _fastaPath = v }, { "out|o=", "output compressed reference {filename}", v => _outputCompressedPath = v } }; string commandLineExample = $"{command} --in --gar --cb --rn --ga --out "; return new ConsoleAppBuilder(args, ops) .Parse() .CheckInputFilenameExists(_genomeAssemblyReportPath, "genome assembly report", "--gar") .CheckInputFilenameExists(_cytogeneticBandPath, "cytogenetic band", "--cb") .HasRequiredParameter(_fastaPath, "FASTA prefix", "--in") .HasRequiredParameter(_genomeAssembly, "genome assembly", "--ga") .HasRequiredParameter(_outputCompressedPath, "output reference", "--out") .HasRequiredParameter(_beginPosition, "offset", "--begin") .HasRequiredParameter(_endPosition, "length", "--end") .SkipBanner() .ShowHelpMenu("Converts a FASTA file to the Nirvana reference format.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: ReferenceSequence/Commands/CreateTestSeqMain.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using Genome; using IO; using ReferenceSequence.Common; using ReferenceSequence.Compression; using ReferenceSequence.Creation; using ReferenceSequence.IO; namespace ReferenceSequence.Commands { public static class CreateTestSeqMain { private static string _outputCompressedPath; private static ExitCodes ProgramExecution() { var testSeqChromosome = new Chromosome("chrTestSeq", "TestSeq", null, null, 1, 0); var chromosomes = new List {testSeqChromosome}; Console.Write("- creating FASTA sequence... "); var fastaSequence = new FastaSequence(testSeqChromosome, "NNATGTTTCCACTTTCTCCTCATTAGANNNTAACGAATGGGTGATTTCCCTAN"); Console.WriteLine($"- sequence length: {fastaSequence.Bases.Length:N0}"); Console.Write("- applying 2-bit compression... "); var referenceSequence = CreateReferenceSequence(fastaSequence); Console.WriteLine("finished."); Console.Write("- creating reference sequence file... "); CreateReferenceSequenceFile(GenomeAssembly.GRCh37, chromosomes, referenceSequence); long fileSize = new FileInfo(_outputCompressedPath).Length; Console.WriteLine($"{fileSize:N0} bytes"); return ExitCodes.Success; } private static void CreateReferenceSequenceFile(GenomeAssembly genomeAssembly, IReadOnlyCollection chromosomes, Creation.ReferenceSequence referenceSequence) { using (var writer = new ReferenceSequenceWriter(FileUtilities.GetCreateStream(_outputCompressedPath), chromosomes, genomeAssembly, 0)) { writer.Write(new List { referenceSequence }); } } private static Creation.ReferenceSequence CreateReferenceSequence(FastaSequence fastaSequence) { (byte[] buffer, MaskedEntry[] maskedEntries) = TwoBitCompressor.Compress(fastaSequence.Bases); return new Creation.ReferenceSequence(buffer, maskedEntries, new Band[0], 0, fastaSequence.Bases.Length); } public static ExitCodes Run(string command, string[] args) { var ops = new OptionSet { { "out|o=", "output compressed reference {filename}", v => _outputCompressedPath = v } }; string commandLineExample = $"{command} --out "; return new ConsoleAppBuilder(args, ops) .Parse() .HasRequiredParameter(_outputCompressedPath, "output reference", "--out") .SkipBanner() .ShowHelpMenu("Creates a TestSeq_reference.dat file.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); } } } ================================================ FILE: ReferenceSequence/Common/IndexEntry.cs ================================================ namespace ReferenceSequence.Common { internal sealed class IndexEntry { public readonly ushort RefIndex; public readonly long FileOffset; public const int Size = 10; internal IndexEntry(ushort refIndex, long fileOffset) { RefIndex = refIndex; FileOffset = fileOffset; } } } ================================================ FILE: ReferenceSequence/Common/MaskedEntry.cs ================================================ namespace ReferenceSequence.Common { internal sealed class MaskedEntry { public readonly int Begin; public readonly int End; internal MaskedEntry(int begin, int end) { Begin = begin; End = end; } } } ================================================ FILE: ReferenceSequence/Common/Sequence.cs ================================================ using Genome; using Intervals; using ReferenceSequence.IO; namespace ReferenceSequence.Common { public sealed class Sequence : ISequence { public int Length { get; private set; } public Band[] CytogeneticBands { get; private set; } public GenomeAssembly Assembly { get; set; } private int _sequenceOffset; private byte[] _buffer; private char[] _decompressBuffer; private IIntervalSearch _maskedIntervalSearch; private readonly char[] _convertNumberToBase; private bool _useNSequence; public Sequence() { const string bases = "GCTA"; _convertNumberToBase = bases.ToCharArray(); _decompressBuffer = new char[1024]; } private static (int BaseIndex, int Shift) GetBaseIndexAndShift(int referencePosition) { int refPos = referencePosition + 1; var baseIndex = (int)(refPos / 4.0); int shift = (3 - refPos % 4) * 2; return (baseIndex, shift); } internal static int GetNumBufferBytes(int numBases) => (int)((double)numBases / ReferenceSequenceCommon.NumBasesPerByte + 1); public void EnableNSequence() => _useNSequence = true; internal void Set(int length, int sequenceOffset, byte[] twoBitBuffer, IntervalArray maskedEntryIntervalArray, Band[] cytogeneticBands) { Length = length; _buffer = twoBitBuffer; _maskedIntervalSearch = maskedEntryIntervalArray; _sequenceOffset = sequenceOffset; CytogeneticBands = cytogeneticBands; _useNSequence = false; } public string Substring(int offset, int length) { if (_useNSequence) return new string('N', length); offset -= _sequenceOffset; // handle negative offsets and lengths if (offset < 0 || length < 1 || offset >= Length) return null; // sanity check: avoid going past the end of the sequence if (offset + length > Length) length = Length - offset; // allocate more memory if needed if (length > _decompressBuffer.Length) _decompressBuffer = new char[length]; // set the initial state of the buffer (int bufferIndex, int bufferShift) = GetBaseIndexAndShift(offset - 1); byte currentBufferSeed = _buffer[bufferIndex]; // get the overlapping masked interval MaskedEntry[] maskedEntries = _maskedIntervalSearch.GetAllOverlappingValues(offset, offset + length - 1); // get the first masked interval var currentOffset = 0; bool hasMaskedIntervals = maskedEntries != null; int numIntervals = maskedEntries?.Length ?? 0; var currentMaskedEntry = hasMaskedIntervals ? maskedEntries[0] : null; for (var baseIndex = 0; baseIndex < length; baseIndex++) { int currentPosition = offset + baseIndex; if (hasMaskedIntervals && currentPosition >= currentMaskedEntry.Begin && currentPosition <= currentMaskedEntry.End) { int numMaskedBases = MaskBases(offset, length, baseIndex, currentMaskedEntry); baseIndex += numMaskedBases - 1; (bufferIndex, bufferShift) = GetBaseIndexAndShift(offset + baseIndex); currentBufferSeed = _buffer[bufferIndex]; currentOffset++; hasMaskedIntervals = currentOffset < numIntervals; currentMaskedEntry = hasMaskedIntervals ? maskedEntries[currentOffset] : null; continue; } // evaluate normal bases _decompressBuffer[baseIndex] = _convertNumberToBase[(currentBufferSeed >> bufferShift) & 3]; bufferShift -= 2; if (bufferShift < 0) { bufferShift = CompressedSequenceReader.MaxShift; bufferIndex++; currentBufferSeed = _buffer[bufferIndex]; } } return new string(_decompressBuffer, 0, length); } private int MaskBases(int offset, int length, int baseIndex, MaskedEntry currentInterval) { var numBasesMasked = 0; for (; baseIndex <= currentInterval.End - offset && baseIndex < length; baseIndex++, numBasesMasked++) _decompressBuffer[baseIndex] = 'N'; return numBasesMasked; } } } ================================================ FILE: ReferenceSequence/Compression/TwoBitCompressor.cs ================================================ using System.Collections.Generic; using ReferenceSequence.Common; namespace ReferenceSequence.Compression { internal static class TwoBitCompressor { private static readonly byte[] ConvertBaseToNumber = new byte[256]; private const string Bases = "GCTA"; static TwoBitCompressor() { for (var index = 0; index < 256; ++index) ConvertBaseToNumber[index] = 10; for (var index = 0; index < Bases.Length; ++index) { ConvertBaseToNumber[Bases[index]] = (byte)index; ConvertBaseToNumber[char.ToLower(Bases[index])] = (byte)index; } } private static int GetNumBufferBytes(int numBases) => (int)(numBases / 4.0 + 1.0); public static (byte[] Buffer, MaskedEntry[] MaskedEntries) Compress(string bases) { int numBufferBases = GetNumBufferBytes(bases.Length); var buffer = new byte[numBufferBases]; byte num1 = 0; var index1 = 0; var num2 = 0; foreach (char index2 in bases) { byte num3 = ConvertBaseToNumber[index2]; if (num3 == 10) num3 = 0; num1 = (byte)((uint)num1 << 2 | num3); ++num2; if (num2 != 4) continue; buffer[index1] = num1; num1 = 0; num2 = 0; ++index1; } if (num2 != 0) buffer[index1] = (byte)((uint)num1 << (4 - num2) * 2); var maskedEntries = new List(); for (var index2 = 0; index2 < bases.Length; ++index2) { if (bases[index2] != 'N') continue; int begin = index2; int end = index2; for (++index2; index2 < bases.Length && bases[index2] == 'N'; ++index2) end = index2; maskedEntries.Add(new MaskedEntry(begin, end)); } return (buffer, maskedEntries.ToArray()); } } } ================================================ FILE: ReferenceSequence/Creation/FastaSequence.cs ================================================ using Genome; namespace ReferenceSequence.Creation { internal sealed class FastaSequence { public readonly Chromosome Chromosome; public readonly string Bases; internal FastaSequence(Chromosome chromosome, string bases) { Chromosome = chromosome; Bases = bases; } } } ================================================ FILE: ReferenceSequence/Creation/ReferenceBuffer.cs ================================================ namespace ReferenceSequence.Creation { public sealed class ReferenceBuffer { public readonly ushort RefIndex; public readonly byte[] Buffer; public readonly int BufferSize; public ReferenceBuffer(ushort refIndex, byte[] buffer, int bufferSize) { RefIndex = refIndex; Buffer = buffer; BufferSize = bufferSize; } } } ================================================ FILE: ReferenceSequence/Creation/ReferenceDictionaryUtils.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; namespace ReferenceSequence.Creation { internal static class ReferenceDictionaryUtils { internal static Dictionary GetRefNameToChromosome(IEnumerable chromosomes) { var refNameToChromosome = new Dictionary(); foreach (var chromosome in chromosomes) { bool isUcscEmpty = string.IsNullOrEmpty(chromosome.UcscName); bool isEnsemblEmpty = string.IsNullOrEmpty(chromosome.EnsemblName); bool isRefSeqAccessionEmpty = string.IsNullOrEmpty(chromosome.RefSeqAccession); bool isGenBankAccessionEmpty = string.IsNullOrEmpty(chromosome.GenBankAccession); if (isUcscEmpty && isEnsemblEmpty && isRefSeqAccessionEmpty && isGenBankAccessionEmpty) throw new InvalidDataException("Expected at least one chromosome field to be non-empty."); if (!isUcscEmpty) refNameToChromosome[chromosome.UcscName] = chromosome; if (!isEnsemblEmpty) refNameToChromosome[chromosome.EnsemblName] = chromosome; if (!isRefSeqAccessionEmpty) refNameToChromosome[chromosome.RefSeqAccession] = chromosome; if (!isGenBankAccessionEmpty) refNameToChromosome[chromosome.GenBankAccession] = chromosome; } return refNameToChromosome; } } } ================================================ FILE: ReferenceSequence/Creation/ReferenceSequence.cs ================================================ using System.IO; using System.Text; using Genome; using IO; using ReferenceSequence.Common; namespace ReferenceSequence.Creation { internal sealed class ReferenceSequence { private readonly byte[] _buffer; private readonly MaskedEntry[] _maskedEntries; private readonly Band[] _cytogeneticBands; private readonly int _sequenceOffset; private readonly int _numBases; internal ReferenceSequence(byte[] buffer, MaskedEntry[] maskedEntries, Band[] cytogeneticBands, int sequenceOffset, int numBases) { _buffer = buffer; _maskedEntries = maskedEntries; _cytogeneticBands = cytogeneticBands; _sequenceOffset = sequenceOffset; _numBases = numBases; } internal ReferenceBuffer GetReferenceBuffer(ushort refIndex) { int bufferSize; byte[] buffer; using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { writer.Write(ReferenceSequenceCommon.ReferenceStartTag); WriteMetadata(writer); WriteBuffer(writer); WriteMaskedEntries(writer); WriteCytogeneticBands(writer); } bufferSize = (int) ms.Position; buffer = ms.ToArray(); } return new ReferenceBuffer(refIndex, buffer, bufferSize); } private void WriteMetadata(IExtendedBinaryWriter writer) { writer.WriteOpt(_sequenceOffset); writer.WriteOpt(_numBases); } private void WriteCytogeneticBands(IExtendedBinaryWriter writer) { writer.WriteOpt(_cytogeneticBands.Length); foreach (var band in _cytogeneticBands) { writer.WriteOpt(band.Begin); writer.WriteOpt(band.End); writer.WriteOptAscii(band.Name); } } private void WriteMaskedEntries(IExtendedBinaryWriter writer) { writer.WriteOpt(_maskedEntries.Length); foreach (var maskedEntry in _maskedEntries) { writer.WriteOpt(maskedEntry.Begin); writer.WriteOpt(maskedEntry.End); } } private void WriteBuffer(IExtendedBinaryWriter writer) { writer.WriteOpt(_buffer.Length); writer.Write(_buffer); } } } ================================================ FILE: ReferenceSequence/IO/AssemblyReader.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using OptimizedCore; namespace ReferenceSequence.IO { public static class AssemblyReader { private const int EnsemblIndex = 0; private const int GenBankAccessionIndex = 4; private const int RefSeqAccessionIndex = 6; private const int LengthIndex = 8; private const int UcscIndex = 9; public static List GetChromosomes(Stream stream, Dictionary oldRefNameToChromosome, int oldNumRefSeqs) { var nextRefIndex = (ushort)oldNumRefSeqs; var chromosomes = new List(); using (var reader = new StreamReader(stream)) { while (true) { string line = reader.ReadLine(); if (line == null) break; if (line.OptimizedStartsWith('#')) continue; string[] cols = line.OptimizedSplit('\t'); string ensemblName = cols[EnsemblIndex].Sanitize(); string genBankAccession = cols[GenBankAccessionIndex].Sanitize(); string refSeqAccession = cols[RefSeqAccessionIndex].Sanitize(); int length = int.Parse(cols[LengthIndex]); string ucscName = cols[UcscIndex].Sanitize(); ushort refIndex = GetRefIndex(oldRefNameToChromosome, ensemblName, ucscName, genBankAccession, refSeqAccession, ref nextRefIndex); chromosomes.Add(new Chromosome(ucscName, ensemblName, refSeqAccession, genBankAccession, length, refIndex)); } } return chromosomes.OrderBy(x => x.Index).ToList(); } private static string Sanitize(this string s) => s == "na" ? null : s; private static ushort GetRefIndex(Dictionary refNameToChromosome, string ensemblName, string ucscName, string genBankAccession, string refSeqAccession, ref ushort nextRefIndex) { if (!string.IsNullOrEmpty(ensemblName) && refNameToChromosome.TryGetValue(ensemblName, out var chromosome)) return chromosome.Index; if (!string.IsNullOrEmpty(ucscName) && refNameToChromosome.TryGetValue(ucscName, out chromosome)) return chromosome.Index; if (!string.IsNullOrEmpty(genBankAccession) && refNameToChromosome.TryGetValue(genBankAccession, out chromosome)) return chromosome.Index; if (!string.IsNullOrEmpty(refSeqAccession) && refNameToChromosome.TryGetValue(refSeqAccession, out chromosome)) return chromosome.Index; ushort refIndex = nextRefIndex; nextRefIndex++; return refIndex; } } } ================================================ FILE: ReferenceSequence/IO/CytogeneticBandsReader.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; namespace ReferenceSequence.IO { public static class CytogeneticBandsReader { public static List[] GetCytogeneticBands(Stream stream, int numRefSeqs, Dictionary refNameToChromosome) { var bandLists = new List[numRefSeqs]; for (var i = 0; i < numRefSeqs; i++) bandLists[i] = new List(); using (var reader = new StreamReader(stream)) { while (true) { string line = reader.ReadLine(); if (string.IsNullOrEmpty(line)) break; string[] cols = line.Split('\t'); const int expectedNumColumns = 5; if (cols.Length != expectedNumColumns) { throw new InvalidDataException($"Expected {expectedNumColumns} columns, but found {cols.Length} columns: [{line}]"); } string ucscName = cols[0]; int begin = int.Parse(cols[1]) + 1; int end = int.Parse(cols[2]); string name = cols[3]; var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, ucscName); if (chromosome.IsEmpty()) continue; bandLists[chromosome.Index].Add(new Band(begin, end, name)); } } return bandLists; } } } ================================================ FILE: ReferenceSequence/IO/FastaReader.cs ================================================ using System.Collections.Generic; using System.IO; using System.Text; using System.Text.RegularExpressions; using ErrorHandling.Exceptions; using Genome; using ReferenceSequence.Creation; namespace ReferenceSequence.IO { internal static class FastaReader { private static readonly Regex NameRegex = new Regex("^>(\\S+)", RegexOptions.Compiled); // >gi|224589823|ref|NC_000024.9| private static readonly Regex NcbiRegex = new Regex("^>gi\\|\\d+\\|ref\\|([^|]+)\\|", RegexOptions.Compiled); // >ref|NC_000013.11| Homo sapiens chromosome 13, GRCh38.p12 Primary Assembly private static readonly Regex NcbiRegex2 = new Regex("^>ref\\|([^|]+)\\|", RegexOptions.Compiled); internal static void AddReferenceSequences(Stream stream, Dictionary refNameToChromosome, List references) { var sb = new StringBuilder(); using (var reader = new StreamReader(stream)) { var queue = new Queue(); while (true) { string input = queue.Count > 0 ? queue.Dequeue() : reader.ReadLine(); if (input == null) break; if (!input.StartsWith(">")) throw new UserErrorException($"Encountered a FASTA header that did not start with '>': {input}"); string name = GetName(input); var chromosome = GetChromosome(refNameToChromosome, name); string bases = GetBases(sb, reader, queue); references.Add(new FastaSequence(chromosome, bases)); } } } private static string GetBases(StringBuilder sb, StreamReader reader, Queue queue) { sb.Clear(); while (true) { string line = reader.ReadLine(); if (line == null) break; if (line.StartsWith('>')) { queue.Enqueue(line); break; } sb.Append(line); } return sb.ToString(); } private static Chromosome GetChromosome(Dictionary refNameToChromosome, string name) { var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, name); if (chromosome.IsEmpty()) { throw new InvalidDataException($"Could not find the chromosome ({name}) in the reference name dictionary."); } return chromosome; } private static string GetName(string s) { var match = NcbiRegex2.Match(s); if (match.Success) return match.Groups[1].Value; match = NcbiRegex.Match(s); if (match.Success) return match.Groups[1].Value; match = NameRegex.Match(s); if (match.Success) return match.Groups[1].Value; throw new InvalidDataException($"Unable to match the regex to the chromosome name ({s})"); } } } ================================================ FILE: ReferenceSequence/IO/ReferenceNamesReader.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using OptimizedCore; namespace ReferenceSequence.IO { public static class ReferenceNamesReader { private const int RefIndex = 0; private const int EnsemblIndex = 1; private const int UcscIndex = 2; public static List GetReferenceNames(Stream stream) { var names = new List(); using (var reader = new StreamReader(stream)) { while (true) { string line = reader.ReadLine(); if (line == null) break; string[] cols = line.OptimizedSplit('\t'); ushort refIndex = ushort.Parse(cols[RefIndex]); string ensemblName = cols[EnsemblIndex]; string ucscName = cols[UcscIndex]; names.Add(new Chromosome(ucscName, ensemblName, null, null, 0, refIndex)); } } return names.OrderBy(x => x.Index).ToList(); } } } ================================================ FILE: ReferenceSequence/IO/ReferenceSequenceReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using Genome; using Intervals; using IO; using ReferenceSequence.Common; namespace ReferenceSequence.IO { public sealed class CompressedSequenceReader : IDisposable { public readonly Dictionary RefNameToChromosome = new Dictionary(); public readonly Dictionary RefIndexToChromosome = new Dictionary(); private readonly Dictionary _refIndexToIndex = new Dictionary(); private readonly IndexEntry[] _indexEntries; public readonly Sequence Sequence = new Sequence(); public ushort NumRefSeqs { get; private set; } public const int MaxShift = 6; private readonly ExtendedBinaryReader _reader; private readonly Stream _stream; public GenomeAssembly Assembly => Sequence.Assembly; public byte PatchLevel; // we'll use this in future version providers public CompressedSequenceReader(Stream stream) { _stream = stream; _reader = new ExtendedBinaryReader(stream); CheckHeaderVersion(); LoadHeader(); AddChromosomes(); _indexEntries = LoadIndex(); } public void Dispose() { _reader?.Dispose(); _stream?.Dispose(); } private void CheckHeaderVersion() { string headerTag = _reader.ReadString(); int headerVersion = _reader.ReadInt32(); if (headerTag != ReferenceSequenceCommon.HeaderTag || headerVersion != ReferenceSequenceCommon.HeaderVersion) { throw new InvalidFileFormatException($"The header identifiers do not match the expected values: Obs: {headerTag} {headerVersion} vs Exp: {ReferenceSequenceCommon.HeaderTag} {ReferenceSequenceCommon.HeaderVersion}"); } } public void GetCompressedSequence(Chromosome chromosome) { if (chromosome.IsEmpty() || !_refIndexToIndex.TryGetValue(chromosome.Index, out int index)) { Sequence.EnableNSequence(); return; } var indexEntry = _indexEntries[index]; _stream.Position = indexEntry.FileOffset; uint tag = _reader.ReadUInt32(); if (tag != ReferenceSequenceCommon.ReferenceStartTag) { throw new InvalidDataException($"The reference start tag does not match the expected values: Obs: {tag} vs Exp: {ReferenceSequenceCommon.ReferenceStartTag}"); } (int sequenceOffset, int numBases) = GetMetadata(_reader); byte[] twoBitBuffer = GetTwoBitBuffer(_reader); IntervalArray maskedEntryIntervalArray = GetMaskedEntries(_reader); Band[] cytogeneticBands = GetCytogeneticBands(_reader); Sequence.Set(numBases, sequenceOffset, twoBitBuffer, maskedEntryIntervalArray, cytogeneticBands); } private static (int SequenceOffset, int NumBases) GetMetadata(ExtendedBinaryReader reader) { int sequenceOffset = reader.ReadOptInt32(); int numBases = reader.ReadOptInt32(); return (sequenceOffset, numBases); } private static Band[] GetCytogeneticBands(ExtendedBinaryReader reader) { int numBands = reader.ReadOptInt32(); var bands = new Band[numBands]; for (var i = 0; i < numBands; i++) { int begin = reader.ReadOptInt32(); int end = reader.ReadOptInt32(); string name = reader.ReadAsciiString(); bands[i] = new Band(begin, end, name); } return bands; } private static IntervalArray GetMaskedEntries(ExtendedBinaryReader reader) { int numEntries = reader.ReadOptInt32(); var maskedEntries = new Interval[numEntries]; for (var i = 0; i < numEntries; i++) { int begin = reader.ReadOptInt32(); int end = reader.ReadOptInt32(); maskedEntries[i] = new Interval(begin, end, new MaskedEntry(begin, end)); } return new IntervalArray(maskedEntries); } private static byte[] GetTwoBitBuffer(ExtendedBinaryReader reader) { int numBytes = reader.ReadOptInt32(); return reader.ReadBytes(numBytes); } private void LoadHeader() { Sequence.Assembly = (GenomeAssembly)_reader.ReadByte(); PatchLevel = _reader.ReadByte(); NumRefSeqs = (ushort)_reader.ReadOptInt32(); } private void AddChromosomes() { for (var i = 0; i < NumRefSeqs; i++) { var chromosome = Chromosome.Read(_reader); AddReferenceName(chromosome); } } private IndexEntry[] LoadIndex() { uint tag = _reader.ReadUInt32(); if (tag != ReferenceSequenceCommon.IndexStartTag) { throw new InvalidDataException($"The index start tag does not match the expected values: Obs: {tag} vs Exp: {ReferenceSequenceCommon.IndexStartTag}"); } int numEntries = _reader.ReadInt32(); var indexEntries = new IndexEntry[numEntries]; for (var i = 0; i < numEntries; i++) { ushort refIndex = _reader.ReadUInt16(); long fileOffset = _reader.ReadInt64(); indexEntries[i] = new IndexEntry(refIndex, fileOffset); _refIndexToIndex[refIndex] = i; } return indexEntries; } private void AddReferenceName(Chromosome chromosome) { if (!string.IsNullOrEmpty(chromosome.UcscName)) RefNameToChromosome[chromosome.UcscName] = chromosome; if (!string.IsNullOrEmpty(chromosome.EnsemblName)) RefNameToChromosome[chromosome.EnsemblName] = chromosome; if (!string.IsNullOrEmpty(chromosome.RefSeqAccession)) RefNameToChromosome[chromosome.RefSeqAccession] = chromosome; if (!string.IsNullOrEmpty(chromosome.GenBankAccession)) RefNameToChromosome[chromosome.GenBankAccession] = chromosome; RefIndexToChromosome[chromosome.Index] = chromosome; } } } ================================================ FILE: ReferenceSequence/IO/ReferenceSequenceWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using IO; using ReferenceSequence.Common; using ReferenceSequence.Creation; namespace ReferenceSequence.IO { internal sealed class ReferenceSequenceWriter : IDisposable { private readonly Stream _stream; private readonly ExtendedBinaryWriter _writer; internal ReferenceSequenceWriter(Stream stream, IReadOnlyCollection chromosomes, GenomeAssembly genomeAssembly, byte patchLevel) { _stream = stream; _writer = new ExtendedBinaryWriter(stream); WriteHeader(genomeAssembly, patchLevel, chromosomes); } public void Dispose() => _writer.Dispose(); private void WriteHeader(GenomeAssembly genomeAssembly, byte patchLevel, IReadOnlyCollection chromosomes) { _writer.Write(ReferenceSequenceCommon.HeaderTag); _writer.Write(ReferenceSequenceCommon.HeaderVersion); _writer.Write((byte)genomeAssembly); _writer.Write(patchLevel); _writer.WriteOpt(chromosomes.Count); foreach (var chromosome in chromosomes) chromosome.Write(_writer); } internal void Write(List referenceSequences) { _writer.Flush(); long indexOffset = _stream.Position; int indexSize = 8 + IndexEntry.Size * referenceSequences.Count; var buffers = new List(referenceSequences.Count); ushort refIndex = 0; foreach (var referenceSequence in referenceSequences) { buffers.Add(referenceSequence.GetReferenceBuffer(refIndex)); refIndex++; } IndexEntry[] index = CreateIndex(buffers, indexOffset, indexSize); WriteIndex(index); WriteReferenceBuffers(buffers); } private static IndexEntry[] CreateIndex(IReadOnlyCollection referenceBuffers, long indexOffset, int indexSize) { var indexEntries = new IndexEntry[referenceBuffers.Count]; long referenceOffset = indexOffset + indexSize; var index = 0; foreach (var block in referenceBuffers) { indexEntries[index] = new IndexEntry(block.RefIndex, referenceOffset); referenceOffset += block.BufferSize; index++; } return indexEntries; } private void WriteIndex(IReadOnlyCollection indexEntries) { _writer.Write(ReferenceSequenceCommon.IndexStartTag); _writer.Write(indexEntries.Count); foreach (var indexEntry in indexEntries) { _writer.Write(indexEntry.RefIndex); _writer.Write(indexEntry.FileOffset); } } private void WriteReferenceBuffers(IEnumerable referenceBuffers) { foreach (var referenceBuffer in referenceBuffers) { _writer.Write(referenceBuffer.Buffer, 0, referenceBuffer.BufferSize); } } } } ================================================ FILE: ReferenceSequence/ReferenceSequence.csproj ================================================  Exe net6.0 ..\bin\$(Configuration) ================================================ FILE: ReferenceSequence/ReferenceSequenceCommon.cs ================================================ namespace ReferenceSequence { public static class ReferenceSequenceCommon { public const string HeaderTag = "NirvanaReference"; public const int HeaderVersion = 7; public const uint IndexStartTag = 0x4CF76E2F; public const uint ReferenceStartTag = 0xA7D8212A; public const int NumBasesPerByte = 4; } } ================================================ FILE: ReferenceSequence/ReferenceUtilsMain.cs ================================================ using System.Collections.Generic; using CommandLine.Builders; using ReferenceSequence.Commands; using VariantAnnotation.Interface; namespace ReferenceSequence { internal static class ReferenceUtilsMain { private static int Main(string[] args) { var ops = new Dictionary { ["create"] = new TopLevelOption("creates a full reference file", CreateReferenceMain.Run), ["substring"] = new TopLevelOption("creates a reference substring file", CreateSubstringMain.Run), ["testseq"] = new TopLevelOption("creates a TestSeq_reference.dat file", CreateTestSeqMain.Run) }; var exitCode = new TopLevelAppBuilder(args, ops) .Parse() .ShowBanner(Constants.Authors) .ShowHelpMenu("Utilities focused on creating the reference files") .ShowErrors() .Execute(); return (int)exitCode; } } } ================================================ FILE: ReferenceSequence/Utilities/SequenceHelper.cs ================================================ using System.Collections.Generic; using Genome; using IO; using ReferenceSequence.IO; namespace ReferenceSequence.Utilities { public static class SequenceHelper { public static (Dictionary refIndexToChromosome, Dictionary refNameToChromosome, int numRefSeqs) GetDictionaries(string referencePath) { Dictionary refIndexToChromosome; Dictionary refNameToChromosome; int numRefSeqs; using (var reader = new CompressedSequenceReader(PersistentStreamUtils.GetReadStream(referencePath))) { refIndexToChromosome = reader.RefIndexToChromosome; refNameToChromosome = reader.RefNameToChromosome; numRefSeqs = reader.NumRefSeqs; } return (refIndexToChromosome, refNameToChromosome, numRefSeqs); } } } ================================================ FILE: RepeatExpansions/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: RepeatExpansions/IO/RepeatExpansionReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using ErrorHandling; using ErrorHandling.Exceptions; using Genome; using Intervals; using OptimizedCore; namespace RepeatExpansions.IO { public static class RepeatExpansionReader { private const int ChromIndex = 0; private const int StartIndex = 1; private const int EndIndex = 2; private const int PhenotypeIndex = 3; private const int OmimIndex = 4; private const int RepeatNumbersIndex = 5; private const int AlleleCountsIndex = 6; private const int CategoriesIndex = 7; private const int CategoryRangesIndex = 8; private const int MinNumberOfColumns = 9; public static IIntervalForest Load(Stream stream, GenomeAssembly desiredGenomeAssembly, Dictionary refNameToChromosome, int numRefSeqs) { var intervalLists = new List>[numRefSeqs]; for (var i = 0; i < numRefSeqs; i++) intervalLists[i] = new List>(); using (stream) { using (var reader = new StreamReader(stream)) { CheckHeader(reader, desiredGenomeAssembly); while (true) { string line = reader.ReadLine(); if (line == null) break; if(line == string.Empty) continue; try { (ushort refIndex, Interval phenotypeInterval) = GetPhenotype(line, refNameToChromosome); if(refIndex == ushort.MaxValue) throw new InvalidDataException("Unknown chromosome encountered in STR file."); intervalLists[refIndex].Add(phenotypeInterval); } catch (Exception e) { e.Data[ExitCodeUtilities.Line] = line; throw; } } } } var refIntervalArrays = new IntervalArray[numRefSeqs]; for (var i = 0; i < numRefSeqs; i++) { refIntervalArrays[i] = new IntervalArray(intervalLists[i].ToArray()); } return new IntervalForest(refIntervalArrays); } private static (ushort RefIndex, Interval Interval) GetPhenotype(string line, Dictionary refNameToChromosome) { string[] cols = line.OptimizedSplit('\t'); if (cols.Length < MinNumberOfColumns) throw new InvalidDataException($"Expected at least {MinNumberOfColumns} columns in the STR data file, but found only {cols.Length}."); string chromosomeString = cols[ChromIndex]; int start = int.Parse(cols[StartIndex]); int end = int.Parse(cols[EndIndex]); string phenotype = cols[PhenotypeIndex]; string omimId = cols[OmimIndex]; int[] repeatNumbers = cols[RepeatNumbersIndex].Split(',').Select(int.Parse).ToArray(); int[] alleleCounts = cols[AlleleCountsIndex].Split(',').Select(int.Parse).ToArray(); string[] classifications = cols[CategoriesIndex].Split(',').ToArray(); Interval[] classificationRanges = cols[CategoryRangesIndex].Split(',').Select(GetInterval).ToArray(); if (repeatNumbers.Length != alleleCounts.Length) throw new InvalidDataException($"Inconsistent number of repeat numbers ({repeatNumbers.Length}) vs. allele counts ({alleleCounts.Length})"); if (classifications.Length != classificationRanges.Length) throw new InvalidDataException($"Inconsistent number of values of classifications ({classifications.Length}) vs. classification ranges ({classificationRanges.Length})"); var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, chromosomeString); var chromosomeInterval = new ChromosomeInterval(chromosome, start, end); double[] percentiles = PercentileUtilities.ComputePercentiles(repeatNumbers.Length, alleleCounts); var rePhenotype = new RepeatExpansionPhenotype(chromosomeInterval, phenotype, omimId, repeatNumbers, percentiles, classifications, classificationRanges); return (chromosome.Index, new Interval(start, end, rePhenotype)); } private static Interval GetInterval(string s) { string[] cols = s.OptimizedSplit('-'); int begin = cols[0] == "inf" ? int.MaxValue : int.Parse(cols[0]); int end = cols[1] == "inf" ? int.MaxValue : int.Parse(cols[1]); return new Interval(begin, end); } private static void CheckHeader(TextReader reader, GenomeAssembly desiredGenomeAssembly) { string line = reader.ReadLine(); while (line == string.Empty) line = reader.ReadLine(); if(line==null) throw new UserErrorException("The custom STR file provided is empty."); GenomeAssembly genomeAssembly = GenomeAssembly.Unknown; var headerNum = 0; while (line!=null && line.StartsWith("#")) { headerNum++; line = line.Trim(); var columns = line.Split('=','\t'); var tag = columns[0].ToLower(); switch (headerNum) { case 1: if(tag != "#assembly") throw new UserErrorException("First line in STR data file has to contain assembly. For example: #assembly=GRCh38"); genomeAssembly = GenomeAssemblyHelper.Convert(columns[1]); if (genomeAssembly != desiredGenomeAssembly) throw new UserErrorException($"Expected {desiredGenomeAssembly} in the STR data file, but found {genomeAssembly}"); break; case 2: if(tag!="#chrom") throw new UserErrorException("Second line in TSV has to contain column labels. For example: #Chrom\tStart\tEnd\tPhenotype\t..."); return; // we should not read the next line default: throw new UserErrorException($"Unexpected header tag observed:\n{line}"); } line = reader.ReadLine(); } if(genomeAssembly == GenomeAssembly.Unknown) throw new UserErrorException("Genome assembly not specified in STR header. It is a required field."); } } } ================================================ FILE: RepeatExpansions/IRepeatExpansionProvider.cs ================================================ using VariantAnnotation.Interface.AnnotatedPositions; namespace RepeatExpansions { public interface IRepeatExpansionProvider { void Annotate(IAnnotatedPosition annotatedPosition); } } ================================================ FILE: RepeatExpansions/Matcher.cs ================================================ using System.Collections.Generic; using Intervals; using VariantAnnotation.Interface.SA; using Variants; namespace RepeatExpansions { public sealed class Matcher { private readonly IIntervalForest _phenotypeForest; public Matcher(IIntervalForest phenotypeForest) => _phenotypeForest = phenotypeForest; public ISupplementaryAnnotation GetMatchingAnnotations(RepeatExpansion variant) { RepeatExpansionPhenotype[] variantPhenotypes = _phenotypeForest.GetAllOverlappingValues(variant.Chromosome.Index, variant.Start, variant.End); if (variantPhenotypes == null) return null; var jsonEntries = new List(); // ReSharper disable once LoopCanBeConvertedToQuery foreach (var variantPhenotype in variantPhenotypes) { if (!ExactMatch(variant, variantPhenotype.ChromosomeInterval)) continue; string json = variantPhenotype.GetAnnotation(variant.RepeatCount); jsonEntries.Add(json); } return jsonEntries.Count == 0 ? null : new RepeatExpansionSupplementaryAnnotation(jsonEntries); } private static bool ExactMatch(IInterval variant, IInterval variantPhenotype) => variant.Start == variantPhenotype.Start && variant.End == variantPhenotype.End; } } ================================================ FILE: RepeatExpansions/PercentileUtilities.cs ================================================ using System; using System.Collections.Generic; using System.Linq; namespace RepeatExpansions { public static class PercentileUtilities { public static double[] ComputePercentiles(int valueCount, IReadOnlyList alleleCounts) { var percentiles = new double[valueCount]; var smallerValueCount = 0; int totalCount = alleleCounts.Sum(); percentiles[0] = 0; for (var i = 1; i < valueCount; i++) { smallerValueCount += alleleCounts[i - 1]; percentiles[i] = 100.0 * smallerValueCount / totalCount; } return percentiles; } public static double GetPercentile(T inputValue, T[] referenceValues, double[] referencePercentiles) { int index = Array.BinarySearch(referenceValues, inputValue); if (index >= 0) return referencePercentiles[index]; index = ~index; return index == referenceValues.Length ? 100.00 : referencePercentiles[index]; } } } ================================================ FILE: RepeatExpansions/RepeatExpansionPhenotype.cs ================================================ using System.Collections.Generic; using System.Linq; using Genome; using Intervals; namespace RepeatExpansions { public sealed class RepeatExpansionPhenotype { public readonly ChromosomeInterval ChromosomeInterval; // used directly in JSON output private readonly string _phenotype; private readonly string _omimId; // used during annotation private readonly int[] _repeatNumbers; private readonly double[] _percentiles; private readonly string[] _classifications; private readonly Interval[] _classificationRanges; public RepeatExpansionPhenotype(ChromosomeInterval chromosomeInterval, string phenotype, string omimId, int[] repeatNumbers, double[] percentiles, string[] classifications, Interval[] classificationRanges) { ChromosomeInterval = chromosomeInterval; _phenotype = phenotype; _omimId = omimId; _repeatNumbers = repeatNumbers; _percentiles = percentiles; _classifications = classifications; _classificationRanges = classificationRanges; } public string GetAnnotation(int repeatNumber) { double percentile = PercentileUtilities.GetPercentile(repeatNumber, _repeatNumbers, _percentiles); IEnumerable classifications = GetClassifications(repeatNumber); return GetJson(percentile, classifications); } private string GetJson(double percentile, IEnumerable classifications) { // in net6.0, the compiler gets confused if you have }}}. Should the first two }s be a closing brace or the second? This results in a bug. // we can circumvent it by taking the leading and trailing parenthesis out of the main expression and adding them separately const char openCurly = '{'; const char closeCurly = '}'; string joined = string.Join(",", classifications.Select(classification => "\"" + classification + "\"")); return $"{openCurly}\"phenotype\":\"{_phenotype}\",\"omimId\":{_omimId},\"classifications\":[{joined}],\"percentile\":{percentile:0.00}{closeCurly}"; } private IEnumerable GetClassifications(int repeatNumber) { var classifications = new List(); for (var i = 0; i < _classificationRanges.Length; i++) { var range = _classificationRanges[i]; if (range.Start <= repeatNumber && repeatNumber <= range.End) classifications.Add(_classifications[i]); } return classifications; } } } ================================================ FILE: RepeatExpansions/RepeatExpansionProvider.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using Intervals; using IO; using RepeatExpansions.IO; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace RepeatExpansions { public sealed class RepeatExpansionProvider : IRepeatExpansionProvider { private readonly Matcher _matcher; public RepeatExpansionProvider(GenomeAssembly genomeAssembly, Dictionary refNameToChromosome, int numRefSeqs, string customTsvPath) { using ( Stream stream = GetTsvStream(genomeAssembly, customTsvPath)) { IIntervalForest phenotypeForest = RepeatExpansionReader.Load(stream, genomeAssembly, refNameToChromosome, numRefSeqs); _matcher = new Matcher(phenotypeForest); } } private static Stream GetTsvStream(GenomeAssembly genomeAssembly, string customTsvPath) { //since we are using the executing assembly, we cannot move the following lines about getting stream further upstream. var assembly = System.Reflection.Assembly.GetExecutingAssembly(); string resourceName = $"RepeatExpansions.Resources.RepeatExpansions.{genomeAssembly}.tsv"; var stream = customTsvPath != null ? PersistentStreamUtils.GetReadStream(customTsvPath) : assembly.GetManifestResourceStream(resourceName); if (stream == null) throw new NullReferenceException("Unable to read from the STR resource file"); return stream; } public void Annotate(IAnnotatedPosition annotatedPosition) { foreach (var variant in annotatedPosition.AnnotatedVariants) { if (variant.Variant.Type != VariantType.short_tandem_repeat_variation) continue; var repeatExpansion = (RepeatExpansion)variant.Variant; var phenotypes = _matcher.GetMatchingAnnotations(repeatExpansion); if (phenotypes == null) continue; variant.RepeatExpansionPhenotypes = phenotypes; } } } } ================================================ FILE: RepeatExpansions/RepeatExpansionSupplementaryAnnotation.cs ================================================ using System.Collections.Generic; using System.Text; using VariantAnnotation.Interface.SA; namespace RepeatExpansions { public sealed class RepeatExpansionSupplementaryAnnotation : ISupplementaryAnnotation { private readonly List _jsonEntries; public string JsonKey => "repeatExpansionPhenotypes"; public RepeatExpansionSupplementaryAnnotation(List jsonEntries) => _jsonEntries = jsonEntries; public void SerializeJson(StringBuilder sb) => sb.Append($"[{string.Join(',', _jsonEntries)}]"); } } ================================================ FILE: RepeatExpansions/RepeatExpansions.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: RepeatExpansions/Resources/RepeatExpansions.GRCh37.tsv ================================================ #assembly=GRCh37 #Chrom Start End Phenotype OMIM_ID Repeat_numbers Allele_counts Classifications Classification_ranges X 66765159 66765227 Spinal and bulbar muscular atrophy of Kennedy 313200 9,14,15,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32 1,1,2,6,19,18,20,18,28,24,13,18,13,7,4,4,3,1,1 Normal, Expanded 0-34, 35-inf 12 7045880 7045936 Dentatorubro-pallidoluysian atrophy 125370 7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,30 1,2,1,16,7,23,5,29,49,16,57,26,36,19,6,5,1,1 Normal, Expanded 0-35, 36-inf 22 46191235 46191304 Spinocerebellar ataxia 10 603516 7,9,11,12,13,14,15,16,17,18,19 2,1,7,43,75,87,43,21,11,6,4 Normal, Expanded 0-32, 33-inf 6 16327865 16327954 Spinocerebellar ataxia 1 164400 19,20,21,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38 1,1,1,1,2,1,26,20,43,59,84,28,21,5,1,2,3,1 Normal, Expanded 0-35, 36-inf 12 112036754 112036822 Spinocerebellar ataxia 2 183090 19,21,22,23,24,26,27,29,30,31,33 2,1,237,48,4,1,1,2,2,1,1 Normal, Expanded 0-31, 32-inf 14 92537354 92537386 Machado-Joseph disease 109150 9,11,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,35,36 1,71,2,11,10,14,24,46,14,3,7,30,17,7,6,6,5,9,8,6,1,1,1 Normal, Expanded 0-44, 45-inf 3 63898361 63898390 Spinocerebellar ataxia 7 164500 1,2,3,4,7,8,9,10,11,12,13,15,21 1,1,5,1,5,2,4,217,25,29,8,1,1 Normal, Expanded 0-27, 28-inf 13 70713516 70713560 Spinocerebellar ataxia 8 608768 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,33,36,37,38,71,81,118 4,55,3,19,40,6,21,54,18,26,10,13,8,9,5,2,1,1,1,1,1,1,1 Normal, Expanded 0-50, 51-inf 9 27573527 27573544 Frontotemporal dementia and/or amyotrophic lateral sclerosis 1 105550 2,3,4,5,6,7,8,9,10,11,12,13,14,17,19,23,27,41 147,2,22,17,24,25,26,9,13,1,4,3,2,1,1,1,1,1 Normal, Expanded 0-25, 26-inf 19 13318673 13318711 Spinocerebellar ataxia 6 183086 4,7,8,10,11,12,13,14,15,38 2,28,1,4,102,62,85,13,2,1 Normal, Expanded 0-18, 19-inf 11 119077000 119077032 Jacobsen syndrome 147791 4,7,8,10,11,12,13,14,15,16,17,18,19,20,22,25 1,1,21,6,192,36,8,10,8,3,3,1,3,4,1,2 Normal, Expanded 0-80, 81-inf 3 128891420 128891499 Myotonic dystrophy 2 602668 6,8,10,11,12,15,16,17,18,19,20,21,22,24,26,27,28,30,41 3,1,2,5,1,130,68,38,12,19,4,7,4,1,1,1,1,1,1 Normal, Expanded 0-50, 51-inf 21 45196325 45196360 Epilepsy, progressive myoclonic 1A (Unverricht and Lundborg) 254800 2,3,4,6,7,10,11,13 133,160,2,1,1,1,1,1 Normal, Expanded 0-3, 4-inf 19 46273463 46273522 Myotonic dystrophy 1 160900 5,6,7,8,9,10,11,12,13,14,15,16,17,19,21,22,24,26,27,32,33 110,1,4,1,2,11,29,55,38,16,13,4,1,1,5,2,3,1,1,1,1 Normal, Expanded 0-34, 35-inf X 146993569 146993628 Fragile X syndrome 300624 8,11,15,20,22,23,24,25,28,29,30,31,32,33,34,35,36,37,38,39,41,43,47,52,55,56,57,60 1,1,1,3,7,5,3,2,1,60,50,20,5,4,1,2,9,6,1,4,1,1,1,8,1,1,1,1 Normal, Expanded 0-44, 45-inf 9 71652203 71652220 Friedreich ataxia 229300 5,6,7,8,9,10,13,14,16,17,18,19,20,21,23,24,25,26 9,7,2,113,135,1,4,1,4,5,4,6,1,3,1,1,1,2 Normal, Expanded 0-33, 34-inf 4 3076604 3076660 Huntington disease 143100 9,11,12,15,16,17,18,19,20,21,22,23,24,25,27,28 1,1,2,33,25,99,46,31,20,10,10,8,6,2,3,3 Normal, Expanded 0-26, 27-inf 16 87637894 87637935 Huntington disease-like 2 606438 5,11,12,13,14,15,16,17,18,19,20,22,23,26,27,28,29,33 1,5,1,20,139,41,49,17,9,5,1,1,1,4,2,1,2,1 Normal, Expanded 0-28, 29-inf 20 2633380 2633403 Spinocerebellar ataxia 36 614153 4,5,6,7,8,9,10,11 72,37,32,122,11,21,2,3 Normal, Expanded 0-14, 15-inf 5 146258291 146258320 Spinocerebellar ataxia 12 604326 9,10,11,13,14,15,16,17,18,19,20,23 31,129,3,43,30,28,17,13,1,3,1,1 Normal, Expanded 0-32, 33-inf 18 53253387 53253458 Fuchs' Corneal Dystrophy 613267 5,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,33,34,35,36,37,46,52,54 1,76,5,4,18,32,12,32,9,6,5,6,12,12,20,11,9,10,4,4,2,2,2,1,1,1,1,1,1 Normal, Expanded 0-39, 40-inf 15 23086367 23086390 Amyotrophic lateral sclerosis 600363 6,7,8,9,10,19,20,24,33 1,127,162,2,4,1,1,1,1 Normal, Expanded 0-8, 9-inf 2 191745600 191745646 Glutaminase deficiency 618412 7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25 1,99,6,12,5,7,21,53,49,13,11,6,8,4,2,1,1,1 Normal, Expanded 0-89, 90-inf ================================================ FILE: RepeatExpansions/Resources/RepeatExpansions.GRCh38.tsv ================================================ #assembly=GRCh38 #Chrom Start End Phenotype OMIM_ID Repeat_numbers Allele_counts Classifications Classification_ranges chrX 67545317 67545385 Spinal and bulbar muscular atrophy of Kennedy 313200 9,14,15,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32 1,1,2,6,19,18,20,18,28,24,13,18,13,7,4,4,3,1,1 Normal, Expanded 0-34, 35-inf chr12 6936717 6936773 Dentatorubro-pallidoluysian atrophy 125370 7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,30 1,2,1,16,7,23,5,29,49,16,57,26,36,19,6,5,1,1 Normal, Expanded 0-35, 36-inf chr22 45795355 45795424 Spinocerebellar ataxia 10 603516 7,9,11,12,13,14,15,16,17,18,19 2,1,7,43,75,87,43,21,11,6,4 Normal, Expanded 0-32, 33-inf chr6 16327634 16327723 Spinocerebellar ataxia 1 164400 19,20,21,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38 1,1,1,1,2,1,26,20,43,59,84,28,21,5,1,2,3,1 Normal, Expanded 0-35, 36-inf chr12 111598950 111599018 Spinocerebellar ataxia 2 183090 19,21,22,23,24,26,27,29,30,31,33 2,1,237,48,4,1,1,2,2,1,1 Normal, Expanded 0-31, 32-inf chr14 92071010 92071042 Machado-Joseph disease 109150 9,11,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,35,36 1,71,2,11,10,14,24,46,14,3,7,30,17,7,6,6,5,9,8,6,1,1,1 Normal, Expanded 0-44, 45-inf chr3 63912685 63912714 Spinocerebellar ataxia 7 164500 1,2,3,4,7,8,9,10,11,12,13,15,21 1,1,5,1,5,2,4,217,25,29,8,1,1 Normal, Expanded 0-27, 28-inf chr13 70139384 70139428 Spinocerebellar ataxia 8 608768 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,33,36,37,38,71,81,118 4,55,3,19,40,6,21,54,18,26,10,13,8,9,5,2,1,1,1,1,1,1,1 Normal, Expanded 0-50, 51-inf chr9 27573529 27573546 Frontotemporal dementia and/or amyotrophic lateral sclerosis 1 105550 2,3,4,5,6,7,8,9,10,11,12,13,14,17,19,23,27,41 147,2,22,17,24,25,26,9,13,1,4,3,2,1,1,1,1,1 Normal, Expanded 0-25, 26-inf chr19 13207859 13207897 Spinocerebellar ataxia 6 183086 4,7,8,10,11,12,13,14,15,38 2,28,1,4,102,62,85,13,2,1 Normal, Expanded 0-18, 19-inf chr11 119206290 119206322 Jacobsen syndrome 147791 4,7,8,10,11,12,13,14,15,16,17,18,19,20,22,25 1,1,21,6,192,36,8,10,8,3,3,1,3,4,1,2 Normal, Expanded 0-80, 81-inf chr3 129172577 129172656 Myotonic dystrophy 2 602668 6,8,10,11,12,15,16,17,18,19,20,21,22,24,26,27,28,30,41 3,1,2,5,1,130,68,38,12,19,4,7,4,1,1,1,1,1,1 Normal, Expanded 0-50, 51-inf chr21 43776444 43776479 Epilepsy, progressive myoclonic 1A (Unverricht and Lundborg) 254800 2,3,4,6,7,10,11,13 133,160,2,1,1,1,1,1 Normal, Expanded 0-3, 4-inf chr19 45770205 45770264 Myotonic dystrophy 1 160900 5,6,7,8,9,10,11,12,13,14,15,16,17,19,21,22,24,26,27,32,33 110,1,4,1,2,11,29,55,38,16,13,4,1,1,5,2,3,1,1,1,1 Normal, Expanded 0-34, 35-inf chrX 147912051 147912110 Fragile X syndrome 300624 8,11,15,20,22,23,24,25,28,29,30,31,32,33,34,35,36,37,38,39,41,43,47,52,55,56,57,60 1,1,1,3,7,5,3,2,1,60,50,20,5,4,1,2,9,6,1,4,1,1,1,8,1,1,1,1 Normal, Expanded 0-44, 45-inf chr9 69037287 69037304 Friedreich ataxia 229300 5,6,7,8,9,10,13,14,16,17,18,19,20,21,23,24,25,26 9,7,2,113,135,1,4,1,4,5,4,6,1,3,1,1,1,2 Normal, Expanded 0-33, 34-inf chr4 3074877 3074933 Huntington disease 143100 9,11,12,15,16,17,18,19,20,21,22,23,24,25,27,28 1,1,2,33,25,99,46,31,20,10,10,8,6,2,3,3 Normal, Expanded 0-26, 27-inf chr16 87604288 87604329 Huntington disease-like 2 606438 5,11,12,13,14,15,16,17,18,19,20,22,23,26,27,28,29,33 1,5,1,20,139,41,49,17,9,5,1,1,1,4,2,1,2,1 Normal, Expanded 0-28, 29-inf chr20 2652734 2652757 Spinocerebellar ataxia 36 614153 4,5,6,7,8,9,10,11 72,37,32,122,11,21,2,3 Normal, Expanded 0-14, 15-inf chr5 146878728 146878757 Spinocerebellar ataxia 12 604326 9,10,11,13,14,15,16,17,18,19,20,23 31,129,3,43,30,28,17,13,1,3,1,1 Normal, Expanded 0-32, 33-inf chr18 55586156 55586227 Fuchs' Corneal Dystrophy 613267 5,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,33,34,35,36,37,46,52,54 1,76,5,4,18,32,12,32,9,6,5,6,12,12,20,11,9,10,4,4,2,2,2,1,1,1,1,1,1 Normal, Expanded 0-39, 40-inf chr15 22786678 22786701 Amyotrophic lateral sclerosis 600363 6,7,8,9,10,19,20,24,33 1,127,162,2,4,1,1,1,1 Normal, Expanded 0-8, 9-inf chr2 190880874 190880920 Glutaminase deficiency 618412 7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25 1,99,6,12,5,7,21,53,49,13,11,6,8,4,2,1,1,1 Normal, Expanded 0-89, 90-inf ================================================ FILE: SAUtils/AAConservation/AaConservationMain.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Caches; using VariantAnnotation.ProteinConservation; using VariantAnnotation.Providers; namespace SAUtils.AAConservation { public static class AaConservationMain { private static string _scoresFile; private static string _compressedReference; private static string _transcriptCachePrefix; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "cache|c=", "Transcript cache prefix", v => _transcriptCachePrefix = v }, { "scr|s=", "input file path with conservation scores", v => _scoresFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckInputFilenameExists(CacheConstants.TranscriptPath(_transcriptCachePrefix), "transcript cache prefix", "--cache") .CheckInputFilenameExists(_scoresFile, "input file path with conservation scores", "--src") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { using var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); TranscriptCacheData transcriptData = AaConservationUtilities.GetTranscriptData(referenceProvider.RefIndexToChromosome, _transcriptCachePrefix);// we will use the transcript data to validate the protein sequence var version = DataSourceVersionReader.GetSourceVersion(_scoresFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; //read multi-alignments using (var stream = GZipUtilities.GetAppropriateReadStream(_scoresFile)) using(var parser = new ProteinConservationParser(stream)) using(var outStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName+ProteinConservationCommon.FileSuffix))) using(var groupStream = FileUtilities.GetCreateStream("transcriptGroups.txt")) using(var writer = new ProteinConservationWriter(outStream, groupStream, transcriptData, version)) { writer.Write(parser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/AAConservation/AaConservationUtilities.cs ================================================ using System.Collections.Generic; using Genome; using IO; using VariantAnnotation.Caches; using VariantAnnotation.IO.Caches; namespace SAUtils.AAConservation { public static class AaConservationUtilities { public static TranscriptCacheData GetTranscriptData(Dictionary refIndexToChromosome, string transcriptCachePrefix) { using var transcriptCacheReader = new TranscriptCacheReader( FileUtilities.GetReadStream(CacheConstants.TranscriptPath(transcriptCachePrefix))); return transcriptCacheReader.Read(refIndexToChromosome); } } } ================================================ FILE: SAUtils/AAConservation/ProteinConservationParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using OptimizedCore; using VariantAnnotation.ProteinConservation; namespace SAUtils.AAConservation { public sealed class ProteinConservationParser:IDisposable { private readonly Stream _stream; private int _ensemblIdsIndex = -1; private int _chromIndex = -1; private int _scoresIndex = -1; private int _proteinSeqIndex = -1; private const string EnsemblIdsTag = "Ensembl"; private const string ProteinSequenceTag = "ProteinSequence"; private const string ChromTag = "Chromosome"; private const string ScoresTag = "Percent Conservation at each AA residue"; public ProteinConservationParser(Stream stream) { _stream = stream; } public IEnumerable GetItems() { using (var reader = new StreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { var columns = line.OptimizedSplit('\t'); if (line.StartsWith("#")) { ParseHeader(line); continue; } var transcriptId = columns[_ensemblIdsIndex]; var proteinSequence = columns[_proteinSeqIndex]; var chromosome = columns[_chromIndex]; var scores = columns[_scoresIndex].OptimizedSplit(',').Select(x => (byte) int.Parse(x)) .ToArray(); yield return new ProteinConservationItem(chromosome, transcriptId, proteinSequence, scores); } } } private void ParseHeader(string line) { var columnTags = line.TrimStart('#').OptimizedSplit('\t'); _ensemblIdsIndex = Array.IndexOf(columnTags, EnsemblIdsTag); _chromIndex = Array.IndexOf(columnTags, ChromTag); _scoresIndex = Array.IndexOf(columnTags, ScoresTag); _proteinSeqIndex = Array.IndexOf(columnTags, ProteinSequenceTag); } public void Dispose()=>_stream?.Dispose(); } } ================================================ FILE: SAUtils/AAConservation/ProteinConservationWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using IO; using VariantAnnotation.Caches; using VariantAnnotation.ProteinConservation; using VariantAnnotation.Providers; namespace SAUtils.AAConservation { public sealed class ProteinConservationWriter:IDisposable { private readonly Stream _transcriptGroupStream; private readonly GenomeAssembly _assembly; private readonly ExtendedBinaryWriter _writer; private readonly DataSourceVersion _version; private readonly TranscriptCacheData _transcriptCacheData; //some transcripts have multiple locations in the genome and may have conflicting scores // so, we need to load them up and check for duplicates and resolve them. public ProteinConservationWriter(Stream stream, Stream groupStream, TranscriptCacheData transcriptData, DataSourceVersion version) { _transcriptGroupStream = groupStream; _writer = new ExtendedBinaryWriter(stream); _transcriptCacheData = transcriptData; _version = version; } public void Write(IEnumerable items) { if (items == null) return; _writer.WriteOpt(ProteinConservationCommon.SchemaVersion); _writer.Write((byte) _assembly); _version.Write(_writer); var alignedProteinsAndScores = GetProteinWithUniqueScores(items); var nirvanaProteins = new HashSet(_transcriptCacheData.PeptideSeqs); CheckProteinSetOverlap(alignedProteinsAndScores, nirvanaProteins); var transcriptScores = new Dictionary(); //protein sequence -> transcript ids mapping var transcriptGroupsByProtein = new Dictionary>(alignedProteinsAndScores.Count); foreach (var protein in alignedProteinsAndScores.Keys) { transcriptGroupsByProtein.Add(protein, new List()); } foreach (var transcriptIntervalArray in _transcriptCacheData.TranscriptIntervalArrays) { if (transcriptIntervalArray == null) continue;//may happen since for GRCh38 decoy contigs, there may be none foreach (var transcriptInterval in transcriptIntervalArray.Array) { var transcript = transcriptInterval.Value; if(transcript.Translation == null) continue; var peptideSeq = transcript.Translation.PeptideSeq; if(!alignedProteinsAndScores.TryGetValue(transcript.Translation.PeptideSeq, out var scores)) continue; transcriptScores.TryAdd(transcript.Id.WithVersion, scores); transcriptGroupsByProtein[peptideSeq].Add(transcript.Id.WithVersion); } } foreach (var (transcriptId, scores) in transcriptScores) { var transcriptScore = new TranscriptConservationScores(transcriptId, scores); transcriptScore.Write(_writer); } WriteTranscriptGroups(transcriptGroupsByProtein); Console.WriteLine($"Recorded conservation scores for {transcriptScores.Count} transcripts."); //writing an empty item to indicate end of records var endOfRecordItem = TranscriptConservationScores.GetEmptyItem(); endOfRecordItem.Write(_writer); } private void WriteTranscriptGroups(Dictionary> transcriptGroupsByProtein) { using (var writer = new StreamWriter(_transcriptGroupStream)) { var ensemblIds = new List(); var refseqIds = new List(); writer.WriteLine("#EnsemblIds\tRefSeqIds\tPeptide sequence"); foreach (var (protein,ids) in transcriptGroupsByProtein) { if(ids.Count == 0) continue; ensemblIds.Clear(); refseqIds.Clear(); foreach (var id in ids) { if(id.StartsWith("ENST")) ensemblIds.Add(id); else refseqIds.Add(id); } writer.WriteLine($"{string.Join(',',ensemblIds)}\t{string.Join(',',refseqIds)}\t{protein}"); } } } private static void CheckProteinSetOverlap(Dictionary proteinAndScores, HashSet nirvanaProteins) { var count = 0; foreach (var protein in proteinAndScores.Keys) { if (nirvanaProteins.Contains(protein)) count++; } Console.WriteLine($"{count} aligned proteins were also in Nirvana cache"); } private static Dictionary GetProteinWithUniqueScores(IEnumerable items) { var proteinAndScores = new Dictionary(); var multiAlignProteins = new HashSet(); var proteinCount = 0; foreach (var item in items) { if (proteinAndScores.TryAdd(item.ProteinSequence, item.Scores)) proteinCount++; else { if (item.Chromosome == "chrX" || item.Chromosome == "X") { proteinAndScores[item.ProteinSequence] = item.Scores; } if (!item.Scores.SequenceEqual(proteinAndScores[item.ProteinSequence])) multiAlignProteins.Add(item.ProteinSequence); } } foreach (var protein in multiAlignProteins) { proteinAndScores.Remove(protein); } Console.WriteLine($"Found {proteinCount} proteins with unique scores."); return proteinAndScores; } public void Dispose()=>_writer?.Dispose(); } } ================================================ FILE: SAUtils/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: SAUtils/ClinGen/DosageMapRegionItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.ClinGen { public sealed class DosageMapRegionItem : ISuppIntervalItem { public Chromosome Chromosome { get; } public int Start { get; } public int End { get; } public readonly int HiScore; public readonly int TsScore; public DosageMapRegionItem(Chromosome chromosome, int start, int end, int hiScore, int tsScore) { Chromosome = chromosome; Start = start; End = end; HiScore = hiScore; TsScore = tsScore; } public string GetJsonString() { var sb= StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("chromosome", Chromosome.EnsemblName); jsonObject.AddIntValue("begin", Start); jsonObject.AddIntValue("end", End); jsonObject.AddStringValue("haploinsufficiency", Data.ScoreToDescription[HiScore]); jsonObject.AddStringValue("triplosensitivity", Data.ScoreToDescription[TsScore]); return StringBuilderPool.GetStringAndReturn(sb); } } } ================================================ FILE: SAUtils/ClinGen/DosageMapRegionParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using OptimizedCore; using VariantAnnotation.IO; using Newtonsoft.Json.Linq; using SAUtils.DataStructures; namespace SAUtils.ClinGen { public sealed class DosageMapRegionParser : IDisposable { private readonly Stream _stream; private readonly Dictionary _refNameToChromosome; private const string GenomicLocation = "Genomic Location"; private const string HaploInsufficiencyScoreTag = "Haploinsufficiency Score"; private const string TriploSensitivityScoreTag = "Triplosensitivity Score"; private int _genomicLocationIndex = -1; private int _haploInsufficiencyScoreIndex = -1; private int _triploSensitivityScoreIndex = -1; private static int _unknownRegion = 0; public DosageMapRegionParser(Stream stream, Dictionary refNameToChromosome) { _stream = stream; _refNameToChromosome = refNameToChromosome; } public void Dispose() { _stream?.Dispose(); } public IEnumerable GetItems() { var dosageMapRegionItems = new List(); using (var reader = new StreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { if (line.StartsWith("#")) { ParseHeaderLine(line); } else { var item = GetDosageMapRegionItem(line, _refNameToChromosome); if (item != null) dosageMapRegionItems.Add(item); } } } ReportStatistics(dosageMapRegionItems); return dosageMapRegionItems; } private DosageMapRegionItem GetDosageMapRegionItem(string line, Dictionary refNameToChromosome) { var fields = line.OptimizedSplit('\t'); string genomicLocation = fields[_genomicLocationIndex]; (string chromName, int start, int end) = ParseGenomeLocation(genomicLocation); if (chromName == null) return null; if (!refNameToChromosome.TryGetValue(chromName, out var chrom)) return null; string haploInsufficiencyScore = fields[_haploInsufficiencyScoreIndex]; string triploSensitivityScore = fields[_triploSensitivityScoreIndex]; if (!int.TryParse(haploInsufficiencyScore, out int hiScore)) hiScore = -1; if (!int.TryParse(triploSensitivityScore, out int tsScore)) tsScore = -1; return new DosageMapRegionItem(chrom, start, end, hiScore, tsScore); } private void ParseHeaderLine(string line) { if (line.StartsWith("#ISCA ID")) GetColumnIndices(line); } private void GetColumnIndices(string line) { var cols = line.OptimizedSplit('\t'); _genomicLocationIndex = Array.IndexOf(cols, GenomicLocation); _haploInsufficiencyScoreIndex = Array.IndexOf(cols, HaploInsufficiencyScoreTag); _triploSensitivityScoreIndex = Array.IndexOf(cols, TriploSensitivityScoreTag); if (_genomicLocationIndex == -1 || _haploInsufficiencyScoreIndex == -1 || _triploSensitivityScoreIndex == -1) throw new InvalidDataException("Column indices not set!!"); } private static (string chromName, int Start, int End) ParseGenomeLocation(string genomeLocation) { int index1 = genomeLocation.IndexOf(':'); int index2 = genomeLocation.IndexOf('-'); if (index1 < 0 || index2 < 0) { Console.WriteLine($"Not able to parse {genomeLocation}"); _unknownRegion ++; return (null, -1, -1); } string chromName = genomeLocation.Substring(0, index1); int start = int.Parse(genomeLocation.Substring(index1 + 1, index2 - index1 - 1)); int end = int.Parse(genomeLocation.Substring(index2 + 1)); return (chromName, start, end); } private void ReportStatistics(IEnumerable items) { var description = new List(Data.ScoreToDescription.Values); KeyCounts hiScore = new KeyCounts(description); KeyCounts tsScore = new KeyCounts(description); foreach (DosageMapRegionItem item in items) { hiScore.Increment(Data.ScoreToDescription[item.HiScore]); tsScore.Increment(Data.ScoreToDescription[item.TsScore]); } var sb = StringBuilderPool.Get(); var jo = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jo.AddIntValue("genomeLocationCount", items.Count()); jo.AddIntValue("unparsableGenomeLocationCount", _unknownRegion); jo.AddObjectValue("haploinsufficiency", hiScore); jo.AddObjectValue("triplosensitivity", tsScore); sb.Append(JsonObject.CloseBrace); Console.WriteLine(JObject.Parse(StringBuilderPool.GetStringAndReturn(sb))); } } } ================================================ FILE: SAUtils/ClinGen/DosageMapRegions.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.ClinGen { public static class DosageMapRegions { private static string _outputDirectory; private static string _dosageMapRegionFile; private static string _inputReferencePath; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "tsv|t=", "input tsv file", v => _dosageMapRegionFile = v }, { "ref|r=", "input reference {filename}", v => _inputReferencePath = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_dosageMapRegionFile, "dosage map region TSV file", "--tsv") .CheckInputFilenameExists(_inputReferencePath, "reference sequence file", "--tsv") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates an interval annotation database from dbVar data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var versionFileNames = Directory.GetFiles(".", "*.version"); if (versionFileNames.Length != 1) { throw new UserErrorException($"Multiple version files found in directory: {Directory.GetCurrentDirectory()}"); } var sourceVersion = DataSourceVersionReader.GetSourceVersion(versionFileNames[0]); string outFileName = $"{sourceVersion.Name.Replace(' ', '_')}_{sourceVersion.Version}"; var referenceProvider = new ReferenceSequenceProvider(GZipUtilities.GetAppropriateReadStream(_inputReferencePath)); using (var dosageSensitivityParser = new DosageMapRegionParser(GZipUtilities.GetAppropriateReadStream(_dosageMapRegionFile), referenceProvider.RefNameToChromosome)) using (var stream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.IntervalFileSuffix))) using (var nsiWriter = new NsiWriter(stream, sourceVersion, referenceProvider.Assembly, SaCommon.DosageSensitivityTag, ReportFor.StructuralVariants, SaCommon.SchemaVersion)) { nsiWriter.Write(dosageSensitivityParser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/ClinGen/DosageSensitivity.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.SA; namespace SAUtils.ClinGen { public static class DosageSensitivity { private static string _outputDirectory; private static string _dosageSensitivityFile; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "tsv|t=", "input tsv file", v => _dosageSensitivityFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .CheckInputFilenameExists(_dosageSensitivityFile, "dosage sensitivity TSV file", "--tsv") .SkipBanner() .ShowHelpMenu("Creates a gene annotation database from dbVar data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var versionFileNames = Directory.GetFiles(".", "*.version"); if (versionFileNames.Length != 1) { throw new UserErrorException($"Multiple version files found in directory: {Directory.GetCurrentDirectory()}"); } var sourceVersion = DataSourceVersionReader.GetSourceVersion(versionFileNames[0]); string outFileName = $"{sourceVersion.Name.Replace(' ','_')}_{sourceVersion.Version}"; using (var dosageSensitivityParser= new DosageSensitivityParser(GZipUtilities.GetAppropriateReadStream(_dosageSensitivityFile))) using (var stream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GeneFileSuffix))) using (var ngaWriter = new NgaWriter(stream, sourceVersion, SaCommon.DosageSensitivityTag, SaCommon.SchemaVersion, false)) { ngaWriter.Write(dosageSensitivityParser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/ClinGen/DosageSensitivityItem.cs ================================================ using System.IO; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.ClinGen { public sealed class DosageSensitivityItem : ISuppGeneItem { public string GeneSymbol { get; } public readonly int HiScore; public readonly int TsScore; public DosageSensitivityItem(string geneSymbol, int hiScore, int tsScore) { GeneSymbol = geneSymbol; HiScore = hiScore; TsScore = tsScore; if (!Data.ScoreToDescription.ContainsKey(HiScore) || !Data.ScoreToDescription.ContainsKey(TsScore)) { throw new InvalidDataException($"Unexpected score ({HiScore}, {TsScore}) observed for gene: {geneSymbol}"); } } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("haploinsufficiency", Data.ScoreToDescription[HiScore]); jsonObject.AddStringValue("triplosensitivity", Data.ScoreToDescription[TsScore]); sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } } } ================================================ FILE: SAUtils/ClinGen/DosageSensitivityParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using Newtonsoft.Json.Linq; using SAUtils.DataStructures; namespace SAUtils.ClinGen { public sealed class DosageSensitivityParser:IDisposable { private readonly Stream _stream; private const string GeneSymbolTag = "#Gene Symbol"; private const string GeneIdTag = "Gene ID"; private const string HaploInsufficiencyScoreTag = "Haploinsufficiency Score"; private const string TriploSensitivityScoreTag = "Triplosensitivity Score"; private int _geneSymbolIndex = -1; private int _geneIdIndex = -1; private int _haploInsufficiencyScoreIndex = -1; private int _triploSensitivityScoreIndex = -1; public DosageSensitivityParser(Stream stream) { _stream = stream; } public void Dispose() { _stream?.Dispose(); } public Dictionary> GetItems() { var geneAnnotations = new Dictionary>(); var duplicateGenes = new HashSet(); using (var reader = new StreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { if (line.StartsWith("#")) { ParseHeaderLine(line); } else { if (MissingIndices()) throw new InvalidDataException("Column indices not set!!"); var geneAnnotation = GetGeneAndScores(line); bool isDuplicate = geneAnnotations.TryAdd(geneAnnotation.GeneSymbol, new List { geneAnnotation }); if (!isDuplicate) { duplicateGenes.Add(geneAnnotation.GeneSymbol); if (geneAnnotation.GetJsonString() != geneAnnotations[geneAnnotation.GeneSymbol][0].GetJsonString()) { Console.WriteLine(geneAnnotation.GetJsonString()); Console.WriteLine(geneAnnotations[geneAnnotation.GeneSymbol][0].GetJsonString()); throw new DataMisalignedException($"Duplicate gene entries have conflicting informatioin."); } } } } Console.WriteLine($"WARNING: Duplicate entries found for genes:{string.Join(',', duplicateGenes)}. But the contents were identical."); } ReportStatistics(geneAnnotations); return geneAnnotations; } private ISuppGeneItem GetGeneAndScores(string line) { var cols = line.OptimizedSplit('\t'); var gene = cols[_geneSymbolIndex]; if (!int.TryParse(cols[_haploInsufficiencyScoreIndex], out var hiScore)) hiScore = -1; if (!int.TryParse(cols[_triploSensitivityScoreIndex], out var tsScore)) tsScore = -1; return new DosageSensitivityItem(gene, hiScore, tsScore); } private bool MissingIndices() { return _geneSymbolIndex == -1 || _geneIdIndex == -1 || _haploInsufficiencyScoreIndex == -1 || _triploSensitivityScoreIndex == -1; } private void ParseHeaderLine(string line) { if (line.StartsWith("#Gene Symbol")) GetColumnIndices(line); } private void GetColumnIndices(string line) { var cols = line.OptimizedSplit('\t'); _geneSymbolIndex = Array.IndexOf(cols, GeneSymbolTag); _geneIdIndex = Array.IndexOf(cols, GeneIdTag); _haploInsufficiencyScoreIndex = Array.IndexOf(cols, HaploInsufficiencyScoreTag); _triploSensitivityScoreIndex = Array.IndexOf(cols, TriploSensitivityScoreTag); } private void ReportStatistics(Dictionary> items) { var genes = new List(items.Keys); var description = new List(Data.ScoreToDescription.Values); KeyCounts hiScore = new KeyCounts(description); KeyCounts tsScore = new KeyCounts(description); foreach (string gene in genes) { var item = (DosageSensitivityItem) items[gene][0]; hiScore.Increment(Data.ScoreToDescription[item.HiScore]); tsScore.Increment(Data.ScoreToDescription[item.TsScore]); } var sb = StringBuilderPool.Get(); var jo = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jo.AddIntValue("geneCount", items.Count); jo.AddObjectValue("haploinsufficiency", hiScore); jo.AddObjectValue("triplosensitivity", tsScore); sb.Append(JsonObject.CloseBrace); Console.WriteLine(JObject.Parse(StringBuilderPool.GetStringAndReturn(sb))); } } } ================================================ FILE: SAUtils/ClinGen/GeneDiseaseValidity.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using OptimizedCore; using SAUtils.InputFileParsers; using VariantAnnotation.SA; namespace SAUtils.ClinGen { public static class GeneDiseaseValidity { private static string _outputDirectory; private static string _ugaFile; private static string _diseaseValidityFile; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "tsv|t=", "ClinGen gene validity file path", v => _diseaseValidityFile = v }, { "uga|u=", "UGA file path", v => _ugaFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .CheckInputFilenameExists(_diseaseValidityFile, "disease validity TSV file", "--tsv") .CheckInputFilenameExists(_ugaFile, "UGA file path", "--uga") .SkipBanner() .ShowHelpMenu("Creates a gene annotation database from ClinGen gene validity data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var versionFileNames = Directory.GetFiles(".","*.version"); if (versionFileNames.Length != 1) { throw new UserErrorException($"Multiple version files found in directory: {Directory.GetCurrentDirectory()}"); } var sourceVersion = DataSourceVersionReader.GetSourceVersion(versionFileNames[0]); string outFileName = $"{sourceVersion.Name.Replace(' ', '_')}_{sourceVersion.Version}"; // read uga file to get hgnc id to gene symbols dictionary using (var diseaseValidityParser = new GeneDiseaseValidityParser(GZipUtilities.GetAppropriateReadStream(_diseaseValidityFile), GetHgncIdToGeneSymbols())) using (var stream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GeneFileSuffix))) using (var ngaWriter = new NgaWriter(stream, sourceVersion, SaCommon.DiseaseValidityTag, SaCommon.SchemaVersion, true)) { ngaWriter.Write(diseaseValidityParser.GetItems()); } return ExitCodes.Success; } private static Dictionary GetHgncIdToGeneSymbols() { var idToSymbols = new Dictionary(); using (var ugaStream = GZipUtilities.GetAppropriateReadStream(_ugaFile)) using(var reader = new StreamReader(ugaStream)) { string line= reader.ReadLine();//first line has the count of entries while ((line = reader.ReadLine()) != null) { var splits = line.OptimizedSplit('\t'); var symbol = splits[2]; var hgncId = int.Parse(splits[8]); if(hgncId == -1) continue; if (idToSymbols.TryAdd(hgncId, symbol)) continue; if(symbol != idToSymbols[hgncId]) Console.WriteLine($"Different symbol for the same id({hgncId}). Existing: {idToSymbols[hgncId]}. New: {symbol}"); } } return idToSymbols; } } } ================================================ FILE: SAUtils/ClinGen/GeneDiseaseValidityItem.cs ================================================ using System; using System.Globalization; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.ClinGen { public sealed class GeneDiseaseValidityItem: ISuppGeneItem { public string GeneSymbol { get; } public readonly string DiseaseId; private readonly string _disease; private readonly string _classification; private readonly string _classificationDate; public GeneDiseaseValidityItem(string geneSymbol, string diseaseId, string disease, string classification, string classificationDate) { GeneSymbol = geneSymbol; DiseaseId = diseaseId; _disease = disease; _classification = classification; _classificationDate = classificationDate; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("diseaseId", DiseaseId); jsonObject.AddStringValue("disease", _disease); jsonObject.AddStringValue("classification", _classification); jsonObject.AddStringValue("classificationDate", _classificationDate); sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } public int CompareDate(GeneDiseaseValidityItem other) { var date = DateTime.ParseExact(_classificationDate, "yyyy-MM-dd", CultureInfo.InvariantCulture); var otherDate = DateTime.ParseExact(other._classificationDate, "yyyy-MM-dd", CultureInfo.InvariantCulture); return date.CompareTo(otherDate); } } } ================================================ FILE: SAUtils/ClinGen/GeneDiseaseValidityParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using Newtonsoft.Json.Linq; namespace SAUtils.ClinGen { public sealed class GeneDiseaseValidityParser: IDisposable { private readonly Stream _stream; private readonly Dictionary _hgncIdToSymbols; private readonly HashSet _unknownIds = new HashSet(); private readonly HashSet _classificationSet = new HashSet { "no reported evidence", "disputed", "limited", "moderate", "definitive", "strong", "refuted", "no known disease relationship" }; public GeneDiseaseValidityParser(Stream stream, Dictionary hgncIdToSymbols) { _stream = stream; _hgncIdToSymbols = hgncIdToSymbols; } public void Dispose() { _stream?.Dispose(); } public Dictionary> GetItems() { var geneAnnotations = new Dictionary>(); using (var reader = new StreamReader(_stream)) { string line; bool isComments = true; bool isHeaderLine = false; while ((line = reader.ReadLine()) != null) { if (isComments) { //the header starts with a bunch of '+' signs if (!line.StartsWith("++++")) continue; isComments = false; isHeaderLine = true; continue; } if (isHeaderLine) { ParseHeaderLine(line); isHeaderLine = false; line = reader.ReadLine();//reading end of header line if (line.StartsWith("++++")) continue; } if (MissingIndices()) throw new InvalidDataException("Column indices not set!!"); var geneAnnotation = GetAnnotationItem(line); if(geneAnnotation == null) continue; if (geneAnnotations.TryGetValue(geneAnnotation.GeneSymbol, out var annotations)) AddLatest(annotations, geneAnnotation); else geneAnnotations.Add(geneAnnotation.GeneSymbol, new Dictionary {{geneAnnotation.DiseaseId, geneAnnotation}}); } } Console.WriteLine($"Number of geneIds missing from the cache:{_unknownIds.Count} ({100.0*_unknownIds.Count/_hgncIdToSymbols.Count}%)"); var items = GetLatestAnnotations(geneAnnotations); ReportStatistics(items); return items; } private static Dictionary> GetLatestAnnotations(Dictionary> annotationByDiseaseIds) { var latestAnnotations = new Dictionary>(); foreach (var annotation in annotationByDiseaseIds) { var geneAnnotation = new List(); foreach (var geneAnno in annotation.Value.Values) { geneAnnotation.Add(geneAnno); } latestAnnotations.Add(annotation.Key, geneAnnotation); } return latestAnnotations; } private static void AddLatest(Dictionary annotations, GeneDiseaseValidityItem geneAnnotation) { if(!annotations.TryGetValue(geneAnnotation.DiseaseId, out var diseaseItem)) annotations.Add(geneAnnotation.DiseaseId, geneAnnotation); else { if (diseaseItem.CompareDate(geneAnnotation) < 0) annotations[geneAnnotation.DiseaseId] = geneAnnotation; } } private GeneDiseaseValidityItem GetAnnotationItem(string line) { var cols = line.OptimizedSplit('\t'); var geneId = int.Parse(cols[_geneIdIndex].OptimizedSplit(':')[1]); if (!_hgncIdToSymbols.TryGetValue(geneId, out var geneSymbol)) { _unknownIds.Add(geneId); return null; } var disease = cols[_diseaseIndex].Trim('\"'); var diseaseId = cols[_diseaseIdIndex]; var classification = cols[_classificationIndex].ToLower(); if (!_classificationSet.Contains(classification)) { throw new InvalidDataException($"Unknown classification found: {classification}"); } var classificationDate = cols[_classificationDateIndex].OptimizedSplit('T')[0];//2018-06-07T14:37:47.175Z return new GeneDiseaseValidityItem(geneSymbol, diseaseId, disease, classification, classificationDate); } private void ReportStatistics(Dictionary> items) { var sb = StringBuilderPool.Get(); var jo = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jo.AddIntValue("geneIdsCount", items.Count); jo.AddIntValue("unknownGeneIdsCount", _unknownIds.Count); sb.Append(JsonObject.CloseBrace); Console.WriteLine(JObject.Parse(StringBuilderPool.GetStringAndReturn(sb))); } private int _geneIdIndex = -1; private int _diseaseIdIndex = -1; private int _diseaseIndex = -1; private int _classificationIndex = -1; private int _classificationDateIndex = -1; private const string GeneIdTag = "GENE ID (HGNC)"; private const string DiseaseTag = "DISEASE LABEL"; private const string DiseaseIdTag = "DISEASE ID (MONDO)"; private const string ClassificationTag = "CLASSIFICATION"; private const string ClassificationDateTag = "CLASSIFICATION DATE"; private bool MissingIndices() { return _geneIdIndex == -1 || _diseaseIdIndex == -1 || _diseaseIndex == -1 || _classificationIndex == -1 || _classificationDateIndex== -1; } private void ParseHeaderLine(string line) { var cols = line.OptimizedSplit('\t'); _geneIdIndex = Array.IndexOf(cols, GeneIdTag); _diseaseIndex = Array.IndexOf(cols, DiseaseTag); _diseaseIdIndex = Array.IndexOf(cols, DiseaseIdTag); _classificationIndex = Array.IndexOf(cols, ClassificationTag); _classificationDateIndex = Array.IndexOf(cols, ClassificationDateTag); } } } ================================================ FILE: SAUtils/ClinGen/ScoreToDescription.cs ================================================ using System.Collections.Generic; namespace SAUtils.ClinGen { public static class Data { public static Dictionary ScoreToDescription { get; } = new Dictionary { {-1, "Not yet evaluated"}, {0, "no evidence to suggest that dosage sensitivity is associated with clinical phenotype"}, {1, "little evidence suggesting dosage sensitivity is associated with clinical phenotype"}, {2, "emerging evidence suggesting dosage sensitivity is associated with clinical phenotype"}, {3, "sufficient evidence suggesting dosage sensitivity is associated with clinical phenotype"}, {30, "gene associated with autosomal recessive phenotype"}, {40, "dosage sensitivity unlikely"} }; } } ================================================ FILE: SAUtils/CosmicGeneFusions/Cache/ReferenceLoader.cs ================================================ using System.Collections.Generic; using Genome; using IO; using VariantAnnotation.Providers; namespace SAUtils.CosmicGeneFusions.Cache { public static class ReferenceLoader { public static Dictionary GetRefIndexToChromosome(string referencePath) { var sequenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(referencePath)); return sequenceProvider.RefIndexToChromosome; } } } ================================================ FILE: SAUtils/CosmicGeneFusions/Cache/TranscriptCache.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using Intervals; using VariantAnnotation.Caches; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; using VariantAnnotation.Utilities; namespace SAUtils.CosmicGeneFusions.Cache { public sealed class TranscriptCache { private readonly Dictionary _idToTranscript; public TranscriptCache(Dictionary idToTranscript) => _idToTranscript = idToTranscript; public static TranscriptCache Create(Stream stream, Dictionary refIndexToChromosome) { using var reader = new TranscriptCacheReader(stream); TranscriptCacheData cacheData = reader.Read(refIndexToChromosome); return new TranscriptCache(GetTranscriptIdToTranscript(cacheData.TranscriptIntervalArrays)); } // ReSharper disable once ParameterTypeCanBeEnumerable.Local internal static Dictionary GetTranscriptIdToTranscript(IntervalArray[] transcriptIntervalArrays) { var transcriptIdToTranscript = new Dictionary(); foreach (IntervalArray refTranscriptIntervals in transcriptIntervalArrays) { if (refTranscriptIntervals == null) continue; foreach (Interval transcriptInterval in refTranscriptIntervals.Array) { ITranscript transcript = transcriptInterval.Value; if (transcript.Source != Source.Ensembl) continue; if (!transcriptIdToTranscript.ContainsKey(transcript.Id.WithVersion)) transcriptIdToTranscript[transcript.Id.WithVersion] = transcript; if (!transcriptIdToTranscript.ContainsKey(transcript.Id.WithoutVersion)) transcriptIdToTranscript[transcript.Id.WithoutVersion] = transcript; } } return transcriptIdToTranscript; } public (string GeneId, string GeneSymbol) GetGene(string transcriptId) { string shortTranscriptId = FormatUtilities.SplitVersion(transcriptId).Id; return _idToTranscript.TryGetValue(shortTranscriptId, out ITranscript transcript) ? (transcript.Gene.EnsemblId.WithoutVersion, transcript.Gene.Symbol) : HandleMissingTranscripts(transcriptId); } // In GRCh38, we're missing some of the transcripts specified by COSMIC. However, it's fine to substitute // these transcripts with others belonging to the same gene. These are generally from transcripts that are // no longer used. internal static (string GeneId, string GeneSymbol) HandleMissingTranscripts(string transcriptId) => transcriptId switch { "ENST00000646891.1" => ("ENSG00000157764", "BRAF"), "ENST00000242365.4" => ("ENSG00000122778", "KIAA1549"), "ENST00000311979.3" => ("ENSG00000172660", "TAF15"), "ENST00000529193.1" => ("ENSG00000157613", "CREB3L1"), "ENST00000312675.4" => ("ENSG00000145012", "LPP"), "ENST00000556625.1" => ("ENSG00000258389", "DUX4"), _ => throw new InvalidDataException($"Found an unhandled transcript ID in HandleMissingTranscripts: {transcriptId}") }; } } ================================================ FILE: SAUtils/CosmicGeneFusions/Conversion/CosmicConverter.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using SAUtils.CosmicGeneFusions.Cache; namespace SAUtils.CosmicGeneFusions.Conversion { public static class CosmicConverter { public static Dictionary Convert(Dictionary> fusionIdToEntries, TranscriptCache transcriptCache) { var fusionKeyToJsonList = new Dictionary>(); foreach ((int fusionId, HashSet fusionEntries) in fusionIdToEntries) { (ulong fusionKey, string json) = GetCosmicGeneFusion(fusionId, fusionEntries, transcriptCache); if (json == null) continue; if (!fusionKeyToJsonList.TryGetValue(fusionKey, out List jsonEntries)) { jsonEntries = new List(); fusionKeyToJsonList[fusionKey] = jsonEntries; } jsonEntries.Add(json); } return fusionKeyToJsonList.ToJsonArray(); } internal static Dictionary ToJsonArray(this Dictionary> geneKeyToJsonList) { var geneKeyToJson = new Dictionary(); foreach ((ulong geneKey, List jsonList) in geneKeyToJsonList) geneKeyToJson[geneKey] = jsonList.ToArray(); return geneKeyToJson; } internal static (ulong FusionKey, string Json) GetCosmicGeneFusion(int fusionId, HashSet fusionEntries, TranscriptCache transcriptCache) { (int[] pubMedIds, int numSamples, string hgvsNotation) = AggregateRawCosmicGeneFusions(fusionEntries); if (hgvsNotation == null) return (0, null); var id = $"COSF{fusionId}"; CosmicCount[] histologies = Histology.GetCounts(fusionEntries, numSamples); CosmicCount[] sites = Site.GetCounts(fusionEntries, numSamples); (string[] geneSymbols, ulong fusionKey) = HgvsRnaParser.GetTranscripts(hgvsNotation, transcriptCache); var geneFusion = new CosmicGeneFusion(id, numSamples, geneSymbols, hgvsNotation, histologies, sites, pubMedIds); var json = geneFusion.ToString(); return (fusionKey, json); } internal static (int[] PubMedIds, int NumSamples, string HgvsNotation) AggregateRawCosmicGeneFusions( // ReSharper disable once ParameterTypeCanBeEnumerable.Local HashSet fusionEntries) { var sampleIds = new HashSet(); var pubMedIds = new HashSet(); var hgvsEntries = new HashSet(); foreach (RawCosmicGeneFusion fusionEntry in fusionEntries) { pubMedIds.Add(fusionEntry.PubMedId); sampleIds.Add(fusionEntry.SampleId); hgvsEntries.Add(fusionEntry.HgvsNotation); } if (hgvsEntries.Count != 1) throw new InvalidDataException($"Expected one HGVS entry for the gene fusion, but found {hgvsEntries.Count}"); string hgvsr = HgvsRnaFixer.Fix(hgvsEntries.First()); return (pubMedIds.OrderBy(x => x).ToArray(), sampleIds.Count, hgvsr); } } } ================================================ FILE: SAUtils/CosmicGeneFusions/Conversion/CosmicGeneFusion.cs ================================================ // ReSharper disable InconsistentNaming // ReSharper disable SuggestBaseTypeForParameter using System.Text.Json; namespace SAUtils.CosmicGeneFusions.Conversion { public sealed record CosmicGeneFusion(string id, int numSamples, string[] geneSymbols, string hgvsr, CosmicCount[] histologies, CosmicCount[] sites, int[] pubMedIds) { public override string ToString() { string json = JsonSerializer.Serialize(this); return json.Substring(1, json.Length - 2); } } public sealed record CosmicCount(string name, int numSamples); } ================================================ FILE: SAUtils/CosmicGeneFusions/Conversion/HgvsRnaFixer.cs ================================================ using System; using System.Text; namespace SAUtils.CosmicGeneFusions.Conversion { public static class HgvsRnaFixer { // COSMIC isn't using the correct HGVS notation, so we're just going to add the proper junction string (::) between each transcript public static string Fix(string hgvsNotation) { var sb = new StringBuilder(); ReadOnlySpan delimiter = "_ENST".AsSpan(); ReadOnlySpan hgvsSpan = hgvsNotation.AsSpan(); var numTranscripts = 0; while (true) { int index = hgvsSpan.IndexOf(delimiter); numTranscripts++; if (index == -1) { sb.Append(hgvsSpan); break; } sb.Append(hgvsSpan.Slice(0, index)); sb.Append("::"); hgvsSpan = hgvsSpan.Slice(index + 1); } // this is to capture HGVS entries like "ENST00000283243.12(PLA2R1):r.1_2802" which is not actually a gene fusion return numTranscripts == 1 ? null : sb.ToString(); } } } ================================================ FILE: SAUtils/CosmicGeneFusions/Conversion/HgvsRnaParser.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; using SAUtils.CosmicGeneFusions.Cache; using VariantAnnotation.GeneFusions.Utilities; namespace SAUtils.CosmicGeneFusions.Conversion { public static class HgvsRnaParser { private static readonly Regex HgvsRegex = new(@"(ENST[^\(]+)", RegexOptions.Compiled); public static (string[] GeneSymbols, ulong FusionKey) GetTranscripts(string hgvsNotation, TranscriptCache transcriptCache) { (string transcriptId5, string transcriptId3) = Parse(hgvsNotation); (string geneId5, string geneSymbol5) = transcriptCache.GetGene(transcriptId5); (string geneId3, string geneSymbol3) = transcriptCache.GetGene(transcriptId3); ulong fusionKey = GeneFusionKey.Create(GeneFusionKey.CreateGeneKey(geneId5), GeneFusionKey.CreateGeneKey(geneId3)); return (new[] {geneSymbol5, geneSymbol3}, fusionKey); } public static (string TranscriptId5, string TranscriptId3) Parse(string hgvsString) { // the only gene fusion involving 3 transcripts. The middle one is a bit suspicious, so we'll use the other two. (GRCh37) if (hgvsString == "ENST00000305877.8(BCR):r.1_2866::ENST00000372348.2(ABL1):r.511-?_511-?::ENST00000318560.5(ABL1):r.461_5766") return ("ENST00000305877.8", "ENST00000318560.5"); // same situation in GRCh38 if (hgvsString == "ENST00000305877.12(BCR):r.1_2866::ENST00000372348.6(ABL1):r.511-?_511-?::ENST00000318560.5(ABL1):r.461_5766") return ("ENST00000305877.12", "ENST00000318560.5"); var transcriptIds = new List(); foreach (Match match in HgvsRegex.Matches(hgvsString)) transcriptIds.Add(match.Value); string[] uniqueTranscriptIds = transcriptIds.Distinct().ToArray(); if (uniqueTranscriptIds.Length != 2) throw new InvalidDataException($"Could not identify 2 transcripts in HGVS RNA parser: {hgvsString}"); return (uniqueTranscriptIds[0], uniqueTranscriptIds[1]); } } } ================================================ FILE: SAUtils/CosmicGeneFusions/Conversion/Histology.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.CosmicGeneFusions.IO; using SAUtils.CosmicGeneFusions.Utilities; namespace SAUtils.CosmicGeneFusions.Conversion { public static class Histology { // ReSharper disable once ParameterTypeCanBeEnumerable.Global public static CosmicCount[] GetCounts(HashSet fusionEntries, int numSamples) { var histologyCountDict = new Dictionary(); var totalCount = 0; foreach (RawCosmicGeneFusion fusionEntry in fusionEntries) { string histology = GetMostSpecificValue(fusionEntry.PrimaryHistology, fusionEntry.HistologySubtype1); if (histology == CosmicGeneFusionParser.MissingValue) continue; if (histologyCountDict.TryGetValue(histology, out int count)) histologyCountDict[histology] = count + 1; else histologyCountDict[histology] = 1; totalCount++; } if (totalCount != numSamples) { throw new InvalidDataException($"Found different histology count total ({totalCount}) than samples ({numSamples})."); } return histologyCountDict.GetCosmicCounts(); } private static string GetMostSpecificValue(string primary, string subtype1) => subtype1 != CosmicGeneFusionParser.MissingValue ? subtype1 : primary; } } ================================================ FILE: SAUtils/CosmicGeneFusions/Conversion/RawCosmicGeneFusion.cs ================================================ namespace SAUtils.CosmicGeneFusions.Conversion { public sealed record RawCosmicGeneFusion(int SampleId, int FusionId, string PrimarySite, string SiteSubtype1, string PrimaryHistology, string HistologySubtype1, string HgvsNotation, int PubMedId); } ================================================ FILE: SAUtils/CosmicGeneFusions/Conversion/Site.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.CosmicGeneFusions.IO; using SAUtils.CosmicGeneFusions.Utilities; namespace SAUtils.CosmicGeneFusions.Conversion { public static class Site { // ReSharper disable once ParameterTypeCanBeEnumerable.Global public static CosmicCount[] GetCounts(HashSet fusionEntries, int numSamples) { var siteCountDict = new Dictionary(); var totalCount = 0; foreach (RawCosmicGeneFusion fusionEntry in fusionEntries) { string site = CombineLevels(fusionEntry.PrimarySite, fusionEntry.SiteSubtype1); if (site == CosmicGeneFusionParser.MissingValue) continue; if (siteCountDict.TryGetValue(site, out int count)) siteCountDict[site] = count + 1; else siteCountDict[site] = 1; totalCount++; } // this can be less if we had missing values if (totalCount > numSamples) throw new InvalidDataException($"Found more total sites ({totalCount}) than samples ({numSamples})."); return siteCountDict.GetCosmicCounts(); } private static string CombineLevels(string primary, string subtype1) => subtype1 != CosmicGeneFusionParser.MissingValue ? $"{primary} ({subtype1})" : primary; } } ================================================ FILE: SAUtils/CosmicGeneFusions/CreateCosmicGeneFusions.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using Genome; using IO; using SAUtils.CosmicGeneFusions.Cache; using SAUtils.CosmicGeneFusions.Conversion; using SAUtils.CosmicGeneFusions.IO; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.CosmicGeneFusions { public static class CreateCosmicGeneFusions { private static string _transcriptCachePath; private static string _cosmicGeneFusionsPath; private static string _referencePath; private static string _outputDirectory; private static string _releaseDate; private static string _cosmicVersion; private static ExitCodes ProgramExecution() { Console.Write("- loading reference sequence... "); Dictionary refIndexToChromosome = ReferenceLoader.GetRefIndexToChromosome(_referencePath); Console.WriteLine("finished."); Console.Write("- loading transcript cache... "); using FileStream cacheStream = FileUtilities.GetReadStream(_transcriptCachePath); var transcriptCache = TranscriptCache.Create(cacheStream, refIndexToChromosome); Console.WriteLine("finished."); Console.Write("- parsing COSMIC gene fusions... "); using StreamReader cosmicReader = GZipUtilities.GetAppropriateStreamReader(_cosmicGeneFusionsPath); Dictionary> fusionIdToEntries = CosmicGeneFusionParser.Parse(cosmicReader); Console.WriteLine($"{fusionIdToEntries.Count:N0} fusion IDs loaded"); Console.Write("- converting COSMIC entries... "); Dictionary fusionKeyToJson = CosmicConverter.Convert(fusionIdToEntries, transcriptCache); Console.WriteLine($"{fusionKeyToJson.Count:N0} gene pairs converted"); DataSourceVersion version = CreateDataSourceVersion(_cosmicVersion, _releaseDate); WriteGeneFusions(_outputDirectory, fusionKeyToJson, version); Console.WriteLine(); Console.WriteLine($"Total: {fusionKeyToJson.Count:N0} gene pairs in database."); return ExitCodes.Success; } // ReSharper disable once SuggestBaseTypeForParameter private static void WriteGeneFusions(string outputDirectory, Dictionary geneKeyToJson, DataSourceVersion version) { Console.Write("- writing gene fusions SA file... "); string outputPath = Path.Combine(outputDirectory, $"COSMIC_GeneFusions_{version.Version}{SaCommon.GeneFusionJsonSuffix}"); using var writer = new GeneFusionJsonWriter(FileUtilities.GetCreateStream(outputPath), "cosmicGeneFusions", version); writer.Write(geneKeyToJson); Console.WriteLine("finished."); } internal static DataSourceVersion CreateDataSourceVersion(string version, string releaseDate) { long releaseTicks = DateTime.Parse(releaseDate).Ticks; return new DataSourceVersion("COSMIC gene fusions", version, releaseTicks, "manually curated somatic gene fusions"); } public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "cache|c=", "transcript cache {path}", v => _transcriptCachePath = v }, { "in|i=", "COSMIC gene fusions {path}", v => _cosmicGeneFusionsPath = v }, { "out|o=", "output {directory}", v => _outputDirectory = v }, { "ref|r=", "input reference sequence {path}", v => _referencePath = v }, { "releaseDate=", "release {date} (YYYY-MM-dd)", v => _releaseDate = v }, { "cosmicVersion=", "COSMIC {version} (e.g. 92)", v => _cosmicVersion = v } }; var commandLineExample = $"{command} [options]"; ExitCodes exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_referencePath, "reference sequence", "--ref") .CheckInputFilenameExists(_transcriptCachePath, "transcript cache", "--cache") .CheckInputFilenameExists(_cosmicGeneFusionsPath, "COSMIC gene fusions", "--in") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .HasRequiredDate(_releaseDate, "COSMIC release date", "--date") .HasRequiredParameter(_cosmicVersion, "COSMIC version", "--version") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with COSMIC gene fusion annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } } } ================================================ FILE: SAUtils/CosmicGeneFusions/IO/CosmicGeneFusionParser.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.CosmicGeneFusions.Conversion; namespace SAUtils.CosmicGeneFusions.IO { public static class CosmicGeneFusionParser { public const string MissingValue = "NS"; public static Dictionary> Parse(StreamReader reader) { var fusionEntries = new List(); // skip the first line reader.ReadLine(); while (true) { string line = reader.ReadLine(); if (line == null) break; string[] cols = line.Split('\t'); if (cols.Length != 32) throw new InvalidDataException($"Expected 32 columns in the COSMIC gene fusions file, but found {cols.Length}"); string fusionIdString = cols[10]; // skip entries that are missing the fusion ID if (string.IsNullOrEmpty(fusionIdString)) continue; int sampleId = int.Parse(cols[0]); string primarySite = RemoveUnderlines(cols[2]); string siteSubtype1 = RemoveUnderlines(cols[3]); string primaryHistology = RemoveUnderlines(cols[6]); string histologySubtype1 = RemoveUnderlines(cols[7]); int fusionId = int.Parse(fusionIdString); string hgvsNotation = cols[11]; int pubMedId = int.Parse(cols[31]); fusionEntries.Add(new RawCosmicGeneFusion(sampleId, fusionId, primarySite, siteSubtype1, primaryHistology, histologySubtype1, hgvsNotation, pubMedId)); } return fusionEntries.GroupByFusionId(); } // ReSharper disable once ParameterTypeCanBeEnumerable.Local private static Dictionary> GroupByFusionId(this List fusionEntries) { var fusionIdToEntries = new Dictionary>(); foreach (RawCosmicGeneFusion fusionEntry in fusionEntries) { if (!fusionIdToEntries.TryGetValue(fusionEntry.FusionId, out HashSet fusionEntrySet)) { fusionEntrySet = new HashSet(); fusionIdToEntries[fusionEntry.FusionId] = fusionEntrySet; } fusionEntrySet.Add(fusionEntry); } return fusionIdToEntries; } internal static string RemoveUnderlines(string s) => s.Replace('_', ' '); } } ================================================ FILE: SAUtils/CosmicGeneFusions/IO/GeneFusionJsonWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Text; using Compression.Utilities; using IO; using IO.v2; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.Interface.Providers; namespace SAUtils.CosmicGeneFusions.IO { public sealed class GeneFusionJsonWriter : IDisposable { private readonly ExtendedBinaryWriter _writer; public GeneFusionJsonWriter(Stream stream, string jsonKey, IDataSourceVersion version, bool leaveOpen = false) { _writer = new ExtendedBinaryWriter(stream, Encoding.UTF8, leaveOpen); WriteHeader(); _writer.Write(jsonKey); version.Write(_writer); } private void WriteHeader() { var header = new Header(FileType.GeneFusionJson, GeneFusionJsonReader.SupportedFileFormatVersion); header.Write(_writer); } public void Write(Dictionary geneKeyToJson) { using var ms = new MemoryStream(); using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { writer.WriteOpt(geneKeyToJson.Count); foreach ((ulong geneKey, string[] jsonArray) in geneKeyToJson) { writer.Write(geneKey); writer.WriteOpt(jsonArray.Length); foreach (string json in jsonArray) writer.Write(json); } } byte[] bytes = ms.ToArray(); _writer.WriteCompressedByteArray(bytes, bytes.Length); } public void Dispose() => _writer.Dispose(); } } ================================================ FILE: SAUtils/CosmicGeneFusions/Utilities/CosmicCountUtilities.cs ================================================ using System.Collections.Generic; using SAUtils.CosmicGeneFusions.Conversion; namespace SAUtils.CosmicGeneFusions.Utilities { public static class CosmicCountUtilities { public static CosmicCount[] GetCosmicCounts(this Dictionary countDict) { var counts = new List(countDict.Count); foreach ((string histology, int count) in countDict) counts.Add(new CosmicCount(histology, count)); return counts.ToArray(); } } } ================================================ FILE: SAUtils/CreateClinvarDb/ClinVarMain.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.ClinVar; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.CreateClinvarDb { public static class ClinVarMain { private static string _rcvFile; private static string _vcvFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "rcv|i=", "ClinVar Full release XML file", v => _rcvFile = v }, { "vcv|c=", "ClinVar Variation release XML file", v => _vcvFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckInputFilenameExists(_rcvFile, "ClinVar full release XML file", "--rcv") .CheckInputFilenameExists(_vcvFile, "ClinVar variation release XML file", "--vcv") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with ClinVar annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var version = DataSourceVersionReader.GetSourceVersion(_rcvFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference))) using (var clinvarReader = new ClinVarParser(GZipUtilities.GetAppropriateReadStream(_rcvFile), GZipUtilities.GetAppropriateReadStream(_vcvFile), referenceProvider)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName+SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.ClinvarTag, false, true, SaCommon.SchemaVersion, false)) using (var schemaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.JsonSchemaSuffix))) using (var schemaWriter = new StreamWriter(schemaStream)) { nsaWriter.Write(clinvarReader.GetItems()); schemaWriter.Write(clinvarReader.JsonSchema); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/CreateClinvarDb/ClinVarStats.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using System.Text; using IO; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.InputFileParsers.ClinVar; using VariantAnnotation.Interface.IO; using VariantAnnotation.IO; namespace SAUtils.CreateClinvarDb; public class ClinVarStats { public int RcvCount = 0; public int VcvCount = 0; public int InvalidRefAlleleCount = 0; public readonly KeyCounts RcvPathogenicityCounts = new KeyCounts(ClinVarCommon.ValidPathogenicity); public readonly KeyCounts RcvReviewStatusCounts = new KeyCounts(ClinVarCommon.ReviewStatusStrings.Values); public readonly KeyCounts VcvPathogenicityCounts = new KeyCounts(ClinVarCommon.ValidPathogenicity); public readonly KeyCounts VcvReviewStatusCounts = new KeyCounts(ClinVarCommon.ReviewStatusStrings.Values); public void GetClinvarSaItemsStats(List items) { foreach (IClinVarSaItem item in items) { if (item.Id.StartsWith("RCV")) { RcvCount++; foreach (string significance in item.Significances) { RcvPathogenicityCounts.Increment(significance); } RcvReviewStatusCounts.Increment(ClinVarCommon.ReviewStatusStrings[item.ReviewStatus]); } else { VcvCount++; foreach (string significance in item.Significances) { VcvPathogenicityCounts.Increment(significance); } VcvReviewStatusCounts.Increment(ClinVarCommon.ReviewStatusStrings[item.ReviewStatus]); } } } public override string ToString() { var sb = StringBuilderPool.Get(); var jo = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jo.AddIntValue("rcvCount", RcvCount); jo.AddObjectValue("rcvPathogenicity", RcvPathogenicityCounts); jo.AddObjectValue("rcvReviewStatus", RcvReviewStatusCounts); jo.AddIntValue("vcvCount", VcvCount); jo.AddObjectValue("vcvPathogenicity", VcvPathogenicityCounts); jo.AddObjectValue("vcvReviewStatus", VcvReviewStatusCounts); sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } } ================================================ FILE: SAUtils/CreateCosmicDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.Cosmic; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.CreateCosmicDb { public static class Main { private static string _vcfFile; private static string _tsvFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "COSMIC VCF file", v => _vcfFile = v }, { "tsv|t=", "COSMIC TSV file", v => _tsvFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_vcfFile, "COSMIC VCF file", "--in") .CheckInputFilenameExists(_vcfFile, "COSMIC VCF file", "--in") .HasRequiredParameter(_tsvFile, "COSMIC TSV file", "--tsv") .CheckInputFilenameExists(_tsvFile, "COSMIC TSV file", "--tsv") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with COSMIC annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var cosmicReader = new MergedCosmicReader(_vcfFile, _tsvFile, referenceProvider); var version = DataSourceVersionReader.GetSourceVersion(_vcfFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.CosmicTag, false, true, SaCommon.SchemaVersion, false)) { nsaWriter.Write(cosmicReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/CreateDbsnpDb/Main.cs ================================================ using System; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.DbSnp; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.CreateDbsnpDb { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "dbSNP VCF file", "--in") .CheckInputFilenameExists(_inputFile, "dbSNP VCF file", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var dbSnpReader = new DbSnpReader(GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.DbsnpTag, true, true, SaCommon.SchemaVersion, false)) { var count = nsaWriter.Write(dbSnpReader.GetItems()); Console.WriteLine($"{{\n \"totalCount\":{count} \n}}"); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/CreateDecipherDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.Decipher; using VariantAnnotation.Providers; using VariantAnnotation.SA; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; namespace SAUtils.CreateDecipherDb { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input txt file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "Decipher txt file", "--in") .CheckInputFilenameExists(_inputFile, "Decipher txt file", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with Decipher", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}".Replace(' ','_'); using (var decipherParser = new DecipherParser(GZipUtilities.GetAppropriateStreamReader(_inputFile), referenceProvider.RefNameToChromosome)) using (FileStream nsiStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.IntervalFileSuffix))) using (var nsiWriter = new NsiWriter(nsiStream, version, referenceProvider.Assembly, SaCommon.DecipherTag, ReportFor.StructuralVariants, SaCommon.SchemaVersion)) { nsiWriter.Write(decipherParser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/CreateGlobalAllelesDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.DbSnp; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.CreateGlobalAllelesDb { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "dbSNP VCF file", "--in") .CheckInputFilenameExists(_inputFile, "dbSNP VCF file", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var globalMinorReader = new GlobalMinorReader(GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider.RefNameToChromosome); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}_globalMinor"; using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.GlobalAlleleTag, true, false, SaCommon.SchemaVersion, true)) { nsaWriter.Write(globalMinorReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/CreateGmeDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.Gme; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.CreateGmeDb { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input TSV file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "GME TSV file", "--in") .CheckInputFilenameExists(_inputFile, "GME TSV file", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var gmeReader = new GmeParser(GZipUtilities.GetAppropriateStreamReader(_inputFile), referenceProvider)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.GmeTag, true, false, SaCommon.SchemaVersion, false)) { nsaWriter.Write(gmeReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/CreateOneKgDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.OneKGen; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.CreateOneKgDb { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "OneK Gen VCF file", "--in") .CheckInputFilenameExists(_inputFile, "OneK Gen VCF file", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}".Replace(' ','_'); using (var oneKGenReader = new OneKGenReader(GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var writer = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.OneKgenTag, true, false, SaCommon.SchemaVersion, false)) { writer.Write(oneKGenReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/CreateTopMedDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.TOPMed; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.CreateTopMedDb { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "TopMed VCFfile", "--in") .CheckInputFilenameExists(_inputFile, "TopMed VCFfile", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var topMedReader = new TopMedReader(GZipUtilities.GetAppropriateStreamReader(_inputFile), referenceProvider)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.TopMedTag, true, false, SaCommon.SchemaVersion, false)) { nsaWriter.Write(topMedReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/Custom/AllowedValues.cs ================================================ using System.Collections.Generic; using System.Linq; using ErrorHandling.Exceptions; namespace SAUtils.Custom { public static class AllowedValues { private const int MaxFilterLength = 20; private const int MaxIdentifierLength = 50; private const int MaxDescriptionLength = 100; private static readonly string[] EmptyValues = {".", ""}; private static readonly HashSet PredictionValues = new HashSet { "pathogenic", "p", "likely pathogenic", "lp", "vus", "likely benign", "lb", "benign", "b" }; public static void ValidatePredictionValue(string value, string line) { if (!IsEmptyValue(value) && !PredictionValues.Contains(value.ToLower())) throw new UserErrorException($"{value} is not a valid prediction value.\nInput line: {line}"); } public static void ValidateFilterValue(string value, string line) => CheckValueLength(value, line, MaxFilterLength); public static void ValidateIdentifierValue(string value, string line) => CheckValueLength(value, line, MaxIdentifierLength); public static void ValidateDescriptionValue(string value, string line) => CheckValueLength(value, line, MaxDescriptionLength); public static void ValidateScoreValue(string value, string line) { // empty (.) implies unknown score if (IsEmptyValue(value)) return; if (double.TryParse(value, out _)) return; var e = new UserErrorException( $"{value} is not a valid score value. Scores are expected to be numbers."); e.Data["Line"] = line; throw e; } public static bool IsEmptyValue(string value) => EmptyValues.Contains(value); private static void CheckValueLength(string value, string line, int maxLength) { if (!string.IsNullOrEmpty(value) && value.Length > maxLength) throw new UserErrorException($"\"{value}\" exceeds the allowed length for descriptions ({maxLength} characters).\nInput line:{line}"); } } } ================================================ FILE: SAUtils/Custom/CaUtilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using Genome; using SAUtils.DataStructures; using SAUtils.Schema; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.Custom { public static class CaUtilities { public static NsaWriter GetNsaWriter(Stream nsaStream, Stream indexStream, VariantAnnotationsParser parser, string dataVersion, ISequenceProvider referenceProvider, out DataSourceVersion version, bool skipRefBaseValidation) { dataVersion = string.IsNullOrEmpty(parser.Version) ? dataVersion : parser.Version; version = new DataSourceVersion(parser.JsonTag, dataVersion, DateTime.Now.Ticks, parser.DataSourceDescription); return new NsaWriter( nsaStream, indexStream, version, referenceProvider, parser.JsonTag, parser.MatchByAllele, // match by allele parser.IsArray, // is array SaCommon.SchemaVersion, false, // is positional skipRefBaseValidation, // skip incorrect ref base true // throw error on conflicting entries ); } public static NsiWriter GetNsiWriter(Stream nsiStream, DataSourceVersion version, GenomeAssembly assembly, string jsonTag, ReportFor reportFor) => new NsiWriter(nsiStream, version, assembly, jsonTag, reportFor, SaCommon.SchemaVersion); public static NgaWriter GetNgaWriter(Stream ngaStream, GeneAnnotationsParser parser, string dataVersion) { dataVersion = string.IsNullOrEmpty(parser.Version) ? dataVersion : parser.Version; var version = new DataSourceVersion(parser.JsonTag, dataVersion, DateTime.Now.Ticks, parser.DataSourceDescription); return new NgaWriter(ngaStream, version, parser.JsonTag, SaCommon.SchemaVersion, false); } public static (string JsonTag, int NsaItemsCount, SaJsonSchema IntervalJsonSchema, List Intervals) WriteSmallVariants(VariantAnnotationsParser parser, NsaWriter nsaWriter, StreamWriter schemaWriter) { int nsaItemsCount = nsaWriter.Write(parser.GetItems()); schemaWriter.Write(parser.JsonSchema); var intervals = parser.GetCustomIntervals(); if (nsaItemsCount == 0 & intervals == null) throw new UserErrorException(GeneAnnotationsParser.NoValidEntriesErrorMessage); return (parser.JsonTag, nsaItemsCount, parser.IntervalJsonSchema, intervals); } public static string GetInputFileName(string inputFilePath) { int fileNameIndex = inputFilePath.LastIndexOf(Path.DirectorySeparatorChar); return fileNameIndex < 0 ? inputFilePath : inputFilePath.Substring(fileNameIndex + 1); } } } ================================================ FILE: SAUtils/Custom/CustomGene.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; using SAUtils.Schema; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.Custom { public sealed class CustomGene : ISuppGeneItem { public string GeneSymbol { get; } private readonly List _values; private readonly SaJsonSchema _jsonSchema; private readonly string _inputLine; public CustomGene(string geneSymbol, List values, SaJsonSchema jsonSchema, string inputLine) { GeneSymbol = geneSymbol; _values = values; _jsonSchema = jsonSchema; _inputLine = inputLine; } public string GetJsonString() { try { return JsonObject.OpenBrace + _jsonSchema.GetJsonString(_values) + JsonObject.CloseBrace; } catch (UserErrorException e) { throw new UserErrorException(e.Message + $"\nInput line: {_inputLine}"); } } } } ================================================ FILE: SAUtils/Custom/GeneAnnotationsParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using ErrorHandling; using ErrorHandling.Exceptions; using OptimizedCore; using SAUtils.GeneIdentifiers; using SAUtils.Schema; using VariantAnnotation.Interface.SA; using VariantAnnotation.SA; namespace SAUtils.Custom { public sealed class GeneAnnotationsParser : IDisposable { private readonly StreamReader _reader; private readonly Dictionary _entrezGeneIdToSymbol; private readonly Dictionary _ensemblIdToSymbol; public string JsonTag; public string Version; public string DataSourceDescription; private string[] _tags; internal CustomAnnotationCategories[] Categories; internal string[] Descriptions; internal SaJsonValueType[] ValueTypes; internal readonly List JsonKeys = new List(); public SaJsonSchema JsonSchema; private const int NumRequiredColumns = 2; private int _numAnnotationColumns; private Action[] _annotationValidators; private readonly List _unknownGenes = new List(); public const string NoValidEntriesErrorMessage = "The provided TSV has no valid custom annotation entries."; public const string UnknownGeneIdsErrorMessage = "The following gene IDs were not recognized in Nirvana:"; internal GeneAnnotationsParser(StreamReader reader, Dictionary entrezGeneIdToSymbol, Dictionary ensemblIdToSymbol) { _reader = reader; _entrezGeneIdToSymbol = entrezGeneIdToSymbol; _ensemblIdToSymbol = ensemblIdToSymbol; } public static GeneAnnotationsParser Create(StreamReader reader, Dictionary entrezGeneIdToSymbol, Dictionary ensemblIdToSymbol) { var parser = new GeneAnnotationsParser(reader, entrezGeneIdToSymbol, ensemblIdToSymbol); parser.ParseHeaderLines(); parser.InitiateSchema(); parser.AddHeaderAnnotation(); return parser; } internal void ParseHeaderLines() { string line; while ((line = _reader.ReadLine()) !=null) { if (line.StartsWith("#geneSymbol")) break; line = line.Trim(); (string key, string value) = line.OptimizedKeyValue(); switch (key) { case "#title": JsonTag = value; break; case "#version": Version = value; break; case "#description": DataSourceDescription = value; break; default: var e = new UserErrorException("Unexpected header tag observed"); e.Data[ExitCodeUtilities.Line] = line; throw e; } } _tags = ParserUtilities.ParseTags(line, "#geneSymbol", NumRequiredColumns); CheckTagsAndSetJsonKeys(); Categories = ParserUtilities.ParseCategories(_reader.ReadLine(), NumRequiredColumns, _numAnnotationColumns, _annotationValidators); Descriptions = ParserUtilities.ParseDescriptions(_reader.ReadLine(), NumRequiredColumns, _numAnnotationColumns); ValueTypes = ParserUtilities.ParseTypes(_reader.ReadLine(), NumRequiredColumns, _numAnnotationColumns); } private void InitiateSchema() { JsonSchema = SaJsonSchema.Create(new StringBuilder(), JsonTag, SaJsonValueType.Object, JsonKeys); } private void CheckTagsAndSetJsonKeys() { for (int i = NumRequiredColumns; i < _tags.Length; i++) { if (string.IsNullOrWhiteSpace(_tags[i])) throw new UserErrorException($"Please provide a name for column {i + 1} at the second row."); JsonKeys.Add(_tags[i]); } _numAnnotationColumns = _tags.Length - NumRequiredColumns; _annotationValidators = Enumerable.Repeat>((a, b) => { }, _numAnnotationColumns).ToArray(); } private void AddHeaderAnnotation() { for (var i = 0; i < _numAnnotationColumns; i++) { var annotation = SaJsonKeyAnnotation.CreateFromProperties(ValueTypes[i], Categories[i], Descriptions[i]); JsonSchema?.AddAnnotation(_tags[i + NumRequiredColumns], annotation); } } public Dictionary> GetItems(bool skipGeneIdValidation=false, StreamWriter logWriter = null) { var geneAnnotations = new Dictionary>(); using (_reader) { string line; while ((line = _reader.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line)) continue; AddItem(line, geneAnnotations, skipGeneIdValidation, logWriter); } } if (_unknownGenes.Count > 0 && geneAnnotations.Count == 0) throw new UserErrorException($"{UnknownGeneIdsErrorMessage} {string.Join(',',_unknownGenes)}. {NoValidEntriesErrorMessage}"); if (_unknownGenes.Count > 0) throw new UserErrorException($"{UnknownGeneIdsErrorMessage} {string.Join(',',_unknownGenes)}."); if (geneAnnotations.Count == 0) throw new UserErrorException(NoValidEntriesErrorMessage); return geneAnnotations; } private void AddItem(string line, Dictionary> geneAnnotations, bool skipGeneIdValidation, StreamWriter logWriter) { var splits = line.OptimizedSplit('\t'); if (splits.Length != _tags.Length) throw new UserErrorException($"Column number mismatch!! Header has {_tags.Length} columns but {line} contains {splits.Length}"); string geneId = splits[1]; var annotationValues = new string[_numAnnotationColumns]; var hasAnnotation = false; for (var i = 0; i < _numAnnotationColumns; i++) { string annotationValue = splits[i + NumRequiredColumns]; if (annotationValue != "" && annotationValue != ".") hasAnnotation = true; annotationValues[i] = annotationValue; _annotationValidators[i](annotationValues[i], line); } if (!hasAnnotation) throw new UserErrorException($"No annotation provided in line {line}"); string geneSymbol = GeneUtilities.GetGeneSymbolFromId(geneId, _entrezGeneIdToSymbol, _ensemblIdToSymbol); if (geneSymbol == null) { if (!skipGeneIdValidation) _unknownGenes.Add(geneId); logWriter?.WriteLine($"Skipping unrecognized gene ID {geneId}"); return; } if (geneAnnotations.ContainsKey(geneSymbol)) throw new UserErrorException($"Found the same gene {geneSymbol} in different lines. Current line is: {line}"); geneAnnotations[geneSymbol] = new List {new CustomGene(geneSymbol, annotationValues.Select(x => new[] {x}).ToList(), JsonSchema, line)}; } public IReadOnlyList GetUnknownGenes() => _unknownGenes.OrderBy(x=>x).ToList(); public void Dispose() => _reader?.Dispose(); } } ================================================ FILE: SAUtils/Custom/GeneMain.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using SAUtils.GeneIdentifiers; using VariantAnnotation.SA; namespace SAUtils.Custom { public static class GeneMain { private static string _inputFile; private static string _universalGeneArchivePath; private static string _outputDirectory; private static bool _skipGeneValidation; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "uga|u=", "universal gene archive file path", v => _universalGeneArchivePath = v }, { "in|i=", "custom TSV file path", v => _inputFile = v }, { "skip-validation", "skips gene name validation", v => _skipGeneValidation = v != null }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_universalGeneArchivePath, "universal gene archive", "--uga") .CheckInputFilenameExists(_inputFile, "Custom gene annotation TSV", "--in") .CheckDirectoryExists(_outputDirectory, "output", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary gene annotation database from a custom input file", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var (entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(null, _universalGeneArchivePath); string outputPrefix = GetOutputPrefix(_inputFile); string ngaFilePath = Path.Combine(_outputDirectory, outputPrefix + SaCommon.GeneFileSuffix); string ngaSchemaFilePath = ngaFilePath + SaCommon.JsonSchemaSuffix; using (var parser = GeneAnnotationsParser.Create(GZipUtilities.GetAppropriateStreamReader(_inputFile), entrezGeneIdToSymbol, ensemblGeneIdToSymbol)) using (var ngaStream = FileUtilities.GetCreateStream(ngaFilePath)) using (var ngaWriter = CaUtilities.GetNgaWriter(ngaStream, parser, CaUtilities.GetInputFileName(_inputFile))) using (var saJsonSchemaStream = FileUtilities.GetCreateStream(ngaSchemaFilePath)) using (var schemaWriter = new StreamWriter(saJsonSchemaStream)) { ngaWriter.Write(parser.GetItems(_skipGeneValidation)); if(parser.GetUnknownGenes().Count > 0) throw new UserErrorException($"{GeneAnnotationsParser.UnknownGeneIdsErrorMessage} {string.Join(',',parser.GetUnknownGenes())}."); schemaWriter.Write(parser.JsonSchema); } return ExitCodes.Success; } private static string GetOutputPrefix(string inputFilePath) { string fileName = GetInputFileName(inputFilePath); if (fileName.EndsWith(".tsv")) return fileName.Substring(0, fileName.Length - 4); return fileName.EndsWith(".tsv.gz") ? fileName.Substring(0, fileName.Length - 7) : fileName; } private static string GetInputFileName(string inputFilePath) { int fileNameIndex = inputFilePath.LastIndexOf(Path.DirectorySeparatorChar); return fileNameIndex < 0 ? inputFilePath : inputFilePath.Substring(fileNameIndex + 1); } } } ================================================ FILE: SAUtils/Custom/ParserUtilities.cs ================================================ using System; using ErrorHandling.Exceptions; using OptimizedCore; using SAUtils.Schema; using VariantAnnotation.Interface.SA; using VariantAnnotation.SA; namespace SAUtils.Custom { public static class ParserUtilities { public static (bool MatchByAllele, bool IsArray, SaJsonValueType PrimaryType, ReportFor reportFor) ParseMatchVariantsBy(string line) { line = line.Trim(); CheckPrefix(line, "#matchVariantsBy"); string firstCol = line.OptimizedSplit('\t')[0]; (_, string matchBy) = firstCol.OptimizedKeyValue(); bool matchByAllele; bool isArray; SaJsonValueType primaryType; ReportFor reportFor = ReportFor.AllVariants; switch (matchBy) { case null: throw new UserErrorException("Please provide the annotation reporting criteria in the format: #matchVariantsBy=allele."); case "allele": matchByAllele = true; isArray = false; primaryType = SaJsonValueType.Object; break; case "position": primaryType = SaJsonValueType.ObjectArray; matchByAllele = false; isArray = true; break; case "sv": primaryType = SaJsonValueType.ObjectArray; matchByAllele = false; isArray = true; reportFor = ReportFor.StructuralVariants; break; default: throw new UserErrorException("matchVariantsBy tag has to be \'allele\', \'sv\' or \'position\'"); } return (matchByAllele, isArray, primaryType, reportFor); } public static string[] ParseTags(string line, string prefix, int numRequiredCols) { CheckPrefix(line, prefix); var tags = line.OptimizedSplit('\t'); if (tags.Length < numRequiredCols) throw new UserErrorException($"At least {numRequiredCols} columns required. Please note that the columns should be separated by tab."); return tags; } public static CustomAnnotationCategories[] ParseCategories(string line, int numRequiredColumns, int numAnnotationColumns, Action[] annotationValidators) { CheckPrefix(line, "#categories"); var splits = line.OptimizedSplit('\t'); if (splits.Length != numRequiredColumns + numAnnotationColumns) throw new UserErrorException("#categories row must have the same number of columns as the header row with column names."); var categories = new CustomAnnotationCategories[numAnnotationColumns]; for (var i = 0; i < numAnnotationColumns; i++) { switch (splits[i + numRequiredColumns].ToLower()) { case "allelecount": categories[i] = CustomAnnotationCategories.AlleleCount; break; case "allelenumber": categories[i] = CustomAnnotationCategories.AlleleNumber; break; case "allelefrequency": categories[i] = CustomAnnotationCategories.AlleleFrequency; break; case "homozygouscount": categories[i] = CustomAnnotationCategories.HomozygousCount; break; case "prediction": categories[i] = CustomAnnotationCategories.Prediction; annotationValidators[i] = AllowedValues.ValidatePredictionValue; break; case "filter": categories[i] = CustomAnnotationCategories.Filter; annotationValidators[i] = AllowedValues.ValidateFilterValue; break; case "identifier": categories[i] = CustomAnnotationCategories.Identifier; annotationValidators[i] = AllowedValues.ValidateIdentifierValue; break; case "description": categories[i] = CustomAnnotationCategories.Description; annotationValidators[i] = AllowedValues.ValidateDescriptionValue; break; case "score": categories[i] = CustomAnnotationCategories.Score; annotationValidators[i] = AllowedValues.ValidateScoreValue; break; case ".": case "": categories[i] = CustomAnnotationCategories.Unknown; break; default: throw new UserErrorException($"Invalid category value: {splits[i + numRequiredColumns]}"); } } return categories; } public static string[] ParseDescriptions(string line, int numRequiredColumns, int numAnnotationColumns) { CheckPrefix(line,"#descriptions"); var splits = line.OptimizedSplit('\t'); if (splits.Length != numRequiredColumns + numAnnotationColumns) throw new UserErrorException("#descriptions row must have the same number of columns as the header row with column names"); var descriptions = new string[numAnnotationColumns]; for (var i = 0; i < numAnnotationColumns; i++) { if (splits[i + numRequiredColumns] == "." || splits[i + numRequiredColumns] == "") descriptions[i] = null; else descriptions[i] = splits[i + numRequiredColumns]; } return descriptions; } public static SaJsonValueType[] ParseTypes(string line, int numRequiredColumns, int numAnnotationColumns) { CheckPrefix(line, "#type"); var splits = line.OptimizedSplit('\t'); if (splits.Length != numRequiredColumns + numAnnotationColumns) throw new UserErrorException("#types row must have the same number of columns as the header row with column names"); var valueTypes = new SaJsonValueType[numAnnotationColumns]; for (var i = 0; i < numAnnotationColumns; i++) { switch (splits[i + numRequiredColumns].ToLower()) { case "bool": valueTypes[i] = SaJsonValueType.Bool; break; case "string": valueTypes[i] = SaJsonValueType.String; break; case "number": valueTypes[i] = SaJsonValueType.Number; break; default: throw new UserErrorException("Invalid value for type column. Valid values are bool, string and number."); } } return valueTypes; } // ReSharper disable once ParameterOnlyUsedForPreconditionCheck.Global internal static void CheckPrefix(string line, string prefix) { if (line != null && !line.StartsWith(prefix)) throw new UserErrorException($"Expected a line starting with {prefix}. Observed \n{line}"); } public static bool CheckJsonTagConflict(string value) { return value.Equals(SaCommon.DbsnpTag) || value.Equals(SaCommon.GlobalAlleleTag) || value.Equals(SaCommon.AncestralAlleleTag) || value.Equals(SaCommon.ClinGenTag) || value.Equals(SaCommon.ClinvarTag) || value.Equals(SaCommon.CosmicTag) || value.Equals(SaCommon.CosmicCnvTag) || value.Equals(SaCommon.DgvTag) || value.Equals(SaCommon.GnomadTag) || value.Equals(SaCommon.GnomadExomeTag) || value.Equals(SaCommon.MitoMapTag) || value.Equals(SaCommon.OmimTag) || value.Equals(SaCommon.OneKgenTag) || value.Equals(SaCommon.OnekSvTag) || value.Equals(SaCommon.PhylopTag) || value.Equals(SaCommon.RefMinorTag) || value.Equals(SaCommon.TopMedTag); } } } ================================================ FILE: SAUtils/Custom/VariantAnnotationsParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using ErrorHandling; using ErrorHandling.Exceptions; using Genome; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.Schema; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.SA; using Variants; namespace SAUtils.Custom { public sealed class VariantAnnotationsParser : IDisposable { private readonly StreamReader _reader; public ISequenceProvider SequenceProvider; public string JsonTag; public GenomeAssembly Assembly; public string Version; public string DataSourceDescription; public bool MatchByAllele; public bool IsArray; public string[] Tags; internal CustomAnnotationCategories[] Categories; internal string[] Descriptions; internal SaJsonValueType[] ValueTypes; public ReportFor ReportFor; private int _numRequiredColumns; private int _numAnnotationColumns; private int _altColumnIndex = -1; private int _endColumnIndex = -1; private readonly HashSet _allowedGenomeAssemblies = new HashSet { GenomeAssembly.GRCh37, GenomeAssembly.GRCh38, GenomeAssembly.SARSCoV2 }; private readonly List _intervals; private (Chromosome Chromesome, int Position) _previousPosition = (null, 0); private Action[] _annotationValidators; private SaJsonValueType _primaryType; private readonly Dictionary _predefinedTypeAnnotation = new Dictionary { {"refAllele", SaJsonValueType.String}, {"altAllele", SaJsonValueType.String}, {"start", SaJsonValueType.Number}, {"end", SaJsonValueType.Number} }; internal readonly List JsonKeys = new List { "refAllele", "altAllele" }; internal readonly List IntervalJsonKeys = new List { "start", "end" }; public SaJsonSchema JsonSchema; public SaJsonSchema IntervalJsonSchema; internal VariantAnnotationsParser(StreamReader streamReader, ISequenceProvider sequenceProvider) { _reader = streamReader; SequenceProvider = sequenceProvider; _intervals = new List(); } public static VariantAnnotationsParser Create(StreamReader streamReader, ISequenceProvider sequenceProvider = null) { var parser = new VariantAnnotationsParser(streamReader, sequenceProvider); parser.ParseHeaderLines(); parser.InitiateSchema(); parser.AddPredefinedTypeAnnotation(); parser.AddHeaderAnnotation(); return parser; } internal void ParseHeaderLines() { var hasMatchByLine = false; string line; while ((line = _reader.ReadLine())!=null) { if (line.StartsWith("#CHROM")) break; line = line.Trim(); (string key, string value) = line.OptimizedKeyValue(); switch (key) { case "#title": JsonTag = value; break; case "#assembly": Assembly = GenomeAssemblyHelper.Convert(value); break; case "#matchVariantsBy": (MatchByAllele, IsArray, _primaryType, ReportFor) = ParserUtilities.ParseMatchVariantsBy(line); hasMatchByLine = true; break; case "#version": Version = value; break; case "#description": DataSourceDescription = value; break; default: var e = new UserErrorException("Unexpected header tag observed:"+value); e.Data[ExitCodeUtilities.Line] = line; throw e; } } CheckRequiredFields(hasMatchByLine); //The following lines have to appear in exact order Tags = ParserUtilities.ParseTags(line, "#CHROM", _numRequiredColumns); CheckTagsAndSetJsonKeys(); Categories = ParserUtilities.ParseCategories(_reader.ReadLine(), _numRequiredColumns, _numAnnotationColumns, _annotationValidators); Descriptions = ParserUtilities.ParseDescriptions(_reader.ReadLine(), _numRequiredColumns, _numAnnotationColumns); ValueTypes = ParserUtilities.ParseTypes(_reader.ReadLine(), _numRequiredColumns, _numAnnotationColumns); } private void CheckRequiredFields(bool hasMatchByLine) { if (string.IsNullOrEmpty(JsonTag)) throw new UserErrorException("Please provide the title in the format: #title=titleValue."); if (ParserUtilities.CheckJsonTagConflict(JsonTag)) throw new UserErrorException($"{JsonTag} is a reserved supplementary annotation tag in Nirvana. Please use a different value."); if (!_allowedGenomeAssemblies.Contains(Assembly)) throw new UserErrorException("Only GRCh37 and GRCh38 are accepted as genome assembly."); if (!hasMatchByLine) throw new UserErrorException( "Please provide the annotation reporting criteria in the format: #matchVariantsBy=allele."); } private void CheckTagsAndSetJsonKeys() { CheckPosAndRefColumns(); CheckAltAndEndColumns(); for (int i = _numRequiredColumns; i < Tags.Length; i++) { if (string.IsNullOrWhiteSpace(Tags[i])) throw new UserErrorException($"Please provide a name for column {i + 1} at the forth row."); JsonKeys.Add(Tags[i]); IntervalJsonKeys.Add(Tags[i]); } } internal void CheckPosAndRefColumns() { if (Tags[1] != "POS" || Tags[2] != "REF") throw new UserErrorException("The 2nd and 3rd columns must be POS and REF, respectively."); } internal void CheckAltAndEndColumns() { _numRequiredColumns = 4; switch (Tags[3]) { case "ALT": { _altColumnIndex = 3; if (Tags.Length > 4 && Tags[4] == "END") { _endColumnIndex = 4; _numRequiredColumns = 5; } break; } case "END": _endColumnIndex = 3; break; default: throw new UserErrorException("Please provide at least one of the ALT and END columns.The END column should come after the ALT column if both are present."); } _numAnnotationColumns = Tags.Length - _numRequiredColumns; _annotationValidators = Enumerable.Repeat>((a, b) => { }, _numAnnotationColumns).ToArray(); } public IEnumerable GetItems() { if (SequenceProvider == null) { throw new Exception("Sequence provider is null."); } using (_reader) { string line; while ((line = _reader.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line)) continue; var item = ExtractItems(line); if (item == null) continue; yield return item; } } } private void InitiateSchema() { if (_altColumnIndex != -1) JsonSchema = SaJsonSchema.Create(new StringBuilder(), JsonTag, _primaryType, JsonKeys); if (_endColumnIndex != -1) IntervalJsonSchema = SaJsonSchema.Create(new StringBuilder(), JsonTag, SaJsonValueType.ObjectArray, IntervalJsonKeys); } private void AddPredefinedTypeAnnotation() { foreach ((string jsonKey, var valueType) in _predefinedTypeAnnotation) { JsonSchema?.AddAnnotation(jsonKey, SaJsonKeyAnnotation.CreateFromProperties(valueType, 0, null)); IntervalJsonSchema?.AddAnnotation(jsonKey, SaJsonKeyAnnotation.CreateFromProperties(valueType, 0, null)); } } private void AddHeaderAnnotation() { for (var i = 0; i < _numAnnotationColumns; i++) { var annotation = SaJsonKeyAnnotation.CreateFromProperties(ValueTypes[i], Categories[i], Descriptions[i]); JsonSchema?.AddAnnotation(Tags[i + _numRequiredColumns], annotation); IntervalJsonSchema?.AddAnnotation(Tags[i + _numRequiredColumns], annotation); } } internal CustomItem ExtractItems(string line) { var splits = line.OptimizedSplit('\t'); if (splits.Length != Tags.Length) throw new UserErrorException($"Column number mismatch!! Header has {Tags.Length} columns but {line} contains {splits.Length}"); string chromosome = splits[0]; if (!SequenceProvider.RefNameToChromosome.TryGetValue(chromosome, out var chrom)) { Console.WriteLine($"Annotation on {chromosome} is skipped."); return null; } SequenceProvider.LoadChromosome(chrom); if (!int.TryParse(splits[1], out var position)) throw new UserErrorException($"POS is not an int number at: {line}."); CheckAnnotationSorted(chrom, position, line); string refAllele = splits[2].ToUpper(); var annotationValues = new string[_numAnnotationColumns]; for (var i = 0; i < _numAnnotationColumns; i++) { annotationValues[i] = splits[i + _numRequiredColumns]; _annotationValidators[i](annotationValues[i], line); } if (IsInterval(splits)) { if (!int.TryParse(splits[_endColumnIndex], out var end)) throw new UserErrorException($"END is not an integer.\nInput line: {line}."); //for symbolic alleles, position needs to increment to account for the padding base if (_altColumnIndex >=0 && IsSymbolicAllele(splits[_altColumnIndex])) position++; var jsonStringValues = new List { position.ToString(), splits[_endColumnIndex] }; jsonStringValues.AddRange(annotationValues); _intervals.Add(new CustomInterval(chrom, position, end, jsonStringValues.Select(x => new[] { x }).ToList(), IntervalJsonSchema, line)); return null; } string altAllele = splits[_altColumnIndex]; if (!IsValidAltAllele(altAllele)) throw new UserErrorException($"Invalid nucleotides in ALT column: {altAllele}.\nInput line: {line}"); (position, refAllele, altAllele) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, SequenceProvider.Sequence); return new CustomItem(chrom, position, refAllele, altAllele, annotationValues.Select(x => new[] { x }).ToArray(), JsonSchema, line); } private bool IsSymbolicAllele(string altAllele) { return altAllele.StartsWith('<') && altAllele.EndsWith('>'); } private bool IsInterval(string[] splits) => _endColumnIndex != -1 && !AllowedValues.IsEmptyValue(splits[_endColumnIndex]); private void CheckAnnotationSorted(Chromosome chrom, int position, string line) { if (chrom != _previousPosition.Chromesome) { _previousPosition = (chrom, position); } else { if (position < _previousPosition.Position) throw new UserErrorException($"Annotation is not sorted at {line}"); _previousPosition.Position = position; } } public List GetCustomIntervals() => _intervals.Count > 0 ? _intervals : null; internal static bool IsValidAltAllele(string sequence) { if (sequence.Contains('[') || sequence.Contains(']')) return true; var validNucleotides = new[] { 'a', 'c', 'g', 't', 'n' }; foreach (char nucleotide in sequence.ToLower()) { if (!validNucleotides.Contains(nucleotide)) return false; } return true; } public void Dispose() { _reader?.Dispose(); } } } ================================================ FILE: SAUtils/Custom/VariantMain.cs ================================================ using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.DataStructures; using SAUtils.Schema; using VariantAnnotation.Interface.SA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.Custom { public static class VariantMain { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; private static bool _skipRefBaseValidation; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "custom TSV file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v }, { "skip-ref", "skip ref base validation", v => _skipRefBaseValidation = v != null } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence", "--ref") .CheckInputFilenameExists(_inputFile, "Custom variant annotation TSV", "--in") .CheckDirectoryExists(_outputDirectory, "output", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary variant annotation database from a custom input file", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); List intervals; SaJsonSchema intervalJsonSchema; string jsonTag; DataSourceVersion version; string outputPrefix = GetOutputPrefix(_inputFile); string nsaFileName = Path.Combine(_outputDirectory, outputPrefix + SaCommon.SaFileSuffix); string nsaIndexFileName = nsaFileName + SaCommon.IndexSuffix; string nsaSchemaFileName = nsaFileName + SaCommon.JsonSchemaSuffix; ReportFor reportFor; var nsaItemCount = 0; using (var parser = VariantAnnotationsParser.Create(GZipUtilities.GetAppropriateStreamReader(_inputFile), referenceProvider)) using (var nsaStream = FileUtilities.GetCreateStream(nsaFileName)) using (var indexStream = FileUtilities.GetCreateStream(nsaIndexFileName)) using (var nsaWriter = CaUtilities.GetNsaWriter(nsaStream, indexStream, parser, CaUtilities.GetInputFileName(_inputFile),referenceProvider, out version, _skipRefBaseValidation)) using (var saJsonSchemaStream = FileUtilities.GetCreateStream(nsaSchemaFileName)) using (var schemaWriter = new StreamWriter(saJsonSchemaStream)) { (jsonTag, nsaItemCount, intervalJsonSchema, intervals) = CaUtilities.WriteSmallVariants(parser, nsaWriter, schemaWriter); reportFor = parser.ReportFor; if (intervals == null) return ExitCodes.Success; } if (nsaItemCount == 0) { File.Delete(nsaFileName); File.Delete(nsaIndexFileName); File.Delete(nsaSchemaFileName); } using (var nsiStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outputPrefix + SaCommon.IntervalFileSuffix))) using (var nsiWriter = CaUtilities.GetNsiWriter(nsiStream, version, referenceProvider.Assembly, jsonTag, reportFor)) using (var siJsonSchemaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outputPrefix + SaCommon.IntervalFileSuffix + SaCommon.JsonSchemaSuffix))) using (var schemaWriter = new StreamWriter(siJsonSchemaStream)) { nsiWriter.Write(intervals); schemaWriter.Write(intervalJsonSchema); } return ExitCodes.Success; } private static string GetOutputPrefix(string inputFilePath) { string fileName = CaUtilities.GetInputFileName(inputFilePath); if (fileName.EndsWith(".tsv")) return fileName.Substring(0, fileName.Length - 4); return fileName.EndsWith(".tsv.gz") ? fileName.Substring(0, fileName.Length - 7) : fileName; } } } ================================================ FILE: SAUtils/Dann/Create.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using SAUtils.InputFileParsers; using VariantAnnotation.GenericScore; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.Dann { public static class Create { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input DANN file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; var commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckInputFilenameExists(_inputFile, "input DANN file Path", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Create a supplementary database from DANN input file ", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var nucleotides = new[] {"A", "C", "G", "T"}; var dannParserSettings = new ParserSettings( new ColumnIndex(0, 2, 3, 4, 5, null), nucleotides, GenericScoreParser.MaxRepresentativeScores ); var dannWriterSettings = new WriterSettings( 1_000_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder(SaCommon.DannTag + SaCommon.Score, null), new SaItemValidator(true, false) ); DataSourceVersion version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); var outFileName = $"{version.Name}_{version.Version}"; using (var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference))) using (var streamReader = GZipUtilities.GetAppropriateStreamReader(_inputFile)) using (var dannParser = new GenericScoreParser(dannParserSettings, streamReader, referenceProvider.RefNameToChromosome)) using (var saStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GsaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GsaFileSuffix + SaCommon.IndexSuffix))) using (var saWriter = new ScoreFileWriter(dannWriterSettings, saStream, indexStream, version, referenceProvider, SaCommon.SchemaVersion, skipIncorrectRefEntries: true, leaveOpen: false)) { saWriter.Write(dannParser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/DataStructures/AlleleFrequencyItem.cs ================================================ using System; using Genome; using VariantAnnotation.Interface.SA; namespace SAUtils.DataStructures { public sealed class AlleleFrequencyItem:ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public readonly double AltFrequency; public AlleleFrequencyItem(Chromosome chromosome, int position, string refAllele, string altAllele, double altFrequency, string inputLine) { Chromosome = chromosome; Position = position; AltFrequency = altFrequency; RefAllele = refAllele; AltAllele = altAllele; InputLine = inputLine; } public string GetJsonString() { throw new NotImplementedException(); } public string InputLine { get; } } } ================================================ FILE: SAUtils/DataStructures/AncestralAlleleItem.cs ================================================ using Genome; using VariantAnnotation.Interface.SA; namespace SAUtils.DataStructures { public sealed class AncestralAlleleItem: ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public string InputLine { get; } public readonly string AncestralAllele; public AncestralAlleleItem(Chromosome chromosome, int position, string refAllele, string altAllele, string ancestralAllele, string inputLine) { Chromosome = chromosome; Position = position; RefAllele = refAllele; AltAllele = altAllele; AncestralAllele = ancestralAllele; InputLine = inputLine; } public string GetJsonString() { return $"\"{AncestralAllele}\""; } } } ================================================ FILE: SAUtils/DataStructures/ClinGenItem.cs ================================================ using System.Collections.Generic; using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using Variants; namespace SAUtils.DataStructures { public enum ClinicalInterpretation { // ReSharper disable InconsistentNaming pathogenic = 5, likely_pathogenic = 4, benign = 3, likely_benign = 2, uncertain_significance = 1, unknown = 0 // ReSharper restore InconsistentNaming } public sealed class ClinGenItem:ISuppIntervalItem { public int Start { get; } public int End { get; } public Chromosome Chromosome { get; } private string Id { get; } private VariantType VariantType { get; } private ClinicalInterpretation ClinicalInterpretation { get; } private IEnumerable Phenotypes => _phenotypes; private readonly HashSet _phenotypes; private IEnumerable PhenotypeIds => _phenotypeIds; private readonly HashSet _phenotypeIds; private int ObservedGains { get; } private int ObservedLosses { get; } private bool Validated { get; } public ClinGenItem(string id, Chromosome chromosome, int start, int end, VariantType variantType, int observedGains, int observedLosses, ClinicalInterpretation clinicalInterpretation, bool validated, HashSet phenotypes = null, HashSet phenotypeIds = null) { Id = id; Chromosome = chromosome; Start = start; End = end; VariantType = variantType; ClinicalInterpretation = clinicalInterpretation; _phenotypes = phenotypes ?? new HashSet(); _phenotypeIds = phenotypeIds ?? new HashSet(); ObservedGains = observedGains; ObservedLosses = observedLosses; Validated = validated; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("chromosome", Chromosome.EnsemblName); jsonObject.AddIntValue("begin", Start); jsonObject.AddIntValue("end", End); jsonObject.AddStringValue("variantType", VariantType.ToString()); jsonObject.AddStringValue("id", Id); jsonObject.AddStringValue("clinicalInterpretation", GetClinicalDescription(ClinicalInterpretation)); jsonObject.AddStringValues("phenotypes", Phenotypes); jsonObject.AddStringValues("phenotypeIds", PhenotypeIds); if (ObservedGains>0) jsonObject.AddIntValue("observedGains", ObservedGains); if (ObservedLosses>0) jsonObject.AddIntValue("observedLosses", ObservedLosses); jsonObject.AddBoolValue("validated",Validated); return StringBuilderPool.GetStringAndReturn(sb); } private static string GetClinicalDescription(ClinicalInterpretation clinicalInterpretation) { // ReSharper disable once SwitchStatementMissingSomeCases switch (clinicalInterpretation) { case ClinicalInterpretation.uncertain_significance: return "uncertain significance"; case ClinicalInterpretation.likely_benign: return "likely benign"; case ClinicalInterpretation.likely_pathogenic: return "likely pathogenic"; case ClinicalInterpretation.unknown: return null; default: return clinicalInterpretation.ToString(); } } } } ================================================ FILE: SAUtils/DataStructures/ClinVarItem.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using Genome; using SAUtils.InputFileParsers.ClinVar; using SAUtils.Schema; using VariantAnnotation.Interface.SA; namespace SAUtils.DataStructures { public sealed class ClinVarItem : IClinVarSaItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public string InputLine { get; } public int Stop { get; } public string VariantType { get; } public string Id { get; } public string VariationId { get; set; } public IEnumerable AlleleOrigins { get; } public IEnumerable Phenotypes { get; } public IEnumerable Significances { get; } public ClinVarCommon.ReviewStatus ReviewStatus { get; } private string IsAlleleSpecific { get; } public IEnumerable MedGenIds { get; } public IEnumerable OmimIds { get; } public IEnumerable OrphanetIds { get; } public IEnumerable PubmedIds { get; } public long LastUpdatedDate { get; } public SaJsonSchema JsonSchema { get; } public ClinVarItem(Chromosome chromosome, int position, int stop, string refAllele, string altAllele, SaJsonSchema jsonSchema, IEnumerable alleleOrigins, string variantType, string id, string variationId, ClinVarCommon.ReviewStatus reviewStatus, IEnumerable medGenIds, IEnumerable omimIds, IEnumerable orphanetIds, IEnumerable phenotypes, IEnumerable significances, IEnumerable pubmedIds = null, long lastUpdatedDate = long.MinValue ) { Chromosome = chromosome; Position = position; Stop = stop; AlleleOrigins = alleleOrigins; AltAllele = altAllele; JsonSchema = jsonSchema; VariantType = variantType; Id = id; VariationId = variationId; MedGenIds = medGenIds; OmimIds = omimIds; OrphanetIds = orphanetIds; Phenotypes = phenotypes; RefAllele = refAllele; Significances = significances; PubmedIds = pubmedIds; LastUpdatedDate = lastUpdatedDate; IsAlleleSpecific = null; ReviewStatus = reviewStatus; } public string GetJsonString() { return JsonSchema.GetJsonString(GetValues()); } private List GetValues() { var values = new List { //the exact order of adding values has to be preserved. the order is dictated by the json schema new[] {Id}, new[] {VariationId}, new[] {ClinVarCommon.ReviewStatusStrings[ReviewStatus]}, AlleleOrigins?.ToArray(), new[] {ClinVarCommon.NormalizeAllele(RefAllele)}, new[] {ClinVarCommon.NormalizeAllele(AltAllele)}, Phenotypes?.ToArray(), MedGenIds?.ToArray(), OmimIds?.ToArray(), OrphanetIds?.ToArray(), Significances?.ToArray(), new[] {new DateTime(LastUpdatedDate).ToString("yyyy-MM-dd")}, PubmedIds?.OrderBy(x => x).Select(x => x.ToString()).ToArray() }; return values; } public int CompareTo(IClinVarSaItem other) { return Chromosome.Index != other.Chromosome.Index ? Chromosome.Index.CompareTo(other.Chromosome.Index) : Position.CompareTo(other.Position); } } } ================================================ FILE: SAUtils/DataStructures/ComputingUtilities.cs ================================================ using System; using VariantAnnotation.IO; namespace SAUtils.DataStructures { public static class ComputingUtilities { public static string ComputeFrequency(int? alleleNumber, int? alleleCount) { return alleleNumber != null && alleleNumber.Value > 0 && alleleCount != null ? ((double)alleleCount / alleleNumber.Value).ToString(JsonCommon.FrequencyRoundingFormat) : null; } public static int GetCoverage(double depth, double allAlleleNumber) { return (int) Math.Round(depth / allAlleleNumber, 0, MidpointRounding.AwayFromZero); } } } ================================================ FILE: SAUtils/DataStructures/ConservationItem.cs ================================================ using System; using Genome; namespace SAUtils.DataStructures { public sealed class PhylopItem { public Chromosome Chromosome { get; } public int Position { get; } public double Score { get; } public PhylopItem(Chromosome chromosome, int position, double score) { Chromosome = chromosome; Position = position; Score = Math.Round(score,1, MidpointRounding.AwayFromZero); } } } ================================================ FILE: SAUtils/DataStructures/CosmicItem.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.DataStructures { public sealed class CosmicItem : ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public string InputLine { get; } private string Id { get; } private string Gene { get; } private int? SampleCount { get; } public HashSet Studies { get; } public CosmicItem( Chromosome chromosome, int position, string id, string refAllele, string altAllele, string gene, HashSet studies, int? sampleCount) { Chromosome = chromosome; Position = position; Id = id; RefAllele = refAllele; AltAllele = altAllele; Gene = gene; Studies = studies; SampleCount = sampleCount; } public sealed class CosmicStudy : IEquatable { #region members public string Id { get; } public IEnumerable Histologies { get; } public IEnumerable Sites { get; } #endregion public CosmicStudy(string studyId, IEnumerable histologies, IEnumerable sites) { Id = studyId; Sites = sites; Histologies = histologies; } public bool Equals(CosmicStudy other) { if (other == null) return false; return Id.Equals(other.Id) && Histologies.SequenceEqual(other.Histologies) && Sites.SequenceEqual(other.Sites); } public override int GetHashCode() { var hashCode = Id?.GetHashCode() ?? 0; //hashCode ^= Histologies.GetHashCode() ^ Sites.GetHashCode(); return hashCode; } } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("id", Id); jsonObject.AddStringValue("refAllele", string.IsNullOrEmpty(RefAllele) ? "-" : RefAllele); jsonObject.AddStringValue("altAllele", SaUtilsCommon.ReverseSaReducedAllele(AltAllele)); jsonObject.AddStringValue("gene", Gene); jsonObject.AddIntValue("sampleCount", SampleCount); jsonObject.AddStringValue("cancerTypesAndCounts", GetJsonStringFromDict("cancerType",GetCancerTypeCounts()), false); jsonObject.AddStringValue("cancerSitesAndCounts", GetJsonStringFromDict("cancerSite",GetTissueCounts()), false); return StringBuilderPool.GetStringAndReturn(sb); } internal Dictionary GetTissueCounts() { if (Studies == null) return null; var tissueCounts = new Dictionary(); foreach (var study in Studies) { if (study.Sites == null) return null; foreach (var site in study.Sites) { if (tissueCounts.TryGetValue(site, out _)) { tissueCounts[site]++; } else tissueCounts[site] = 1; } } return tissueCounts; } internal Dictionary GetCancerTypeCounts() { if (Studies == null) return null; var cancerTypeCounts = new Dictionary(); foreach (var study in Studies) { if (study.Histologies == null) return null; foreach (var histology in study.Histologies) { if (cancerTypeCounts.TryGetValue(histology, out _)) { cancerTypeCounts[histology]++; } else cancerTypeCounts[histology] = 1; } } return cancerTypeCounts; } private static string GetJsonStringFromDict(string dataType, Dictionary dictionary) { if (dictionary == null) return null; var sb = StringBuilderPool.Get(); sb.Append(JsonObject.OpenBracket); bool isFirstItem = true; foreach (var kvp in dictionary) { if (!isFirstItem) sb.Append(JsonObject.Comma); sb.Append(JsonObject.OpenBrace); sb.Append($"\"{dataType}\":\"{kvp.Key}\","); sb.Append($"\"count\":{kvp.Value}"); sb.Append(JsonObject.CloseBrace); isFirstItem = false; } sb.Append(JsonObject.CloseBracket); return StringBuilderPool.GetStringAndReturn(sb); } } } ================================================ FILE: SAUtils/DataStructures/CounterDictionary.cs ================================================ using System.Collections.Generic; using System.Text; using VariantAnnotation.Interface.IO; using VariantAnnotation.IO; namespace SAUtils.DataStructures; public sealed class CounterDictionary : Dictionary, IJsonSerializer { public uint Total; public void Add(TKey key) { Total++; if (TryGetValue(key, out uint _)) { this[key]++; return; } this[key] = 1; } public void SerializeJson(StringBuilder sb) { var jo = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jo.AddUIntValue("count", Total); foreach ((TKey key, uint count) in this) { jo.AddUIntValue(key.ToString(), count); } sb.Append(JsonObject.CloseBrace); } } ================================================ FILE: SAUtils/DataStructures/CustomInterval.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; using Genome; using SAUtils.Schema; using VariantAnnotation.Interface.SA; using Variants; namespace SAUtils.DataStructures { public sealed class CustomInterval : ISuppIntervalItem { public Chromosome Chromosome { get; } public int Start { get; } public int End { get; } private VariantType VariantType { get; } private readonly List _values; private readonly SaJsonSchema _jsonSchema; private readonly string _inputLine; /// /// constructor /// public CustomInterval(Chromosome chromosome, int start, int end, List values, SaJsonSchema jsonSchema, string inputLine) { Chromosome = chromosome; Start = start; End = end; VariantType = VariantType.structural_alteration; _values = values; _jsonSchema = jsonSchema; _inputLine = inputLine; } public string GetJsonString() { try { return _jsonSchema.GetJsonString(_values); } catch (UserErrorException e) { throw new UserErrorException(e.Message + $"\nInput line: {_inputLine}"); } } } } ================================================ FILE: SAUtils/DataStructures/CustomItem.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; using Genome; using SAUtils.Schema; using VariantAnnotation.Interface.SA; using VariantAnnotation.Utilities; namespace SAUtils.DataStructures { public sealed class CustomItem : ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } private readonly string[][] _values; private readonly SaJsonSchema _jsonSchema; public CustomItem(Chromosome chromosome, int start, string refAllele, string altAllele, string[][] values, SaJsonSchema jsonSchema, string inputLine) { Chromosome = chromosome; Position = start; RefAllele = refAllele; AltAllele = altAllele; _values = values; _jsonSchema = jsonSchema; InputLine = inputLine; } public string GetJsonString() { var allValues = new List {new []{BaseFormatting.EmptyToDash(RefAllele)}, new []{BaseFormatting.EmptyToDash(AltAllele)} }; allValues.AddRange(_values); try { return _jsonSchema.GetJsonString(allValues); } catch (UserErrorException e) { throw new UserErrorException(e.Message + $"\nInput line: {InputLine}"); } } public string InputLine { get; } } } ================================================ FILE: SAUtils/DataStructures/DbSnpItem.cs ================================================ using Genome; using VariantAnnotation.Interface.SA; namespace SAUtils.DataStructures { public sealed class DbSnpItem: ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public string InputLine { get; } public long RsId { get; } public DbSnpItem(Chromosome chromosome, int position, long rsId, string refAllele, string alternateAllele, string inputLine) { Chromosome = chromosome; Position = position; RsId = rsId; RefAllele = refAllele; AltAllele = alternateAllele; InputLine = inputLine; } public string GetJsonString() { return $"\"rs{RsId}\""; } } } ================================================ FILE: SAUtils/DataStructures/DecipherItem.cs ================================================ using System.Text; using Genome; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.DataStructures { public sealed class DecipherItem : ISuppIntervalItem { private readonly int? _delNum; private readonly double? _delFreq; private readonly int? _dupNum; private readonly double? _dupFreq; private readonly int? _sampleSize; public Chromosome Chromosome { get; } public int Start { get; } public int End { get; } public DecipherItem(Chromosome chrom, int start, int end, int? delNum, double? delFreq, int? dupNum, double? dupFreq, int? sampleSize) { Chromosome = chrom; Start = start; End = end; _delNum = delNum; _delFreq = delFreq; _dupNum = dupNum; _dupFreq = dupFreq; _sampleSize = sampleSize; } public string GetJsonString() { var sb = new StringBuilder(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("chromosome", Chromosome.EnsemblName); jsonObject.AddIntValue("begin", Start); jsonObject.AddIntValue("end", End); jsonObject.AddIntValue("numDeletions", _delNum); jsonObject.AddDoubleValue("deletionFrequency", _delFreq, JsonCommon.FrequencyRoundingFormat); jsonObject.AddIntValue("numDuplications", _dupNum); jsonObject.AddDoubleValue("duplicationFrequency", _dupFreq, JsonCommon.FrequencyRoundingFormat); jsonObject.AddIntValue("sampleSize", _sampleSize); return sb.ToString(); } } } ================================================ FILE: SAUtils/DataStructures/DgvItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using Variants; namespace SAUtils.DataStructures { public sealed class DgvItem : ISuppIntervalItem { public Chromosome Chromosome { get; } public int Start { get; } public int End { get; } private string Id { get; } private int ObservedGains { get; } private int ObservedLosses { get; } private int SampleSize { get; } private VariantType VariantType { get; } private double? VariantFreqAll { get; } public DgvItem(string id, Chromosome chromosome, int start, int end, int sampleSize, int observedGains, int observedLosses, VariantType variantType) { Id = id; Chromosome = chromosome; Start = start; End = end; SampleSize = sampleSize; ObservedGains = observedGains; ObservedLosses = observedLosses; VariantType = variantType; if (SampleSize == 0 || ObservedLosses + ObservedGains == 0) return; VariantFreqAll = (ObservedLosses + ObservedGains) / (double)SampleSize; VariantFreqAll = VariantFreqAll > 1.0 ? 1.0 : VariantFreqAll; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("chromosome", Chromosome.EnsemblName); jsonObject.AddIntValue("begin", Start); jsonObject.AddIntValue("end", End); jsonObject.AddStringValue("variantType", VariantType.ToString()); jsonObject.AddStringValue("id", Id); jsonObject.AddIntValue("sampleSize", SampleSize); if (ObservedGains != 0) jsonObject.AddIntValue("observedGains", ObservedGains); if (ObservedLosses != 0) jsonObject.AddIntValue("observedLosses", ObservedLosses); jsonObject.AddDoubleValue("variantFreqAll", VariantFreqAll, "0.#####"); return StringBuilderPool.GetStringAndReturn(sb); } public override bool Equals(object obj) { // If parameter is null return false. if (!(obj is DgvItem otherItem)) return false; // Return true if the fields match: return Equals(Chromosome, otherItem.Chromosome) && Start == otherItem.Start && End == otherItem.End && ObservedGains == otherItem.ObservedGains && SampleSize == otherItem.SampleSize && ObservedLosses == otherItem.ObservedLosses && string.Equals(Id, otherItem.Id) && Equals(VariantType, otherItem.VariantType) && Equals(VariantFreqAll, otherItem.VariantFreqAll); } public override int GetHashCode() { unchecked { var hashCode = Id?.GetHashCode() ?? 0; hashCode = (hashCode * 397) ^ (Chromosome?.GetHashCode() ?? 0); hashCode = (hashCode * 397) ^ Start.GetHashCode(); hashCode = (hashCode * 397) ^ End.GetHashCode(); hashCode = (hashCode * 397) ^ VariantType.GetHashCode(); hashCode = (hashCode * 397) ^ SampleSize.GetHashCode(); hashCode = (hashCode * 397) ^ ObservedGains.GetHashCode(); hashCode = (hashCode * 397) ^ ObservedLosses.GetHashCode(); hashCode = (hashCode * 397) ^ (VariantFreqAll?.GetHashCode() ?? 0); return hashCode; } } } } ================================================ FILE: SAUtils/DataStructures/GlobalMinorItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.DataStructures { public sealed class GlobalMinorItem:ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } private readonly string _allele; private readonly double _frequency; public GlobalMinorItem(Chromosome chromosome, int position, string allele, double frequency) { Chromosome = chromosome; Position = position; _allele = allele; _frequency = frequency; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("globalMinorAllele", _allele); jsonObject.AddDoubleValue("globalMinorAlleleFrequency", _frequency, "0.#######"); sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } public string InputLine { get; set; } } } ================================================ FILE: SAUtils/DataStructures/GmeItem.cs ================================================ using System.Text; using Genome; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.DataStructures { public sealed class GmeItem : ISupplementaryDataItem { private readonly int? _alleleCount; private readonly int? _alleleNum; private readonly double? _alleleFreq; private readonly bool _failedFilter; public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public GmeItem(Chromosome chrom, int position, string refAllele, string altAllele, int? alleleCount, int? alleleNum, double? alleleFreq, bool failedFilter) { Chromosome = chrom; Position = position; RefAllele = refAllele; AltAllele = altAllele; _alleleCount = alleleCount; _alleleNum = alleleNum; _alleleFreq = alleleFreq; _failedFilter = failedFilter; } public string GetJsonString() { var sb = new StringBuilder(); var jsonObject = new JsonObject(sb); jsonObject.AddIntValue("allAc", _alleleCount); jsonObject.AddIntValue("allAn", _alleleNum); jsonObject.AddDoubleValue("allAf", _alleleFreq); if (_failedFilter) jsonObject.AddBoolValue("failedFilter", true); return sb.ToString(); } public string InputLine { get; set; } } } ================================================ FILE: SAUtils/DataStructures/GnomadItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; // ReSharper disable NonReadonlyMemberInGetHashCode namespace SAUtils.DataStructures { public enum GnomadDataType : byte { Unknown, Genome, Exome } public sealed class GnomadItem : ISupplementaryDataItem { #region members public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public int? AllAlleleCount { get; private set; } public int? AfrAlleleCount { get; private set; } public int? AmrAlleleCount { get; private set; } public int? EasAlleleCount { get; private set; } public int? FinAlleleCount { get; private set; } public int? NfeAlleleCount { get; private set; } public int? OthAlleleCount { get; private set; } public int? AsjAlleleCount { get; private set; } public int? SasAlleleCount { get; private set; } public int? AllAlleleNumber { get; private set; } public int? AfrAlleleNumber { get; private set; } public int? AmrAlleleNumber { get; private set; } public int? EasAlleleNumber { get; private set; } public int? FinAlleleNumber { get; private set; } public int? NfeAlleleNumber { get; private set; } public int? OthAlleleNumber { get; private set; } public int? AsjAlleleNumber { get; private set; } public int? SasAlleleNumber { get; private set; } public int? AllHomCount { get; private set; } public int? AfrHomCount { get; private set; } public int? AmrHomCount { get; private set; } public int? EasHomCount { get; private set; } public int? FinHomCount { get; private set; } public int? NfeHomCount { get; private set; } public int? OthHomCount { get; private set; } public int? AsjHomCount { get; private set; } public int? SasHomCount { get; private set; } //male counts public int? MaleAlleleCount { get; private set; } public int? MaleAlleleNumber { get; private set; } public int? MaleHomCount { get; private set; } //female counts public int? FemaleAlleleCount { get; private set; } public int? FemaleAlleleNumber { get; private set; } public int? FemaleHomCount { get; private set; } //controls public int? ControlsAllAlleleCount { get; private set; } public int? ControlsAllAlleleNumber { get; private set; } public int? Depth { get; } public int? Coverage { get; } public bool HasFailedFilters { get; } public GnomadDataType DataType { get; } #endregion public GnomadItem(Chromosome chromosome, int position, string refAllele, string alternateAllele, int? depth, int? allAlleleNumber, int? afrAlleleNumber, int? amrAlleleNumber, int? easAlleleNumber, int? finAlleleNumber, int? nfeAlleleNumber, int? othAlleleNumber, int? asjAlleleNumber, int? sasAlleleNumber, int? maleAlleleNumber, int? femaleAlleleNumber, int? allAlleleCount, int? afrAlleleCount, int? amrAlleleCount, int? easAlleleCount, int? finAlleleCount, int? nfeAlleleCount, int? othAlleleCount, int? asjAlleleCount, int? sasAlleleCount, int? maleAlleleCount, int? femaleAlleleCount, int? allHomCount, int? afrHomCount, int? amrHomCount, int? easHomCount, int? finHomCount, int? nfeHomCount, int? othHomCount, int? asjHomCount, int? sasHomCount, int? maleHomCount, int? femaleHomCount, int? controlsAllAlleleNumber, int? controlsAllAlleleCount, bool hasFailedFilters, GnomadDataType dataType, string inputLine) { Chromosome = chromosome; Position = position; RefAllele = refAllele; AltAllele = alternateAllele; InputLine = inputLine; Depth = depth; if (depth != null && allAlleleNumber != null && allAlleleNumber.Value > 0) Coverage = ComputingUtilities.GetCoverage(depth.Value, allAlleleNumber.Value); AllAlleleNumber = allAlleleNumber; AfrAlleleNumber = afrAlleleNumber; AmrAlleleNumber = amrAlleleNumber; EasAlleleNumber = easAlleleNumber; FinAlleleNumber = finAlleleNumber; NfeAlleleNumber = nfeAlleleNumber; OthAlleleNumber = othAlleleNumber; AsjAlleleNumber = asjAlleleNumber; SasAlleleNumber = sasAlleleNumber; MaleAlleleNumber = maleAlleleNumber; FemaleAlleleNumber = femaleAlleleNumber; MaleHomCount = maleHomCount; AllAlleleCount = allAlleleCount; AfrAlleleCount = afrAlleleCount; AmrAlleleCount = amrAlleleCount; EasAlleleCount = easAlleleCount; FinAlleleCount = finAlleleCount; NfeAlleleCount = nfeAlleleCount; OthAlleleCount = othAlleleCount; AsjAlleleCount = asjAlleleCount; SasAlleleCount = sasAlleleCount; MaleAlleleCount = maleAlleleCount; FemaleAlleleCount = femaleAlleleCount; FemaleHomCount = femaleHomCount; AllHomCount = allHomCount; AfrHomCount = afrHomCount; AmrHomCount = amrHomCount; EasHomCount = easHomCount; FinHomCount = finHomCount; NfeHomCount = nfeHomCount; OthHomCount = othHomCount; AsjHomCount = asjHomCount; SasHomCount = sasHomCount; //controls ControlsAllAlleleNumber = controlsAllAlleleNumber; ControlsAllAlleleCount = controlsAllAlleleCount; HasFailedFilters = hasFailedFilters; DataType = dataType; RemoveAlleleNumberZero(); } private void RemoveAlleleNumberZero() { if (SaUtilsCommon.IsNumberNullOrZero(AllAlleleNumber)) { AllAlleleNumber = null; AllAlleleCount = null; AllHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(MaleAlleleNumber)) { MaleAlleleNumber = null; MaleAlleleCount = null; MaleHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(FemaleAlleleNumber)) { FemaleAlleleNumber = null; FemaleAlleleCount = null; FemaleHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(AfrAlleleNumber)) { AfrAlleleNumber = null; AfrAlleleCount = null; AfrHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(AmrAlleleNumber)) { AmrAlleleNumber = null; AmrAlleleCount = null; AmrHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(EasAlleleNumber)) { EasAlleleNumber = null; EasAlleleCount = null; EasHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(FinAlleleNumber)) { FinAlleleNumber = null; FinAlleleCount = null; FinHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(NfeAlleleNumber)) { NfeAlleleNumber = null; NfeAlleleCount = null; NfeHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(OthAlleleNumber)) { OthAlleleNumber = null; OthAlleleCount = null; OthHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(AsjAlleleNumber)) { AsjAlleleNumber = null; AsjAlleleCount = null; AsjHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(SasAlleleNumber)) { SasAlleleNumber = null; SasAlleleCount = null; SasHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(MaleAlleleNumber)) { MaleAlleleNumber = null; MaleAlleleCount = null; MaleHomCount = null; } if (SaUtilsCommon.IsNumberNullOrZero(FemaleAlleleNumber)) { FemaleAlleleNumber = null; FemaleAlleleCount = null; FemaleHomCount = null; } //controls if (SaUtilsCommon.IsNumberNullOrZero(ControlsAllAlleleNumber)) { ControlsAllAlleleNumber = null; ControlsAllAlleleCount = null; } } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddIntValue("coverage", Coverage); if (HasFailedFilters) jsonObject.AddBoolValue("failedFilter", true); jsonObject.AddStringValue("allAf", ComputingUtilities.ComputeFrequency(AllAlleleNumber, AllAlleleCount), false); jsonObject.AddIntValue("allAn", AllAlleleNumber); jsonObject.AddIntValue("allAc", AllAlleleCount); jsonObject.AddIntValue("allHc", AllHomCount); jsonObject.AddStringValue("afrAf", ComputingUtilities.ComputeFrequency(AfrAlleleNumber, AfrAlleleCount), false); jsonObject.AddIntValue("afrAn", AfrAlleleNumber); jsonObject.AddIntValue("afrAc", AfrAlleleCount); jsonObject.AddIntValue("afrHc", AfrHomCount); jsonObject.AddStringValue("amrAf", ComputingUtilities.ComputeFrequency(AmrAlleleNumber, AmrAlleleCount), false); jsonObject.AddIntValue("amrAn", AmrAlleleNumber); jsonObject.AddIntValue("amrAc", AmrAlleleCount); jsonObject.AddIntValue("amrHc", AmrHomCount); jsonObject.AddStringValue("easAf", ComputingUtilities.ComputeFrequency(EasAlleleNumber, EasAlleleCount), false); jsonObject.AddIntValue("easAn", EasAlleleNumber); jsonObject.AddIntValue("easAc", EasAlleleCount); jsonObject.AddIntValue("easHc", EasHomCount); jsonObject.AddStringValue("finAf", ComputingUtilities.ComputeFrequency(FinAlleleNumber, FinAlleleCount), false); jsonObject.AddIntValue("finAn", FinAlleleNumber); jsonObject.AddIntValue("finAc", FinAlleleCount); jsonObject.AddIntValue("finHc", FinHomCount); jsonObject.AddStringValue("nfeAf", ComputingUtilities.ComputeFrequency(NfeAlleleNumber, NfeAlleleCount), false); jsonObject.AddIntValue("nfeAn", NfeAlleleNumber); jsonObject.AddIntValue("nfeAc", NfeAlleleCount); jsonObject.AddIntValue("nfeHc", NfeHomCount); jsonObject.AddStringValue("asjAf", ComputingUtilities.ComputeFrequency(AsjAlleleNumber, AsjAlleleCount), false); jsonObject.AddIntValue("asjAn", AsjAlleleNumber); jsonObject.AddIntValue("asjAc", AsjAlleleCount); jsonObject.AddIntValue("asjHc", AsjHomCount); jsonObject.AddStringValue("sasAf", ComputingUtilities.ComputeFrequency(SasAlleleNumber, SasAlleleCount), false); jsonObject.AddIntValue("sasAn", SasAlleleNumber); jsonObject.AddIntValue("sasAc", SasAlleleCount); jsonObject.AddIntValue("sasHc", SasHomCount); jsonObject.AddStringValue("othAf", ComputingUtilities.ComputeFrequency(OthAlleleNumber, OthAlleleCount), false); jsonObject.AddIntValue("othAn", OthAlleleNumber); jsonObject.AddIntValue("othAc", OthAlleleCount); jsonObject.AddIntValue("othHc", OthHomCount); jsonObject.AddStringValue("maleAf", ComputingUtilities.ComputeFrequency(MaleAlleleNumber, MaleAlleleCount), false); jsonObject.AddIntValue("maleAn", MaleAlleleNumber); jsonObject.AddIntValue("maleAc", MaleAlleleCount); jsonObject.AddIntValue("maleHc", MaleHomCount); jsonObject.AddStringValue("femaleAf", ComputingUtilities.ComputeFrequency(FemaleAlleleNumber, FemaleAlleleCount), false); jsonObject.AddIntValue("femaleAn", FemaleAlleleNumber); jsonObject.AddIntValue("femaleAc", FemaleAlleleCount); jsonObject.AddIntValue("femaleHc", FemaleHomCount); //controls //jsonObject.AddIntValue("controlsCoverage", ControlsCoverage); jsonObject.AddStringValue("controlsAllAf", ComputingUtilities.ComputeFrequency(ControlsAllAlleleNumber, ControlsAllAlleleCount), false); jsonObject.AddIntValue("controlsAllAn", ControlsAllAlleleNumber); jsonObject.AddIntValue("controlsAllAc", ControlsAllAlleleCount); return StringBuilderPool.GetStringAndReturn(sb); } public string InputLine { get; } public static int CompareTo(GnomadItem item, GnomadItem other) { if (other == null) return -1; return item.Chromosome.Index == other.Chromosome.Index ? item.Position.CompareTo(other.Position) : item.Chromosome.Index.CompareTo(other.Chromosome.Index); } } } ================================================ FILE: SAUtils/DataStructures/GnomadSvItem.cs ================================================ using System.Text; using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using Variants; namespace SAUtils.DataStructures; public sealed record GnomadSvItem(Chromosome Chromosome, string InputLine) : ISuppIntervalItem { public int Start { get; init; } public int End { get; init; } public bool HasFailedFilters { get; init; } public VariantType SvType { get; init; } public string VariantId { get; init; } public double? AllAlleleFrequency { get; init; } public double? AfrAlleleFrequency { get; init; } public double? AmrAlleleFrequency { get; init; } public double? EasAlleleFrequency { get; init; } public double? EurAlleleFrequency { get; init; } public double? OthAlleleFrequency { get; init; } public double? FemaleAlleleFrequency { get; init; } public double? MaleAlleleFrequency { get; init; } public int? AllAlleleCount { get; init; } public int? AfrAlleleCount { get; init; } public int? AmrAlleleCount { get; init; } public int? EasAlleleCount { get; init; } public int? EurAlleleCount { get; init; } public int? OthAlleleCount { get; init; } public int? FemaleAlleleCount { get; init; } public int? MaleAlleleCount { get; init; } public int? AllAlleleNumber { get; init; } public int? AfrAlleleNumber { get; init; } public int? AmrAlleleNumber { get; init; } public int? EasAlleleNumber { get; init; } public int? EurAlleleNumber { get; init; } public int? OthAlleleNumber { get; init; } public int? FemaleAlleleNumber { get; init; } public int? MaleAlleleNumber { get; init; } public int? AllHomCount { get; init; } public int? AfrHomCount { get; init; } public int? AmrHomCount { get; init; } public int? EasHomCount { get; init; } public int? EurHomCount { get; init; } public int? OthHomCount { get; init; } public int? FemaleHomCount { get; init; } public int? MaleHomCount { get; init; } public string GetJsonString() { int start = Start; int end = End; // swap bengin and end if variant is an insertion if (SvType == VariantType.insertion) { (start, end) = (end, start); } StringBuilder sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue(JsonCommon.Chromosome, Chromosome.EnsemblName); jsonObject.AddIntValue(JsonCommon.Begin, start); jsonObject.AddIntValue(JsonCommon.End, end); jsonObject.AddStringValue(JsonCommon.VariantId, VariantId); jsonObject.AddStringValue(JsonCommon.VariantType, SvType.ToString()); if (HasFailedFilters) jsonObject.AddBoolValue(JsonCommon.FailedFilter, true); jsonObject.AddDoubleValue(JsonCommon.AllAlleleFrequency, AllAlleleFrequency, JsonCommon.FrequencyRoundingFormat); jsonObject.AddDoubleValue(JsonCommon.AfrAlleleFrequency, AfrAlleleFrequency, JsonCommon.FrequencyRoundingFormat); jsonObject.AddDoubleValue(JsonCommon.AmrAlleleFrequency, AmrAlleleFrequency, JsonCommon.FrequencyRoundingFormat); jsonObject.AddDoubleValue(JsonCommon.EasAlleleFrequency, EasAlleleFrequency, JsonCommon.FrequencyRoundingFormat); jsonObject.AddDoubleValue(JsonCommon.EurAlleleFrequency, EurAlleleFrequency, JsonCommon.FrequencyRoundingFormat); jsonObject.AddDoubleValue(JsonCommon.OthAlleleFrequency, OthAlleleFrequency, JsonCommon.FrequencyRoundingFormat); jsonObject.AddDoubleValue(JsonCommon.FemaleAlleleFrequency, FemaleAlleleFrequency, JsonCommon.FrequencyRoundingFormat); jsonObject.AddDoubleValue(JsonCommon.MaleAlleleFrequency, MaleAlleleFrequency, JsonCommon.FrequencyRoundingFormat); jsonObject.AddIntValue(JsonCommon.AllAlleleCount, AllAlleleCount); jsonObject.AddIntValue(JsonCommon.AfrAlleleCount, AfrAlleleCount); jsonObject.AddIntValue(JsonCommon.AmrAlleleCount, AmrAlleleCount); jsonObject.AddIntValue(JsonCommon.EasAlleleCount, EasAlleleCount); jsonObject.AddIntValue(JsonCommon.EurAlleleCount, EurAlleleCount); jsonObject.AddIntValue(JsonCommon.OthAlleleCount, OthAlleleCount); jsonObject.AddIntValue(JsonCommon.FemaleAlleleCount, FemaleAlleleCount); jsonObject.AddIntValue(JsonCommon.MaleAlleleCount, MaleAlleleCount); jsonObject.AddIntValue(JsonCommon.AllAlleleNumber, AllAlleleNumber); jsonObject.AddIntValue(JsonCommon.AfrAlleleNumber, AfrAlleleNumber); jsonObject.AddIntValue(JsonCommon.AmrAlleleNumber, AmrAlleleNumber); jsonObject.AddIntValue(JsonCommon.EasAlleleNumber, EasAlleleNumber); jsonObject.AddIntValue(JsonCommon.EurAlleleNumber, EurAlleleNumber); jsonObject.AddIntValue(JsonCommon.OthAlleleNumber, OthAlleleNumber); jsonObject.AddIntValue(JsonCommon.FemaleAlleleNumber, FemaleAlleleNumber); jsonObject.AddIntValue(JsonCommon.MaleAlleleNumber, MaleAlleleNumber); jsonObject.AddIntValue(JsonCommon.AllHomCount, AllHomCount); jsonObject.AddIntValue(JsonCommon.AfrHomCount, AfrHomCount); jsonObject.AddIntValue(JsonCommon.AmrHomCount, AmrHomCount); jsonObject.AddIntValue(JsonCommon.EasHomCount, EasHomCount); jsonObject.AddIntValue(JsonCommon.EurHomCount, EurHomCount); jsonObject.AddIntValue(JsonCommon.OthHomCount, OthHomCount); jsonObject.AddIntValue(JsonCommon.FemaleHomCount, FemaleHomCount); jsonObject.AddIntValue(JsonCommon.MaleHomCount, MaleHomCount); return StringBuilderPool.GetStringAndReturn(sb); } } ================================================ FILE: SAUtils/DataStructures/KeyCounts.cs ================================================ using System.Collections.Generic; using System.Text; using VariantAnnotation.Interface.IO; using VariantAnnotation.IO; namespace SAUtils.DataStructures; public class KeyCounts: IJsonSerializer { public readonly Dictionary Counts; public KeyCounts(IEnumerable keys) { Counts = new (); foreach (var key in keys) { Counts[key] = 0; } } public void Increment(string key) { Counts[key]++; } public void SerializeJson(StringBuilder sb) { var jo = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); foreach (var (key, count) in Counts) { jo.AddIntValue(key, count); } sb.Append(JsonObject.CloseBrace); } } ================================================ FILE: SAUtils/DataStructures/MinHeap.cs ================================================ using System; using System.Collections.Generic; namespace SAUtils.DataStructures { public sealed class MinHeap { private readonly List _itemArray; private readonly Func _comparerFunc; public MinHeap(Func comparerFunc) { _itemArray = new List(); _comparerFunc = comparerFunc; } public void Add(T item) { _itemArray.Add(item); Heapify(); } private void Heapify() { var i = _itemArray.Count - 1; while (i > 0) { var j = i % 2 == 0 ? i / 2 - 1 : i / 2;//the index of the parent //if (_itemArray[i].CompareTo(_itemArray[j]) < 0) if (_comparerFunc(_itemArray[i], _itemArray[j]) < 0) SwapItems(_itemArray, i, j); i = j; } } public T ExtractMin() { var min = _itemArray[0]; // the last item form the array is brought to the root and pushed down to the appropriate position _itemArray[0] = _itemArray[_itemArray.Count - 1]; _itemArray.RemoveAt(_itemArray.Count - 1); for (var i = 0; i < _itemArray.Count / 2;) { var j = 2 * i + 1; if (j + 1 < _itemArray.Count && _comparerFunc(_itemArray[j], _itemArray[j + 1]) > 0) // both children are present j++; //A[2*i+2] is the smaller child if (_comparerFunc(_itemArray[i],_itemArray[j]) > 0) SwapItems(_itemArray, i, j); i = j; } return min; } private static void SwapItems(List list, int i, int j) { var temp = list[i]; list[i] = list[j]; list[j] = temp; } public T GetMin() { return _itemArray.Count == 0 ? default : _itemArray[0]; } public int Count() { return _itemArray.Count; } public override string ToString() { return string.Join(",", _itemArray); } } } ================================================ FILE: SAUtils/DataStructures/OmimItem.cs ================================================ using System.Collections.Generic; using System.Linq; using System.Text; using Newtonsoft.Json; using OptimizedCore; using SAUtils.Schema; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.DataStructures; public sealed class OmimItem : ISuppGeneItem { public string GeneSymbol { get; } private readonly string _geneName; private readonly string _description; private readonly int _mimNumber; public readonly List Phenotypes; public SaJsonSchema JsonSchema { get; } public OmimItem(string geneSymbol, string geneName, string description, int mimNumber, List phenotypes, SaJsonSchema jsonSchema) { GeneSymbol = geneSymbol; _geneName = geneName; _description = description; _mimNumber = mimNumber; Phenotypes = phenotypes; JsonSchema = jsonSchema; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); JsonSchema.TotalItems++; JsonSchema.CountKeyIfAdded(jsonObject.AddIntValue("mimNumber", _mimNumber), "mimNumber"); JsonSchema.CountKeyIfAdded( jsonObject.AddStringValue("geneName", string.IsNullOrEmpty(_geneName) ? null : JsonConvert.SerializeObject(_geneName), false), "geneName"); //Serialized string has the double quote at the beginning and the end JsonSchema.CountKeyIfAdded( jsonObject.AddStringValue("description", string.IsNullOrEmpty(_description) ? null : JsonConvert.SerializeObject(_description), false), "description"); if (Phenotypes.Count > 0) JsonSchema.CountKeyIfAdded(jsonObject.AddObjectValues("phenotypes", Phenotypes), "phenotypes"); sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } public sealed class Phenotype : IJsonSerializer { private readonly int _mimNumber; public readonly string _phenotype; private readonly string _description; public readonly Mapping Mapping; private readonly Comment[] _comments; public readonly HashSet Inheritance; private readonly SaJsonSchema _jsonSchema; public Phenotype(int mimNumber, string phenotype, string description, Mapping mapping, Comment[] comments, HashSet inheritance, SaJsonSchema schema) { _mimNumber = mimNumber; _phenotype = phenotype; _description = description; Mapping = mapping; _comments = comments; Inheritance = inheritance; _jsonSchema = schema; } public void SerializeJson(StringBuilder sb) { var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); _jsonSchema.TotalItems++; if (_mimNumber >= 100000) _jsonSchema.CountKeyIfAdded(jsonObject.AddIntValue("mimNumber", _mimNumber), "mimNumber"); _jsonSchema.CountKeyIfAdded(jsonObject.AddStringValue("phenotype", _phenotype), "phenotype"); _jsonSchema.CountKeyIfAdded( jsonObject.AddStringValue("description", string.IsNullOrEmpty(_description) ? null : JsonConvert.SerializeObject(_description), false), "description"); if (Mapping != Mapping.unknown) _jsonSchema.CountKeyIfAdded(jsonObject.AddStringValue("mapping", Mapping.ToString().Replace("_", " ")), "mapping"); if (Inheritance != null && Inheritance.Count > 0) _jsonSchema.CountKeyIfAdded(jsonObject.AddStringValues("inheritances", Inheritance), "inheritances"); if (_comments.Length > 0) _jsonSchema.CountKeyIfAdded(jsonObject.AddStringValues("comments", _comments.Select(x => x.ToString().Replace("_", " "))), "comments"); sb.Append(JsonObject.CloseBrace); } } public enum Mapping : byte { // ReSharper disable InconsistentNaming unknown, mapping_of_the_wildtype_gene, disease_phenotype_itself_was_mapped, molecular_basis_of_the_disorder_is_known, chromosome_deletion_or_duplication_syndrome // ReSharper restore InconsistentNaming } public enum Comment : byte { // ReSharper disable InconsistentNaming unknown, unconfirmed_or_possibly_spurious_mapping, nondiseases, contribute_to_susceptibility_to_multifactorial_disorders_or_to_susceptibility_to_infection // ReSharper restore InconsistentNaming } } ================================================ FILE: SAUtils/DataStructures/OneKGenItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.DataStructures { public sealed class OneKGenItem : ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } private string AncestralAllele { get; } private int? AllAlleleNumber { get; } private int? AfrAlleleNumber { get; } private int? AmrAlleleNumber { get; } private int? EurAlleleNumber { get; } private int? EasAlleleNumber { get; } private int? SasAlleleNumber { get; } private int? AllAlleleCount { get; } private int? AfrAlleleCount { get; } private int? AmrAlleleCount { get; } private int? EurAlleleCount { get; } private int? EasAlleleCount { get; } private int? SasAlleleCount { get; } public OneKGenItem(Chromosome chromosome, int position, string refAllele, string alternateAllele, string ancestralAllele, int? allAlleleCount, int? afrAlleleCount, int? amrAlleleCount, int? eurAlleleCount, int? easAlleleCount, int? sasAlleleCount, int? allAlleleNumber, int? afrAlleleNumber, int? amrAlleleNumber, int? eurAlleleNumber, int? easAlleleNumber, int? sasAlleleNumber ) { Chromosome = chromosome; Position = position; RefAllele = refAllele; AltAllele = alternateAllele; AncestralAllele = ancestralAllele; AllAlleleCount = allAlleleCount; AfrAlleleCount = afrAlleleCount; AmrAlleleCount = amrAlleleCount; EurAlleleCount = eurAlleleCount; EasAlleleCount = easAlleleCount; SasAlleleCount = sasAlleleCount; AllAlleleNumber = allAlleleNumber; AfrAlleleNumber = afrAlleleNumber; AmrAlleleNumber = amrAlleleNumber; EurAlleleNumber = eurAlleleNumber; EasAlleleNumber = easAlleleNumber; SasAlleleNumber = sasAlleleNumber; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("ancestralAllele", AncestralAllele); jsonObject.AddStringValue("allAf", ComputingUtilities.ComputeFrequency(AllAlleleNumber, AllAlleleCount), false); jsonObject.AddStringValue("afrAf", ComputingUtilities.ComputeFrequency(AfrAlleleNumber, AfrAlleleCount), false); jsonObject.AddStringValue("amrAf", ComputingUtilities.ComputeFrequency(AmrAlleleNumber, AmrAlleleCount), false); jsonObject.AddStringValue("easAf", ComputingUtilities.ComputeFrequency(EasAlleleNumber, EasAlleleCount), false); jsonObject.AddStringValue("eurAf", ComputingUtilities.ComputeFrequency(EurAlleleNumber, EurAlleleCount), false); jsonObject.AddStringValue("sasAf", ComputingUtilities.ComputeFrequency(SasAlleleNumber, SasAlleleCount), false); jsonObject.AddIntValue("allAn", AllAlleleNumber); jsonObject.AddIntValue("afrAn", AfrAlleleNumber); jsonObject.AddIntValue("amrAn", AmrAlleleNumber); jsonObject.AddIntValue("easAn", EasAlleleNumber); jsonObject.AddIntValue("eurAn", EurAlleleNumber); jsonObject.AddIntValue("sasAn", SasAlleleNumber); jsonObject.AddIntValue("allAc", AllAlleleCount); jsonObject.AddIntValue("afrAc", AfrAlleleCount); jsonObject.AddIntValue("amrAc", AmrAlleleCount); jsonObject.AddIntValue("easAc", EasAlleleCount); jsonObject.AddIntValue("eurAc", EurAlleleCount); jsonObject.AddIntValue("sasAc", SasAlleleCount); return StringBuilderPool.GetStringAndReturn(sb); } public string InputLine { get; set; } } } ================================================ FILE: SAUtils/DataStructures/OnekGenSvItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using Variants; namespace SAUtils.DataStructures { public sealed class OnekGenSvItem: ISuppIntervalItem { public int Start { get; } public int End { get; } public Chromosome Chromosome { get; } private VariantType VariantType { get; } private readonly int? _allAlleleNumber; private readonly int? _allAlleleCount; private readonly double? _allAlleleFrequency; private readonly double? _afrAlleleFrequency; private readonly double? _amrAlleleFrequency; private readonly double? _easAlleleFrequency; private readonly double? _eurAlleleFrequency; private readonly double? _sasAlleleFrequency; public OnekGenSvItem(Chromosome chromosome, int start, int end, VariantType variantType, string id, int? allAlleleNumber, int? allAlleleCount, double? allAlleleFrequency, double? afrAlleleFrequency, double? amrAlleleFrequency, double? easAlleleFrequency, double? eurAlleleFrequency, double? sasAlleleFrequency) { Chromosome = chromosome; Start = start; End = end; VariantType = variantType; Id = id; _allAlleleNumber = allAlleleNumber; _allAlleleCount = allAlleleCount; _allAlleleFrequency = allAlleleFrequency; _afrAlleleFrequency = afrAlleleFrequency; _amrAlleleFrequency = amrAlleleFrequency; _easAlleleFrequency = easAlleleFrequency; _eurAlleleFrequency = eurAlleleFrequency; _sasAlleleFrequency = sasAlleleFrequency; } private string Id { get; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("chromosome", Chromosome.EnsemblName); jsonObject.AddIntValue("begin", Start); jsonObject.AddIntValue("end", End); jsonObject.AddStringValue("variantType", VariantType.ToString()); jsonObject.AddStringValue("id", Id); jsonObject.AddIntValue("allAn", _allAlleleNumber); jsonObject.AddIntValue("allAc", _allAlleleCount); jsonObject.AddDoubleValue("allAf", _allAlleleFrequency, "0.######"); jsonObject.AddDoubleValue("afrAf", _afrAlleleFrequency, "0.######"); jsonObject.AddDoubleValue("amrAf", _amrAlleleFrequency, "0.######"); jsonObject.AddDoubleValue("eurAf", _eurAlleleFrequency, "0.######"); jsonObject.AddDoubleValue("easAf", _easAlleleFrequency, "0.######"); jsonObject.AddDoubleValue("sasAf", _sasAlleleFrequency, "0.######"); return StringBuilderPool.GetStringAndReturn(sb); } } } ================================================ FILE: SAUtils/DataStructures/RefMinorItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.DataStructures { public sealed class RefMinorItem:ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public string GlobalMajor { get; } public RefMinorItem(Chromosome chromosome, int position, string globalMajor) { Chromosome = chromosome; Position = position; GlobalMajor = globalMajor; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddBoolValue("isReferenceMinor", true); return StringBuilderPool.GetStringAndReturn(sb); } public string InputLine { get; set; } } } ================================================ FILE: SAUtils/DataStructures/SuppDataUtilities.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using ErrorHandling.Exceptions; using SAUtils.PrimateAi; using VariantAnnotation.Interface.SA; using Variants; namespace SAUtils.DataStructures { public static class SuppDataUtilities { public static int CompareTo(ISupplementaryDataItem item, ISupplementaryDataItem other) { if (other == null) return -1; return item.Chromosome.Index == other.Chromosome.Index ? item.Position.CompareTo(other.Position) : item.Chromosome.Index.CompareTo(other.Chromosome.Index); } public static void Trim(this ISupplementaryDataItem saItem) { if (saItem.RefAllele == null || saItem.AltAllele == null || saItem.Position < 0) return; (int start, string refAllele, string altAllele) = BiDirectionalTrimmer.Trim(saItem.Position, saItem.RefAllele, saItem.AltAllele); saItem.Position = start; saItem.RefAllele = refAllele; saItem.AltAllele = altAllele; } public static int BinarySearch(List items, int value) where T:IComparable { var begin = 0; int end = items.Count - 1; while (begin <= end) { int index = begin + (end - begin >> 1); int ret = items[index].CompareTo(value); if (ret == 0) return index; if (ret < 0) begin = index + 1; else end = index - 1; } return ~begin; } public static List DeDuplicatePrimateAiItems(List saItems) { var maxScoreItems = new Dictionary(); foreach (var supplementaryDataItem in saItems) { var saItem = (PrimateAiItem) supplementaryDataItem; var refAlt = saItem.RefAllele + '>' + saItem.AltAllele; if (maxScoreItems.TryGetValue(refAlt, out var dupItem)) { var dupPrimateAiItem = (PrimateAiItem) dupItem; if (saItem.ScorePercentile >= dupPrimateAiItem.ScorePercentile) { maxScoreItems[refAlt] = saItem; } } else maxScoreItems.Add(refAlt, saItem); } return maxScoreItems.Values.ToList(); } public static List RemoveConflictingAlleles(List saItems, bool throwErrorOnConflicts) { var nonDuplicateSet = new Dictionary(); var conflictSet = new List(); foreach (var saItem in saItems) { var refAlt = saItem.RefAllele+'>'+saItem.AltAllele; if (nonDuplicateSet.TryGetValue(refAlt, out var dupItem)) { if (saItem.GetJsonString() != dupItem.GetJsonString()) { if(throwErrorOnConflicts) throw new UserErrorException($"Conflicting entries for items at {saItem.Chromosome.UcscName}:{saItem.Position} for alleles {saItem.RefAllele} > {saItem.AltAllele}"); conflictSet.Add(refAlt); } } else nonDuplicateSet.Add(refAlt, saItem); } var values = nonDuplicateSet.Values.ToList(); if (conflictSet.Count > 0) { values.RemoveAll(x => conflictSet.Contains(x.RefAllele + '>' + x.AltAllele)); } return values; } public static ISupplementaryDataItem GetPositionalAnnotation(List saItems) { // all items in the list are assumed to be objects of the same implementation var firstItem = saItems[0]; switch (firstItem) { case AlleleFrequencyItem _: return GetGlobalMinor(saItems); // if onekgen return Ancestral allele case AncestralAlleleItem _: return GetConsensus(saItems); } return null; } private static ISupplementaryDataItem GetConsensus(List saItems) { //check consistancy string ancestralAllele = null; foreach (var supplementaryDataItem in saItems) { var aaItem = (AncestralAlleleItem) supplementaryDataItem; //note: aaItem.AncestralAllele cannot be null at this point if (ancestralAllele == null) ancestralAllele = aaItem.AncestralAllele; if (ancestralAllele != aaItem.AncestralAllele) return null; } return ancestralAllele==null? null : saItems[0]; } private static ISupplementaryDataItem GetGlobalMinor(List saItems) { var alleleFreqDict = new Dictionary(); foreach (var supplementaryDataItem in saItems) { var frequencyItem = (AlleleFrequencyItem) supplementaryDataItem; if (!double.MinValue.Equals(frequencyItem.AltFrequency)) alleleFreqDict[frequencyItem.AltAllele] = frequencyItem.AltFrequency; } if (alleleFreqDict.Count == 0) return null; var firstItem = saItems[0]; string refAllele = firstItem.RefAllele; string globalMajorAllele = GetMostFrequentAllele(alleleFreqDict, refAllele); if (globalMajorAllele == null) return null; alleleFreqDict.Remove(globalMajorAllele); string globalMinorAllele = GetMostFrequentAllele(alleleFreqDict, refAllele, false); if (globalMinorAllele == null) return null; double frequency = alleleFreqDict[globalMinorAllele]; return new GlobalMinorItem(firstItem.Chromosome, firstItem.Position, globalMinorAllele, frequency); } public static string GetMostFrequentAllele(Dictionary alleleFreqDict, string refAllele, bool isRefPreferred = true) { if (alleleFreqDict.Count == 0) return null; // find all alleles that have max frequency. double maxFreq = alleleFreqDict.Values.Max(); if (Math.Abs(maxFreq - double.MinValue) < double.Epsilon) return null; var maxFreqAlleles = (from pair in alleleFreqDict where Math.Abs(pair.Value - maxFreq) < double.Epsilon select pair.Key).ToList(); // if there is only one with max frequency, return it if (maxFreqAlleles.Count == 1) return maxFreqAlleles[0]; // if ref is preferred (as in global major) it is returned if (isRefPreferred && maxFreqAlleles.Contains(refAllele)) return refAllele; // else refAllele is removed and the first of the remaining allele is returned (arbitrary selection) maxFreqAlleles.Remove(refAllele); return maxFreqAlleles[0]; } } } ================================================ FILE: SAUtils/DataStructures/TopMedItem.cs ================================================ using System.Text; using Genome; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.DataStructures { public sealed class TopMedItem : ISupplementaryDataItem { private readonly int? _alleleNum; private readonly int? _alleleCount; private readonly int? _homCount; private readonly bool _failedFilter; public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public TopMedItem(Chromosome chrom, int position, string refAllele, string altAllele, int? alleleNum, int? alleleCount, int? homCount, bool failedFilter) { Chromosome = chrom; Position = position; RefAllele = refAllele; AltAllele = altAllele; _alleleNum = alleleNum; _alleleCount = alleleCount; _homCount = homCount; _failedFilter = failedFilter; } public string GetJsonString() { var sb = new StringBuilder(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("allAf", ComputingUtilities.ComputeFrequency(_alleleNum, _alleleCount), false); jsonObject.AddIntValue("allAn", _alleleNum); jsonObject.AddIntValue("allAc", _alleleCount); jsonObject.AddIntValue("allHc", _homCount); if (_failedFilter) jsonObject.AddBoolValue("failedFilter", true); return sb.ToString(); } public string InputLine { get; set; } } } ================================================ FILE: SAUtils/DbSnpRemapper/ChromMapper.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Compression.Utilities; using Genome; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.DbSnpRemapper { internal sealed class ChromMapper { private readonly StreamReader _srcReader; private readonly StreamReader _destReader; private readonly Dictionary _writers; private readonly StreamWriter _leftoverWriter; private readonly ISequenceProvider _srcSequenceProvider; private readonly ISequenceProvider _desSequenceProvider; private int _leftoverCount; private readonly Dictionary<(long, int, string), List> _destinationVariants; private int _alleleMismatchCount; public ChromMapper(StreamReader srcReader, StreamReader destReader, StreamWriter leftoverWriter, ISequenceProvider srcSequenceProvider, ISequenceProvider desSequenceProvider) { _srcReader = srcReader; _destReader = destReader; _writers = new Dictionary(); _leftoverWriter = leftoverWriter; _srcSequenceProvider = srcSequenceProvider; _desSequenceProvider = desSequenceProvider; _destinationVariants = new Dictionary<(long, int, string), List>(); } public Dictionary Map() { using (_srcReader) using (_destReader) { //map all the destination rsIDs to their positions in destination string srcLine, destLine; //read to the first data line while ((srcLine = _srcReader.ReadLine()) != null) { if (!srcLine.OptimizedStartsWith('#')) break; } while ((destLine= _destReader.ReadLine()) != null) { if (!destLine.OptimizedStartsWith('#')) break; } // dictionary of leftover rsIds from previous chromosomes //var destRsidLocations = new Dictionary(); while (destLine != null && srcLine!=null) { _destinationVariants.Clear(); destLine = GetNextChromDestinations(destLine); srcLine = ProcessNextChromSource(srcLine); } } // these writers need to be kept open so that the leftover mapper can append to them Console.WriteLine($"Total leftover count:{_leftoverCount}"); return _writers; } private string ProcessNextChromSource(string line) { //extracting current chrom info from first line provided var currentChromName = line.Split('\t', 2)[VcfCommon.ChromIndex]; var currentChrom = ReferenceNameUtilities.GetChromosome(_srcSequenceProvider.RefNameToChromosome, currentChromName); _srcSequenceProvider.LoadChromosome(currentChrom); var leftoverCount=0; do { var splits = line.Split('\t', VcfCommon.InfoIndex); var chrom = splits[VcfCommon.ChromIndex]; if (chrom != currentChromName) break; var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) continue; var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _srcSequenceProvider.Sequence)).ToArray(); var foundInDest = false; foreach (var (_, variantRef, variantAlt) in processedVariants) foreach (var rsId in rsIds) { if (! _destinationVariants.TryGetValue((rsId, variantRef.Length, variantAlt), out var targetPositions)) continue; targetPositions.ForEach(x => WriteRemappedEntry(chrom, x, variantRef, variantAlt, line)); //flipping the sign to indicate it has been mapped //_destinationVariants[rsId] = (-variant.position, variant.refAllele, variant.altAlleles); foundInDest = true; } if (foundInDest) continue; foreach (var (_, _, variantAlt) in processedVariants) foreach (var rsId in rsIds) _leftoverWriter.WriteLine(string.Join('#',rsId.ToString(), variantAlt, line)); leftoverCount++; } while ((line = _srcReader.ReadLine()) != null); Console.WriteLine($"Leftover count for {currentChromName}: {leftoverCount}"); //Console.WriteLine($"Number of entries discarded due to allele mismatch: {_alleleMismatchCount}"); _leftoverCount += leftoverCount; return line; } private string GetNextChromDestinations(string line) { //extracting current chrom info from first line provided var currentChromName = line.Split('\t', 2)[VcfCommon.ChromIndex]; Console.Write($"Getting destinations for chromosome:{currentChromName}..."); var currentChrom = ReferenceNameUtilities.GetChromosome(_desSequenceProvider.RefNameToChromosome, currentChromName); _desSequenceProvider.LoadChromosome(currentChrom); do { var splits = line.Split('\t', VcfCommon.InfoIndex); var chrom = splits[VcfCommon.ChromIndex]; if (chrom != currentChromName) break; var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) continue; var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _desSequenceProvider.Sequence)).ToArray(); foreach (var (start, variantRef, variantAlt) in processedVariants) foreach (var rsId in rsIds) { if (!_destinationVariants.TryGetValue((rsId, variantRef.Length, variantAlt), out var variants)) { variants = new List(); _destinationVariants[(rsId, variantRef.Length, variantAlt)] = variants; } variants.Add(start); } } while ((line = _destReader.ReadLine()) != null); Console.WriteLine($"{_destinationVariants.Count} rsIds found."); return line; } private void WriteRemappedEntry(string chrom, int pos, string refAllele, string altAllele, string vcfLine) { if (!_writers.ContainsKey(chrom)) _writers[chrom] = GZipUtilities.GetStreamWriter(chrom+".vcf.gz"); var splits = vcfLine.Split('\t', 6); _writers[chrom].WriteLine(string.Join('\t', chrom, pos.ToString(), splits[2], refAllele, altAllele, splits[5])); } } } ================================================ FILE: SAUtils/DbSnpRemapper/DbSnpRemapperMain.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using Nirvana; using VariantAnnotation.Interface.Providers; namespace SAUtils.DbSnpRemapper { public static class DbSnpRemapperMain { private static string _srcMapFile; private static string _destMapFile; private static string _srcRefSequence; private static string _desRefSequence; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "src|s=", "VCF file with dbSNP ids and data to be remapped", v => _srcMapFile = v }, { "des|d=", "VCF file (with same chromosome order as src) with destination dbSNP mapping", v => _destMapFile = v }, { "sref=", "compressed reference sequence file for the source assembly", v => _srcRefSequence = v }, { "dref=", "compressed reference sequence file for the destination assembly", v => _desRefSequence = v } }; var commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_srcMapFile, "VCF file with dbSNP ids and data to be remapped", "--src") .CheckInputFilenameExists(_destMapFile, "VCF file with destination dbSNP mapping", "--des") .CheckInputFilenameExists(_srcRefSequence, "reference sequence for source genome assembly", "--sref") .CheckInputFilenameExists(_desRefSequence, "reference sequence for destination genome assembly", "--dref") .SkipBanner() .ShowHelpMenu("Reads provided supplementary data files and populates tsv files", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { const string tempLeftoverFilename = "LeftOvers.vcf.gz"; Dictionary writers; ISequenceProvider srcSequenceProvider = ProviderUtilities.GetSequenceProvider(_srcRefSequence); ISequenceProvider desSequenceProvider = ProviderUtilities.GetSequenceProvider(_desRefSequence); using (var srcReader = GZipUtilities.GetAppropriateStreamReader(_srcMapFile)) using (var destReader = GZipUtilities.GetAppropriateStreamReader(_destMapFile)) using (var leftoverWriter = GZipUtilities.GetStreamWriter(tempLeftoverFilename)) { var chromMapper = new ChromMapper(srcReader, destReader, leftoverWriter, srcSequenceProvider, desSequenceProvider); writers = chromMapper.Map(); } //now we will try to map the leftovers using (var destReader = GZipUtilities.GetAppropriateStreamReader(_destMapFile)) using (var leftoverReader = GZipUtilities.GetAppropriateStreamReader(tempLeftoverFilename)) { var leftOverMapper = new LeftoverMapper(leftoverReader, destReader, writers, desSequenceProvider); var leftoverCount = leftOverMapper.Map(); Console.WriteLine($"{leftoverCount} leftovers mapped!!"); } foreach (var writer in writers.Values) { writer.Dispose(); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/DbSnpRemapper/GenomicLocation.cs ================================================ namespace SAUtils.DbSnpRemapper { public struct GenomicLocation { public readonly string Chrom; public readonly int Position; public GenomicLocation(string chrom, int pos) { Chrom = chrom; Position = pos; } } } ================================================ FILE: SAUtils/DbSnpRemapper/LeftoverMapper.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Compression.Utilities; using Genome; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.DbSnpRemapper { public sealed class LeftoverMapper { private readonly StreamReader _leftoverReader; private readonly StreamReader _destReader; private readonly Dictionary _writers; private readonly ISequenceProvider _desSequenceProvider; public LeftoverMapper(StreamReader leftoverReader, StreamReader destReader, Dictionary writers, ISequenceProvider desSequenceProvider) { _leftoverReader = leftoverReader; _destReader = destReader; _writers = writers; _desSequenceProvider = desSequenceProvider; } public int Map() { // write out the relocated locations of the leftover rsIds whenever possible //reading in the leftover ids var leftoverIds = new HashSet<(long,string)>(); Console.Write("Loading leftover ids..."); string line; while ((line = _leftoverReader.ReadLine()) != null) { var splits = line.Split('#', 3); var id = long.Parse(splits[0]); var alt = splits[1]; leftoverIds.Add((id, alt)); } Console.WriteLine($"{leftoverIds.Count} found."); // stream through the dest file to find locations var leftoversWithDest = new Dictionary<(long, string), List>(); var currentChromName = ""; while ((line = _destReader.ReadLine()) != null) { if (line.OptimizedStartsWith('#')) continue; var splits = line.Split('\t', VcfCommon.InfoIndex); var chromName = splits[VcfCommon.ChromIndex]; if (chromName != currentChromName) { currentChromName = chromName; Console.WriteLine($"Getting destinations for chromosome:{currentChromName}..."); var currentChrom = ReferenceNameUtilities.GetChromosome(_desSequenceProvider.RefNameToChromosome, currentChromName); _desSequenceProvider.LoadChromosome(currentChrom); } var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) continue; var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _desSequenceProvider.Sequence)).ToArray(); foreach (var (_, _, variantAlt) in processedVariants) foreach (var rsId in rsIds) { if (!leftoverIds.Contains((rsId, variantAlt))) continue; var pos = int.Parse(splits[VcfCommon.PosIndex]); if (!leftoversWithDest.TryGetValue((rsId, variantAlt), out var locations)) { locations = new List(); leftoversWithDest[(rsId, variantAlt)] = locations; } locations.Add(new GenomicLocation(chromName, pos)); } } WriteMappedLeftovers(leftoversWithDest); return leftoversWithDest.Count; } private void WriteMappedLeftovers(Dictionary<(long, string), List> leftoversWithDest) { //resetting the reader _leftoverReader.DiscardBufferedData(); _leftoverReader.BaseStream.Position = 0; string line; while ((line = _leftoverReader.ReadLine()) != null) { var splits = line.Split('#', 3); var id = long.Parse(splits[0]); var alt = splits[1]; if (! leftoversWithDest.ContainsKey((id, alt))) continue; AppendToChromFile(leftoversWithDest[(id, alt)], line); } } private void AppendToChromFile(List leftoverLocations, string line) { foreach (GenomicLocation location in leftoverLocations) { var chromName = location.Chrom; if (!chromName.StartsWith("chr")) chromName = "chr" + chromName; if (!_writers.ContainsKey(chromName)) { Console.WriteLine($"Warning!! {chromName} was not present in source but is in destination"); _writers.Add(chromName, GZipUtilities.GetStreamWriter(chromName +".vcf.gz")); } var splits = line.Split('\t', 3); _writers[chromName].WriteLine($"{chromName}\t{location.Position}\t{splits[2]}"); } } } } ================================================ FILE: SAUtils/DbSnpRemapper/Utilities.cs ================================================ using System.Linq; using OptimizedCore; namespace SAUtils.DbSnpRemapper { public static class Utilities { public static long[] GetRsids(string idField) { var ids = idField.OptimizedSplit(',') .Where(idStr => idStr.StartsWith("rs")) .Select(idStr => long.Parse(idStr.Substring(2))).ToArray(); return ids.Length == 0 ? null : ids; } } } ================================================ FILE: SAUtils/DegenerateBaseUtilities.cs ================================================ using System.Collections.Generic; namespace SAUtils { public static class DegenerateBaseUtilities { private static readonly Dictionary> DegenerateBaseNotation = new Dictionary> { {'B', new List{'C','G','T'}}, {'D', new List{'A','G','T'}}, {'H', new List{'A','C','T'}}, {'K', new List{'G','T'}}, {'M', new List{'A','C'}}, {'R', new List{'A','G'}}, {'S', new List{'C','G'}}, {'V', new List{'A','C','G'}}, {'W', new List{'A','T'}}, {'Y', new List{'C','T'}} }; public static List GetAllPossibleSequences(string sequenceWithDegenerateBases) { var sequences = new List(); GetSequences(sequenceWithDegenerateBases.ToUpper(), sequences, 0, ""); return sequences; } private static void GetSequences(string inputSequence, ICollection outputSequences, int index, string subSequence) { if (index == inputSequence.Length) { outputSequences.Add(subSequence); return; } MapBase(inputSequence[index]).ForEach(x => GetSequences(inputSequence, outputSequences, index + 1, subSequence + x)); } private static List MapBase(char inputBase) => DegenerateBaseNotation.ContainsKey(inputBase) ? DegenerateBaseNotation[inputBase] : new List {inputBase}; } } ================================================ FILE: SAUtils/ExtractCosmicSvs/CosmicCnvItem.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using VariantAnnotation.IO; using Variants; namespace SAUtils.ExtractCosmicSvs { public sealed class CosmicCnvItem { public readonly int CNVId; private readonly Chromosome _chromosome; private readonly VariantType _cnvType; private readonly int _copyNumber; private readonly int _studyId; private readonly Dictionary _cancerTypes; public int CancerTypeCount => _cancerTypes.Count; private readonly Dictionary _tissueTypes; public int TissueTypeCount => _tissueTypes.Count; public CosmicCnvItem(int cnvId, Chromosome chromosome, int start, int end, VariantType cnvType, int copyNumber, Dictionary cancerTypes, Dictionary tissueTypes, int studyId) { CNVId = cnvId; _chromosome = chromosome; Start = start; End = end; _cnvType = cnvType; _studyId = studyId; _copyNumber = copyNumber; _cancerTypes = cancerTypes; _tissueTypes = tissueTypes; } private int Start { get; } private int End { get; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddIntValue("id", CNVId); jsonObject.AddStringValue("variantType", _cnvType.ToString()); if (_copyNumber!=-1) jsonObject.AddIntValue("copyNumber", _copyNumber); jsonObject.AddStringValues("cancerTypes", GetJsonStrings(_cancerTypes), false); jsonObject.AddStringValues("tissueTypes", GetJsonStrings(_tissueTypes), false); return sb.ToString(); } private static IEnumerable GetJsonStrings(Dictionary dictionary) { foreach (var kvp in dictionary) { yield return $"{JsonObject.OpenBrace}\"{kvp.Key.Replace('_', ' ')}\":{kvp.Value}{JsonObject.CloseBrace}"; } } public void Merge(CosmicCnvItem other) { if (CNVId != other.CNVId || _cnvType != other._cnvType || _copyNumber!= other._copyNumber) throw new InvalidDataException("Attempting to merge different cosmic CNVs"); //avoid double counting if (_studyId != other._studyId) { MergeCounts(_cancerTypes, other._cancerTypes); MergeCounts(_tissueTypes, other._tissueTypes); } } private static void MergeCounts(Dictionary countDict1, Dictionary countDict2) { foreach (var kvp in countDict2) { if (!countDict1.TryAdd(kvp.Key, kvp.Value)) // this key already exist countDict1[kvp.Key] += kvp.Value; } } } } ================================================ FILE: SAUtils/ExtractCosmicSvs/CosmicCnvReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using IO; using OptimizedCore; using Variants; namespace SAUtils.ExtractCosmicSvs { public sealed class CosmicCnvReader:IDisposable { private readonly StreamReader _reader; private readonly Dictionary _refToChrom; private readonly GenomeAssembly _assembly; private int _idIndex = -1; private int _primarySiteIndex = -1; private int _siteSubtypeOneIndex = -1; private int _siteSubtypeTwoIndex = -1; private int _siteSubtypeThreeIndex = -1; private int _primaryHistologyIndex = -1; private int _histologySubtypeOneIndex = -1; private int _histologySubtypeTwoIndex = -1; private int _histologySubtypeThreeIndex = -1; private int _copyNumberIndex = -1; private int _cnvTypeIndex = -1; private int _assemblyIndex = -1; private int _chromStartStopIndex = -1; private int _studyIdIndex = -1; private static readonly char[] ChromosomeDelimiters = {':', '.'}; //CNV_ID ID_GENE gene_name ID_SAMPLE ID_TUMOUR Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 SAMPLE_NAME TOTAL_CN MINOR_ALLELE MUT_TYPE ID_STUDY GRCh Chromosome:G_Start..G_Stop public CosmicCnvReader(Stream cnvStream, Dictionary refNameToChorm, GenomeAssembly assembly) { _reader = FileUtilities.GetStreamReader(cnvStream); _refToChrom = refNameToChorm; _assembly = assembly; } public IEnumerable GetEntries() { var cnvDictionary = new Dictionary(); string line; var isFirstLine = true; while ((line = _reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (isFirstLine) { GetColumnIndices(line); isFirstLine = false; continue; } try { var cnvItem = ExtractCosmicCnv(line); if (cnvItem == null) continue; if (cnvDictionary.TryGetValue(cnvItem.CNVId, out var value)) value.Merge(cnvItem); else cnvDictionary[cnvItem.CNVId] = cnvItem; } catch (Exception e) { Console.WriteLine(e); Console.WriteLine(line); throw; } } Console.WriteLine($"Found {cnvDictionary.Count} unique cosmic cnvs"); return cnvDictionary.Values; } internal void GetColumnIndices(string headerLine) { //CNV_ID ID_GENE gene_name ID_SAMPLE ID_TUMOUR Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 SAMPLE_NAME TOTAL_CN MINOR_ALLELE MUT_TYPE ID_STUDY GRCh Chromosome:G_Start..G_Stop _idIndex = -1; _primarySiteIndex = -1; _siteSubtypeOneIndex = -1; _siteSubtypeTwoIndex = -1; _siteSubtypeThreeIndex = -1; _primaryHistologyIndex = -1; _histologySubtypeOneIndex = -1; _histologySubtypeTwoIndex = -1; _histologySubtypeThreeIndex = -1; _copyNumberIndex = -1; _cnvTypeIndex = -1; _assemblyIndex = -1; _chromStartStopIndex = -1; _studyIdIndex = -1; var columns = headerLine.OptimizedSplit('\t'); for (int i = 0; i < columns.Length; i++) { switch (columns[i]) { case "CNV_ID": _idIndex = i; break; case "Primary site": _primarySiteIndex = i; break; case "Site subtype 1": _siteSubtypeOneIndex = i; break; case "Site subtype 2": _siteSubtypeTwoIndex = i; break; case "Site subtype 3": _siteSubtypeThreeIndex = i; break; case "Primary histology": _primaryHistologyIndex = i; break; case "Histology subtype 1": _histologySubtypeOneIndex = i; break; case "Histology subtype 2": _histologySubtypeTwoIndex = i; break; case "Histology subtype 3": _histologySubtypeThreeIndex = i; break; case "TOTAL_CN": _copyNumberIndex = i; break; case "MUT_TYPE": _cnvTypeIndex = i; break; case "GRCh": _assemblyIndex = i; break; case "Chromosome:G_Start..G_Stop": _chromStartStopIndex = i; break; case "ID_STUDY": _studyIdIndex = i; break; } } if (_primarySiteIndex == -1 || _siteSubtypeThreeIndex == -1 || _siteSubtypeOneIndex == -1 || _siteSubtypeTwoIndex == -1) throw new InvalidDataException("Column for some site(s) could not be detected"); if (_primaryHistologyIndex == -1 || _histologySubtypeOneIndex == -1 || _histologySubtypeTwoIndex == -1 || _histologySubtypeThreeIndex == -1) throw new InvalidDataException("Column for some histology(ies) could not be detected"); if (_copyNumberIndex == -1 || _assemblyIndex == -1 || _chromStartStopIndex == -1 || _cnvTypeIndex == -1) throw new InvalidDataException("Column for some CNV details could not be detected"); if (_studyIdIndex == -1) throw new InvalidDataException("No study Id column detected"); } private CosmicCnvItem ExtractCosmicCnv(string line) { var splits = line.OptimizedSplit('\t'); if (splits.Length == 1) return null; var assembly = GenomeAssembly.Unknown; var assemblyString = splits[_assemblyIndex]; if (assemblyString == "37") assembly = GenomeAssembly.GRCh37; if (assemblyString == "38") assembly = GenomeAssembly.GRCh38; if (assembly != _assembly) return null; var cnvId = int.Parse(splits[_idIndex]); var studyId = int.Parse(splits[_studyIdIndex]); var cancerTypes = new Dictionary(); TryAddValue(cancerTypes, splits[_primaryHistologyIndex]); TryAddValue(cancerTypes, splits[_histologySubtypeOneIndex]); TryAddValue(cancerTypes, splits[_histologySubtypeTwoIndex]); TryAddValue(cancerTypes, splits[_histologySubtypeThreeIndex]); var tissueTypes = new Dictionary(); TryAddValue(tissueTypes, splits[_primarySiteIndex]); TryAddValue(tissueTypes, splits[_siteSubtypeOneIndex]); TryAddValue(tissueTypes, splits[_siteSubtypeTwoIndex]); TryAddValue(tissueTypes, splits[_siteSubtypeThreeIndex]); if (! int.TryParse(splits[_copyNumberIndex], out var copyNumber)) { copyNumber = -1; } var cnvType = VariantType.copy_number_variation; if (splits[_cnvTypeIndex] == "gain") cnvType = VariantType.copy_number_gain; if (splits[_cnvTypeIndex] == "loss") cnvType = VariantType.copy_number_loss; (string chrom, int start, int end) = GetChromStartStop(splits[_chromStartStopIndex]); return new CosmicCnvItem(cnvId, _refToChrom[chrom], start, end, cnvType, copyNumber, cancerTypes, tissueTypes, studyId); } private static (string, int, int) GetChromStartStop(string chromPos) { // 17:18358950..18464587 Chromosome:G_Start..G_Stop var splits = chromPos.Split(ChromosomeDelimiters); string chrom = splits[0]; if (chrom == "25") chrom = "MT"; return (chrom, int.Parse(splits[1]), int.Parse(splits[3])); } private static void TryAddValue(Dictionary cancerTypes, string type) { if (string.IsNullOrEmpty(type) || type == "NS") return; cancerTypes[type] = 1; // we don't care about overriding the old count since this is for one study. So counts should not add up } public void Dispose() => _reader?.Dispose(); } } ================================================ FILE: SAUtils/ExtractCosmicSvs/CosmicSvReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using VariantAnnotation.Providers; namespace SAUtils.ExtractCosmicSvs { public sealed class CosmicSvReader:IDisposable { private readonly Stream _cnvStream; private readonly Stream _breakendStream; private readonly DataSourceVersion _version; private readonly string _outputDirectory; private readonly GenomeAssembly _genomeAssembly; private readonly Dictionary _refNameToChorm; public CosmicSvReader(Stream cnvStream, Stream breakendStream, DataSourceVersion version, string outputDir, GenomeAssembly assembly, Dictionary refNameToChromosome) { _cnvStream = cnvStream; _breakendStream = breakendStream; _version = version; _outputDirectory = outputDir; _genomeAssembly = assembly; _refNameToChorm = refNameToChromosome; } //public void CreateTsv() //{ // var benchMark = new Benchmark(); // const string dataSource = "COSMIC"; // if (_cnvStream != null) // { // using (var writer = new IntervalTsvWriter(_outputDirectory, _version, // _genomeAssembly.ToString(), SaTsvCommon.CosmicSvSchemaVersion, DataSourceTags.CosmicCnvTag, ReportFor.StructuralVariants)) // using (var cnvReader = new CosmicCnvReader(_cnvStream, _refNameToChorm, _genomeAssembly)) // { // foreach (var cnvEntry in cnvReader.GetEntries()) // { // writer.AddEntry(cnvEntry.Chromosome.EnsemblName, cnvEntry.Start, cnvEntry.End, cnvEntry.GetJsonString()); // } // } // } // var timeSpan = Benchmark.ToHumanReadable(benchMark.GetElapsedTime()); // TsvWriterUtilities.WriteCompleteInfo(dataSource, _version.Version, timeSpan); //} public void Dispose() { _cnvStream?.Dispose(); _breakendStream?.Dispose(); } } } ================================================ FILE: SAUtils/ExtractCosmicSvs/ExtractCosmicSvsMain.cs ================================================ using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Providers; namespace SAUtils.ExtractCosmicSvs { public static class ExtractCosmicSvsMain { private static string _breakendTsv; private static string _cnvTsv; private static string _outputDir; private static string _compressedReference; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "brk|b=", "input TSV file with breakend data", v => _breakendTsv = v }, { "cnv|c=", "input TSV file with CNV data", v => _cnvTsv = v }, { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "out|o=", "output directory for intermediate TSV", v => _outputDir = v } }; var commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .HasRequiredParameter(_cnvTsv, "input TSV file with CNV data", "--cnv") .HasRequiredParameter(_outputDir, "output directory name", "--out") .CheckDirectoryExists(_outputDir, "output directory name", "--out") .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .SkipBanner() .ShowHelpMenu("Reads provided supplementary data files and populates tsv files", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var version = DataSourceVersionReader.GetSourceVersion(_cnvTsv+ ".version"); var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var cnvStream = _cnvTsv==null? null: GZipUtilities.GetAppropriateReadStream(_cnvTsv); var breakendStream = _breakendTsv == null ? null : GZipUtilities.GetAppropriateReadStream(_breakendTsv); using (new CosmicSvReader(cnvStream, breakendStream, version, _outputDir, referenceProvider.Assembly, referenceProvider.RefNameToChromosome)) { //cosmicSvExtractor.CreateTsv(); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/ExtractMiniSa/ExtractMiniSaMain.cs ================================================ using System; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; namespace SAUtils.ExtractMiniSa { internal static class ExtractMiniSaMain { #region members // filenames private static string _compressedReference; private static string _inputSuppAnnotPath; private static string _dataSourceName; private static int _begin; private static int _end; private static string _miniSaDirectory; #endregion /// /// executes the program /// private static ExitCodes ProgramExecution() { var extractor = new MiniSaExtractor(_compressedReference, _inputSuppAnnotPath, _begin, _end, _dataSourceName, _miniSaDirectory); var count = extractor.Extract(); Console.WriteLine("Extracted {0} supplementary annotations", count); return ExitCodes.Success; } public static ExitCodes Run(string command,string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input Nirvana Supplementary Annotations {file}", v => _inputSuppAnnotPath = v }, { "name|n=", "data source {name}", v => _dataSourceName = v }, { "begin|b=", "reference begin {position}", (int v) => _begin= v }, { "end|e=", "reference end {allele}", (int v) => _end= v }, { "out|o=", "output {directory}", v => _miniSaDirectory= v } }; var commandLineExample = $"{command} --in --out --begin --end --name "; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_inputSuppAnnotPath, "Nirvana supplementary annotations", "--in") .CheckInputFilenameExists(_compressedReference, "Compressed reference sequence file name", "--ref") .HasRequiredParameter(_miniSaDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Extracts mini supplementary annotations for the given range from Nirvana Supplementary Annotations files.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } } } ================================================ FILE: SAUtils/ExtractMiniSa/MiniSaExtractor.cs ================================================ using System; using System.Collections.Generic; using System.IO; namespace SAUtils.ExtractMiniSa { public sealed class MiniSaExtractor { #region members private readonly int _begin; private readonly int _end; private readonly string _saPath; private readonly string _miniSaPath; #endregion public MiniSaExtractor(string compressedRefFile, string saPath, int begin, int end, string datasourceName = null, string outputDir = null) { _begin = begin; _end = end; _saPath = saPath; //new ReferenceSequenceProvider(FileUtilities.GetReadStream(compressedRefFile)).RefNameToChromosome; //string referenceName = GetReferenceName(saPath, refChromDict); //_miniSaPath = GetMiniSaPath(referenceName, begin, end, datasourceName, outputDir); Console.WriteLine($"MiniSA output to: {_miniSaPath}"); } private static string GetMiniSaPath(string referenceName, int begin, int end, string dataSourceName, string outputDir) { string miniSaPath = dataSourceName == null ? $"{referenceName}_{begin}_{end}.nsa" : $"{referenceName}_{begin}_{end}_{dataSourceName}.nsa"; if (outputDir != null) miniSaPath = Path.Combine(outputDir, miniSaPath); return miniSaPath; } //private static string GetReferenceName(string saPath, IDictionary refChromDict) //{ // ISupplementaryAnnotationHeader header; // using (var stream = FileUtilities.GetReadStream(saPath)) // using (var reader = new ExtendedBinaryReader(stream)) // { // header = SaReader.GetHeader(reader); // } // return refChromDict[header.ReferenceSequenceName].UcscName; //} //private static SaWriter GetSaWriter(string saPath, ISupplementaryAnnotationHeader header, // List smallVariantIntervals, List svIntervals, // List allVariantIntervals,List<(int,string)> globalMajorAlleleInRefMinors) //{ // var stream = FileUtilities.GetCreateStream(saPath); // var idxStream = FileUtilities.GetCreateStream(saPath + ".idx"); // return new SaWriter(stream, idxStream, header, smallVariantIntervals, svIntervals, allVariantIntervals,globalMajorAlleleInRefMinors); //} //private static SaReader GetSaReader(string saPath) //{ // var stream = FileUtilities.GetReadStream(saPath); // var idxStream = FileUtilities.GetReadStream(saPath + ".idx"); // return new SaReader(stream, idxStream); //} public int Extract() { var count = 0; //using (var reader = GetSaReader(_saPath)) //{ // var smallVariantIntervals = GetIntervals("small variants", reader.SmallVariantIntervals); // var svIntervals = GetIntervals("SVs", reader.SvIntervals); // var allVariantIntervals = GetIntervals("all variants", reader.AllVariantIntervals); // var globalMajorAlleles = GetGlobaleMajorAlleleAndRefMinors(reader.GlobalMajorAlleleInRefMinors); // using (var writer = GetSaWriter(_miniSaPath, reader.Header, smallVariantIntervals, svIntervals, // allVariantIntervals,globalMajorAlleles)) // { // for (int position = _begin; position <= _end; position++) // { // var saPosition = reader.GetAnnotation(position); // if (saPosition == null) continue; // writer.Write(saPosition, position); // count++; // } // } //} return count; } private List<(int,string)> GetGlobaleMajorAlleleAndRefMinors(IEnumerable<(int Position, string)> readerGlobalMajorAlleleInRefMinors) { var overlappedRefMinors = new List<(int,string)>(); foreach (var refMinor in readerGlobalMajorAlleleInRefMinors) { if(refMinor.Position>=_begin && refMinor.Position<=_end) overlappedRefMinors.Add(refMinor); } return overlappedRefMinors; } //private List GetIntervals(string description, // IEnumerable> intervals) //{ // var miniIntervals = new List(); // var targetInterval = new Interval(_begin, _end); // var allIntervals = intervals; // if (allIntervals != null) // { // foreach (var interval in allIntervals) // { // if (targetInterval.Overlaps(interval.Begin, interval.End)) miniIntervals.Add(interval.Value); // } // } // Console.WriteLine($"Found {miniIntervals.Count} supplementary intervals for {description}."); // return miniIntervals; //} } } ================================================ FILE: SAUtils/ExtractMiniXml/ExtractMiniXmlMain.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; namespace SAUtils.ExtractMiniXml { public static class ExtractMiniXmlMain { private static string _inputXmlFile; private static string _accessions; private static string _outputDir; private static ExitCodes ProgramExecution() { var accessions = GetAccessions(_accessions); if (accessions.Any(x=>x.StartsWith("RCV"))) { var rcvExtractor = new RcvXmlExtractor(_inputXmlFile, accessions, _outputDir); rcvExtractor.Extract(); } if (accessions.Any(x=>x.StartsWith("VCV"))) { var vcvExtractor = new VcvXmlExtractor(_inputXmlFile, accessions, _outputDir); vcvExtractor.Extract(); } return ExitCodes.Success; } private static List GetAccessions(string accString) { var accessions = new List(); if (Directory.Exists(accString)) { foreach (var fileName in Directory.EnumerateFiles(accString)) { if(fileName.Contains("RCV") || fileName.Contains("VCV")) accessions.Add(Path.GetFileNameWithoutExtension(fileName)); } return accessions; } return accString.Split(',').ToList(); } public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "i|in=", "Input XML {file}", v => _inputXmlFile = v }, { "a|acc=", "accessions", v => _accessions = v }, { "o|out=", "Output {dir}", v => _outputDir = v } }; var commandLineExample = $"{command} --in --out --rcv "; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_inputXmlFile, "input XML file", "--in") .HasRequiredParameter(_outputDir, "output directory", "--out") .HasRequiredParameter(_accessions, "comma separated list of accessions or folder containing mini XML files to update", "--acc") .SkipBanner() .ShowHelpMenu("Extracts mini supplementary annotations for the given range from Nirvana Supplementary Annotations files.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } } } ================================================ FILE: SAUtils/ExtractMiniXml/RcvXmlExtractor.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Xml; using Compression.Utilities; using IO; namespace SAUtils.ExtractMiniXml { public sealed class RcvXmlExtractor { private readonly string _inputXmlFile; private readonly string _outputDir; private readonly List _rcvIds; private const string XmlHeader = ""+"\n"+ ""+"\n\n"; private const string XmlFooter = "\n\n"; public RcvXmlExtractor(string inputXmlFile, List rcvIds, string outputDir) { _inputXmlFile = inputXmlFile; _rcvIds = rcvIds; _outputDir = outputDir; } public void Extract() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_inputXmlFile)) using (var xmlReader = XmlReader.Create(reader, new XmlReaderSettings { DtdProcessing = DtdProcessing.Prohibit, IgnoreWhitespace = true })) { var existVarSet = xmlReader.ReadToDescendant("ClinVarSet"); while (_rcvIds.Count > 0 && existVarSet) { var rcvContents = xmlReader.ReadOuterXml(); var rcv = DetectRcv(_rcvIds, rcvContents); if (rcv!=null) { var targetedContent =rcvContents; var outXmlFile = Path.Combine(_outputDir, rcv + ".xml"); WriteToFile(outXmlFile, targetedContent); } if(!xmlReader.IsStartElement("ClinVarSet")) existVarSet = xmlReader.ReadToNextSibling("ClinVarSet"); } } if (_rcvIds.Count > 0) { Console.WriteLine($"Failed to Find {string.Join(',',_rcvIds)}"); } } private static string DetectRcv(List rcvs, string rcvContents) { foreach (var rcv in rcvs) { if (rcvContents.Contains(rcv)) { rcvs.Remove(rcv); return rcv; } } return null; } private static void WriteToFile(string fileName, string targetedContent) { using (var writer = new StreamWriter(FileUtilities.GetCreateStream(fileName))) { writer.Write(XmlHeader); writer.Write(targetedContent); writer.Write(XmlFooter); Console.WriteLine($"Creating/ updating {fileName}"); } } } } ================================================ FILE: SAUtils/ExtractMiniXml/VcvXmlExtractor.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Xml; using Compression.Utilities; using IO; namespace SAUtils.ExtractMiniXml { public sealed class VcvXmlExtractor { private readonly string _inputXmlFile; private readonly string _outputDir; private readonly List _vcvIds; private const string VcvRecordTag = "VariationArchive"; private const string XmlHeader = "\n" + "\n"; private const string XmlFooter = "\n"; public VcvXmlExtractor(string inputXmlFile, List vcvIds, string outputDir) { _inputXmlFile = inputXmlFile; _vcvIds = vcvIds; _outputDir = outputDir; } public void Extract() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_inputXmlFile)) using (var xmlReader = XmlReader.Create(reader, new XmlReaderSettings { DtdProcessing = DtdProcessing.Prohibit, IgnoreWhitespace = true })) { var existVarSet = xmlReader.ReadToDescendant(VcvRecordTag); while (_vcvIds.Count > 0 && existVarSet) { var contents = xmlReader.ReadOuterXml(); var rcv = DetectVcv(_vcvIds, contents); if (rcv !=null) { var targetedContent =contents; var outXmlFile = Path.Combine(_outputDir, rcv + ".xml"); WriteToFile(outXmlFile, targetedContent); } if(!xmlReader.IsStartElement(VcvRecordTag)) existVarSet = xmlReader.ReadToNextSibling(VcvRecordTag); } } if (_vcvIds.Count > 0) { Console.WriteLine($"Failed to Find {string.Join(',',_vcvIds)}"); } } private static void WriteToFile(string fileName, string targetedContent) { using (var writer = new StreamWriter(FileUtilities.GetCreateStream(fileName))) { writer.Write(XmlHeader); writer.Write(targetedContent); writer.Write(XmlFooter); Console.WriteLine($"Creating/ updating {fileName}"); } } private static string DetectVcv(List vcvs, string rcvContents) { foreach (var vcv in vcvs) { if (rcvContents.Contains(vcv)) { vcvs.Remove(vcv); return vcv; } } return null; } } } ================================================ FILE: SAUtils/FusionCatcher/CollectionType.cs ================================================ namespace SAUtils.FusionCatcher { public enum CollectionType : byte { Germline, Somatic, Relationships } } ================================================ FILE: SAUtils/FusionCatcher/CreateFusionCatcher.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using Genome; using IO; using VariantAnnotation.Caches; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.FusionCatcher { public static class CreateFusionCatcher { private static string _transcriptCache37Path; private static string _transcriptCache38Path; private static string _dataDirectory; private static string _reference38Path; private static string _outputDirectory; private static ExitCodes ProgramExecution() { var geneKeyToFusion = new Dictionary(); var knownGenes = new HashSet(); var oncoGenes = new HashSet(); Dictionary refIndexToChromosome = GetReferences(_reference38Path); AddGenes(_transcriptCache37Path, refIndexToChromosome, knownGenes, "GRCh37"); AddGenes(_transcriptCache38Path, refIndexToChromosome, knownGenes, "GRCh38"); DataSourceVersion version = CreateDataSourceVersion(Path.Combine(_dataDirectory, "version.txt")); // relationships FusionCatcherDataSource.Parse(GetStream("pairs_pseudogenes.txt"), GeneFusionSource.Pseudogene, CollectionType.Relationships, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("paralogs.txt"), GeneFusionSource.Paralog, CollectionType.Relationships, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("readthroughs.txt"), GeneFusionSource.Readthrough, CollectionType.Relationships, geneKeyToFusion, knownGenes); // oncogenes FusionCatcherOncogenes.Parse(GetStream("cancer_genes.txt"), "Bushman", oncoGenes, knownGenes); FusionCatcherOncogenes.Parse(GetStream("oncogenes_more.txt"), "ONGENE", oncoGenes, knownGenes); FusionCatcherOncogenes.Parse(GetStream("tumor_genes.txt"), "UniProt", oncoGenes, knownGenes); Console.WriteLine($"- found a total of {oncoGenes.Count:N0} oncogenes."); // germline fusions FusionCatcherDataSource.Parse(GetStream("1000genomes.txt"), GeneFusionSource.OneK_Genomes_Project, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("banned.txt"), GeneFusionSource.Healthy_strong_support, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("bodymap2.txt"), GeneFusionSource.Illumina_BodyMap2, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("cacg.txt"), GeneFusionSource.CACG, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("conjoing.txt"), GeneFusionSource.ConjoinG, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("cortex.txt"), GeneFusionSource.Healthy_prefrontal_cortex, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("dgd.txt"), GeneFusionSource.Duplicated_Genes_Database, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("gtex.txt"), GeneFusionSource.GTEx_healthy_tissues, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("healthy.txt"), GeneFusionSource.Healthy, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("hpa.txt"), GeneFusionSource.Human_Protein_Atlas, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("non-cancer_tissues.txt"), GeneFusionSource.Babiceanu_NonCancerTissues, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("non-tumor_cells.txt"), GeneFusionSource.NonTumorCellLines, CollectionType.Germline, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("tcga-normal.txt"), GeneFusionSource.TumorFusions_normal, CollectionType.Germline, geneKeyToFusion, knownGenes); // somatic fusions FusionCatcherDataSource.Parse(GetStream("18cancers.txt"), GeneFusionSource.Alaei_Mahabadi_18_Cancers, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("ccle.txt"), GeneFusionSource.CCLE, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("ccle2.txt"), GeneFusionSource.CCLE_Klign, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("ccle3.txt"), GeneFusionSource.CCLE_Vellichirammal, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("cgp.txt"), GeneFusionSource.Cancer_Genome_Project, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("chimerdb4kb.txt"), GeneFusionSource.ChimerKB_4, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("chimerdb4pub.txt"), GeneFusionSource.ChimerPub_4, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("chimerdb4seq.txt"), GeneFusionSource.ChimerSeq_4, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("cosmic.txt"), GeneFusionSource.COSMIC, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("gliomas.txt"), GeneFusionSource.Bao_gliomas, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("known.txt"), GeneFusionSource.Known, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("mitelman.txt"), GeneFusionSource.Mitelman_DB, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("oesophagus.txt"), GeneFusionSource.TCGA_oesophageal_carcinomas, CollectionType.Somatic, geneKeyToFusion, knownGenes); // FusionCatcherDataSource.Parse(GetStream("oncokb.txt"), GeneFusionSource.OncoKB, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("pancreases.txt"), GeneFusionSource.Bailey_pancreatic_cancers, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("pcawg.txt"), GeneFusionSource.PCAWG, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("prostate_cancer.txt"), GeneFusionSource.Robinson_prostate_cancers, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("tcga.txt"), GeneFusionSource.TCGA, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("tcga-cancer.txt"), GeneFusionSource.TumorFusions_tumor, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("tcga2.txt"), GeneFusionSource.TCGA_Gao, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("tcga3.txt"), GeneFusionSource.TCGA_Vellichirammal, CollectionType.Somatic, geneKeyToFusion, knownGenes); FusionCatcherDataSource.Parse(GetStream("ticdb.txt"), GeneFusionSource.TICdb, CollectionType.Somatic, geneKeyToFusion, knownGenes); (GeneFusionSourceCollection[] index, GeneFusionIndexEntry[] indexEntries) = IndexBuilder.Convert(geneKeyToFusion); Console.WriteLine($"- created {index.Length:N0} index entries."); uint[] oncogeneKeys = oncoGenes.OrderBy(x => x).ToArray(); WriteGeneFusions(_outputDirectory, oncogeneKeys, index, indexEntries, version); Console.WriteLine(); Console.WriteLine($"Total: {geneKeyToFusion.Count:N0} gene pairs in database."); return ExitCodes.Success; } private static Dictionary GetReferences(string referencePath) { Console.Write("- loading reference sequence... "); var sequenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(referencePath)); Console.WriteLine("finished."); return sequenceProvider.RefIndexToChromosome; } private static void AddGenes(string cachePath, Dictionary refIndexToChromosome, ISet knownGenes, string description) { Console.Write($"- loading known genes ({description})... "); int startCount = knownGenes.Count; using (var reader = new TranscriptCacheReader(FileUtilities.GetReadStream(cachePath))) { TranscriptCacheData cacheData = reader.Read(refIndexToChromosome); foreach (IGene gene in cacheData.Genes) { string ensemblId = gene.EnsemblId.WithoutVersion; if (string.IsNullOrEmpty(ensemblId)) continue; knownGenes.Add(ensemblId); } } int numAdded = knownGenes.Count - startCount; Console.WriteLine($"added {numAdded:N0} Ensembl gene IDs."); } private static void WriteGeneFusions(string outputDirectory, uint[] oncogeneKeys, GeneFusionSourceCollection[] index, // ReSharper disable once SuggestBaseTypeForParameter GeneFusionIndexEntry[] indexEntries, DataSourceVersion version) { Console.Write("- writing gene fusions SA file... "); string outputPath = Path.Combine(outputDirectory, $"FusionCatcher_{version.Version}{SaCommon.GeneFusionSourceSuffix}"); using var writer = new GeneFusionSourceWriter(FileUtilities.GetCreateStream(outputPath), "fusionCatcher", version); writer.Write(oncogeneKeys, index, indexEntries); Console.WriteLine("finished."); } private static DataSourceVersion CreateDataSourceVersion(string filePath) { var fi = new FileInfo(filePath); long releaseDateTicks = fi.CreationTime.Ticks; // const string description = using var reader = new StreamReader(FileUtilities.GetReadStream(filePath)); string line = reader.ReadLine(); if (line == null) throw new InvalidDataException("Could not extract the first line from version.txt"); int spacePos = line.LastIndexOf(' '); string version = line.Substring(spacePos + 1); return new DataSourceVersion("FusionCatcher", version, releaseDateTicks, "known germline and somatic gene fusions"); } private static Stream GetStream(string filename) => GZipUtilities.GetAppropriateReadStream(Path.Combine(_dataDirectory, filename)); public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "cache37=", "transcript cache {path} for GRCh37", v => _transcriptCache37Path = v }, { "cache38=", "transcript cache {path} for GRCh38", v => _transcriptCache38Path = v }, { "in|i=", "FusionCatcher data {directory}", v => _dataDirectory = v }, { "out|o=", "output {directory}", v => _outputDirectory = v }, { "ref|r=", "input reference sequence {path} for GRCh38", v => _reference38Path = v } }; var commandLineExample = $"{command} [options]"; ExitCodes exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_reference38Path, "reference sequence (GRCh38)", "--ref") .CheckInputFilenameExists(_transcriptCache37Path, "transcript cache (GRCh37)", "--cache37") .CheckInputFilenameExists(_transcriptCache38Path, "transcript cache (GRCh38)", "--cache38") .CheckDirectoryExists(_dataDirectory, "FusionCatcher data directory", "--in") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with FusionCatcher annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } } } ================================================ FILE: SAUtils/FusionCatcher/FusionCatcherDataSource.cs ================================================ using System; using System.Collections.Generic; using System.IO; using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.GeneFusions.Utilities; namespace SAUtils.FusionCatcher { public static class FusionCatcherDataSource { public static void Parse(Stream stream, GeneFusionSource source, CollectionType collectionType, Dictionary geneKeyToFusion, HashSet knownEnsemblGenes) { Console.Write($"- parsing {source}... "); using var reader = new StreamReader(stream); var numGeneFusionsAdded = 0; while (true) { string line = reader.ReadLine(); if (line == null) break; string[] cols = line.Split('\t'); if (cols.Length != 2) throw new InvalidDataException($"Expected 2 columns in the FusionCatcher file, but found {cols.Length}"); string gene = cols[0]; string gene2 = cols[1]; bool hasGene = knownEnsemblGenes.Contains(gene); bool hasGene2 = knownEnsemblGenes.Contains(gene2); if (!hasGene || !hasGene2) continue; ulong fusionKey = GeneFusionKey.Create(GeneFusionKey.CreateGeneKey(cols[0]), GeneFusionKey.CreateGeneKey(cols[1])); if (!geneKeyToFusion.TryGetValue(fusionKey, out GeneFusionSourceBuilder geneFusion)) { geneFusion = new GeneFusionSourceBuilder(); geneKeyToFusion[fusionKey] = geneFusion; } switch (collectionType) { case CollectionType.Germline: geneFusion.GermlineSources.Add(source); break; case CollectionType.Somatic: geneFusion.SomaticSources.Add(source); break; case CollectionType.Relationships: switch (source) { case GeneFusionSource.Pseudogene: geneFusion.IsPseudogenePair = true; break; case GeneFusionSource.Paralog: geneFusion.IsParalogPair = true; break; case GeneFusionSource.Readthrough: geneFusion.IsReadthrough = true; break; default: throw new NotSupportedException($"Found an unsupported relationship: {source}"); } break; default: throw new NotSupportedException($"Found an unsupported gene fusion collection type: {collectionType}"); } numGeneFusionsAdded++; } Console.WriteLine($"added {numGeneFusionsAdded:N0} gene fusions."); } } } ================================================ FILE: SAUtils/FusionCatcher/FusionCatcherOncogenes.cs ================================================ using System; using System.Collections.Generic; using System.IO; using VariantAnnotation.GeneFusions.Utilities; namespace SAUtils.FusionCatcher { public static class FusionCatcherOncogenes { public static void Parse(Stream stream, string description, HashSet oncoGenes, HashSet knownEnsemblGenes) { Console.Write($"- parsing {description} oncogenes... "); using var reader = new StreamReader(stream); var numOncogenesAdded = 0; while (true) { string line = reader.ReadLine(); if (line == null) break; string[] cols = line.Split('\t'); if (cols.Length != 1) throw new InvalidDataException($"Expected 1 column in the FusionCatcher file, but found {cols.Length}"); string gene = cols[0]; bool hasGene = knownEnsemblGenes.Contains(gene); if (!hasGene) continue; uint geneKey = GeneFusionKey.CreateGeneKey(gene); oncoGenes.Add(geneKey); numOncogenesAdded++; } Console.WriteLine($"added {numOncogenesAdded:N0} oncogenes."); } } } ================================================ FILE: SAUtils/FusionCatcher/GeneFusionSourceBuilder.cs ================================================ using System.Collections.Generic; using VariantAnnotation.GeneFusions.SA; namespace SAUtils.FusionCatcher { public sealed class GeneFusionSourceBuilder { public bool IsPseudogenePair; public bool IsParalogPair; public bool IsReadthrough; public readonly List GermlineSources = new(); public readonly List SomaticSources = new(); public GeneFusionSourceCollection Create() { GeneFusionSource[] germlineSources = GermlineSources.Count > 0 ? GermlineSources.ToArray() : null; GeneFusionSource[] somaticSources = SomaticSources.Count > 0 ? SomaticSources.ToArray() : null; return new GeneFusionSourceCollection(IsPseudogenePair, IsParalogPair, IsReadthrough, germlineSources, somaticSources); } } } ================================================ FILE: SAUtils/FusionCatcher/GeneFusionSourceWriter.cs ================================================ using System; using System.IO; using System.Text; using Compression.Utilities; using IO; using IO.v2; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.Interface.Providers; namespace SAUtils.FusionCatcher { public sealed class GeneFusionSourceWriter : IDisposable { private readonly ExtendedBinaryWriter _writer; public GeneFusionSourceWriter(Stream stream, string jsonKey, IDataSourceVersion version, bool leaveOpen = false) { _writer = new ExtendedBinaryWriter(stream, Encoding.UTF8, leaveOpen); WriteHeader(); _writer.Write(jsonKey); version.Write(_writer); } private void WriteHeader() { var header = new Header(FileType.FusionCatcher, GeneFusionSourceReader.SupportedFileFormatVersion); header.Write(_writer); } public void Write(uint[] oncogeneKeys, GeneFusionSourceCollection[] index, GeneFusionIndexEntry[] indexEntries) { using var ms = new MemoryStream(); using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { writer.WriteOpt(oncogeneKeys.Length); foreach (uint geneKey in oncogeneKeys) writer.WriteOpt(geneKey); writer.WriteOpt(index.Length); foreach (GeneFusionSourceCollection sourceCollection in index) sourceCollection.Write(writer); writer.WriteOpt(indexEntries.Length); foreach (GeneFusionIndexEntry indexEntry in indexEntries) indexEntry.Write(writer); } byte[] bytes = ms.ToArray(); _writer.WriteCompressedByteArray(bytes, bytes.Length); } public void Dispose() => _writer.Dispose(); } } ================================================ FILE: SAUtils/FusionCatcher/IndexBuilder.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.GeneFusions.SA; namespace SAUtils.FusionCatcher { public static class IndexBuilder { public static (GeneFusionSourceCollection[] Index, GeneFusionIndexEntry[] IndexEntries) Convert(Dictionary geneKeyToSourceBuilder) { Dictionary geneKeyToSourceCollection = GetSourceCollection(geneKeyToSourceBuilder); (GeneFusionSourceCollection[] index, Dictionary sourceCollectionToIndex) = BuildIndex(geneKeyToSourceCollection.Values); GeneFusionIndexEntry[] indexEntries = BuildIndexEntries(geneKeyToSourceCollection, sourceCollectionToIndex); return (index, indexEntries); } private static GeneFusionIndexEntry[] BuildIndexEntries(Dictionary geneKeyToSourceCollection, IReadOnlyDictionary sourceCollectionToIndex) { var indexEntries = new GeneFusionIndexEntry[geneKeyToSourceCollection.Count]; var currentIndex = 0; foreach ((ulong geneKey, GeneFusionSourceCollection sourceCollection) in geneKeyToSourceCollection.OrderBy(x => x.Key)) { if (!sourceCollectionToIndex.TryGetValue(sourceCollection, out ushort index)) throw new InvalidDataException($"Unable to find the gene fusion source collection for gene key: {geneKey}"); indexEntries[currentIndex++] = new GeneFusionIndexEntry(geneKey, index); } return indexEntries; } private static (GeneFusionSourceCollection[] Index, Dictionary SourceCollectionToIndex) BuildIndex( Dictionary.ValueCollection sourceCollections) { var collectionToHits = new Dictionary(); foreach (GeneFusionSourceCollection sourceCollection in sourceCollections) { if (collectionToHits.TryGetValue(sourceCollection, out BuilderMetadata metadata)) { metadata.NumHits++; } else { collectionToHits[sourceCollection] = new BuilderMetadata {NumHits = 1, SourceCollection = sourceCollection}; } } // we want to order these in descending popularity BuilderMetadata[] sortedIndex = collectionToHits.Values.OrderByDescending(x => x.NumHits).ToArray(); var index = new GeneFusionSourceCollection[sortedIndex.Length]; var sourceCollectionToIndex = new Dictionary(); for (var i = 0; i < sortedIndex.Length; i++) { GeneFusionSourceCollection sourceCollection = sortedIndex[i].SourceCollection; index[i] = sourceCollection; sourceCollectionToIndex[sourceCollection] = (ushort) i; } return (index, sourceCollectionToIndex); } private static Dictionary GetSourceCollection( Dictionary geneKeyToSourceBuilder) { var geneKeyToSourceCollection = new Dictionary(geneKeyToSourceBuilder.Count); foreach ((ulong geneKey, GeneFusionSourceBuilder builder) in geneKeyToSourceBuilder) { GeneFusionSourceCollection sourceCollection = builder.Create(); geneKeyToSourceCollection[geneKey] = sourceCollection; } return geneKeyToSourceCollection; } private sealed class BuilderMetadata { public int NumHits; public GeneFusionSourceCollection SourceCollection; } } } ================================================ FILE: SAUtils/GERP/GerpMain.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using SAUtils.InputFileParsers; using VariantAnnotation.GenericScore; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.GERP { public class GerpMain { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "GERP wiggle or TSV file", "--in") .CheckInputFilenameExists(_inputFile, "GERP wiggle or TSV file", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("create Ancestral allele database from 1000Genomes data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); DataSourceVersion version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); var outFileName = $"{version.Name}_{version.Version}"; var nucleotides = new[] {"N"}; var wigColumnIndex = new ColumnIndex(0, 2, null, null, 3, null); var tsvColumnIndex = new ColumnIndex(0, 1, null, null, 2, null); bool isWig = _inputFile.EndsWith("wig.gz"); var parserSettings = new ParserSettings( isWig ? wigColumnIndex : tsvColumnIndex, nucleotides, GenericScoreParser.NonConflictingScore ); var writerSettings = new WriterSettings( 1_000_000, nucleotides, true, EncoderType.Generic, new GenericScoreEncoder(), new ScoreJsonEncoder(SaCommon.GerpTag + SaCommon.Score, null), new SaItemValidator(null, null) ); using (var streamReader = new StreamReader(GZipUtilities.GetAppropriateReadStream(_inputFile))) using (var parser = new GenericScoreParser(parserSettings, streamReader, referenceProvider.RefNameToChromosome)) using (var saStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GsaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GsaFileSuffix + SaCommon.IndexSuffix))) using (var saWriter = new ScoreFileWriter(writerSettings, saStream, indexStream, version, referenceProvider, SaCommon.SchemaVersion, skipIncorrectRefEntries: true, leaveOpen: false)) { saWriter.Write(parser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/GeneIdentifiers/GeneSymbolUpdater.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using System.Text; using Newtonsoft.Json.Linq; using OptimizedCore; using VariantAnnotation.IO; namespace SAUtils.GeneIdentifiers { public sealed class GeneSymbolUpdater { private readonly Dictionary _entrezGeneIdToSymbol; private readonly Dictionary _ensemblGeneIdToSymbol; private readonly HashSet _geneSymbols; private readonly Dictionary _updatedGeneSymbols; private int _numGenesWhereBothIdsAreNull; private int _numGeneSymbolsUpToDate; private int _numGeneSymbolsUpdated; private int _numGeneSymbolsNotInCache; private int _numResolvedGeneSymbolConflicts; private int _numUnresolvedGeneSymbolConflicts; public GeneSymbolUpdater(Dictionary entrezGeneIdToSymbol, Dictionary ensemblGeneIdToSymbol) { _entrezGeneIdToSymbol = entrezGeneIdToSymbol; _ensemblGeneIdToSymbol = ensemblGeneIdToSymbol; _geneSymbols = new HashSet(); _updatedGeneSymbols = new Dictionary(); } public string UpdateGeneSymbol(string oldGeneSymbol, string ensemblGeneId, string entrezGeneId) { if (ensemblGeneId == null && entrezGeneId == null) { _numGenesWhereBothIdsAreNull++; return null; } var ensemblSymbol = GetSymbol(ensemblGeneId, _ensemblGeneIdToSymbol); var entrezGeneSymbol = GetSymbol(entrezGeneId, _entrezGeneIdToSymbol); _geneSymbols.Clear(); if (ensemblSymbol != null) _geneSymbols.Add(ensemblSymbol); if (entrezGeneSymbol != null) _geneSymbols.Add(entrezGeneSymbol); if (_geneSymbols.Count == 0) { _numGeneSymbolsNotInCache++; return oldGeneSymbol; } var newGeneSymbol = _geneSymbols.First(); if (_geneSymbols.Count > 1) { newGeneSymbol = ResolveGeneSymbolConflict(oldGeneSymbol, ensemblSymbol, entrezGeneSymbol); if (newGeneSymbol == null) { Console.WriteLine($"Unable to resolve gene symbol conflict for {oldGeneSymbol}: Ensembl: [{ensemblGeneId}]: {ensemblSymbol}, Entrez Gene: [{entrezGeneId}]: {entrezGeneSymbol}"); _numUnresolvedGeneSymbolConflicts++; return null; } _numResolvedGeneSymbolConflicts++; } if (newGeneSymbol == oldGeneSymbol) _numGeneSymbolsUpToDate++; else { _updatedGeneSymbols[oldGeneSymbol] = newGeneSymbol; _numGeneSymbolsUpdated++; } return newGeneSymbol; } private static string ResolveGeneSymbolConflict(string oldGeneSymbol, string ensemblSymbol, string entrezGeneSymbol) { var symbolCounts = new Dictionary(); AddSymbol(symbolCounts, oldGeneSymbol); AddSymbol(symbolCounts, ensemblSymbol); AddSymbol(symbolCounts, entrezGeneSymbol); var mostFrequentSymbol = symbolCounts.OrderByDescending(x => x.Value).First(); if (mostFrequentSymbol.Value == 1) { //Console.WriteLine($"Found unique gene symbols when trying to resolve the gene symbol conflict. Entrez Gene {entrezGeneSymbol}"); return null; } return mostFrequentSymbol.Key; } private static void AddSymbol(Dictionary symbolCounts, string geneSymbol) { if (symbolCounts.TryGetValue(geneSymbol, out int counts)) symbolCounts[geneSymbol] = counts + 1; else symbolCounts[geneSymbol] = 1; } private static string GetSymbol(string geneId, IReadOnlyDictionary geneIdToSymbol) { if (geneId == null) return null; return geneIdToSymbol.TryGetValue(geneId, out var symbol) ? symbol : null; } public void DisplayStatistics() { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("Gene Symbol Update Statistics"); Console.ResetColor(); Console.WriteLine("============================================"); StringBuilder sb = StringBuilderPool.Get(); var jo = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jo.AddIntValue("NumGeneSymbolsUpToDate", _numGeneSymbolsUpToDate); jo.AddIntValue("NumGeneSymbolsUpdated", _numGeneSymbolsUpdated); jo.AddIntValue("NumGenesWhereBothIdsAreNull", _numGenesWhereBothIdsAreNull); jo.AddIntValue("NumGeneSymbolsNotInCache", _numGeneSymbolsNotInCache); jo.AddIntValue("NumResolvedGeneSymbolConflicts", _numResolvedGeneSymbolConflicts); jo.AddIntValue("NumUnresolvedGeneSymbolConflicts", _numUnresolvedGeneSymbolConflicts); sb.Append(JsonObject.CloseBrace); Console.WriteLine(JObject.Parse(sb.ToString())); //pretty printing json } } } ================================================ FILE: SAUtils/GeneIdentifiers/GeneUtilities.cs ================================================ using System; using System.Collections.Generic; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.IO; using Compression.Utilities; using Genome; using ReferenceSequence.Utilities; namespace SAUtils.GeneIdentifiers { public static class GeneUtilities { public static string GetGeneSymbolFromId(string geneId, Dictionary entrezGeneIdToSymbol, Dictionary ensemblIdToSymbol) { string geneSymbol; if (geneId.StartsWith("ENSG")) return ensemblIdToSymbol.TryGetValue(geneId, out geneSymbol) ? geneSymbol : null; return entrezGeneIdToSymbol.TryGetValue(geneId, out geneSymbol) ? geneSymbol : null; } public static (Dictionary EntrezGeneIdToSymbol, Dictionary EnsemblIdToSymbol) ParseUniversalGeneArchive(string inputReferencePath, string universalGeneArchivePath) { Dictionary refNameToChromosome; if (inputReferencePath == null) refNameToChromosome = null; else (_, refNameToChromosome, _) = SequenceHelper.GetDictionaries(inputReferencePath); UgaGene[] genes; using (var reader = new UgaGeneReader(GZipUtilities.GetAppropriateReadStream(universalGeneArchivePath), refNameToChromosome)) { genes = reader.GetGenes(); } var entrezGeneIdToSymbol = genes.GetGeneIdToSymbol(x => x.EntrezGeneId); var ensemblIdToSymbol = genes.GetGeneIdToSymbol(x => x.EnsemblId); return (entrezGeneIdToSymbol, ensemblIdToSymbol); } private static Dictionary GetGeneIdToSymbol(this IEnumerable genes, Func geneIdFunc) { var dict = new Dictionary(); foreach (var gene in genes) { var key = geneIdFunc(gene); var symbol = gene.Symbol; if (string.IsNullOrEmpty(key) || string.IsNullOrEmpty(symbol)) continue; dict[key] = symbol; } return dict; } } } ================================================ FILE: SAUtils/GenericScore/GenericScoreParser/GenericScoreItem.cs ================================================ using System; using Genome; using VariantAnnotation.Interface.SA; namespace SAUtils.GenericScore.GenericScoreParser { public sealed class GenericScoreItem : ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public readonly double Score; public GenericScoreItem(Chromosome chromosome, int position, string refAllele, string altAllele, double score) { Chromosome = chromosome; Position = position; RefAllele = refAllele; AltAllele = altAllele; Score = score; } [Obsolete] public string GetJsonString() => $"\"score\":{Score}"; public string InputLine { get; } } } ================================================ FILE: SAUtils/GenericScore/GenericScoreParser/GenericScoreParser.cs ================================================ using System; using System.Collections.Generic; using System.Globalization; using System.IO; using ErrorHandling.Exceptions; using Genome; using OptimizedCore; namespace SAUtils.GenericScore.GenericScoreParser { public sealed class GenericScoreParser : IDisposable { private readonly ParserSettings _parserSettings; private readonly StreamReader _reader; private readonly Dictionary _refNameToChromosome; private readonly Dictionary _representativeScores; private readonly Action> _updateRepresentativeScores; public GenericScoreParser( ParserSettings parserSettings, StreamReader reader, Dictionary refNameToChromosome ) { _reader = reader; _refNameToChromosome = refNameToChromosome; _parserSettings = parserSettings; _representativeScores = new Dictionary(); foreach (string allele in _parserSettings.PossibleAlleles) { _representativeScores[allele] = double.NaN; } _updateRepresentativeScores = _parserSettings.ConflictResolutionFunction; } public IEnumerable GetItems() { string line; int currentPosition = -1; Chromosome currentChromosome = null; string refAllele = null; ColumnIndex columnIndex = _parserSettings.ColumnIndex; while ((line = _reader.ReadLine()) != null) { if (line.StartsWith("#")) continue; string[] fields = line.OptimizedSplit('\t'); if (!_refNameToChromosome.TryGetValue(fields[columnIndex.Chromosome], out var chromosome)) continue; int position = int.Parse(fields[columnIndex.Position]); if (chromosome != currentChromosome || position != currentPosition) { foreach (GenericScoreItem scoreItem in GetItemsAtOnePosition(currentChromosome, currentPosition, refAllele)) yield return scoreItem; } currentChromosome = chromosome; currentPosition = position; // add null checks for alleles refAllele = columnIndex.RefAllele == null ? null : fields[columnIndex.RefAllele.Value]; string altAllele = columnIndex.AltAllele == null ? null : fields[columnIndex.AltAllele.Value]; // set saItem.AltAllele to 'N' if positional if (_parserSettings.IsPositional) altAllele = "N"; if (double.TryParse(fields[columnIndex.Score], NumberStyles.Number | NumberStyles.AllowExponent, CultureInfo.InvariantCulture, out double score)) { _updateRepresentativeScores(altAllele, score, _representativeScores); } } foreach (var scoreItem in GetItemsAtOnePosition(currentChromosome, currentPosition, refAllele)) yield return scoreItem; } private IEnumerable GetItemsAtOnePosition(Chromosome currentChromosome, int currentPosition, string refAllele) { if (currentChromosome == null) yield break; foreach (string altAllele in _parserSettings.PossibleAlleles) { double score = _representativeScores[altAllele]; if (double.IsNaN(score)) continue; yield return new GenericScoreItem(currentChromosome, currentPosition, refAllele, altAllele, score); _representativeScores[altAllele] = double.NaN; } } public static void MaxRepresentativeScores(string altAllele, double score, Dictionary highestScores) { if (double.IsNaN(highestScores[altAllele]) || highestScores[altAllele] < score) highestScores[altAllele] = score; } public static void NonConflictingScore(string altAllele, double score, Dictionary highestScores) { if (!double.IsNaN(highestScores[altAllele])) throw new UserErrorException("Multiple scores oberved."); highestScores[altAllele] = score; } public static void MinRepresentativeScores(string altAllele, double score, Dictionary highestScores) { if (double.IsNaN(highestScores[altAllele]) || highestScores[altAllele] > score) highestScores[altAllele] = score; } public void Dispose() { _reader?.Dispose(); } } } ================================================ FILE: SAUtils/GenericScore/GenericScoreParser/SaItemValidator.cs ================================================ using System.IO; using System.Linq; using VariantAnnotation.Interface.Providers; namespace SAUtils.GenericScore.GenericScoreParser { public sealed class SaItemValidator { private readonly bool? _strictSnvCheck; private readonly bool? _strictReferenceCheck; /// /// Performs checks on each saItem, will throw exception if strict checking is enabled, otherwise returns true/false /// Setting strict checking to null disables all checks and true is always returned /// /// Set to null to disable, if true, then exception will be thrown /// Set to null to disable, if true, then exception will be thrown public SaItemValidator(bool? strictSnvCheck, bool? strictReferenceCheck) { _strictSnvCheck = strictSnvCheck; _strictReferenceCheck = strictReferenceCheck; } public bool Validate(GenericScoreItem saItem, ISequenceProvider refProvider) { return CheckSnv(saItem) && CheckReference(saItem, refProvider); } private bool CheckReference(GenericScoreItem saItem, ISequenceProvider refProvider) { if (_strictReferenceCheck == null) return true; bool hasParRegions = CheckParRegion(saItem, refProvider); string refSequence = refProvider.Sequence.Substring(saItem.Position - 1, saItem.RefAllele.Length); if (string.IsNullOrEmpty(saItem.RefAllele) || saItem.RefAllele == refSequence || hasParRegions) return true; if (_strictReferenceCheck == false) return false; throw new InvalidDataException( $"The provided reference allele {saItem.RefAllele} at {saItem.Chromosome.UcscName}:{saItem.Position} is different from {refSequence} in the reference genome sequence." + $"\nInput Line:\n {saItem.InputLine}"); } private bool CheckParRegion(GenericScoreItem saItem, ISequenceProvider refProvider) { return RegionUtilities.OverlapsParRegion(saItem, refProvider.Assembly) && !string.IsNullOrEmpty(saItem.RefAllele) && saItem.RefAllele.All(x => x is 'N' or 'n'); } private bool CheckSnv(GenericScoreItem saItem) { if (_strictSnvCheck == null) return true; if (saItem.RefAllele.Length == 1 && saItem.AltAllele.Length == 1) return true; if (_strictSnvCheck == false) return false; throw new InvalidDataException($"Only SNV is expected in the input file. Exception found: {saItem.Chromosome}:{saItem.Position}"); } } } ================================================ FILE: SAUtils/GenericScore/ParserSettings.cs ================================================ using System; using System.Collections.Generic; namespace SAUtils.GenericScore { public sealed class ParserSettings { public readonly ColumnIndex ColumnIndex; public readonly string[] PossibleAlleles; public bool IsPositional => ColumnIndex.AltAllele == null; public readonly Action> ConflictResolutionFunction; public ParserSettings( ColumnIndex columnIndex, string[] possibleAlleles, Action> conflictResolutionFunction ) { ColumnIndex = columnIndex; PossibleAlleles = possibleAlleles; ConflictResolutionFunction = conflictResolutionFunction; } } public sealed class ColumnIndex { public readonly ushort Chromosome; public readonly ushort Position; public readonly ushort? RefAllele; public readonly ushort? AltAllele; public readonly ushort Score; public readonly ushort Others; public ColumnIndex( ushort chromosome, ushort position, ushort? refAllele, ushort? altAllele, ushort score, ushort? others ) { Chromosome = chromosome; Position = position; RefAllele = refAllele; AltAllele = altAllele; Score = score; Others = others ?? ushort.MaxValue; } } } ================================================ FILE: SAUtils/GenericScore/ScoreFileWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Compression.Algorithms; using ErrorHandling.Exceptions; using IO; using IO.v2; using SAUtils.GenericScore.GenericScoreParser; using VariantAnnotation.GenericScore; using VariantAnnotation.Interface.Providers; using VariantAnnotation.SA; namespace SAUtils.GenericScore { public sealed class ScoreFileWriter : IDisposable { private readonly ExtendedBinaryWriter _writer; private readonly ExtendedBinaryWriter _indexWriter; private readonly ScoreIndex _index; private readonly ScoreBlock _block; private readonly WriterSettings _writerSettings; private readonly ISequenceProvider _refProvider; private readonly bool _leaveOpen; private readonly bool _skipIncorrectRefEntries; public ScoreFileWriter( WriterSettings writerSettings, Stream stream, Stream indexStream, IDataSourceVersion version, ISequenceProvider refProvider, int schemaVersion, bool skipIncorrectRefEntries = true, bool leaveOpen = false) { _leaveOpen = leaveOpen; _skipIncorrectRefEntries = skipIncorrectRefEntries; _refProvider = refProvider; _writerSettings = writerSettings; var readerSettings = new ReaderSettings( _writerSettings.IsPositional, _writerSettings.EncoderType, _writerSettings.ScoreEncoder, _writerSettings.ScoreJsonEncoder, _writerSettings.Nucleotides, _writerSettings.BlockLength ); _writer = new ExtendedBinaryWriter(stream, System.Text.Encoding.Default, _leaveOpen); _indexWriter = new ExtendedBinaryWriter(indexStream, System.Text.Encoding.Default, _leaveOpen); _index = new ScoreIndex( _indexWriter, readerSettings, _refProvider.Assembly, version, schemaVersion, _writerSettings.IndexHeader, _writerSettings.FilePairId ); _block = new ScoreBlock( new Zstandard(), _index.GetBlockLength() ); } private long FilePosition => _writer.BaseStream.Position; private void WriteHeader() { _writerSettings.Header.Write(_writer); _writer.WriteOpt(_writerSettings.FilePairId); _writer.Write(SaCommon.GuardInt); } public void Write(IEnumerable saItems) { WriteHeader(); uint nucleotideSize = _index.GetNucleotideCount() * _writerSettings.ScoreEncoder.BytesRequired; var nucleotideArray = new byte[nucleotideSize]; Array.Fill(nucleotideArray, byte.MaxValue); var chromosomeIndex = ushort.MaxValue; int chromosomeStartingPosition = -1; int previousPosition = -1; uint blockNumber = 0; uint localBlockIndex = 0; foreach (GenericScoreItem saItem in saItems) { if (chromosomeStartingPosition < 0 && previousPosition < 0) { (chromosomeIndex, chromosomeStartingPosition) = AddNewChromosome(saItem); previousPosition = chromosomeStartingPosition; } if (!_writerSettings.SaItemValidator.Validate(saItem, _refProvider)) { _index.TrackUnmatchedReferencePositions(); continue; } int previousBlockNumber = _index.GetLastBlockNumber(chromosomeIndex); int position = saItem.Position; byte[] encodedScore = _writerSettings.ScoreEncoder.EncodeToBytes(saItem.Score); // Still on the same chromosome and postion, hence just fill the nucleotide array only if (chromosomeIndex == saItem.Chromosome.Index && position == previousPosition) { // Write 4 {A,C,T,G} score values to nucleotide array AddEncodedScoreToNucleotideArray(nucleotideArray, saItem.AltAllele, encodedScore); continue; } (blockNumber, localBlockIndex) = PositionToBlockLocation(previousPosition, chromosomeStartingPosition); // Handle empty blocks by skipping them and adding them to index if (blockNumber - previousBlockNumber > 1) { // Finalize previous memory buffer before writing empty blocks (creats an additional block) WriteToDiskAndUpdateIndex(chromosomeIndex); // write blockNumber - previousBlockNumber - 2 blank blocks and write them to disk int blankBlockCount = (int) blockNumber - previousBlockNumber - 2; WriteBlankBlocks(chromosomeIndex, blankBlockCount); } // Add nucleotide array to memory at appropriate index _block.Add(localBlockIndex, nucleotideArray, nucleotideSize); // writeout if memory buffer is full if (_block.IsFull()) { WriteToDiskAndUpdateIndex(chromosomeIndex); } Array.Fill(nucleotideArray, byte.MaxValue); AddEncodedScoreToNucleotideArray(nucleotideArray, saItem.AltAllele, encodedScore); // A new chromosome if (chromosomeIndex != saItem.Chromosome.Index) { WriteToDiskAndUpdateIndex(chromosomeIndex); (chromosomeIndex, chromosomeStartingPosition) = AddNewChromosome(saItem); } previousPosition = position; } // Writeout the partial block at the end (_, localBlockIndex) = PositionToBlockLocation(previousPosition, chromosomeStartingPosition); _block.Add(localBlockIndex, nucleotideArray, nucleotideSize); WriteToDiskAndUpdateIndex(chromosomeIndex); _writer.Write(Header.NirvanaFooter); //Write Index to disk _index.Write(); } private void AddEncodedScoreToNucleotideArray(byte[] nucleotideArray, string allele, byte[] encodedScore) { ushort? nucleotidePosition = _index.GetNucleotidePosition(allele); if (nucleotidePosition == null) return; Array.Copy( encodedScore, 0, nucleotideArray, (ushort) nucleotidePosition, encodedScore.Length ); } private (ushort chromosomeIndex, int chromosomeStartingPosition) AddNewChromosome(GenericScoreItem saItem) { ushort chromosomeIndex = saItem.Chromosome.Index; int chromosomeStartingPosition = saItem.Position; _refProvider.LoadChromosome(saItem.Chromosome); _index.AddChromosomeBlock(chromosomeIndex, chromosomeStartingPosition); return (chromosomeIndex, chromosomeStartingPosition); } private void WriteBlankBlocks(ushort chromosomeIndex, int blankBlockCount) { for (var i = 0; i < blankBlockCount; i++) { AddBlockToIndex(chromosomeIndex, -1, 0, 0); } } /// /// Write the memory buffer to disk, /// Add the block to index /// Clear out the memory buffer /// /// private void WriteToDiskAndUpdateIndex(ushort chromosomeIndex) { long filePosition = FilePosition; (uint uncompressedSize, int compressedSize) = _block.Write(_writer); AddBlockToIndex(chromosomeIndex, filePosition, compressedSize, uncompressedSize); } private void AddBlockToIndex(ushort chromosomeIndex, long fileStartingPosition, int compressedSize, uint uncompressedSize) { _index.Add(chromosomeIndex, fileStartingPosition, compressedSize, uncompressedSize); } private (uint blockNumber, uint localBlockIndex) PositionToBlockLocation(int position, int startingPosition) { // Position is less than start position if (position < startingPosition) throw new UserErrorException("The Positions are not in order"); return ((uint blockNumber, uint localBlockIndex)) _index.PositionToBlockLocation(position, startingPosition); } public void Dispose() { if (_leaveOpen) return; _writer?.Dispose(); _indexWriter?.Dispose(); } } } ================================================ FILE: SAUtils/GenericScore/WriterSettings.cs ================================================ using System; using IO.v2; using SAUtils.GenericScore.GenericScoreParser; using VariantAnnotation.GenericScore; namespace SAUtils.GenericScore { public sealed class WriterSettings { public readonly Header Header = new(FileType.GsaWriter, 1); public readonly Header IndexHeader = new(FileType.GsaIndex, 1); public readonly int FilePairId = new Random().Next(1_000_000, int.MaxValue); public readonly bool IsPositional; public readonly EncoderType EncoderType; public readonly IScoreEncoder ScoreEncoder; public readonly SaItemValidator SaItemValidator; public readonly string[] Nucleotides; public readonly int BlockLength; public readonly ScoreJsonEncoder ScoreJsonEncoder; public WriterSettings( int blockLength, string[] nucleotides, bool isPositional, EncoderType encoderType, IScoreEncoder scoreEncoder, ScoreJsonEncoder scoreJsonEncoder, SaItemValidator saItemValidator ) { BlockLength = blockLength; Nucleotides = nucleotides; IsPositional = isPositional; EncoderType = encoderType; ScoreEncoder = scoreEncoder; ScoreJsonEncoder = scoreJsonEncoder; SaItemValidator = saItemValidator; } } } ================================================ FILE: SAUtils/GnomadGeneScores/GnomadGeneItem.cs ================================================ using System; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.GnomadGeneScores { public sealed class GnomadGeneItem : ISuppGeneItem, IComparable { public string GeneSymbol { get; } private readonly double? _pLI; private readonly double? _pRec; private readonly double? _pNull; private readonly double? _synZ; private readonly double? _misZ; private readonly double? _loeuf; public GnomadGeneItem(string gene, double? pLi, double? pRec, double? pNull, double? synZ, double? misZ, double? loeuf) { GeneSymbol = gene; _pLI = pLi; _pRec = pRec; _pNull = pNull; _synZ = synZ; _misZ = misZ; _loeuf = loeuf; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddDoubleValue("pLi", _pLI, "0.00e0"); jsonObject.AddDoubleValue("pRec", _pRec, "0.00e0"); jsonObject.AddDoubleValue("pNull", _pNull, "0.00e0"); jsonObject.AddDoubleValue("synZ", _synZ, "0.00e0"); jsonObject.AddDoubleValue("misZ", _misZ, "0.00e0"); jsonObject.AddDoubleValue("loeuf", _loeuf, "0.00e0"); sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } public int CompareTo(GnomadGeneItem other) { if (_loeuf == other._loeuf) { //pick entry with lowest pLI value if (_pLI == other._pLI) { //pick the entry with the max absolute value of synZ + misZ var abs1 = Math.Abs(_synZ ?? 0 + _misZ ?? 0); var abs2 = Math.Abs(other._synZ ?? 0 + other._misZ ?? 0); return abs2.CompareTo(abs1);// inverse compare since we want the greater value to be taken } if (_pLI == null) return 1; if (other._pLI == null) return -1; return _pLI.Value.CompareTo(other._pLI.Value); } if (_loeuf == null) return 1; if (other._loeuf == null) return -1; return _loeuf.Value.CompareTo(other._loeuf.Value); } } } ================================================ FILE: SAUtils/GnomadGeneScores/GnomadGeneParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using OptimizedCore; using VariantAnnotation.Interface.SA; namespace SAUtils.GnomadGeneScores { public sealed class GnomadGeneParser : IDisposable { private readonly StreamReader _reader; private const string GeneTag = "gene"; private const string GeneIdTag = "gene_id"; private const string PliTag = "pLI"; private const string PrecTag = "pRec"; private const string PnullTag = "pNull"; private const string SynZTag = "syn_z"; private const string MisZTag = "mis_z"; private const string LoeufTag = "oe_lof_upper"; private int _geneIndex = -1; private int _geneIdIndex = -1; private int _pliIndex = -1; private int _precIndex = -1; private int _pnullIndex = -1; private int _synZIndex = -1; private int _misZIndex = -1; private int _loeufIndex = -1; private readonly Dictionary _geneIdToSymbols; public GnomadGeneParser(StreamReader reader, Dictionary geneIdToSymbols) { _reader = reader; _geneIdToSymbols = geneIdToSymbols; } public void Dispose() { _reader?.Dispose(); } public Dictionary> GetItems() { bool isFirstLine = true; var geneAnnotations = new Dictionary>(); string line; while ((line = _reader.ReadLine()) != null) { if (isFirstLine) { if (!GetColumnIndices(line)) return null; isFirstLine = false; } else { var geneAnnotation = GetGeneAndScores(line); if(geneAnnotation == null) continue; if (geneAnnotations.TryAdd(geneAnnotation.GeneSymbol, new List {geneAnnotation})) continue; var existingEntry = (GnomadGeneItem) geneAnnotations[geneAnnotation.GeneSymbol][0]; var newEntry = (GnomadGeneItem) geneAnnotation; // in case of a conflict we keep the item with minimal loeuf if (existingEntry.CompareTo(newEntry) > 0) geneAnnotations[geneAnnotation.GeneSymbol][0] = geneAnnotation; } } return geneAnnotations; } private ISuppGeneItem GetGeneAndScores(string line) { var cols = line.OptimizedSplit('\t'); var geneId = cols[_geneIdIndex]; if (!_geneIdToSymbols.TryGetValue(geneId, out var gene)) { gene = cols[_geneIndex]; Console.WriteLine($"GeneId to symbol not found in cache for: {geneId}, using provided name in file: {gene}"); } var pLi = GetScore(cols[_pliIndex]); var pRec = GetScore(cols[_precIndex]); var pNull = GetScore(cols[_pnullIndex]); var synZ = GetScore(cols[_synZIndex]); var misZ = GetScore(cols[_misZIndex]); var loeuf = GetScore(cols[_loeufIndex]); return new GnomadGeneItem(gene, pLi, pRec, pNull, synZ, misZ, loeuf); } private static double? GetScore(string score) { if (score == "NA" || score == "NaN") return null; return double.Parse(score); } private bool GetColumnIndices(string line) { var cols = line.OptimizedSplit('\t'); _geneIndex = Array.IndexOf(cols, GeneTag); _geneIdIndex = Array.IndexOf(cols, GeneIdTag); _pliIndex = Array.IndexOf(cols, PliTag); _pnullIndex = Array.IndexOf(cols, PnullTag); _precIndex = Array.IndexOf(cols, PrecTag); _synZIndex = Array.IndexOf(cols, SynZTag); _misZIndex = Array.IndexOf(cols, MisZTag); _loeufIndex = Array.IndexOf(cols, LoeufTag); if (_geneIdIndex < 0) { Console.WriteLine("gene column not found"); return false; } if (_pliIndex < 0) { Console.WriteLine("pLI column not found"); return false; } if (_precIndex < 0) { Console.WriteLine("pRec column not found"); return false; } if (_pnullIndex < 0) { Console.WriteLine("pNull column not found"); return false; } if (_synZIndex < 0) { Console.WriteLine("synZ column not found"); return false; } if (_misZIndex < 0) { Console.WriteLine("misZ column not found"); return false; } if (_loeufIndex < 0) { Console.WriteLine("loeuf column not found"); return false; } return true; } } } ================================================ FILE: SAUtils/GnomadGeneScores/GnomadGenesMain.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.IO.Caches; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.GnomadGeneScores { public static class GnomadGenesMain { private static string _outputDirectory; private static string _inputFile; private static string _cachePrefix; private static string _referenceSequncePath; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "cache|c=", "Cache prefix", v => _cachePrefix = v }, { "ref|r=", "Reference sequence path", v => _referenceSequncePath = v }, { "in|i=", "input tsv file", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .HasRequiredParameter(_cachePrefix, "transcript cache prefix", "--cache") .CheckInputFilenameExists(CacheConstants.TranscriptPath(_cachePrefix), "transcript cache prefix", "--cache") .HasRequiredParameter(_referenceSequncePath, "reference sequence path", "--ref") .CheckInputFilenameExists(_referenceSequncePath, "reference sequence path", "--ref") .CheckInputFilenameExists(_inputFile, "input TSV file", "--in") .SkipBanner() .ShowHelpMenu("Creates a gene annotation database from gnomAD data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { Dictionary geneIdToSymbols; using (var cacheStream = FileUtilities.GetReadStream(CacheConstants.TranscriptPath(_cachePrefix))) using (var transcriptCacheReader = new TranscriptCacheReader(cacheStream)) using (var refProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_referenceSequncePath))) { geneIdToSymbols = LoadGenesFromCache(refProvider, transcriptCacheReader); Console.WriteLine($"Loaded {geneIdToSymbols.Count} gene symbols from cache."); } var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); var outFileName = $"{version.Name}_{version.Version}"; using (var gnomadGeneParser = new GnomadGeneParser(GZipUtilities.GetAppropriateStreamReader(_inputFile), geneIdToSymbols)) using (var stream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GeneFileSuffix))) using (var ngaWriter = new NgaWriter(stream, version, SaCommon.GnomadGeneScoreTag, SaCommon.SchemaVersion, false)) { ngaWriter.Write(gnomadGeneParser.GetItems()); } return ExitCodes.Success; } private static Dictionary LoadGenesFromCache(ReferenceSequenceProvider refProvider, TranscriptCacheReader cacheReader) { var transcriptData = cacheReader.Read(refProvider.RefIndexToChromosome); var geneIdToSymbols = new Dictionary(transcriptData.Genes.Length); foreach (var gene in transcriptData.Genes) { var geneId = gene.EnsemblId.WithoutVersion; //if(geneId == "ENSG00000272962" || geneId == "ENSG00000198743") // Console.WriteLine("bug"); if (string.IsNullOrEmpty(geneId)) continue; if (! geneIdToSymbols.TryAdd(geneId, gene.Symbol)) { if(geneIdToSymbols[geneId] != gene.Symbol) throw new DataMisalignedException($"Multiple symbols found for {geneId}"); } } return geneIdToSymbols; } } } ================================================ FILE: SAUtils/InputFileParsers/AncestralAlleleReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.InputFileParsers { public sealed class AncestralAlleleReader:IDisposable { private readonly StreamReader _streamReader; private readonly Dictionary _refNameDictionary; private readonly ISequenceProvider _sequenceProvider; private string _ancestralAllele; public AncestralAlleleReader(StreamReader streamReader, ISequenceProvider sequenceProvider) { _streamReader = streamReader; _sequenceProvider = sequenceProvider; _refNameDictionary = sequenceProvider.RefNameToChromosome; } private void Clear() { _ancestralAllele = null; } public IEnumerable GetItems() { using (_streamReader) { string line; while ((line = _streamReader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; var itemsList = ExtractItems(line); if (itemsList == null) continue; foreach (var aaItem in itemsList) { yield return aaItem; } } } } private List ExtractItems(string vcfLine) { var splitLine = vcfLine.Split(new[] { '\t' }, 9);// we don't care about the many fields after info field if (splitLine.Length < 8) return null; Clear(); var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refNameDictionary.ContainsKey(chromosomeName)) return null; var chromosome = _refNameDictionary[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var infoFields = splitLine[VcfCommon.InfoIndex]; // parses the info fields and extract frequencies, ancestral allele, allele counts, etc. var hasSymbolicAllele = altAlleles.Any(x => x.OptimizedStartsWith('<') && x.OptimizedEndsWith('>')); if (hasSymbolicAllele) return null; // ReSharper disable once ConditionIsAlwaysTrueOrFalse ParseInfoField(infoFields); var ancestralAlleleItems = new List(); foreach (string altAllele in altAlleles) { var (shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); ancestralAlleleItems.Add(new AncestralAlleleItem(chromosome, shiftedPos, shiftedRef, shiftedAlt, _ancestralAllele, vcfLine)); } return ancestralAlleleItems; } private void ParseInfoField(string infoFields) { if (infoFields == "" || infoFields == ".") return; var infoItems = infoFields.OptimizedSplit(';'); foreach (string infoItem in infoItems) { (string key, string value) = infoItem.OptimizedKeyValue(); if (key != "AA") continue; _ancestralAllele = GetAncestralAllele(value); break; } } private static string GetAncestralAllele(string value) { if (value == "" || value == ".") return null; var ancestralAllele = value.OptimizedSplit('|')[0]; if (string.IsNullOrEmpty(ancestralAllele)) return null; return ancestralAllele.All(IsNucleotide) ? ancestralAllele : null; } private static bool IsNucleotide(char c) { c = char.ToUpper(c); return c == 'A' || c == 'C' || c == 'G' || c == 'T' || c == 'N'; } public void Dispose() { _streamReader?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/ClinGen/ClinGenReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using SAUtils.DataStructures; using Variants; namespace SAUtils.InputFileParsers.ClinGen { public sealed class ClinGenReader : IDisposable { #region members private readonly StreamReader _reader; private readonly Dictionary _refNameDict; #endregion public ClinGenReader(StreamReader reader, Dictionary refNameDict) { _reader = reader; _refNameDict = refNameDict; } public IEnumerable GetItems() { using (var reader = _reader) { string line; while ((line = reader.ReadLine()) != null) { if (IsClinGenHeader(line)) continue; var cols = line.OptimizedSplit('\t'); string id = cols[0]; string ucscChrom = cols[1]; if(!_refNameDict.ContainsKey(ucscChrom)) continue; var chrom = _refNameDict[ucscChrom]; int start = int.Parse(cols[2]); int end = int.Parse(cols[3]); int observedGains = int.Parse(cols[4]); int observedLosses = int.Parse(cols[5]); var variantType = GetVariantType(cols[6]); var clinInterpretation = GetClinInterpretation(cols[7]); bool validated = cols[8].Equals("True"); var phenotypes = cols[9] == "" ? null : new HashSet(cols[9].OptimizedSplit(',')); var phenotypeIds = cols[10] == "" ? null : new HashSet(cols[10].OptimizedSplit(',')); var currentItem = new ClinGenItem(id, chrom, start, end, variantType, observedGains, observedLosses, clinInterpretation, validated, phenotypes, phenotypeIds); yield return currentItem; } } } private static VariantType GetVariantType(string variantTypeDescription) { switch (variantTypeDescription) { case "copy_number_gain": return VariantType.copy_number_gain; case "copy_number_loss": return VariantType.copy_number_loss; case "copy_number_variation": return VariantType.copy_number_variation; default: return VariantType.unknown; } } private static ClinicalInterpretation GetClinInterpretation(string s) { switch (s) { case "pathogenic": return ClinicalInterpretation.pathogenic; case "benign": return ClinicalInterpretation.benign; case "likely_pathogenic": return ClinicalInterpretation.likely_pathogenic; case "likely_benign": return ClinicalInterpretation.likely_benign; case "uncertain_significance": return ClinicalInterpretation.uncertain_significance; default: return ClinicalInterpretation.unknown; } } private static bool IsClinGenHeader(string line) { return line.OptimizedStartsWith('#'); } public void Dispose() { _reader?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/ClinVarCommon.cs ================================================ using System.Collections.Generic; using System.Linq; namespace SAUtils.InputFileParsers.ClinVar { public static class ClinVarCommon { public static string NormalizeAllele(string allele) { if (string.IsNullOrEmpty(allele)) return "-"; return allele == "N" ? null : allele; } public static readonly HashSet ValidPathogenicity = new HashSet { "uncertain significance", "not provided", "benign", "likely benign", "likely pathogenic", "pathogenic", "drug response", "histocompatibility", "association", "risk factor", "protective", "affects", "conflicting data from submitters", "other", "association not found", "confers sensitivity", "no interpretation for the single variant",// observed in VCV XML only "conflicting interpretations of pathogenicity", // observed in VCV XML only "established risk allele", // observed in VCV XML only "likely risk allele" // observed in VCV XML only }; public enum ReviewStatus { // ReSharper disable InconsistentNaming no_assertion, no_criteria, single_submitter, multiple_submitters, multiple_submitters_no_conflict, conflicting_interpretations, expert_panel, practice_guideline, no_interpretation_single // ReSharper restore InconsistentNaming } public static readonly Dictionary ReviewStatusNameMapping = new Dictionary { ["no_assertion"] = ReviewStatus.no_assertion, ["no_criteria"] = ReviewStatus.no_criteria, ["guideline"] = ReviewStatus.practice_guideline, ["single"] = ReviewStatus.single_submitter, ["mult"] = ReviewStatus.multiple_submitters, ["conf"] = ReviewStatus.conflicting_interpretations, ["exp"] = ReviewStatus.expert_panel, // the following are the long forms found in XML ["no assertion provided"] = ReviewStatus.no_assertion, ["no assertion criteria provided"] = ReviewStatus.no_criteria, ["practice guideline"] = ReviewStatus.practice_guideline, ["criteria provided, conflicting interpretations"] = ReviewStatus.conflicting_interpretations, ["reviewed by expert panel"] = ReviewStatus.expert_panel, ["classified by multiple submitters"] = ReviewStatus.multiple_submitters, ["criteria provided, multiple submitters, no conflicts"] = ReviewStatus.multiple_submitters_no_conflict, ["criteria provided, single submitter"] = ReviewStatus.single_submitter, ["no interpretation for the single variant"] = ReviewStatus.no_interpretation_single }; public static readonly Dictionary ReviewStatusStrings = new Dictionary { [ReviewStatus.no_criteria] = "no assertion criteria provided", [ReviewStatus.no_assertion] = "no assertion provided", [ReviewStatus.expert_panel] = "reviewed by expert panel", [ReviewStatus.single_submitter] = "criteria provided, single submitter", [ReviewStatus.practice_guideline] = "practice guideline", [ReviewStatus.multiple_submitters] = "classified by multiple submitters", [ReviewStatus.conflicting_interpretations] = "criteria provided, conflicting interpretations", [ReviewStatus.multiple_submitters_no_conflict] = "criteria provided, multiple submitters, no conflicts", [ReviewStatus.no_interpretation_single] = "no interpretation for the single variant" }; public static string[] GetSignificances(string description, string explanation) { if(string.IsNullOrEmpty(explanation)) return description?.ToLower().Split('/', ',', ';').Select(x=>x.Trim()).ToArray(); //Pathogenic(1);Uncertain significance(1) var significances =new List(); foreach (var significance in explanation.ToLower().Split('/',';')) { var openParenthesisIndex = significance.IndexOf('('); significances.Add(openParenthesisIndex < 0 ? significance.Trim() : significance.Substring(0, openParenthesisIndex).Trim()); } return significances.ToArray(); } } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/ClinVarParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Xml; using System.Xml.Linq; using Genome; using Newtonsoft.Json.Linq; using SAUtils.CreateClinvarDb; using SAUtils.DataStructures; using SAUtils.Schema; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using Variants; using Vcf.VariantCreator; namespace SAUtils.InputFileParsers.ClinVar { public sealed class ClinVarParser :IDisposable { #region members private const string RefAssertionTag = "ReferenceClinVarAssertion"; private const string ClinVarAssertionTag = "ClinVarAssertion"; private const string ReviewStatusTag = "ReviewStatus"; private const string DescriptionTag = "Description"; private const string ExplanationTag = "Explanation"; private const int MaxVariantLength = 1000; private readonly Dictionary _iupacBases = new Dictionary { ['R'] = new[] { 'A', 'G' }, ['Y'] = new[] { 'C', 'T' }, ['S'] = new[] { 'G', 'C' }, ['W'] = new[] { 'A', 'T' }, ['K'] = new[] { 'G', 'T' }, ['M'] = new[] { 'A', 'C' }, ['B'] = new[] { 'C', 'G', 'T' }, ['D'] = new[] { 'A', 'G', 'T' }, ['H'] = new[] { 'A', 'C', 'T' }, ['V'] = new[] { 'A', 'C', 'G' } }; private readonly Stream _rcvStream; private readonly Stream _vcvStream; private readonly ISequenceProvider _sequenceProvider; private readonly Dictionary _refChromDict; private string _lastClinvarAccession; #endregion #region clinVarItem fields private readonly List _variantList= new List(); private HashSet _alleleOrigins; private string _reviewStatus; private string _id; private HashSet _prefPhenotypes; private HashSet _altPhenotypes; private string[] _significances; private HashSet _medGenIDs; private HashSet _omimIDs; private HashSet _allilicOmimIDs; private HashSet _orphanetIDs; private HashSet _pubMedIds = new HashSet(); private long _lastUpdatedDate; private List _vcvItems; public SaJsonSchema JsonSchema { get; } = ClinVarSchema.Get(); #endregion private void ClearClinvarFields() { _variantList.Clear(); _reviewStatus = null; _alleleOrigins = new HashSet(); _significances = null; _prefPhenotypes = new HashSet(); _altPhenotypes = new HashSet(); _id = null; _medGenIDs = new HashSet(); _omimIDs = new HashSet(); _allilicOmimIDs = new HashSet(); _orphanetIDs = new HashSet(); _pubMedIds = new HashSet();//we need a new pubmed hash since otherwise, pubmedid hashes of different items interfere. _lastUpdatedDate = long.MinValue; } // constructor public ClinVarParser(Stream rcvStream, Stream vcvStream, ISequenceProvider sequenceProvider) { _rcvStream = rcvStream; _vcvStream = vcvStream; _sequenceProvider = sequenceProvider; _refChromDict = sequenceProvider?.RefNameToChromosome; } private const string ClinVarSetTag = "ClinVarSet"; public IEnumerable GetItems() { _vcvItems = GetVariationRecords(); Console.WriteLine($"Found {_vcvItems.Count} VCV records"); var unknownVcvs = new HashSet(); var vcvSaItems = new HashSet(); var rcvItems = GetRcvItems(); foreach (var clinVarItem in rcvItems) { var vcvId = int.Parse(clinVarItem.VariationId); var vcvIndex = SuppDataUtilities.BinarySearch(_vcvItems, vcvId); if (vcvIndex < 0) { Console.WriteLine($"Unknown vcv id:{vcvId} found in {clinVarItem.Id}"); unknownVcvs.Add(vcvId); //remove the VariationId clinVarItem.VariationId = null; continue; } var vcvItem = _vcvItems[vcvIndex]; vcvSaItems.Add(new VcvSaItem(clinVarItem.Chromosome, clinVarItem.Position, clinVarItem.RefAllele, clinVarItem.AltAllele, vcvItem.Accession, vcvItem.Version, vcvItem.LastUpdatedDate, vcvItem.ReviewStatus, vcvItem.Significances)); clinVarItem.VariationId = $"{vcvItem.Accession}.{vcvItem.Version}"; } var allItems = new List(rcvItems); allItems.AddRange(vcvSaItems); allItems.Sort(); ReportStatistics(allItems); Console.WriteLine($"{unknownVcvs.Count} unknown VCVs found in RCVs."); Console.WriteLine($"{string.Join(',', unknownVcvs)}"); return allItems; } private void ReportStatistics(List items) { Console.WriteLine($"{_invalidRefAlleleCount} entries were skipped due to invalid ref allele."); Console.WriteLine($"{_aluCount} ALU entries found."); Console.WriteLine($"{_microsatelliteCount} Microsatellite entries found."); Console.WriteLine($"{_variationCount} Variation entries found."); var stats = new ClinVarStats(); stats.GetClinvarSaItemsStats(items); var jo = JObject.Parse(stats.ToString()); Console.WriteLine(jo);//pretty printing json } public List GetRcvItems() { var clinVarItems = new List(); using (var reader = new StreamReader(_rcvStream)) using (var xmlReader = XmlReader.Create(reader, new XmlReaderSettings { DtdProcessing = DtdProcessing.Prohibit, IgnoreWhitespace = true})) { //skipping the top level element to go down to its elements xmlReader.ReadToDescendant(ClinVarSetTag); do { var subTreeReader = xmlReader.ReadSubtree(); var xElement = XElement.Load(subTreeReader); List extractedItems; try { extractedItems = ExtractClinVarItems(xElement); } catch (Exception ) { Console.WriteLine($"Last clinVar accession observed {_lastClinvarAccession}"); throw; } if (extractedItems == null) continue; clinVarItems.AddRange(extractedItems); } while (xmlReader.ReadToNextSibling(ClinVarSetTag)); } clinVarItems.Sort(); var validItems = GetValidItems(clinVarItems); return validItems.Distinct().ToList(); } private List GetVariationRecords() { using var reader = new ClinVarVariationReader(_vcvStream); var items= new List(reader.GetItems()); items.Sort(); return items; } private int _invalidRefAlleleCount = 0; private List GetValidItems(List clinVarItems) { var shiftedItems = new List(); foreach (var item in clinVarItems) { _sequenceProvider.LoadChromosome(item.Chromosome); if (!ValidateRefAllele(item)) { _invalidRefAlleleCount++; continue; } string refAllele= item.RefAllele, altAllele= item.AltAllele; if (string.IsNullOrEmpty(item.RefAllele) && item.VariantType == "Deletion") refAllele = GetReferenceAllele(item, _sequenceProvider.Sequence); if (string.IsNullOrEmpty(item.RefAllele) && item.VariantType == "Indel" && !string.IsNullOrEmpty(item.AltAllele)) refAllele = GetReferenceAllele(item, _sequenceProvider.Sequence); if (string.IsNullOrEmpty(item.AltAllele) && item.VariantType == "Duplication") altAllele = GetAltAllele(item, _sequenceProvider.Sequence); if (string.IsNullOrEmpty(refAllele) && string.IsNullOrEmpty(altAllele)) continue; int start; (start, refAllele, altAllele) = VariantUtils.TrimAndLeftAlign(item.Position, refAllele, altAllele, _sequenceProvider.Sequence); shiftedItems.Add(new ClinVarItem(item.Chromosome, start, item.Stop, refAllele, altAllele, item.JsonSchema, item.AlleleOrigins, item.VariantType, item.Id,item.VariationId, item.ReviewStatus, item.MedGenIds, item.OmimIds, item.OrphanetIds, item.Phenotypes, item.Significances, item.PubmedIds, item.LastUpdatedDate)); } shiftedItems.Sort(); return shiftedItems; } private List ExtractClinVarItems(XElement xElement) { ClearClinvarFields(); if (xElement == null || xElement.IsEmpty) return null; ParseAssertions(xElement); var clinvarList = new List(); foreach (var variant in _variantList) { if (IsInvalidVariant(variant)) continue; var extendedOmimIds = GetOmimIds(variant); var reviewStatEnum = ClinVarCommon.ReviewStatus.no_assertion; if (ClinVarCommon.ReviewStatusNameMapping.ContainsKey(_reviewStatus)) reviewStatEnum = ClinVarCommon.ReviewStatusNameMapping[_reviewStatus]; clinvarList.Add( new ClinVarItem(variant.Chromosome, variant.Start, variant.Stop, variant.RefAllele??"",// alleles cannot be null variant.AltAllele??"", JsonSchema, _alleleOrigins.Count > 0 ? _alleleOrigins : null, variant.VariantType, _id, variant.VariantId, reviewStatEnum, _medGenIDs.Count > 0 ? _medGenIDs : null, extendedOmimIds.Count > 0 ? extendedOmimIds : null, _orphanetIDs.Count > 0 ? _orphanetIDs : null, _prefPhenotypes.Count > 0 ? _prefPhenotypes : _altPhenotypes, _significances, _pubMedIds.Count > 0 ? _pubMedIds.OrderBy(x => x) : null, _lastUpdatedDate)); } return clinvarList.Count > 0 ? clinvarList: null; } private HashSet GetOmimIds(ClinvarVariant variant) { var extendedOmimIds = new HashSet(_omimIDs); foreach (var omimId in variant.AllelicOmimIds) { extendedOmimIds.Add(omimId); } return extendedOmimIds; } private void ParseAssertions(XElement xElement) { foreach (var element in xElement.Elements(RefAssertionTag)) ParseRefClinVarAssertion(element); foreach (var element in xElement.Elements(ClinVarAssertionTag)) ParseClinvarAssertion(element); } private int _aluCount = 0; private int _microsatelliteCount = 0; private int _variationCount = 0; private bool IsInvalidVariant(ClinvarVariant variant) { switch (variant.VariantType) { case "ALU": _aluCount++; break; case "Microsatellite": _microsatelliteCount++; break; case "Variation": _variationCount++; break; } if (variant.VariantType == "ALU") return true; return variant.Chromosome == null || (variant.VariantType == "Microsatellite" || variant.VariantType == "Variation" ) && string.IsNullOrEmpty(variant.AltAllele); } private bool ValidateRefAllele(ClinVarItem clinvarVariant) { if (string.IsNullOrEmpty(clinvarVariant.RefAllele) || clinvarVariant.RefAllele == "-") return true; string refAllele = clinvarVariant.RefAllele; if (string.IsNullOrEmpty(refAllele)) return true; int refLength = clinvarVariant.Stop - clinvarVariant.Position + 1; return refLength == refAllele.Length && _sequenceProvider.Sequence.Validate(clinvarVariant.Position, clinvarVariant.Stop, refAllele); } private static string GetReferenceAllele(ClinVarItem variant, ISequence compressedSequence) { return variant == null ? null : compressedSequence.Substring(variant.Position - 1, variant.Stop - variant.Position + 1); } private static string GetAltAllele(ClinVarItem variant, ISequence compressedSequence) { return variant == null ? null : compressedSequence.Substring(variant.Position - 1, variant.Stop - variant.Position + 1); } internal static long ParseDate(string s) { if (string.IsNullOrEmpty(s) || s == "-") return long.MinValue; //Jun 29, 2010 return DateTime.Parse(s).Ticks; } private const string UpdateDateTag = "DateLastUpdated"; private const string AccessionTag = "Acc"; private const string VersionTag = "Version"; private const string ClinVarAccessionTag = "ClinVarAccession"; private const string ClinicalSignificanceTag = "ClinicalSignificance"; private const string MeasureSetTag = "MeasureSet"; private const string TraitSetTag = "TraitSet"; private const string ObservedInTag = "ObservedIn"; private const string SampleTag = "Sample"; private void ParseRefClinVarAssertion(XElement xElement) { if (xElement==null || xElement.IsEmpty) return; // _lastUpdatedDate = ParseDate(xElement.Attribute(UpdateDateTag)?.Value); _lastClinvarAccession = xElement.Element(ClinVarAccessionTag)?.Attribute(AccessionTag)?.Value; _id = _lastClinvarAccession + "." + xElement.Element(ClinVarAccessionTag)?.Attribute(VersionTag)?.Value; GetClinicalSignificance(xElement.Element(ClinicalSignificanceTag)); ParseGenotypeSet(xElement.Element(GenotypeSetTag)); ParseMeasureSet(xElement.Element(MeasureSetTag)); ParseTraitSet(xElement.Element(TraitSetTag)); } private const string CitationTag = "Citation"; private const string OriginTag = "Origin"; private void ParseClinvarAssertion(XElement xElement) { if (xElement == null || xElement.IsEmpty) return; foreach (var element in xElement.Descendants(CitationTag)) ParseCitation(element); foreach (var element in xElement.Elements(ObservedInTag)) ParseObservedIn(element); } private void ParseObservedIn(XElement xElement) { var samples = xElement?.Elements(SampleTag); if (samples == null) return; foreach (var sample in samples) { foreach (var origin in sample.Elements(OriginTag)) _alleleOrigins.Add(origin.Value); } } private const string TraitTag = "Trait"; private void ParseTraitSet(XElement xElement) { if (xElement == null || xElement.IsEmpty) return; foreach (var element in xElement.Elements(TraitTag)) ParseTrait(element); } private const string XrefTag = "XRef"; private const string NameTag = "Name"; private void ParseTrait(XElement xElement) { if (xElement == null || xElement.IsEmpty) return; foreach (var element in xElement.Elements(XrefTag)) ParseXref(element); foreach (var element in xElement.Elements(NameTag)) ParsePnenotype(element); } private const string ElementValueTag = "ElementValue"; private void ParsePnenotype(XElement xElement) { if (xElement == null || xElement.IsEmpty) return; ParsePhenotypeElementValue(xElement.Element(ElementValueTag)); } private const string TypeTag = "Type"; private void ParsePhenotypeElementValue(XElement xElement) { var phenotype = xElement.Attribute(TypeTag); if (phenotype == null) return; if (phenotype.Value == "Preferred") { _prefPhenotypes.Add(xElement.Value); } else if (phenotype.Value == "Alternate") { _altPhenotypes.Add(xElement.Value); } } private const string DbTag = "DB"; private const string IdTag = "ID"; private void ParseXref(XElement xElement) { var db = xElement.Attribute(DbTag); if (db == null) return; string id = xElement.Attribute(IdTag)?.Value.Trim(' '); // Trimming is necessary here, don't turn it off. switch (db.Value) { case "MedGen": _medGenIDs.Add(id); break; case "Orphanet": _orphanetIDs.Add(id); break; case "OMIM": var type = xElement.Attribute(TypeTag); if (type !=null) if (type.Value == "Allelic variant" ) _allilicOmimIDs.Add(TrimOmimId(id)); else _omimIDs.Add(TrimOmimId(id)); break; } } private static string TrimOmimId(string id) { return id.TrimStart('P','S'); } private const string SourceTag = "Source"; private const string PubmedIdTag = "PubMed"; private void ParseCitation(XElement xElement) { if (xElement == null || xElement.IsEmpty) return; foreach (var element in xElement.Elements(IdTag)) { var source = element.Attribute(SourceTag); if (source == null) continue; if (source.Value != PubmedIdTag) continue; string pubmedId = element.Value.Split('.', ',')[0]; //pubmed ids with more than 8 digits are bad if (long.TryParse(pubmedId, out long l) && l <= 99_999_999)//pubmed ids with more than 8 digits are bad _pubMedIds.Add(l); //else Console.WriteLine($"WARNING:unexpected pubmedID {pubmedId}."); } } private const string MeasureTag = "Measure"; private const string GenotypeSetTag = "GenotypeSet"; private void ParseGenotypeSet(XElement xElement) { if (xElement == null || xElement.IsEmpty) return; foreach (var measureSet in xElement.Elements(MeasureSetTag)) { ParseMeasureSet(measureSet); } } private void ParseMeasureSet(XElement xElement) { if (xElement == null || xElement.IsEmpty) return; var variantId = xElement.Attribute(IdTag) == null ? null : xElement.Attribute(IdTag)?.Value; foreach (var element in xElement.Elements(MeasureTag)) { ParseMeasure(element, variantId); } } private const string SeqLocationTag = "SequenceLocation"; private void ParseMeasure(XElement xElement, string variantId) { if (xElement == null || xElement.IsEmpty) return; _allilicOmimIDs.Clear(); //the variant type is available in the attributes var varType = xElement.Attribute(TypeTag)?.Value; var variantList = new List(); foreach (var element in xElement.Elements(XrefTag)) ParseXref(element); foreach (var element in xElement.Elements(SeqLocationTag)) { var variant = GetClinvarVariant(element, _sequenceProvider.Assembly, _refChromDict, variantId); if (variant == null) continue; variant.VariantType = varType; if (varType == "Microsatellite") UpdateVariantType(variant); if (variant.AltAllele == "Alu") variant.VariantType = "ALU"; if (variant.AltAllele != null && variant.AltAllele.Length == 1 && _iupacBases.ContainsKey(variant.AltAllele[0])) AddIupacVariants(variant, variantList); else variantList.Add(variant); } if (_allilicOmimIDs.Count != 0) { foreach (var variant in variantList) { variant.AllelicOmimIds.AddRange(_allilicOmimIDs); } } _variantList.AddRange(variantList); } private static void UpdateVariantType(ClinvarVariant variant) { var refAllele = variant.RefAllele; var altAllele = variant.AltAllele; if (refAllele == null || altAllele == null) return; var variantType = SmallVariantCreator.GetVariantType(refAllele, altAllele); switch (variantType) { case VariantType.deletion: variant.VariantType = "Deletion"; break; case VariantType.insertion: variant.VariantType = "Insertion"; break; case VariantType.indel: variant.VariantType = "Indel"; break; case VariantType.duplication: variant.VariantType = "Duplication"; break; case VariantType.SNV: variant.VariantType = "SNV"; break; case VariantType.MNV: variant.VariantType = "MNV"; break; } } private void AddIupacVariants(ClinvarVariant variant, List variantList) { foreach (char altAllele in _iupacBases[variant.AltAllele[0]]) { variantList.Add(new ClinvarVariant(variant.Chromosome,variant.Start, variant.Stop,variant.VariantId, variant.RefAllele, altAllele.ToString())); } } private const string ChrTag = "Chr"; private const string StopTag = "display_stop"; private const string StartTag = "display_start"; private const string AssemblyTag = "Assembly"; private const string RefAlleleTag = "referenceAllele"; private const string AltAlleleTag = "alternateAllele"; private const string VcfPositionTag = "positionVCF"; private const string VcfRefAlleleTag = "referenceAlleleVCF"; private const string VcfAltAlleleTag = "alternateAlleleVCF"; private static ClinvarVariant GetClinvarVariant(XElement xElement, GenomeAssembly genomeAssembly, Dictionary refChromDict, string variantId) { if (xElement == null ) return null; // if (genomeAssembly.ToString()!= xElement.Attribute(AssemblyTag)?.Value && genomeAssembly != GenomeAssembly.Unknown) return null; var chromosome = refChromDict.ContainsKey(xElement.Attribute(ChrTag)?.Value) ? refChromDict[xElement.Attribute(ChrTag)?.Value] : null; int start = Convert.ToInt32(xElement.Attribute(StartTag)?.Value); int stop = Convert.ToInt32(xElement.Attribute(StopTag)?.Value); string refAllele = xElement.Attribute(RefAlleleTag)?.Value; string altAllele = xElement.Attribute(AltAlleleTag)?.Value; //check if VCV values are present int vcfPosition = Convert.ToInt32(xElement.Attribute(VcfPositionTag)?.Value); string vcfRefAllele = xElement.Attribute(VcfRefAlleleTag)?.Value; string vcfAltAllele = xElement.Attribute(VcfAltAlleleTag)?.Value; if (vcfRefAllele != null) { start = vcfPosition; refAllele = vcfRefAllele; altAllele = vcfAltAllele; stop = start + refAllele.Length - 1; } if (stop - start + 1 > MaxVariantLength) return null; AdjustVariant(ref start, ref refAllele, ref altAllele); return new ClinvarVariant(chromosome, start, stop, variantId, refAllele, altAllele); } private static void AdjustVariant(ref int start, ref string referenceAllele, ref string altAllele) { if (referenceAllele == "-") { referenceAllele = ""; start++; } if (altAllele == "-") altAllele = ""; } private void GetClinicalSignificance(XElement xElement) { if (xElement == null || xElement.IsEmpty) return; _reviewStatus = xElement.Element(ReviewStatusTag)?.Value; var description = xElement.Element(DescriptionTag)?.Value; var explanation = xElement.Element(ExplanationTag)?.Value; _significances = ClinVarCommon.GetSignificances(description, explanation); ValidateSignificance(_significances); } private void ValidateSignificance(string[] significances) { foreach (var significance in significances) { if (!ClinVarCommon.ValidPathogenicity.Contains(significance)) throw new InvalidDataException($"Invalid pathogenicity found in {_id}. Observed: {significance}"); } } public void Dispose() { _rcvStream?.Dispose(); _sequenceProvider?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/ClinVarSchema.cs ================================================ using System.Collections.Generic; using System.Linq; using System.Text; using SAUtils.Schema; using VariantAnnotation.SA; namespace SAUtils.InputFileParsers.ClinVar { public static class ClinVarSchema { private static readonly SaJsonValueType PrimaryValueType = SaJsonValueType.ObjectArray; private static readonly string[] JsonKeys = { "id", "variationId", "reviewStatus", "alleleOrigins", "refAllele", "altAllele", "phenotypes", "medGenIds", "omimIds", "orphanetIds", "significance", "lastUpdatedDate", "pubMedIds", "isAlleleSpecific" }; private static readonly List ValueTypes = new List { SaJsonValueType.String, SaJsonValueType.String, SaJsonValueType.String, SaJsonValueType.StringArray, SaJsonValueType.String, SaJsonValueType.String, SaJsonValueType.StringArray, SaJsonValueType.StringArray, SaJsonValueType.StringArray, SaJsonValueType.StringArray, SaJsonValueType.StringArray, SaJsonValueType.String, SaJsonValueType.StringArray, SaJsonValueType.Bool }; public static SaJsonSchema Get() { var jsonSchema = SaJsonSchema.Create(new StringBuilder(), SaCommon.ClinvarTag, PrimaryValueType, JsonKeys); jsonSchema.SetNonSaKeys(new []{"isAlleleSpecific"}); foreach((string key, var valueType) in JsonKeys.Zip(ValueTypes, (a, b) => (a, b))) jsonSchema.AddAnnotation(key, SaJsonKeyAnnotation.CreateFromProperties(valueType, 0, null)); return jsonSchema; } } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/ClinVarVariationReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Xml; using System.Xml.Linq; using IO; namespace SAUtils.InputFileParsers.ClinVar { public sealed class ClinVarVariationReader : IDisposable { private const string VcvRecordTag = "VariationArchive"; private const string AccessionTag = "Accession"; private const string VersionTag = "Version"; private const string DateTag = "DateLastUpdated"; private const string ReviewStatusTag = "ReviewStatus"; private const string InterpretedRecordTag = "InterpretedRecord"; private const string InterpretationsTag = "Interpretations"; private const string InterpretationTag = "Interpretation"; private const string IncludedRecordTag = "IncludedRecord"; private const string DescriptionTag = "Description"; private const string ExplanationTag = "Explanation"; private const string TypeTag = "Type"; private readonly Stream _readStream; public ClinVarVariationReader(Stream readStream) { _readStream = readStream; } public IEnumerable GetItems() { using (var reader = FileUtilities.GetStreamReader(_readStream)) using (var xmlReader = XmlReader.Create(reader, new XmlReaderSettings { DtdProcessing = DtdProcessing.Prohibit, IgnoreWhitespace = true})) { xmlReader.ReadToDescendant(VcvRecordTag); do { var subTreeReader = xmlReader.ReadSubtree(); var xElement = XElement.Load(subTreeReader); var item = ExtractVariantRecord(xElement); if (item == null) continue; yield return item; } while (xmlReader.ReadToNextSibling(VcvRecordTag)); } } private static VcvItem ExtractVariantRecord(XElement xElement) { if (xElement == null || xElement.IsEmpty) return null; var accession = xElement.Attribute(AccessionTag)?.Value; var version = xElement.Attribute(VersionTag)?.Value; var dateString = xElement.Attribute(DateTag)?.Value; var date = ClinVarParser.ParseDate(dateString); var interpretationRecord = xElement.Element(InterpretedRecordTag); var includedRecord = xElement.Element(IncludedRecordTag); //expecting one of the two to be non-null if (!((interpretationRecord == null || interpretationRecord.IsEmpty) ^ (includedRecord == null || includedRecord.IsEmpty))) { throw new DataMisalignedException("Only one of interpretation/included records should be present for "+ accession); } if (interpretationRecord != null && !interpretationRecord.IsEmpty) { var interpretedSignificances = GetSignificances(interpretationRecord.Element(InterpretationsTag)); var interpretedReviewStatusString = interpretationRecord.Element(ReviewStatusTag)?.Value; if(interpretedReviewStatusString ==null) throw new MissingFieldException($"No review status provided for {accession}.{version}"); var interpretedReviewStatus = ClinVarCommon.ReviewStatusNameMapping[interpretedReviewStatusString]; return new VcvItem(accession, version, date, interpretedReviewStatus, interpretedSignificances); } var includedSignificances = GetSignificances(includedRecord.Element(InterpretationsTag)); var includedReviewStatusString = includedRecord.Element(ReviewStatusTag)?.Value; if(includedReviewStatusString ==null) throw new MissingFieldException($"No review status provided for {accession}.{version}"); var reviewStatus = ClinVarCommon.ReviewStatusNameMapping[includedReviewStatusString]; return new VcvItem(accession, version, date, reviewStatus, includedSignificances); } private static List GetSignificances(XElement interpretations) { if (interpretations == null || interpretations.IsEmpty) return null; var significanceList = new List(); foreach (var interpretation in interpretations.Elements(InterpretationTag)) { var type = interpretation.Attribute(TypeTag)?.Value; if(type==null || type != "Clinical significance") continue; var description = interpretation.Element(DescriptionTag)?.Value.ToLower(); var explanation = interpretation.Element(ExplanationTag)?.Value.ToLower(); if(description == null && explanation == null) continue; var significances = ClinVarCommon.GetSignificances(description, explanation); foreach (var significance in significances) { if (!ClinVarCommon.ValidPathogenicity.Contains(significance)) throw new InvalidDataException($"Invalid clinical significance found. Observed: {significance}"); significanceList.Add(significance); } } return significanceList; } public void Dispose() { _readStream?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/ClinvarVariant.cs ================================================ using System.Collections.Generic; using Genome; namespace SAUtils.InputFileParsers.ClinVar { public sealed class ClinvarVariant { public readonly Chromosome Chromosome; public int Start { get; } public readonly int Stop; public readonly string RefAllele; public readonly string AltAllele; public string VariantType; public readonly List AllelicOmimIds; public readonly string VariantId; public ClinvarVariant(Chromosome chr, int start, int stop, string variantId, string refAllele, string altAllele, List allilicOmimIds =null) { Chromosome = chr; Start = start; Stop = stop; VariantId = variantId; RefAllele = refAllele; AltAllele = altAllele; AllelicOmimIds = allilicOmimIds ?? new List(); } } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/IClinVarSaItem.cs ================================================ using System; using System.Collections.Generic; using VariantAnnotation.Interface.SA; namespace SAUtils.InputFileParsers.ClinVar; public interface IClinVarSaItem: ISupplementaryDataItem, IComparable { string Id { get; } IEnumerable Significances { get; } ClinVarCommon.ReviewStatus ReviewStatus { get; } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/VariantAligner.cs ================================================ using System.IO; using Genome; using SAUtils.MitoMap; using Variants; namespace SAUtils.InputFileParsers.ClinVar { public sealed class VariantAligner { private readonly ISequence _compressedSequence; private const int MaxRotationRange = 500; /// /// constructor /// public VariantAligner(ISequence compressedSequence) { _compressedSequence = compressedSequence; } /// /// Left aligns the variant using base rotation /// /// Tuple of new position, ref and alt allele public (int RefPosition, string RefAllele, string AltAllele) LeftAlign(int refPosition, string refAllele, string altAllele, bool isCircularGenome = false) { var trimmedAllele = BiDirectionalTrimmer.Trim(refPosition, refAllele, altAllele); var trimmedPos = trimmedAllele.Start; var trimmedRefAllele = trimmedAllele.RefAllele; var trimmedAltAllele = trimmedAllele.AltAllele; // alignment only makes sense for insertion and deletion if (!(trimmedAltAllele.Length == 0 || trimmedRefAllele.Length == 0)) return (refPosition, refAllele, altAllele); var upstreamSeq = GetUpstreamSeq(trimmedPos, MaxRotationRange, isCircularGenome); if (upstreamSeq == null) throw new InvalidDataException("Reference sequence not set, please check that it is loaded"); // compressed seq is 0 based var combinedSeq = upstreamSeq; int repeatLength; int i; if (trimmedRefAllele.Length > trimmedAltAllele.Length) { // deletion combinedSeq += trimmedRefAllele; repeatLength = trimmedRefAllele.Length; for (i = combinedSeq.Length - 1; i >= repeatLength; i--, trimmedPos--) { if (combinedSeq[i] != combinedSeq[i - repeatLength]) break; } var newRefAllele = combinedSeq.Substring(i + 1 - repeatLength, repeatLength); return (trimmedPos, newRefAllele, ""); //alt is empty for deletion } //insertion combinedSeq += trimmedAltAllele; repeatLength = trimmedAltAllele.Length; for (i = combinedSeq.Length - 1; i >= repeatLength; i--, trimmedPos--) { if (combinedSeq[i] != combinedSeq[i - repeatLength]) break; } var newAltAllele = combinedSeq.Substring(i + 1 - repeatLength, repeatLength); return (trimmedPos, "", newAltAllele); } private string GetUpstreamSeq(int position, int length, bool isCircularGenome) { if (isCircularGenome) { var circularGenome = new CircularGenomeModel(_compressedSequence); var interval = (position - length, position -1); return circularGenome.ExtractIntervalSequence(interval); } var adjustedLength = length < position ? length : position - 1; return _compressedSequence.Substring(position - 1 - adjustedLength, adjustedLength); } } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/VcvItem.cs ================================================ using System; using System.Collections.Generic; using OptimizedCore; using VariantAnnotation.IO; namespace SAUtils.InputFileParsers.ClinVar { public sealed class VcvItem : IComparable, IComparable { public readonly int VariantId; public readonly string Accession; public readonly string Version; public readonly DateTime LastUpdatedDate; public readonly ClinVarCommon.ReviewStatus ReviewStatus; public readonly IEnumerable Significances; public VcvItem(string accession, string version, long updatedDateTicks, ClinVarCommon.ReviewStatus reviewStatus, IEnumerable significances) { Accession = accession; Version = version; LastUpdatedDate = new DateTime(updatedDateTicks); ReviewStatus = reviewStatus; Significances = significances; VariantId = int.Parse(accession.Substring(3)); } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("id", $"{Accession}.{Version}"); jsonObject.AddStringValue("reviewStatus", ClinVarCommon.ReviewStatusStrings[ReviewStatus]); jsonObject.AddStringValue("lastUpdatedDate", LastUpdatedDate.ToString("yyyy-MM-dd")); jsonObject.AddStringValues("significance", Significances); return StringBuilderPool.GetStringAndReturn(sb); } public int CompareTo(int vcvId) { return VariantId.CompareTo(vcvId); } public int CompareTo(VcvItem other) { if (ReferenceEquals(this, other)) return 0; if (ReferenceEquals(null, other)) return 1; return VariantId.CompareTo(other.VariantId); } } } ================================================ FILE: SAUtils/InputFileParsers/ClinVar/VcvSaItem.cs ================================================ using System; using System.Collections.Generic; using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.InputFileParsers.ClinVar { public sealed class VcvSaItem: IClinVarSaItem, IEquatable { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } private readonly string _accession; private readonly string _version; private readonly DateTime _lastUpdatedDate; public ClinVarCommon.ReviewStatus ReviewStatus { get; } public IEnumerable Significances { get; } public string Id => $"{_accession}.{_version}"; public VcvSaItem(Chromosome chromosome, int position, string refAllele, string altAllele, string accession, string version, DateTime lastUpdatedDate, ClinVarCommon.ReviewStatus reviewStatus, IEnumerable significances) { Chromosome = chromosome; Position = position; RefAllele = refAllele; AltAllele = altAllele; _accession = accession; _version = version; _lastUpdatedDate = lastUpdatedDate; ReviewStatus = reviewStatus; Significances = significances; } public string GetJsonString() { var sb= StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("id", $"{_accession}.{_version}"); jsonObject.AddStringValue("reviewStatus", ClinVarCommon.ReviewStatusStrings[ReviewStatus]); jsonObject.AddStringValues("significance", Significances); jsonObject.AddStringValue("refAllele", ClinVarCommon.NormalizeAllele(RefAllele)); jsonObject.AddStringValue("altAllele", ClinVarCommon.NormalizeAllele(AltAllele)); jsonObject.AddStringValue("lastUpdatedDate", _lastUpdatedDate.ToString("yyyy-MM-dd")); return StringBuilderPool.GetStringAndReturn(sb); } public string InputLine { get; set; } public int CompareTo(IClinVarSaItem other) { return Chromosome.Index != other.Chromosome.Index ? Chromosome.Index.CompareTo(other.Chromosome.Index) : Position.CompareTo(other.Position); } public bool Equals(VcvSaItem other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return _accession == other._accession && _version == other._version; } public override int GetHashCode() { return HashCode.Combine(_accession, _version); } } } ================================================ FILE: SAUtils/InputFileParsers/Cosmic/MergedCosmicReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Compression.Utilities; using Genome; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.InputFileParsers.Cosmic { public sealed class MergedCosmicReader { private readonly StreamReader _vcfFileReader; private readonly StreamReader _tsvFileReader; private string _geneName; private int? _sampleCount; private int _mutationIdIndex = -1; private int _primarySiteIndex = -1; private int _primaryHistologyIndex = -1; private int _studyIdIndex = -1; private const string StudyIdTag = "ID_STUDY"; private readonly Dictionary _refChromDict; private readonly ISequenceProvider _sequenceProvider; private readonly Dictionary> _studies; public MergedCosmicReader(string vcfFile, string tsvFile, ISequenceProvider sequenceProvider) { _vcfFileReader = GZipUtilities.GetAppropriateStreamReader(vcfFile); _tsvFileReader = GZipUtilities.GetAppropriateStreamReader(tsvFile); _sequenceProvider = sequenceProvider; _refChromDict = _sequenceProvider.RefNameToChromosome; _studies = new Dictionary>(); } public IEnumerable GetItems() { // taking up all studies in to the dictionary using (_tsvFileReader) { string line; while ((line = _tsvFileReader.ReadLine()) != null) { if (IsHeaderLine(line)) GetColumnIndexes(line); // the first line is supposed to be a the header line else AddCosmicStudy(line); } } using (_vcfFileReader) { string line; while ((line = _vcfFileReader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; var cosmicItems = ExtractCosmicItems(line); if (cosmicItems == null) continue; foreach (var cosmicItem in cosmicItems) { yield return cosmicItem; } } } } private void AddCosmicStudy(string line) { var columns = line.OptimizedSplit('\t'); string mutationId = columns[_mutationIdIndex]; string studyId = columns[_studyIdIndex]; var sites = GetSites(columns); var histologies = GetHistologies(columns); if (string.IsNullOrEmpty(mutationId)) return; var study = new CosmicItem.CosmicStudy(studyId, histologies, sites); if (_studies.TryGetValue(mutationId, out var studySet)) studySet.Add(study); else _studies[mutationId] = new HashSet { study }; } private List GetHistologies(string[] columns) { var histologies = new HashSet(); var primaryHistology = columns[_primaryHistologyIndex].Replace('_', ' '); TryAddValue(primaryHistology, histologies); return histologies.ToList(); } private List GetSites(string[] columns) { var sites = new HashSet(); var primarySite = columns[_primarySiteIndex].Replace('_', ' '); TryAddValue(primarySite, sites); return sites.ToList(); } private static void TryAddValue(string value, ISet sites) { if (!string.IsNullOrEmpty(value) && value != "NS") sites.Add(value); } private static bool IsHeaderLine(string line) => line.Contains(StudyIdTag); private void GetColumnIndexes(string headerLine) { //Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 Genome-wide screen Mutation ID Mutation CDS Mutation AA Mutation Description Mutation zygosity LOH GRCh Mutation genome position Mutation strand SNP FATHMM prediction FATHMM score Mutation somatic status Pubmed_PMID ID_STUDY Sample source Tumour origin Age _mutationIdIndex = -1; _studyIdIndex = -1; _primarySiteIndex = -1; _primaryHistologyIndex = -1; var columns = headerLine.OptimizedSplit('\t'); for (int i = 0; i < columns.Length; i++) { switch (columns[i]) { case "Mutation ID": _mutationIdIndex = i; break; case StudyIdTag: _studyIdIndex = i; break; case "Primary site": _primarySiteIndex = i; break; case "Primary histology": _primaryHistologyIndex = i; break; } } if (_mutationIdIndex == -1) throw new InvalidDataException("Column for mutation Id could not be detected"); if (_studyIdIndex == -1) throw new InvalidDataException("Column for study Id could not be detected"); if (_primarySiteIndex == -1) throw new InvalidDataException("Column for primary site could not be detected"); if (_primaryHistologyIndex == -1) throw new InvalidDataException("Column for primary histology could not be detected"); } private const int MaxVariantLength= 1000; internal List ExtractCosmicItems(string vcfLine) { var splitLine = vcfLine.Split(new[] { '\t' }, 8); //skipping large variants if (splitLine[VcfCommon.RefIndex].Length > MaxVariantLength || splitLine[VcfCommon.AltIndex].Length > MaxVariantLength) return null; string chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refChromDict.ContainsKey(chromosomeName)) return null; var chromosome = _refChromDict[chromosomeName]; int position = int.Parse(splitLine[VcfCommon.PosIndex]); string cosmicId = splitLine[VcfCommon.IdIndex]; string refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); string infoField = splitLine[VcfCommon.InfoIndex]; Clear(); ParseInfoField(infoField); var cosmicItems = new List(); foreach (string altAllele in altAlleles) { var (shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); cosmicItems.Add(_studies.TryGetValue(cosmicId, out var studies) ? new CosmicItem(chromosome, shiftedPos, cosmicId, shiftedRef, shiftedAlt, _geneName, studies, _sampleCount) : new CosmicItem(chromosome, shiftedPos, cosmicId, shiftedRef, shiftedAlt, _geneName, null, _sampleCount)); } return cosmicItems; } private void Clear() { _geneName = null; _sampleCount = null; } private void ParseInfoField(string infoFields) { if (infoFields == "" || infoFields == ".") return; var infoItems = infoFields.OptimizedSplit(';'); foreach (string infoItem in infoItems) { if (string.IsNullOrEmpty(infoItem)) continue; (string key, string value) = infoItem.OptimizedKeyValue(); // sanity check if (value != null) SetInfoField(key, value); } } private void SetInfoField(string vcfId, string value) { switch (vcfId) { case "GENE": _geneName = value; break; case "CNT": _sampleCount = Convert.ToInt32(value); break; } } } } ================================================ FILE: SAUtils/InputFileParsers/DGV/DgvReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.ParseUtils; namespace SAUtils.InputFileParsers.DGV { public sealed class DgvReader: IDisposable { #region members private readonly StreamReader _reader; private readonly Dictionary _refChromDict; #endregion // constructor public DgvReader(StreamReader reader, Dictionary refChromDict) { _reader = reader; _refChromDict = refChromDict; } /// /// returns a ClinVar object given the vcf line /// public static DgvItem ExtractDgvItem(string line, Dictionary refChromDict) { var cols = line.OptimizedSplit('\t'); if (cols.Length < 8) return null; string id = cols[0]; string chromosomeName = cols[1]; if (!refChromDict.ContainsKey(chromosomeName)) return null; var chromosome = refChromDict[chromosomeName]; int start = int.Parse(cols[2]); int end = int.Parse(cols[3]); string variantType = cols[4]; string variantSubType = cols[5]; int sampleSize = int.Parse(cols[14]); int observedGains = cols[15] == "" ? 0 : int.Parse(cols[15]); int observedLosses = cols[16] == "" ? 0 : int.Parse(cols[16]); var seqAltType = SaParseUtilities.GetSequenceAlterationType(variantType, variantSubType); return new DgvItem(id, chromosome, start, end, sampleSize, observedGains, observedLosses, seqAltType); } public IEnumerable GetItems() { using (var reader = _reader) { while (true) { // grab the next line string line = reader.ReadLine(); if (line == null) break; // skip header and empty lines if (string.IsNullOrWhiteSpace(line) || IsDgvHeader(line)) continue; var dgvItem = ExtractDgvItem(line, _refChromDict); if (dgvItem == null) continue; yield return dgvItem; } } } private static bool IsDgvHeader(string line) { return line.StartsWith("variantaccession"); } public void Dispose() { _reader?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/DataSourceVersionReader.cs ================================================ using System; using System.IO; using IO; using OptimizedCore; using VariantAnnotation.Providers; namespace SAUtils.InputFileParsers { /// /// /// reads data version from a file that is expected to be found alongside each supplementary data file /// public sealed class DataSourceVersionReader : IDisposable { #region members private readonly StreamReader _reader; #endregion public void Dispose() { _reader.Dispose(); } /// /// constructor /// public DataSourceVersionReader(Stream fileStream) { _reader = new StreamReader(fileStream); } public static DataSourceVersion GetSourceVersion(string versionFileName) { if (!versionFileName.EndsWith(".version")) versionFileName += ".version"; if (!File.Exists(versionFileName)) { throw new FileNotFoundException(versionFileName); } var fileStream = FileUtilities.GetReadStream(versionFileName); return GetSourceVersion(fileStream); } private static DataSourceVersion GetSourceVersion(Stream versionFileStream) { using (var versionReader = new DataSourceVersionReader(versionFileStream)) { var version = versionReader.GetVersion(); return version; } } public DataSourceVersion GetVersion() { // NAME = dbSNP // VERSION = 147 // DATE = 2016-04-08 // DESCRIPTION = string line, name = null, version = null, date = null, description = null; while ((line = _reader.ReadLine()) != null) { (string key, string value) = line.OptimizedKeyValue(); if (key == null || value == null) continue; switch (key) { case "NAME": name = value; break; case "VERSION": version = value; break; case "DATE": date = value; break; case "DESCRIPTION": description = value; break; } } if (date == null) { date = DateTime.Now.ToString("yyyy-MM-dd"); Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine($"WARNING: Date was missing from the data source. Using {date} instead."); Console.ResetColor(); } return new DataSourceVersion(name, version, DateTime.Parse(date).Ticks, description); } } } ================================================ FILE: SAUtils/InputFileParsers/DbSnp/DbSnpReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using IO; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.InputFileParsers.DbSnp { public sealed class DbSnpReader : IDisposable { // Key in VCF info field of the allele frequencies subfield. private readonly Stream _stream; private readonly ISequenceProvider _sequenceProvider; public DbSnpReader(Stream stream, ISequenceProvider sequenceProvider) { _stream = stream; _sequenceProvider = sequenceProvider; } public IEnumerable GetItems() { using (var reader = FileUtilities.GetStreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; foreach (var dbSnpItem in ExtractItem(line)) { yield return dbSnpItem; } } } } /// /// Extracts a dbSNP item from the specified VCF line. /// /// /// public IEnumerable ExtractItem(string vcfLine) { var splitLine = vcfLine.Split('\t',6); if (splitLine.Length < 5) yield break; var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_sequenceProvider.RefNameToChromosome.ContainsKey(chromosomeName)) yield break; var chromosome = _sequenceProvider.RefNameToChromosome[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]); var dbSnpId = Convert.ToInt64(splitLine[VcfCommon.IdIndex].Substring(2)); var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); foreach (var altAllele in altAlleles) { var (shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); yield return new DbSnpItem(chromosome, shiftedPos, dbSnpId, shiftedRef, shiftedAlt, vcfLine); } } public void Dispose() { _stream?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/DbSnp/GlobalMinorReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using IO; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; namespace SAUtils.InputFileParsers.DbSnp { public sealed class GlobalMinorReader { // Key in VCF info field of the allele frequencies subfield. private readonly Stream _stream; private readonly Dictionary _refChromDict; public GlobalMinorReader(Stream stream, Dictionary refChromDict) { _stream = stream; _refChromDict = refChromDict; } public IEnumerable GetItems() { using (var reader = FileUtilities.GetStreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; var items = ExtractItem(line); if (items == null || items.Count == 0) continue; foreach (var item in items) { yield return item; } } } } /// /// Extracts a dbSNP item from the specified VCF line. /// /// /// private List ExtractItem(string vcfline) { var splitLine = vcfline.OptimizedSplit('\t'); if (splitLine.Length < 8) return null; var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refChromDict.ContainsKey(chromosomeName)) return null; var chromosome = _refChromDict[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]); var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var infoField = splitLine[VcfCommon.InfoIndex]; var alleleFrequencies = GetAlleleFrequencies(infoField, refAllele, altAlleles); var frequencyItems = new List(); foreach ((string allele, double frequency) in alleleFrequencies) { frequencyItems.Add(new AlleleFrequencyItem(chromosome, position, refAllele, allele, frequency, vcfline)); } return frequencyItems; } private static Dictionary GetAlleleFrequencies(string infoField, string refAllele, string[] altAlleles) { var freqDict = new Dictionary { [refAllele] = double.MinValue }; foreach (var altAllele in altAlleles) { freqDict[altAllele] = double.MinValue; } if (infoField.Trim() == ".") return freqDict; // for now we also want to disregard anything other than SNVs var allSnv = refAllele.Length == 1 && altAlleles.All(altAllele => altAllele.Length == 1); if (!allSnv) return freqDict; // return if there are no freq information if (!infoField.Contains("CAF=")) return freqDict; foreach (var info in infoField.OptimizedSplit(';')) { if (!info.StartsWith("CAF=")) continue; var alleleFrequencies = info.OptimizedKeyValue().Value.OptimizedSplit(','); freqDict[refAllele] = GetFrequency(alleleFrequencies[0]); for (int i = 1; i < alleleFrequencies.Length; i++) freqDict[altAlleles[i - 1]] = GetFrequency(alleleFrequencies[i]); break; } return freqDict; } private static double GetFrequency(string alleleFrequency) { return alleleFrequency == "." || alleleFrequency == "0" ? double.MinValue : Convert.ToDouble(alleleFrequency); } } } ================================================ FILE: SAUtils/InputFileParsers/Decipher/DecipherParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using SAUtils.DataStructures; namespace SAUtils.InputFileParsers.Decipher { public sealed class DecipherParser : IDisposable { private const int ChromIndex = 1; private const int StartIndex = 2; private const int EndIndex = 3; private const int DelNumIndex = 4; private const int DelFreqIndex = 5; private const int DupNumIndex = 7; private const int DupFreqIndex = 8; private const int SampleSizeIndex = 14; private readonly StreamReader _reader; private readonly IDictionary _refNameDict; private int? _delNum; private double? _delFreq; private int? _dupNum; private double? _dupFreq; private int? _sampleSize; public DecipherParser(StreamReader reader, IDictionary refNameDict) { _reader = reader; _refNameDict = refNameDict; } public IEnumerable GetItems() { using (_reader) { string line; while ((line = _reader.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line) || line.OptimizedStartsWith('#')) continue; // #population_cnv_id chr start end deletion_observations deletion_frequency deletion_standard_error duplication_observations duplication_frequency duplication_standard_error observations frequency standard_error type sample_size study var splitLine = line.OptimizedSplit('\t'); string chromosomeName = splitLine[ChromIndex]; if(!_refNameDict.ContainsKey(chromosomeName)) continue; var chrom = _refNameDict[chromosomeName]; int start = int.Parse(splitLine[StartIndex]); int end = int.Parse(splitLine[EndIndex]); _delNum = int.Parse(splitLine[DelNumIndex]); _delFreq = double.Parse(splitLine[DelFreqIndex]); _dupNum = int.Parse(splitLine[DupNumIndex]); _dupFreq = double.Parse(splitLine[DupFreqIndex]); _sampleSize = int.Parse(splitLine[SampleSizeIndex]); var decipherItem = new DecipherItem(chrom, start, end, _delNum, _delFreq, _dupNum, _dupFreq, _sampleSize); yield return decipherItem; } } } public void Dispose() { _reader?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/Gme/GmeParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.InputFileParsers.Gme { public sealed class GmeParser : IDisposable { private readonly StreamReader _reader; private readonly Dictionary _refChromDict; private readonly ISequenceProvider _sequenceProvider; private int? _alleleCount; private int? _alleleNum; private double? _alleleFreq; public GmeParser(StreamReader streamReader, ISequenceProvider sequenceProvider) { _reader = streamReader; _sequenceProvider = sequenceProvider; _refChromDict = sequenceProvider.RefNameToChromosome; } private void Clear() { _alleleCount = null; _alleleNum = null; _alleleFreq = null; } public IEnumerable GetItems() { using (_reader) { string line; while ((line = _reader.ReadLine()) != null) { // file has been modified to 7 columns // #chrom pos ref alt filter GME_AC GME_AF if (string.IsNullOrWhiteSpace(line) || line.OptimizedStartsWith('#')) continue; var cols = line.OptimizedSplit('\t'); string ucscChrom = cols[0]; if(!_refChromDict.ContainsKey(ucscChrom)) continue; var chrom = _refChromDict[ucscChrom]; int position = int.Parse(cols[1]); var refAllele = cols[2]; var altAllele = cols[3]; var filters = cols[4]; var gmeAc = cols[5].OptimizedSplit(','); _alleleFreq = double.Parse(cols[6]); var failedFilter = !(filters.Equals("PASS") || filters.Equals(".")); var (shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); _alleleCount = Convert.ToInt32(gmeAc[0]); _alleleNum = Convert.ToInt32(gmeAc[0]) + Convert.ToInt32(gmeAc[1]); var gemItem = new GmeItem(chrom, shiftedPos, shiftedRef, shiftedAlt, _alleleCount, _alleleNum, _alleleFreq, failedFilter); yield return gemItem; } } } public void Dispose() => _reader?.Dispose(); } } ================================================ FILE: SAUtils/InputFileParsers/OneKGen/OneKGenReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using IO; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.InputFileParsers.OneKGen { public sealed class OneKGenReader :IDisposable { private readonly Stream _stream; private readonly Dictionary _refNameDictionary; private readonly ISequenceProvider _sequenceProvider; private string _ancestralAllele; private int? _allAlleleNumber; private int? _afrAlleleNumber; private int? _amrAlleleNumber; private int? _eurAlleleNumber; private int? _easAlleleNumber; private int? _sasAlleleNumber; private int[] _allAlleleCounts; private int[] _afrAlleleCounts; private int[] _amrAlleleCounts; private int[] _eurAlleleCounts; private int[] _easAlleleCounts; private int[] _sasAlleleCounts; // empty constructor for onekg reader for unit tests. public OneKGenReader(Stream stream, ISequenceProvider sequenceProvider) { _stream = stream; _sequenceProvider = sequenceProvider; _refNameDictionary = sequenceProvider.RefNameToChromosome; } private void Clear() { _ancestralAllele = null; _allAlleleNumber = null; _afrAlleleNumber = null; _amrAlleleNumber = null; _eurAlleleNumber = null; _easAlleleNumber = null; _sasAlleleNumber = null; _allAlleleCounts = null; _afrAlleleCounts = null; _amrAlleleCounts = null; _eurAlleleCounts = null; _easAlleleCounts = null; _sasAlleleCounts = null; // SV fields } public IEnumerable GetItems() { using (var reader = FileUtilities.GetStreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; foreach (var oneKGenItem in ExtractItems(line)) { yield return oneKGenItem; } } } } internal IEnumerable ExtractItems(string vcfLine) { var splitLine = vcfLine.OptimizedSplit('\t');// we don't care about the many fields after info field if (splitLine.Length < 8) yield break; Clear(); var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refNameDictionary.ContainsKey(chromosomeName)) yield break; var chromosome = _refNameDictionary[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var infoFields = splitLine[VcfCommon.InfoIndex]; // parses the info fields and extract frequencies, ancestral allele, allele counts, etc. var hasSymbolicAllele = altAlleles.Any(x => x.OptimizedStartsWith('<') && x.OptimizedEndsWith('>')); if (hasSymbolicAllele) yield break; // ReSharper disable once ConditionIsAlwaysTrueOrFalse ParseInfoField(infoFields, hasSymbolicAllele); for (var i = 0; i < altAlleles.Length; i++) { var (shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAlleles[i], _sequenceProvider.Sequence); yield return new OneKGenItem( chromosome, shiftedPos, shiftedRef, shiftedAlt, _ancestralAllele, GetAlleleCount(_allAlleleCounts, i), GetAlleleCount(_afrAlleleCounts,i), GetAlleleCount(_amrAlleleCounts,i), GetAlleleCount(_eurAlleleCounts,i), GetAlleleCount(_easAlleleCounts,i), GetAlleleCount(_sasAlleleCounts, i), _allAlleleNumber, _afrAlleleNumber, _amrAlleleNumber, _eurAlleleNumber, _easAlleleNumber, _sasAlleleNumber ); } } private static int? GetAlleleCount(int[] alleleCounts, int i) { if (alleleCounts == null) return null; if (i >= alleleCounts.Length) return null; return alleleCounts[i]; } private void ParseInfoField(string infoFields, bool hasSymbolicAllele) { if (infoFields == "" || infoFields == ".") return; var infoItems = infoFields.OptimizedSplit(';'); foreach (string infoItem in infoItems) { (string key, string value) = infoItem.OptimizedKeyValue(); // sanity check if (value != null) SetInfoField(key, value, hasSymbolicAllele); } } private void SetInfoField(string vcfAfId, string value, bool hasSymbolicAllele) { switch (vcfAfId) { case "AA": _ancestralAllele = GetAncestralAllele(value); break; // the following are for SVs case "SVTYPE": if (hasSymbolicAllele) { } break; case "END": if (hasSymbolicAllele) { } break; case "CIEND": case "CIPOS": break; case "AN": _allAlleleNumber = Convert.ToInt32(value); break; case "AFR_AN": _afrAlleleNumber = Convert.ToInt32(value); break; case "AMR_AN": _amrAlleleNumber = Convert.ToInt32(value); break; case "EUR_AN": _eurAlleleNumber = Convert.ToInt32(value); break; case "EAS_AN": _easAlleleNumber = Convert.ToInt32(value); break; case "SAS_AN": _sasAlleleNumber = Convert.ToInt32(value); break; case "AC": _allAlleleCounts = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AMR_AC": _amrAlleleCounts = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AFR_AC": _afrAlleleCounts = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "EUR_AC": _eurAlleleCounts = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "EAS_AC": _easAlleleCounts = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "SAS_AC": _sasAlleleCounts = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; } } private static string GetAncestralAllele(string value) { if (value == "" || value == ".") return null; var ancestralAllele = value.OptimizedSplit('|')[0]; if (string.IsNullOrEmpty(ancestralAllele)) return null; return ancestralAllele.All(IsNucleotide) ? ancestralAllele : null; } private static bool IsNucleotide(char c) { c = char.ToUpper(c); return c == 'A' || c == 'C' || c == 'G' || c == 'T' || c == 'N'; } public void Dispose() { _stream?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/OneKGen/RefMinorReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.InputFileParsers.OneKGen { public sealed class RefMinorReader:IDisposable { private readonly StreamReader _reader; private readonly Dictionary _refNameDictionary; private readonly ISequenceProvider _sequenceProvider; private int? _allAlleleNumber; private int[] _allAlleleCounts; public RefMinorReader(StreamReader reader, ISequenceProvider sequenceProvider) { _reader = reader; _sequenceProvider = sequenceProvider; _refNameDictionary = sequenceProvider.RefNameToChromosome; } private void Clear() { _allAlleleNumber = null; _allAlleleCounts = null; } public IEnumerable GetItems() { using (var reader = _reader) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; var items = ExtractItems(line); if (items == null) continue; foreach (var item in items) { yield return item; } } } } private List ExtractItems(string vcfLine) { var splitLine = vcfLine.Split(new[] { '\t' }, 9);// we don't care about the many fields after info field if (splitLine.Length < 8) return null; Clear(); var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refNameDictionary.ContainsKey(chromosomeName)) return null; var chromosome = _refNameDictionary[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var infoFields = splitLine[VcfCommon.InfoIndex]; // parses the info fields and extract frequencies, ancestral allele, allele counts, etc. ParseInfoField(infoFields); if (_allAlleleNumber == null) return null; var items = new List(); for (var i = 0; i < altAlleles.Length; i++) { var alleleCount = GetAlleleCount(_allAlleleCounts, i); if (alleleCount == null || alleleCount==0) continue; var frequency = 1.0* alleleCount.Value/ _allAlleleNumber.Value ; var (shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAlleles[i], _sequenceProvider.Sequence); items.Add(new AlleleFrequencyItem(chromosome, shiftedPos,shiftedRef, shiftedAlt, frequency, vcfLine)); } return items.Count>0? items: null; } private static int? GetAlleleCount(int[] alleleCounts, int i) { if (alleleCounts == null) return null; if (i >= alleleCounts.Length) return null; return alleleCounts[i]; } private void ParseInfoField(string infoFields) { if (infoFields == "" || infoFields == ".") return; var infoItems = infoFields.OptimizedSplit(';'); foreach (string infoItem in infoItems) { (string key, string value) = infoItem.OptimizedKeyValue(); switch (key) { case "AN": _allAlleleNumber = Convert.ToInt32(value); break; case "AC": _allAlleleCounts = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; } } } public void Dispose() { _reader?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/OneKGen/oneKGenSvReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.ParseUtils; namespace SAUtils.InputFileParsers.OneKGen { public sealed class OneKGenSvReader:IDisposable { private const int ChromIndex = 0; private const int StartIndex = 1; private const int EndIndex = 2; private const int IdIndex = 3; private const int AltIndex = 4; private const int InfoIndex = 5; private readonly StreamReader _reader; private readonly Dictionary _refNameDict; private string _svType; private int? _allAlleleNumber; private int? _allAlleleCount; private double? _allAlleleFrequency; private double? _afrAlleleFrequency; private double? _amrAlleleFrequency; private double? _eurAlleleFrequency; private double? _easAlleleFrequency; private double? _sasAlleleFrequency; public OneKGenSvReader(StreamReader reader, Dictionary refNameDict) { _reader = reader; _refNameDict = refNameDict; } public IEnumerable GetItems() { string line; while ((line = _reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; var oneKSvGenItem = ExtractOneKGenSvItem(line); if (oneKSvGenItem == null) continue; yield return oneKSvGenItem; } } private void Clear() { _allAlleleNumber = null; _allAlleleFrequency = null; _afrAlleleFrequency = null; _amrAlleleFrequency = null; _eurAlleleFrequency = null; _easAlleleFrequency = null; _sasAlleleFrequency = null; _svType = null; } private OnekGenSvItem ExtractOneKGenSvItem(string line) { var splitLine = line.OptimizedSplit('\t'); string altAllele = splitLine[AltIndex]; if (altAllele.StartsWith(" id != ".")); } private void ParseInfoField(string infoFields) { if (infoFields == "" || infoFields == ".") return; var infoItems = infoFields.OptimizedSplit(';'); foreach (string infoItem in infoItems) { (string key, string value) = infoItem.OptimizedKeyValue(); // sanity check if (value != null) SetInfoField(key, value); } } //1 668630 esv3584976 G 100 PASS AC=64;AF=0.0127796;AN=5008;CIEND=-150,150;CIPOS=-150,150;CS=DUP_delly;END=850204;NS=2504;SVTYPE=DUP;IMPRECISE;DP=22135;EAS_AF=0.0595;AMR_AF=0;AFR_AF=0.0015;EUR_AF=0.001;SAS_AF=0.001;VT=SV;EX_TARGET private void SetInfoField(string vcfAfId, string value) { switch (vcfAfId) { case "SVTYPE": _svType = value;// for SVs there is only one value in SVTYPE break; case "AN": _allAlleleNumber = Convert.ToInt32(value); break; case "AC": _allAlleleCount = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).Sum(); break; case "AF": _allAlleleFrequency = value.OptimizedSplit(',').Select(Convert.ToDouble).Sum(); break; case "AMR_AF": _amrAlleleFrequency = value.OptimizedSplit(',').Select(Convert.ToDouble).Sum(); break; case "AFR_AF": _afrAlleleFrequency = value.OptimizedSplit(',').Select(Convert.ToDouble).Sum(); break; case "EUR_AF": _eurAlleleFrequency = value.OptimizedSplit(',').Select(Convert.ToDouble).Sum(); break; case "EAS_AF": _easAlleleFrequency = value.OptimizedSplit(',').Select(Convert.ToDouble).Sum(); break; case "SAS_AF": _sasAlleleFrequency = value.OptimizedSplit(',').Select(Convert.ToDouble).Sum(); break; } } public void Dispose() { _reader?.Dispose(); } } } ================================================ FILE: SAUtils/InputFileParsers/SequenceExtensions.cs ================================================ using Genome; namespace SAUtils.InputFileParsers { public static class SequenceExtensions { public static bool Validate( this ISequence referenceSequence, int start, int end, string testSequence) { var expSequence = referenceSequence.Substring(start - 1, end - start + 1); return testSequence == expSequence; } } } ================================================ FILE: SAUtils/InputFileParsers/TOPMed/TopMedReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.InputFileParsers.TOPMed { public sealed class TopMedReader : IDisposable { private readonly StreamReader _reader; private readonly Dictionary _refChromDict; private readonly ISequenceProvider _sequenceProvider; private int? _alleleNum; private int? _alleleCount; private int? _homCount; public TopMedReader(StreamReader streamReader, ISequenceProvider sequenceProvider) { _reader = streamReader; _sequenceProvider = sequenceProvider; _refChromDict = sequenceProvider.RefNameToChromosome; } private void Clear() { _alleleNum = null; _alleleCount = null; _homCount = null; } public IEnumerable GetItems() { using (_reader) { string line; while ((line = _reader.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line) || line.OptimizedStartsWith('#')) continue; var topMedItem = ExtractItems(line); if (topMedItem == null) continue; yield return topMedItem; } } } private TopMedItem ExtractItems(string vcfLine) { if (vcfLine == null) return null; var splitLine = vcfLine.OptimizedSplit('\t'); if (splitLine.Length < 8) return null; Clear(); // chr1 10169 TOPMed_freeze_5?chr1:10,169 T C 255 SVM VRT=1;NS=62784;AN=125568;AC=20;AF=0.000159276;Het=20;Hom=0 NA:FRQ 125568:0.000159276 var chromosome = splitLine[VcfCommon.ChromIndex]; if (!_refChromDict.ContainsKey(chromosome)) return null; var chrom = _refChromDict[chromosome]; var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAllele = splitLine[VcfCommon.AltIndex]; var filters = splitLine[VcfCommon.FilterIndex]; var infoFields = splitLine[VcfCommon.InfoIndex]; if (altAllele.Contains(",")) { Console.WriteLine(vcfLine); throw new InvalidDataException("het site found!!"); } var failedFilter = !(filters.Equals("PASS") || filters.Equals(".")); ParseInfoField(infoFields); if (_alleleNum == 0) return null; var (shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); return new TopMedItem(chrom, shiftedPos, shiftedRef, shiftedAlt, _alleleNum, _alleleCount, _homCount, failedFilter); } private void ParseInfoField(string infoFields) { if (infoFields == "" || infoFields == ".") return; var infoItems = infoFields.OptimizedSplit(';'); foreach (string infoItem in infoItems) { (string key, string value) = infoItem.OptimizedKeyValue(); // sanity check if (value != null) SetInfoField(key, value); } } private void SetInfoField(string vcfId, string value) { // VRT=1;NS=62784;AN=125568;AC=20;AF=0.000159276;Het=20;Hom=0 switch (vcfId) { case "AN": _alleleNum = Convert.ToInt32(value); break; case "AC": _alleleCount = Convert.ToInt32(value); break; case "Hom": _homCount = Convert.ToInt32(value); break; } } public void Dispose() => _reader?.Dispose(); } } ================================================ FILE: SAUtils/MakeAaDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.MakeAaDb { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "OneK Gen VCFfile", "--in") .CheckInputFilenameExists(_inputFile, "OneK Gen VCFfile", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("create Ancestral allele database from 1000Genomes data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}_ancestralAlleles".Replace(' ','_'); using (var ancestralAlleleReader = new AncestralAlleleReader(GZipUtilities.GetAppropriateStreamReader(_inputFile), referenceProvider)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var writer = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.AncestralAlleleTag, true, false, SaCommon.SchemaVersion, true)) { writer.Write(ancestralAlleleReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/MakeClinGenDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.ClinGen; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.MakeClinGenDb { public static class Main { private static string _inputFileName; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "ClinGen VCFfile", v => _inputFileName = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFileName, "ClinGen VCFfile", "--in") .CheckInputFilenameExists(_inputFileName, "ClinGen VCFfile", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with ClinVar annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFileName + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var clinGenReader = new ClinGenReader(GZipUtilities.GetAppropriateStreamReader(_inputFileName), referenceProvider.RefNameToChromosome)) using (var nsiStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.IntervalFileSuffix))) using (var nsiWriter = new NsiWriter(nsiStream, version, referenceProvider.Assembly, SaCommon.ClinGenTag, ReportFor.StructuralVariants, SaCommon.SchemaVersion)) { nsiWriter.Write(clinGenReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/MitoHeteroplasmy/MitoHeteroplasmyDb.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; namespace SAUtils.MitoHeteroplasmy { public static class MitoHeteroplasmyDb { private static string _inputFile; private static string _outputDirectory; private const string OutFileName = "MitoHeteroplasmy.tsv"; private const string HeaderLine = "#POS\tREF\tALT\tVRFs\tAlleleDepths"; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "in|i=", "input BED file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_inputFile, "Mitochondrial Heteroplasmy BED file", "--in") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a TSV file with mitochondrial heteroplasmy information", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { using var mitoHeteroplasmyParser = new MitoHeteroplasmyParser(GZipUtilities.GetAppropriateReadStream(_inputFile)); using var tsvStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, OutFileName)); using var tsvWriter = new StreamWriter(tsvStream); tsvWriter.WriteLine(HeaderLine); foreach(var line in mitoHeteroplasmyParser.GetOutputLines()) tsvWriter.WriteLine(line); return ExitCodes.Success; } } } ================================================ FILE: SAUtils/MitoHeteroplasmy/MitoHeteroplasmyParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using IO; using Newtonsoft.Json; using OptimizedCore; namespace SAUtils.MitoHeteroplasmy { public sealed class MitoHeteroplasmyParser : IDisposable { private readonly Stream _stream; public MitoHeteroplasmyParser(Stream stream) { _stream = stream; } public void Dispose() { _stream?.Dispose(); } public IEnumerable GetOutputLines() { using var reader = FileUtilities.GetStreamReader(_stream); string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments, headers if (line.OptimizedStartsWith('#')) continue; foreach (string item in ExtractItems(line)) yield return item; } } //MT 5 6 {"C:A":{"ad":[1],"allele_type":"alt","vrf":[0.006329113924050633],"vrf_stats":{"kurtosis":241.00408163265314,"max":0.0063291139240506328,"mean":2.5728105382319646e-05,"min":0.0,"nobs":246,"skewness":15.588588185998534,"stdev":0.00040352956522996095,"variance":1.6283611001468132e-07}}} private static IEnumerable ExtractItems(string line) { var splits = line.Split('\t'); if (splits.Length < 4) yield break; var position = int.Parse(splits[1]) + 1; // since this is a bed file var info = splits[3]; var stats = DeserializeStats(info); foreach ((string refAllele, string altAllele, AlleleStats alleleStats) in GetAlleleStats(stats)) { (string formattedVrfs, string alleleDepths) = MergeAndSortByVrf(alleleStats); yield return string.Join('\t', position, refAllele, altAllele, formattedVrfs, alleleDepths); } } private static (string formattedVrfs, string alleleDepths) MergeAndSortByVrf(AlleleStats alleleStats) { var vrfToAd = new Dictionary(); foreach ((string vrf, int ad) in alleleStats.vrf.Select(x => x.ToString("0.###")) .Zip(alleleStats.ad, (a, b) => (a, b))) { if (vrfToAd.ContainsKey(vrf)) vrfToAd[vrf] += ad; else vrfToAd[vrf] = ad; } var formattedVrfs = new string[vrfToAd.Count]; var alleleDepths = new int[vrfToAd.Count]; var i = 0; foreach (var vrf in vrfToAd.Keys.OrderBy(x => double.Parse(x))) { formattedVrfs[i] = vrf; alleleDepths[i] = vrfToAd[vrf]; i++; } return (string.Join(',',formattedVrfs), string.Join(',', alleleDepths)); } private static IEnumerable<(string, string, AlleleStats)> GetAlleleStats(PositionStats stats) { if (stats.A_C != null) yield return ("A", "C", stats.A_C); if (stats.A_G != null) yield return ("A", "G", stats.A_G); if (stats.A_T != null) yield return ("A", "T", stats.A_T); if (stats.C_A != null) yield return ("C", "A", stats.C_A); if (stats.C_G != null) yield return ("C", "G", stats.C_G); if (stats.C_T != null) yield return ("C", "T", stats.C_T); if (stats.G_A != null) yield return ("G", "A", stats.G_A); if (stats.G_C != null) yield return ("G", "C", stats.G_C); if (stats.G_T != null) yield return ("G", "T", stats.G_T); if (stats.T_A != null) yield return ("T", "A", stats.T_A); if (stats.T_C != null) yield return ("T", "C", stats.T_C); if (stats.T_G != null) yield return ("T", "G", stats.T_G); } public static PositionStats DeserializeStats(string s) { var charArray = s.ToCharArray(); for (var i = 0; i < charArray.Length - 3; i++) { if (IsNucleotide(charArray[i]) && charArray[i + 1] == ':' && IsNucleotide(charArray[i + 2])) charArray[i + 1] = '_'; } return JsonConvert.DeserializeObject(new string(charArray)); } private static bool IsNucleotide(char c) { return c == 'A' || c == 'C' || c == 'G' || c == 'T'; } } } ================================================ FILE: SAUtils/MitoHeteroplasmy/StatClasses.cs ================================================ namespace SAUtils.MitoHeteroplasmy { //{ //"ad": [1], //"allele_type": "alt", //"vrf": [0.004273504273504274], //"vrf_stats": { // "kurtosis": 241.00408163265314, // "max": 0.0042735042735042739, // "mean": 1.7371968591480788e-05, // "min": 0.0, // "nobs": 246, // "skewness": 15.588588185998535, // "stdev": 0.00027246868079629845, // "variance": 7.4239182014875175e-08 //} //} public sealed class PositionStats { public AlleleStats A_C; public AlleleStats A_G; public AlleleStats A_T; public AlleleStats C_A; public AlleleStats C_G; public AlleleStats C_T; public AlleleStats G_C; public AlleleStats G_A; public AlleleStats G_T; public AlleleStats T_C; public AlleleStats T_G; public AlleleStats T_A; } public class AlleleStats { public int[] ad; public double[] vrf; } } ================================================ FILE: SAUtils/MitoMap/CircularGenomeModel.cs ================================================ using System.Collections.Generic; using Genome; namespace SAUtils.MitoMap { public sealed class CircularGenomeModel { private readonly int _genomeLength; private readonly ISequence _compressedSequence; public CircularGenomeModel(ISequence compressedSequence) { _compressedSequence = compressedSequence; _genomeLength = compressedSequence.Length; } // convert linear pseudogenome position back to the circular genome position private (int, int) PseudoToCircular((int, int) interval) => (GetCircularPosition(interval.Item1), GetCircularPosition(interval.Item2)); private int GetCircularPosition(int posi) => (posi - 1) % _genomeLength + 1; // translate the genomic interval that may overlap with the origin of the genome, no matter on circular genome or linear pseudo genome, into interval(s) not crossing the origin private List<(int, int)> SplitInterval((int, int) interval) { var (circularStart, circularEnd) = PseudoToCircular(interval); var intervalList = new List<(int, int)>(); if (circularEnd >= circularStart) intervalList.Add((circularStart, circularEnd)); else { intervalList.Add((circularStart, _genomeLength)); intervalList.Add((1, circularEnd)); } return intervalList; } public string ExtractIntervalSequence((int, int) interval) { var subSequence = ""; SplitInterval(interval).ForEach(x => subSequence += _compressedSequence.Substring(x.Item1 - 1, x.Item2 - x.Item1 + 1)); return subSequence; } } } ================================================ FILE: SAUtils/MitoMap/MitoMapDatabaseUtilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using OptimizedCore; namespace SAUtils.MitoMap { internal static class MitoMapDatabaseUtilities { private const string ReferenceQueryPrefix = "COPY mitomap.reference ("; public static MitoMapInputDb Create(string mitoMapDatabase) { var internalReferenceIdToPubmedId = new Dictionary(); using (var stream = new FileStream(mitoMapDatabase, FileMode.Open)) using(var gzStream = new GZipStream(stream, CompressionMode.Decompress)) using (var reader = new StreamReader(gzStream)) { string line; MitoMapTable currentTable = 0; while ((line = reader.ReadLine()) != null) { if (line == "\\.") { currentTable = 0; continue; } switch (currentTable) { case 0: currentTable = TryGetTable(line); continue; case MitoMapTable.Reference: ProcessReferenceInfo(line, internalReferenceIdToPubmedId); break; default: throw new ArgumentOutOfRangeException(); } } } return new MitoMapInputDb(internalReferenceIdToPubmedId); } private static void ProcessReferenceInfo(string line, Dictionary internalReferenceIdToPubmedId) { var fields = line.OptimizedSplit('\t'); if (fields.Length != 14) throw new InvalidDataException($"Invalid reference table record: {line}"); internalReferenceIdToPubmedId[fields[0]] = fields[13]; } private static MitoMapTable TryGetTable(string line) { return line.StartsWith(ReferenceQueryPrefix) ? MitoMapTable.Reference : 0; } } public enum MitoMapTable { Reference = 1 } } ================================================ FILE: SAUtils/MitoMap/MitoMapInputDb.cs ================================================ using System.Collections.Generic; namespace SAUtils.MitoMap { public struct MitoMapInputDb { public Dictionary InternalReferenceIdToPubmedId { get; } public MitoMapInputDb(Dictionary internalReferenceIdToPubmedId) { InternalReferenceIdToPubmedId = internalReferenceIdToPubmedId; } } } ================================================ FILE: SAUtils/MitoMap/MitoMapItem.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using OptimizedCore; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.MitoMap { public static class MitoMapDataTypes { public const string MitoMapMutationsCodingControl = "MutationsCodingControl"; public const string MitoMapMutationsRNA = "MutationsRNA"; public const string MitoMapPolymorphismsCoding = "PolymorphismsCoding"; public const string MitoMapPolymorphismsControl = "PolymorphismsControl"; public const string MitoMapDeletionsSingle = "DeletionsSingle"; public const string MitoMapInsertionsSimple = "InsertionsSimple"; } public static class MitoDLoop { public const int Start = 16024; // ReSharper disable once UnusedMember.Global public const int End = 576; } public static class MitomapParsingParameters { public const int LargeDeletionCutoff = 100; } public sealed class MitoMapItem : ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } private readonly List _diseases; private readonly bool? _homoplasmy; private readonly bool? _heteroplasmy; private readonly string _status; private readonly string _clinicalSignificance; private readonly string _scorePercentile; private readonly int _numGenBankFullLengthSeqs; private readonly List _pubMedIds; public MitoMapItem(Chromosome chromosome, int posi, string refAllele, string altAllele, List diseases, bool? homoplasmy, bool? heteroplasmy, string status, string clinicalSignificance, string scorePercentile, ISequenceProvider sequenceProvider, int numGenBankFullLengthSeqs, List pubMedIds) { Chromosome = chromosome; Position = posi; if (sequenceProvider == null) { RefAllele = refAllele; AltAllele = altAllele; } else { (Position, RefAllele, AltAllele) = TryAddPaddingBase(refAllele, altAllele, Position, sequenceProvider); } _diseases = diseases; _homoplasmy = homoplasmy; _heteroplasmy = heteroplasmy; _status = status; _clinicalSignificance = clinicalSignificance; _scorePercentile = scorePercentile; _numGenBankFullLengthSeqs = numGenBankFullLengthSeqs; _pubMedIds = pubMedIds; } private static (int, string, string) TryAddPaddingBase(string refAllele, string altAllele, int position, ISequenceProvider sequenceProvider) { // insertion if (IsEmptyOrDash(refAllele)) return AddPaddingBase(altAllele, true, position, sequenceProvider); // deletion return IsEmptyOrDash(altAllele) ? AddPaddingBase(refAllele, false, position, sequenceProvider) : (position, refAllele, altAllele); } private static (int, string, string) AddPaddingBase(string allele, bool isInsertion, int position, ISequenceProvider sequenceProvider) { string paddingBase = sequenceProvider.Sequence.Substring(position - 2, 1); return isInsertion ? (position - 1, paddingBase, paddingBase + allele) : (position - 1, paddingBase + allele, paddingBase); } private static bool IsEmptyOrDash(string allele) => string.IsNullOrEmpty(allele) || allele == "-"; public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); if (string.IsNullOrEmpty(RefAllele)) RefAllele = "-"; if (string.IsNullOrEmpty(AltAllele)) AltAllele = "-"; jsonObject.AddStringValue("refAllele", RefAllele); jsonObject.AddStringValue("altAllele", AltAllele); jsonObject.AddStringValues("diseases", _diseases?.Distinct()); if (_homoplasmy.HasValue) jsonObject.AddBoolValue("hasHomoplasmy", _homoplasmy.Value, true); if (_heteroplasmy.HasValue) jsonObject.AddBoolValue("hasHeteroplasmy", _heteroplasmy.Value, true); jsonObject.AddStringValue("status", _status); jsonObject.AddStringValue("clinicalSignificance", _clinicalSignificance); jsonObject.AddStringValue("scorePercentile", _scorePercentile, false); jsonObject.AddIntValue("numGenBankFullLengthSeqs", _numGenBankFullLengthSeqs); jsonObject.AddStringValues("pubMedIds", _pubMedIds); return StringBuilderPool.GetStringAndReturn(sb); } public string InputLine { get; set; } public static Dictionary<(string, string), MitoMapItem> AggregatedMutationsSamePosition(IEnumerable mitoMapMutItems) { var aggregatedMutations = new Dictionary<(string, string), MitoMapItem>(); foreach (var mitoMapMutItem in mitoMapMutItems) { var mutation = (mitoMapMutItem.RefAllele, mitoMapMutItem.AltAllele); if (aggregatedMutations.ContainsKey(mutation)) { var mergedItem = Merge(aggregatedMutations[mutation], mitoMapMutItem); if (mergedItem == null) continue; aggregatedMutations[mutation] = mergedItem; } else aggregatedMutations[mutation] = mitoMapMutItem; } return aggregatedMutations; } private static MitoMapItem Merge(MitoMapItem mitoMapItem1, MitoMapItem mitoMapItem2) { if (HasConflictValue(mitoMapItem1.Chromosome, mitoMapItem2.Chromosome) || HasConflictValue(mitoMapItem1.Position, mitoMapItem2.Position) || HasConflictValue(mitoMapItem1.RefAllele, mitoMapItem2.RefAllele) || HasConflictValue(mitoMapItem1.AltAllele, mitoMapItem2.AltAllele) || HasConflictValue(mitoMapItem1._homoplasmy, mitoMapItem2._homoplasmy) || HasConflictValue(mitoMapItem1._heteroplasmy, mitoMapItem2._heteroplasmy) || HasConflictValue(mitoMapItem1._status, mitoMapItem2._status) || HasConflictValue(mitoMapItem1._clinicalSignificance, mitoMapItem2._clinicalSignificance) || HasConflictValue(mitoMapItem1._scorePercentile, mitoMapItem2._scorePercentile)) { throw new InvalidDataException($"Conflict found at {mitoMapItem1.Position} when updating MITOMAP record: first record: {mitoMapItem1.GetJsonString()}; second record: {mitoMapItem2.GetJsonString()} "); } var homoplasmy = mitoMapItem1._homoplasmy ?? mitoMapItem2._homoplasmy; var heteroplasmy = mitoMapItem1._heteroplasmy ?? mitoMapItem2._heteroplasmy; string alleleInfo = $"{mitoMapItem1.Position} (Ref: {mitoMapItem1.RefAllele}, Alt: {mitoMapItem1.AltAllele})"; var diseases = MergeCollections(mitoMapItem1._diseases, mitoMapItem2._diseases, alleleInfo).ToList(); var pubMedIds = MergeCollections(mitoMapItem1._pubMedIds, mitoMapItem2._pubMedIds, alleleInfo).ToList(); var status = mitoMapItem1._status ?? mitoMapItem2._status; var clinicalSignificance = mitoMapItem1._clinicalSignificance ?? mitoMapItem2._clinicalSignificance; var scorePercentile = mitoMapItem1._scorePercentile ?? mitoMapItem2._scorePercentile; var numFullLengthSequences = Math.Max(mitoMapItem1._numGenBankFullLengthSeqs, mitoMapItem2._numGenBankFullLengthSeqs); return new MitoMapItem(mitoMapItem1.Chromosome, mitoMapItem1.Position, mitoMapItem1.RefAllele, mitoMapItem1.AltAllele, diseases, homoplasmy, heteroplasmy, status, clinicalSignificance, scorePercentile, null, numFullLengthSequences, pubMedIds); } private static IEnumerable MergeCollections(ICollection collection1, ICollection collection2, string alleleInfo) { if (IsNullOrEmpty(collection1) || IsNullOrEmpty(collection2)) return (collection1?.Count ?? -1) > 0 ? collection1 : collection2 ?? Enumerable.Empty(); Console.WriteLine($"Merge data at {alleleInfo}: {string.Join(",", collection1)} and {string.Join(",", collection2)}"); return collection1.Concat(collection2).Distinct(); } private static bool HasConflictValue(T originalValue, T newValue) { bool hasConflict = !IsNullOrEmpty(originalValue) && !IsNullOrEmpty(newValue) && !originalValue.Equals(newValue); if (hasConflict) Console.WriteLine($"Conflict found: {originalValue}, {newValue}"); return hasConflict; } private static bool IsNullOrEmpty(T value) { if (typeof(T) == typeof(string)) return string.IsNullOrEmpty(value as string); return value == null || value.Equals(default(T)); } } } ================================================ FILE: SAUtils/MitoMap/MitoMapSvItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using Variants; namespace SAUtils.MitoMap { public sealed class MitoMapSvItem : ISuppIntervalItem { public int Start { get; } public int End { get; } public Chromosome Chromosome { get; } private VariantType VariantType { get; } public MitoMapSvItem(Chromosome chromosome, int start, int end, VariantType variantType) { Chromosome = chromosome; Start = start; End = end; VariantType = variantType; } public string GetJsonString() { var sb= StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); // data section jsonObject.AddStringValue("chromosome", Chromosome.EnsemblName); jsonObject.AddIntValue("begin", Start); jsonObject.AddIntValue("end", End); jsonObject.AddStringValue("variantType", VariantType.ToString()); return StringBuilderPool.GetStringAndReturn(sb); } } } ================================================ FILE: SAUtils/MitoMap/MitoMapSvReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; using ErrorHandling.Exceptions; using Genome; using IO; using OptimizedCore; using SAUtils.InputFileParsers.ClinVar; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.MitoMap { public sealed class MitoMapSvReader { private readonly FileInfo _mitoMapFileInfo; private readonly string _dataType; private readonly ISequenceProvider _sequenceProvider; private readonly VariantAligner _variantAligner; private readonly Chromosome _chromosome; private readonly HashSet _mitoMapSvDataTypes = new HashSet { MitoMapDataTypes.MitoMapDeletionsSingle, MitoMapDataTypes.MitoMapInsertionsSimple }; public MitoMapSvReader(FileInfo mitoMapFileInfo, ISequenceProvider sequenceProvider) { _mitoMapFileInfo = mitoMapFileInfo; _dataType = GetDataType(); _sequenceProvider = sequenceProvider; _chromosome = sequenceProvider?.RefNameToChromosome["chrM"] ; _variantAligner = new VariantAligner(sequenceProvider?.Sequence); } private string GetDataType() { string dataType = _mitoMapFileInfo.Name.Replace(".html", null, StringComparison.Ordinal); if (!_mitoMapSvDataTypes.Contains(dataType)) throw new InvalidFileFormatException($"Unexpected data file: {_mitoMapFileInfo.Name}"); return dataType; } private IEnumerable GetMitoMapSvItems() { bool isDataLine = false; using (var reader = FileUtilities.GetStreamReader(FileUtilities.GetReadStream(_mitoMapFileInfo.FullName))) { string line; while ((line = reader.ReadLine()) != null) { line = line.Trim(); if (!isDataLine) { if (line == "\"data\":[") isDataLine = true; continue; } // last item if (line.OptimizedStartsWith('[') && line.EndsWith("]],", StringComparison.Ordinal)) isDataLine = false; foreach (var supplementaryIntervalItem in ParseLine(line)) { yield return supplementaryIntervalItem; } } } } internal List ParseLine(string line) { // line validation if (!(line.OptimizedStartsWith('[') && line.EndsWith("],", StringComparison.Ordinal))) throw new InvalidFileFormatException($"Data line doesn't start with \"[\" or end with \"],\": {line}"); var info = line.TrimEnd(',').TrimEnd(']').Trim('[', ']').Split("\",\"").Select(x => x.Trim('"')).ToList(); return _dataType == MitoMapDataTypes.MitoMapInsertionsSimple ? ExtractSvItemFromSimpleInsertions(info) : ExtractSvItemFromDeletionsSingle(info); } private List ExtractSvItemFromDeletionsSingle(List info) { var junctions = info[0].OptimizedSplit(':').Select(int.Parse).ToList(); var start = junctions[0] + 1; var end = junctions[1] - 1; if (end < start) throw new ArgumentOutOfRangeException($"Deletions with end position smaller than start position: start: {start}, end: {end}"); var calculatedSize = end - start + 1; var size = int.Parse(info[1].Substring(1)); if (size <= MitomapParsingParameters.LargeDeletionCutoff) return new List(); if (calculatedSize != size) Console.WriteLine($"Incorrect size of deleted region: size of {start}-{end} should be {calculatedSize}, provided size is {size}. Provided size is used."); var refSequence = _sequenceProvider.Sequence.Substring(start - 1, size); var newStart = _variantAligner.LeftAlign(start, refSequence, "").Item1; if (start != newStart) Console.WriteLine($"Deletion of {size} bps. Original start start position: {start}; new position after left-alignment {newStart}."); var mitoMapSvItem = new MitoMapSvItem(_chromosome, newStart, newStart + size - 1, VariantType.deletion); return new List { mitoMapSvItem }; } // extract large insertions from this file private List ExtractSvItemFromSimpleInsertions(IReadOnlyList info) { var mitoMapSvItems = new List(); var altAlleleInfo = info[2]; var dLoopPattern = new Regex(@"(?^\d+)-(?(\d+)) D-Loop region"); var dLoopMatch = dLoopPattern.Match(altAlleleInfo); // not a large insertion if (!dLoopMatch.Success) return mitoMapSvItems; var genomeStart = MitoDLoop.Start + int.Parse(dLoopMatch.Groups["start"].Value) - 1; var genomeEnd = MitoDLoop.Start + int.Parse(dLoopMatch.Groups["end"].Value) - 1; if (genomeEnd < genomeStart) throw new ArgumentOutOfRangeException($"Duplication with end position smaller than start position: start: {genomeStart}, end: {genomeEnd}"); var size = genomeEnd - genomeStart + 1; var refSequence = _sequenceProvider.Sequence.Substring(genomeStart - 1, size); var leftAlignResults = _variantAligner.LeftAlign(genomeStart, refSequence, refSequence + refSequence); // duplication var newStart = leftAlignResults.Item1; if (genomeStart != newStart) Console.WriteLine($"Duplication of {size} bps. Original start start position: {genomeStart}; new position after left-alignment {newStart}."); var mitoMapSvItem = new MitoMapSvItem(_chromosome, newStart, newStart + size - 1, VariantType.duplication); mitoMapSvItems.Add(mitoMapSvItem); return mitoMapSvItems; } public static IEnumerable GetSortedItems(IEnumerable mitoMapSvReaders) => mitoMapSvReaders.SelectMany(x => x.GetMitoMapSvItems()).OrderBy(x => x.Start); } } ================================================ FILE: SAUtils/MitoMap/MitoMapVariantReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; using ErrorHandling.Exceptions; using Genome; using IO; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.InputFileParsers.ClinVar; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; namespace SAUtils.MitoMap { public sealed class MitoMapVariantReader { private readonly FileInfo _mitoMapFileInfo; private const string DelSymbol = ""; private readonly string _dataType; private readonly ReferenceSequenceProvider _sequenceProvider; private readonly VariantAligner _variantAligner; private readonly Chromosome _chromosome; private readonly MitoMapInputDb _mitoMapInputDb; private static readonly Dictionary MitoMapMutationColumnDefinitions = new Dictionary { {MitoMapDataTypes.MitoMapMutationsCodingControl, new[] {0, 2, 3, 6, 7, 8, -1, 10, 11}}, {MitoMapDataTypes.MitoMapMutationsRNA, new[] {0, 2, 3, 5, 6, 7, 8, 10, 11}}, {MitoMapDataTypes.MitoMapPolymorphismsCoding, new[] {0, -1, 2, -1, -1, -1, -1, 7, 8}}, {MitoMapDataTypes.MitoMapPolymorphismsControl, new[] {0, -1, 2, -1, -1, -1, -1, 4, 5}}, {MitoMapDataTypes.MitoMapInsertionsSimple, new int[0]}, {MitoMapDataTypes.MitoMapDeletionsSingle, new int[0]} }; private static readonly Dictionary<(string, int), string> ClinicalSignificances = new Dictionary<(string, int), string> { {("up", 3), "confirmed pathogenic"}, {("up", 2), "likely pathogenic"}, {("up", 1), "possibly pathogenic"}, {("down", 1), "possibly benign"}, {("down", 2), "likely benign"} }; private static readonly Dictionary SymbolToBools = new Dictionary { {"+", true}, {"-", false} }; private static readonly HashSet MitoMapDelSymbolSet = new HashSet { ":", "del", "d" }; private static readonly HashSet IgnoredStatus = new HashSet { "See 7471insC", "Reported (alt loc)" }; public MitoMapVariantReader(FileInfo mitoMapFileInfo, MitoMapInputDb mitoMapInputDb, ReferenceSequenceProvider sequenceProvider) { _mitoMapFileInfo = mitoMapFileInfo; _mitoMapInputDb = mitoMapInputDb; _dataType = GetDataType(); _sequenceProvider = sequenceProvider; _chromosome = sequenceProvider.RefNameToChromosome["chrM"]; _variantAligner = new VariantAligner(sequenceProvider.Sequence); } private string GetDataType() { var dataType = _mitoMapFileInfo.Name.Replace(".html", ""); if (!MitoMapMutationColumnDefinitions.ContainsKey(dataType)) throw new InvalidDataException($"Unexpected data file: {_mitoMapFileInfo.Name}"); return dataType; } private IEnumerable GetMitoMapItems() { Console.WriteLine($"Processing {_dataType} file"); bool isDataLine = false; using (var reader = FileUtilities.GetStreamReader(FileUtilities.GetReadStream(_mitoMapFileInfo.FullName))) { string line; while ((line = reader.ReadLine()) != null) { line = line.Trim(); if (!isDataLine) { if (line == "\"data\":[") isDataLine = true; continue; } // last item if (line.OptimizedStartsWith('[') && line.EndsWith("]],")) isDataLine = false; foreach (var mitoMapMutItem in ParseLine(line, _dataType, _sequenceProvider, _variantAligner, _chromosome, _mitoMapInputDb)) { if (!string.IsNullOrEmpty(mitoMapMutItem.RefAllele) || !string.IsNullOrEmpty(mitoMapMutItem.AltAllele)) yield return mitoMapMutItem; } } } } internal static List ParseLine(string line, string dataType, ISequenceProvider sequenceProvider, VariantAligner variantAligner, Chromosome chromosome, MitoMapInputDb mitoMapInputDb) { // line validation if (!(line.OptimizedStartsWith('[') && line.EndsWith("],"))) throw new InvalidFileFormatException($"Data line doesn't start with \"[\" or end with \"],\": {line}"); /* example lines ["582","MT-TF","Mitochondrial myopathy","T582C","tRNA Phe","-","+","Reported","72.90% ","0","2"], ["583","MT-TF","MELAS / MM & EXIT","G583A","tRNA Phe","-","+","Cfrm","93.10% ","0","3"], */ var info = line.TrimEnd(',').TrimEnd(']').Trim('[', ']').Split("\",\"").Select(x => x.Trim('"')).ToList(); switch (dataType) { case MitoMapDataTypes.MitoMapInsertionsSimple: return ExtractVariantItemFromInsertionsSimple(info, sequenceProvider, variantAligner, chromosome, mitoMapInputDb); case MitoMapDataTypes.MitoMapDeletionsSingle: return ExtractVariantItemFromDeletionsSingle(info, sequenceProvider, variantAligner, chromosome, mitoMapInputDb); } return ExtractVariantItem(info, dataType, sequenceProvider, variantAligner, chromosome, mitoMapInputDb); } private static List ExtractVariantItemFromDeletionsSingle(List info, ISequenceProvider sequenceProvider, VariantAligner variantAligner, Chromosome chromosome, MitoMapInputDb mitoMapInputDb) { var junctions = info[0].OptimizedSplit(':').Select(int.Parse).ToList(); var start = junctions[0] + 1; var end = junctions[1] - 1; if (end < start) throw new ArgumentOutOfRangeException($"Deletions with end position smaller than start position: start: {start}, end: {end}"); var calculatedSize = end - start + 1; var size = int.Parse(info[1].Substring(1)); if (size > MitomapParsingParameters.LargeDeletionCutoff) return new List(); if (calculatedSize != size) Console.WriteLine($"Incorrect size of deleted region: size of {start}-{end} should be {calculatedSize}, provided size is {size}. Provided size is used."); var refSequence = sequenceProvider.Sequence.Substring(start - 1, size); var leftAlignResults = GetLeftAlignedVariant(start, refSequence, "", variantAligner); var pubMedIds = ParsingUtilities.GetPubMedIds(info[4], mitoMapInputDb); var mitoMapItem = new MitoMapItem(chromosome, leftAlignResults.RefPosition, leftAlignResults.RefAllele, "-", null, null, null, "", "", "", sequenceProvider, default, pubMedIds); return new List { mitoMapItem }; } // extract small variant from this file private static List ExtractVariantItemFromInsertionsSimple(List info, ISequenceProvider sequenceProvider, VariantAligner variantAligner, Chromosome chromosome, MitoMapInputDb mitoMapInputDb) { var altAlleleInfo = info[2]; var dLoopPattern = new Regex(@"(?^\d+)-(?(\d+)) D-Loop region"); var dLoopMatch = dLoopPattern.Match(altAlleleInfo); // not a small variant if (dLoopMatch.Success) { return new List(); } string altAllele; var additionalRepeatPattern = new Regex(@"additional \[(?[ACTGN]+)\] "); var additionalRepeatMatch = additionalRepeatPattern.Match(altAlleleInfo); if (additionalRepeatMatch.Success) altAllele = additionalRepeatMatch.Groups["repeat"].Value; // expect a string of allele sequence then else { if (altAlleleInfo.Contains(" ")) throw new InvalidDataException($"Cannot parse {altAlleleInfo}"); altAllele = altAlleleInfo; } var firstNumberPattern = new Regex(@"(?^\d+)"); var firstNumberMatch = firstNumberPattern.Match(info[3]); if (!firstNumberMatch.Success) throw new InvalidDataException($"Failed to extract variant position from {info[3]}"); var position = int.Parse(firstNumberMatch.Groups["firstNumber"].Value); var leftAlgnResults = GetLeftAlignedVariant(position, "", altAllele, variantAligner); // insertion var pubMedIds = ParsingUtilities.GetPubMedIds(info[6], mitoMapInputDb); return new List{new MitoMapItem(chromosome, leftAlgnResults.RefPosition, "-", leftAlgnResults.AltAllele, null, null, null, "", "", "", sequenceProvider, default, pubMedIds) }; } private static List ExtractVariantItem(List info, string dataType, ISequenceProvider sequenceProvider, VariantAligner variantAligner, Chromosome chromosome, MitoMapInputDb mitoMapInputDb) { int[] fields = MitoMapMutationColumnDefinitions[dataType]; List mitoMapVarItems = new List(); int position = int.Parse(info[fields[0]]); var mitomapDiseaseString = GetDiseaseInfo(info, fields[1]); if (DescribedAsDuplicatedRecord(mitomapDiseaseString)) return mitoMapVarItems; var diseases = string.IsNullOrEmpty(mitomapDiseaseString) ? null : new List {mitomapDiseaseString}; var (refAllele, rawAltAllele, extractedPosition) = GetRefAltAlleles(info[fields[2]], sequenceProvider); if (extractedPosition.HasValue && position != extractedPosition) Console.WriteLine($"Inconsistant positions found: annotated position: {position}; allele {info[fields[2]]}"); if (string.IsNullOrEmpty(refAllele) && string.IsNullOrEmpty(rawAltAllele)) { Console.WriteLine($"No reference and alternative alleles could be extracted: {position}; allele {info[fields[2]]}"); return mitoMapVarItems; } if (MitoMapDelSymbolSet.Contains(rawAltAllele)) rawAltAllele = DelSymbol; var homoplasmy = GetPlasmy(info, fields[3]); var heteroplasmy = GetPlasmy(info, fields[4]); string status = GetStatus(info, fields); (string scorePercentile, string clinicalSignificance) = GetFunctionalInfo(info, fields[6]); int numFullLengthSeqs = GetNumFullLengthSequences(info[fields[7]], dataType); var pubMedIds = ParsingUtilities.GetPubMedIds(info[fields[8]], mitoMapInputDb); if (!string.IsNullOrEmpty(rawAltAllele)) { foreach (var altAllele in GetAltAlleles(rawAltAllele)) { var thisLeftAlignResults = GetLeftAlignedVariant(position, refAllele, altAllele, variantAligner); mitoMapVarItems.Add(new MitoMapItem(chromosome, thisLeftAlignResults.RefPosition, thisLeftAlignResults.RefAllele, thisLeftAlignResults.AltAllele, diseases, homoplasmy,heteroplasmy, status, clinicalSignificance, scorePercentile, sequenceProvider, numFullLengthSeqs, pubMedIds)); } if (mitoMapVarItems.Count > 1) Console.WriteLine($"Multiple Alternative Allele Sequences {info[fields[2]]} at {position}"); return mitoMapVarItems; } var leftAlignResults = GetLeftAlignedVariant(position, refAllele, rawAltAllele, variantAligner); mitoMapVarItems.Add(new MitoMapItem(chromosome, leftAlignResults.RefPosition, leftAlignResults.RefAllele, leftAlignResults.AltAllele, diseases, homoplasmy, heteroplasmy, status, clinicalSignificance, scorePercentile, sequenceProvider, numFullLengthSeqs, pubMedIds)); return mitoMapVarItems; } private static string GetStatus(List info, int[] fields) { string status = fields[5] == -1 ? null : info[fields[5]]; return IgnoredStatus.Contains(status) ? null : status; } internal static int GetNumFullLengthSequences(string field, string dataType) { if (!field?.OptimizedStartsWith('<') ?? true) return 0; int leadingCharIndex = -1; int trailingCharIndex = -1; switch (dataType) { case MitoMapDataTypes.MitoMapMutationsRNA: case MitoMapDataTypes.MitoMapMutationsCodingControl: leadingCharIndex = field.IndexOf('>'); trailingCharIndex = field.IndexOf(" (", StringComparison.Ordinal); break; case MitoMapDataTypes.MitoMapPolymorphismsCoding: leadingCharIndex = field.IndexOf('>'); trailingCharIndex = field.IndexOf(" info, int fields) { if (fields == -1 || !SymbolToBools.TryGetValue(info[fields], out bool b)) return null; return b; } // there may be multiple alt alleles concatenated by ";" internal static IEnumerable GetAltAlleles(string rawAltAllele) => rawAltAllele.OptimizedSplit(';').Select(DegenerateBaseUtilities.GetAllPossibleSequences).SelectMany(x => x); private static bool DescribedAsDuplicatedRecord(string mitomapDiseaseString) { if (string.IsNullOrEmpty(mitomapDiseaseString)) return false; var altNotationPattern1 = new Regex("alternate notation$"); var altNotationMatch = altNotationPattern1.Match(mitomapDiseaseString); if (!altNotationMatch.Success) return false; Console.WriteLine($"Alternate notation found: {mitomapDiseaseString}. This record is skipped."); return true; } private static string GetDiseaseInfo(List info, int fieldIndex) { if (fieldIndex == -1) return null; string diseaseString = info[fieldIndex]; if (string.IsNullOrEmpty(diseaseString)) return diseaseString; var regexPattern = new Regex(@"(?.+)$"); var match = regexPattern.Match(diseaseString); return match.Success ? match.Groups["disease"].Value : diseaseString; } private static (string, string) GetFunctionalInfo(List info, int fieldIndex) { if (fieldIndex == -1) return (null, null); string functionInfoString = info[fieldIndex]; // 93.10% var regexPattern = new Regex(@"(?[0-9.]+)% (?.+)$"); var match = regexPattern.Match(functionInfoString); var clineSignificance = GetClinicalSignificance(match.Groups["significanceString"].Value); return (match.Groups["scoreString"].Value, clineSignificance); } private static string GetClinicalSignificance(string significanceString) { // < i class='fa fa-arrow-up' style='color:red' aria-hidden='true'> // filter out the symbol for frequency alert var arrows = significanceString.Split(@"", StringSplitOptions.RemoveEmptyEntries).Where(x => !x.Contains("fa-asterisk")).ToList(); var nArrows = arrows.Count; if (nArrows == 0) return null; var arrowType = arrows[0].Contains("fa-arrow-up") ? "up" : "down"; return ClinicalSignificances[(arrowType, nArrows)]; } private static (string RefAllele, string RawAltAllele, int? ExtractedPosition) GetRefAltAlleles(string alleleString, ISequenceProvider sequenceProvider) { var results = Evaluate_C123T(alleleString); if (results.Success) return (results.RefAllele, results.RawAltAllele, results.ExtractedPosition); results = Evaluate_16021_16022del(alleleString, sequenceProvider); if (results.Success) return (results.RefAllele, results.RawAltAllele, results.ExtractedPosition); results = Evaluate_8042del2(alleleString, sequenceProvider); if (results.Success) return (results.RefAllele, results.RawAltAllele, results.ExtractedPosition); results = Evaluate_C9537insC(alleleString); if (results.Success) return (results.RefAllele, results.RawAltAllele, results.ExtractedPosition); results = Evaluate_3902_3908invACCTTGC(alleleString, sequenceProvider); if (results.Success) return (results.RefAllele, results.RawAltAllele, results.ExtractedPosition); results = Evaluate_A_C_or_CC(alleleString); if (results.Success) return (results.RefAllele, results.RawAltAllele, results.ExtractedPosition); results = Evaluate_C_C_2_8(alleleString); if (results.Success) return (results.RefAllele, results.RawAltAllele, results.ExtractedPosition); results = Evaluate_8042delAT(alleleString, sequenceProvider); return results.Success ? (results.RefAllele, results.RawAltAllele, results.ExtractedPosition) : (null, null, null); } // 8042delAT private static (bool Success, string RefAllele, string RawAltAllele, int? ExtractedPosition) Evaluate_8042delAT(string alleleString, ISequenceProvider sequenceProvider) { var regex = new Regex(@"(?^\d+)del(?[ACGTacgtNn]+)"); var match = regex.Match(alleleString); if (!match.Success) return (false, null, null, null); var extractedPosition = int.Parse(match.Groups["position"].Value); string deletedSeq = match.Groups["del"].Value; string deletedReferenceSeq = GetRefAllelesFromReference(sequenceProvider, extractedPosition, deletedSeq.Length); if (deletedSeq != deletedReferenceSeq) { throw new InvalidDataException($"Deleted sequence at {extractedPosition}: annoation is {deletedSeq}, reference sequence is {deletedReferenceSeq}"); } return (true, deletedReferenceSeq, "-", extractedPosition); } // C-C(2-8) private static (bool Success, string RefAllele, string RawAltAllele, int? ExtractedPosition) Evaluate_C_C_2_8(string alleleString) { var regex = new Regex(@"(?[ACGTacgtNn])[_|-](?[ACGTacgtNn])\((?\d+)-(?\d+)\)"); var match = regex.Match(alleleString); if (!match.Success) return (false, null, null, null); var altBase = char.Parse(match.Groups["alt"].Value); int minRepeat = int.Parse(match.Groups["min"].Value); int maxRepeat = int.Parse(match.Groups["max"].Value); var altAlleleSequences = new List(); for (int i = minRepeat; i <= maxRepeat; i++) { altAlleleSequences.Add(new string(altBase, i)); } return (true, match.Groups["ref"].Value, string.Join(";", altAlleleSequences), null); } //A-Cor CC private static (bool Success, string RefAllele, string RawAltAllele, int? ExtractedPosition) Evaluate_A_C_or_CC(string alleleString) { var regex = new Regex(@"(?[ACGTacgtNn]+)[_|-](?[ACGTacgtNn]+) ?or ?(?[ACGTacgtNn]+)"); var match = regex.Match(alleleString); if (!match.Success) return (false, null, null, null); var altAllele = match.Groups["alt1"].Value + ";" + match.Groups["alt2"].Value; return (true, match.Groups["ref"].Value, altAllele, null); } // 3902_3908invACCTTGC private static (bool Success, string RefAllele, string RawAltAllele, int? ExtractedPosition) Evaluate_3902_3908invACCTTGC(string alleleString, ISequenceProvider sequenceProvider) { var regex = new Regex(@"(?^\d+)[_|-](?\d+)inv(?[ACGTacgtNn]+)"); var match = regex.Match(alleleString); if (!match.Success) return (false, null, null, null); var start = int.Parse(match.Groups["start"].Value); var end = int.Parse(match.Groups["end"].Value); var refSequence = GetRefAllelesFromReference(sequenceProvider, start, end - start + 1); if (refSequence != match.Groups["seq"].Value) throw new InvalidDataException($"Inconsistent sequences: reference {refSequence}, annotation {match.Groups["seq"].Value}"); return (true, refSequence, ReverseSequence(refSequence), start); } // C9537insC private static (bool Success, string RefAllele, string RawAltAllele, int? ExtractedPosition) Evaluate_C9537insC(string alleleString) { var regex = new Regex(@"(?[ACGTacgtNn])(?\d+)ins(?[ACGTacgtNn]+)"); var match = regex.Match(alleleString); if (!match.Success) return (false, null, null, null); var extractedPosition = int.Parse(match.Groups["position"].Value); var refAllele = match.Groups["ref"].Value; var altAllele = refAllele + match.Groups["extra"].Value; return (true, refAllele, altAllele, extractedPosition); } // 8042del2 private static (bool Success, string RefAllele, string RawAltAllele, int? ExtractedPosition) Evaluate_8042del2(string alleleString, ISequenceProvider sequenceProvider) { var regex = new Regex(@"(?^\d+)del(?\d+)"); var match = regex.Match(alleleString); if (!match.Success) return (false, null, null, null); var extractedPosition = int.Parse(match.Groups["position"].Value); return (true, GetRefAllelesFromReference(sequenceProvider, extractedPosition, int.Parse(match.Groups["length"].Value)), "-", extractedPosition); } // 16021_16022del private static (bool Success, string RefAllele, string RawAltAllele, int? ExtractedPosition) Evaluate_16021_16022del(string alleleString, ISequenceProvider sequenceProvider) { var regex = new Regex(@"(?^\d+)[_|-](?\d+)del"); var match = regex.Match(alleleString); if (!match.Success) return (false, null, null, null); var start = int.Parse(match.Groups["start"].Value); var end = int.Parse(match.Groups["end"].Value); return (true, GetRefAllelesFromReference(sequenceProvider, start, end - start + 1), "-", start); } // C123T, A-del or A123del private static (bool Success, string RefAllele, string RawAltAllele, int? ExtractedPosition) Evaluate_C123T(string alleleString) { var regex = new Regex(@"(?^[ACGTacgtNn]+)(?(\d+|-))(?([ACGTBDHKMRSVWYNacgtbdhkmrsvwyn]+|:|del[ACGTacgtNn]*|d)$)"); var match = regex.Match(alleleString); if (!match.Success) return (false, null, null, null); int? extractedPosition = null; if (match.Groups["position"].Value != "-") extractedPosition = int.Parse(match.Groups["position"].Value); return (true, match.Groups["ref"].Value, match.Groups["alt"].Value, extractedPosition); } private static string GetRefAllelesFromReference(ISequenceProvider sequenceProvider, int start, int length) => sequenceProvider.Sequence.Substring(start - 1, length); private static string ReverseSequence(string sequence) { var reversedNucleotide = new char[sequence.Length]; var i = sequence.Length - 1; foreach (var nucleotide in sequence) { reversedNucleotide[i] = nucleotide; i--; } return new string(reversedNucleotide); } public static IEnumerable GetMergeAndSortedItems(IEnumerable mitoMapMutationReaders) { var items = mitoMapMutationReaders.SelectMany(x => x.GetMitoMapItems()).ToList(); items.ForEach(x => x.Trim()); return items.ToLookup(x => x.Position).Select(x => MitoMapItem.AggregatedMutationsSamePosition(x.Select(i => i)).Values) .SelectMany(x => x).OrderBy(x => x.Position); } private static (int RefPosition, string RefAllele, string AltAllele) GetLeftAlignedVariant(int position, string refAllele, string altAllele, VariantAligner variantAligner) { if (refAllele == null || altAllele == null) return (position, refAllele, altAllele); if (refAllele == "-") refAllele = ""; if (altAllele == "-") altAllele = ""; var leftAlgnResults = variantAligner.LeftAlign(position, refAllele, altAllele); var newPosition = leftAlgnResults.RefPosition; var newRefAllele = leftAlgnResults.RefAllele; var newAltAllele = leftAlgnResults.AltAllele; if (position == newPosition) return leftAlgnResults; if (newRefAllele == "") // insertion Console.WriteLine( $"Insertion of {altAllele}. Original start position: {position}; new position after left-alignment {newPosition}; new altAllele {newAltAllele}"); else if (newAltAllele == "") // deletion Console.WriteLine($"Deletion of {newRefAllele.Length} bps. Original start start position: {position}; new position after left-alignment {newPosition}."); else { throw new InvalidDataException($"{position}:{refAllele}:{altAllele} becomes {newPosition}:{newRefAllele}:{newAltAllele} after left alignment. Left-alignment should be only performed for deletions and insertions"); } return leftAlgnResults; } } } ================================================ FILE: SAUtils/MitoMap/ParsingUtilities.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; namespace SAUtils.MitoMap { public static class ParsingUtilities { private const string EmptyString = "\\N"; public static List GetPubMedIds(string field, MitoMapInputDb mitoMapInputDb) { if (field == "0") return default; var internalIds = ExtractInternalIds(field); var pubMedIds = new List(); foreach (string internalId in internalIds) { if (mitoMapInputDb.InternalReferenceIdToPubmedId.TryGetValue(internalId, out string pubMedId)) { if (pubMedId != EmptyString) pubMedIds.Add(pubMedId); } else throw new InvalidDataException($"Can't find PubMedID corresponding to internal reference ID {internalId} when parsing {field}"); } return pubMedIds.Distinct().ToList(); } public static string[] ExtractInternalIds(string field) { //"?refs=4,140,189,91687,91737&title=" const string leadingString = "refs="; const string trailingString = "&title="; var leadingStringIndex = field.IndexOf(leadingString, StringComparison.Ordinal); var trailingStringIndex = field.IndexOf(trailingString, StringComparison.Ordinal); var startIndex = leadingStringIndex + leadingString.Length; var idStringLength = trailingStringIndex - startIndex; if (leadingStringIndex == -1 || trailingStringIndex == -1 || idStringLength == 0) throw new InvalidDataException($"Failed to extract reference IDs from {field}"); return field.Substring(startIndex, idStringLength).Split(','); } } } ================================================ FILE: SAUtils/MitoMap/SmallVarDb.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.MitoMap { public static class SmallVarDb { private static string _compressedReference; private static string _outputDirectory; private static readonly List MitoMapFileNames = new List(); private static string _mitoMapDatabase; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "MITOMAP small variants HTML file", v => MitoMapFileNames.Add(v) }, { "database|d=", "MITOMAP database", v => _mitoMapDatabase = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckEachFilenameExists(MitoMapFileNames, "MITOMAP small variants HTML file", "--in") .CheckInputFilenameExists(_mitoMapDatabase, "output directory", "--database") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with MITOMAP small variants annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var rootDirectory = new FileInfo(MitoMapFileNames[0]).Directory; if (rootDirectory == null) return ExitCodes.PathNotFound; var version = DataSourceVersionReader.GetSourceVersion(Path.Combine(rootDirectory.ToString(), "mitoMapVar")); var sequenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var chrom = sequenceProvider.RefNameToChromosome["chrM"]; sequenceProvider.LoadChromosome(chrom); MitoMapInputDb mitoMapInputDb = MitoMapDatabaseUtilities.Create(_mitoMapDatabase); var mitoMapVarReaders = MitoMapFileNames.Select(mitoMapFileName => new MitoMapVariantReader(new FileInfo(mitoMapFileName), mitoMapInputDb, sequenceProvider)).ToList(); var mergedMitoMapVarItems = MitoMapVariantReader.GetMergeAndSortedItems(mitoMapVarReaders); string outFileName = $"{version.Name}_{version.Version}"; using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, sequenceProvider, SaCommon.MitoMapTag, false, true, SaCommon.SchemaVersion, false)) { nsaWriter.Write(mergedMitoMapVarItems); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/MitoMap/StructVarDb.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using Genome; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.MitoMap { public static class StructVarDb { private static string _compressedReference; private static string _outputDirectory; private static readonly List MitoMapFileNames = new List(); public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "MITOMAP structural variants HTML file", v => MitoMapFileNames.Add(v) }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckEachFilenameExists(MitoMapFileNames, "MITOMAP structural variants HTML file", "--in") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with MITOMAP structural variants annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var rootDirectory = new FileInfo(MitoMapFileNames[0]).Directory; if (rootDirectory == null) return ExitCodes.PathNotFound; var version = DataSourceVersionReader.GetSourceVersion(Path.Combine(rootDirectory.ToString(), "mitoMapSv")); var sequenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var chrom = sequenceProvider.RefNameToChromosome["chrM"]; sequenceProvider.LoadChromosome(chrom); var mitoMapSvReaders = MitoMapFileNames.Select(mitoMapFileName => new MitoMapSvReader(new FileInfo(mitoMapFileName), sequenceProvider)).ToList(); var sortedMitoMapVarItems = MitoMapSvReader.GetSortedItems(mitoMapSvReaders); string outFileName = $"{version.Name}_{version.Version}"; using (var nsiStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.IntervalFileSuffix))) using(var nsiWriter = new NsiWriter(nsiStream, version, GenomeAssembly.rCRS, SaCommon.MitoMapTag, ReportFor.StructuralVariants, SaCommon.SchemaVersion)) { nsiWriter.Write(sortedMitoMapVarItems); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/NgaWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Text; using Compression.Algorithms; using Compression.FileHandling; using IO; using VariantAnnotation.Interface.SA; using VariantAnnotation.SA; namespace SAUtils { public sealed class NgaWriter : IDisposable { private readonly ExtendedBinaryWriter _writer; public NgaWriter(Stream stream, ISerializable version, string jsonKey, ushort schemaVersion, bool isArray, bool leaveOpen = false) { WriteHeader(stream, version, jsonKey, schemaVersion, isArray); var blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Compress); _writer = new ExtendedBinaryWriter(blockStream, Encoding.UTF8, leaveOpen); } private static void WriteHeader(Stream stream, ISerializable version, string jsonKey, ushort schemaVersion, bool isArray) { using (var writer = new ExtendedBinaryWriter(stream, Encoding.UTF8, true)) { writer.Write(SaCommon.NgaIdentifier); version.Write(writer); writer.Write(jsonKey); writer.Write(isArray); writer.Write(schemaVersion); writer.Write(SaCommon.GuardInt); } } public void Dispose() => _writer.Dispose(); public int Write(Dictionary> geneToEntries) { _writer.WriteOpt(geneToEntries.Count); var count = 0; foreach ((string geneSymbol, List entries) in geneToEntries) { _writer.WriteOptAscii(geneSymbol); _writer.WriteOpt(entries.Count); foreach (ISuppGeneItem geneItem in entries) { count++; _writer.Write(geneItem.GetJsonString()); } } return count; } } } ================================================ FILE: SAUtils/NsaConcatenator/ConcatUtilities.cs ================================================ using IO; using System; using System.Collections.Generic; using System.Linq; using Genome; using VariantAnnotation.Interface.Providers; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.NsaConcatenator { public static class ConcatUtilities { private static (IDataSourceVersion version, string jsonKey, bool matchByAllele, bool isArray, bool isPositional, GenomeAssembly assembly) GetIndexFields(List nsaReaders) { var version = nsaReaders[0].Version; var jsonKey = nsaReaders[0].JsonKey; var matchByAllele = nsaReaders[0].MatchByAllele; var isArray = nsaReaders[0].IsArray; var isPositional = nsaReaders[0].IsPositional; var assembly = nsaReaders[0].Assembly; var versionComparer = new DataSourceVersionComparer(); for (var i = 1; i < nsaReaders.Count; i++) { if (!versionComparer.Equals(version, nsaReaders[i].Version) || jsonKey != nsaReaders[i].JsonKey || matchByAllele != nsaReaders[i].MatchByAllele || isArray != nsaReaders[i].IsArray || isPositional != nsaReaders[i].IsPositional || assembly != nsaReaders[i].Assembly ) return (null, null, false, false, false, GenomeAssembly.Unknown); } return (version, jsonKey, matchByAllele, isArray, isPositional, assembly); } private static NsaReader GetNsaReader(ushort chromIndex, List nsaReaders) { if (nsaReaders == null) return null; var hasDataArray = nsaReaders.Select(x => x.HasDataBlocks(chromIndex)).ToArray(); var count = hasDataArray.Count(x => x); if (count > 1) throw new DataMisalignedException("Only one of the NSA files should have data for a given chromosome."); for (var i = 0; i < hasDataArray.Length; i++) { if (hasDataArray[i] == false) continue; return nsaReaders[i]; } return null; } public static void ConcatenateNsaFiles(IEnumerable filePaths, string outFilePrefix) { if(filePaths == null || !filePaths.Any()) return; var nsaReaders = new List(); foreach (var fileName in filePaths) { nsaReaders.Add(new NsaReader(FileUtilities.GetReadStream(fileName), FileUtilities.GetReadStream(fileName + SaCommon.IndexSuffix))); } Console.WriteLine($"Merging {nsaReaders.Count} NSA files..."); var (version, jsonKey, matchByAllele, isArray, isPositional, assembly) = GetIndexFields(nsaReaders); using (var nsaStream = FileUtilities.GetCreateStream(outFilePrefix + SaCommon.SaFileSuffix)) using (var indexStream = FileUtilities.GetCreateStream(outFilePrefix + SaCommon.SaFileSuffix + SaCommon.IndexSuffix)) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, null, jsonKey, matchByAllele, isArray, SaCommon.SchemaVersion, isPositional, true, false, SaCommon.DefaultBlockSize, assembly)) { var chromIndices = GetChromIndices(nsaReaders); foreach (var chromIndex in chromIndices) { Console.WriteLine($"Working on chromosome index: {chromIndex}"); nsaWriter.Write(chromIndex, GetNsaReader(chromIndex, nsaReaders)); } } } private static IEnumerable GetChromIndices(List nsaReaders) { var indices = new List(); if (nsaReaders == null) return indices; foreach (var reader in nsaReaders) { indices.AddRange(reader.ChromosomeIndices); } return indices.Distinct(); } } } ================================================ FILE: SAUtils/NsaConcatenator/NsaConcatenator.cs ================================================ using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using System; using System.IO; using VariantAnnotation.SA; namespace SAUtils.NsaConcatenator { public static class NsaConcatenator { private static string _inputDir; private static string _outFileStub; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "dir|d=", "input directory containing NSA (and index) files to be merged", v => _inputDir = v }, { "out|o=", "output NSA file stub", v => _outFileStub = v } }; var commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckDirectoryExists(_inputDir, "input directory containing NSA files", "--in") .HasRequiredParameter(_outFileStub, "output NSA file stub", "--out") .SkipBanner() .ShowHelpMenu("Concatenate multiple (non-overlapping) NSA files from the same data source into one", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { Console.WriteLine($"Concatenating NSA files from {_inputDir}"); ConcatUtilities.ConcatenateNsaFiles(Directory.GetFiles(_inputDir, $"*{SaCommon.SaFileSuffix}"), _outFileStub); return ExitCodes.Success; } } } ================================================ FILE: SAUtils/NsaIndexUpdater/UpdateIndex.cs ================================================ using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.NSA; namespace SAUtils.NsaIndexUpdater { public static class UpdateIndex { private static string _inputIndexFile; private static string _outputIndexFile; private static string _versionFile; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ind|i=", "input NSA index file path", v => _inputIndexFile = v }, { "ver|r=", "version file path", v => _versionFile = v }, { "out|o=", "output index file path", v => _outputIndexFile= v } }; var commandLineExample = $"{command} --ind --out --ver "; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_inputIndexFile, "input NSA index file path", "--ind") .HasRequiredParameter(_outputIndexFile, "output index file path", "--out") .CheckInputFilenameExists(_versionFile, "version file path", "--ver") .SkipBanner() .ShowHelpMenu("Extracts mini supplementary annotations for the given range from Nirvana Supplementary Annotations files.", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { using (var indexStream = FileUtilities.GetReadStream(_inputIndexFile)) using (var outStream = FileUtilities.GetCreateStream(_outputIndexFile)) using (var extWriter = new ExtendedBinaryWriter(outStream)) { var version = DataSourceVersionReader.GetSourceVersion(_versionFile); var oldIndex = new NsaIndex(indexStream); var newIndex = new NsaIndex(extWriter, oldIndex.Assembly, version, oldIndex.JsonKey, oldIndex.MatchByAllele, oldIndex.IsArray, oldIndex.SchemaVersion, oldIndex.IsPositional); newIndex.Write(oldIndex.GetBlocks()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/NsaWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using CommandLine.Utilities; using Compression.Algorithms; using ErrorHandling.Exceptions; using Genome; using IO; using SAUtils.DataStructures; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.SA; using Variants; namespace SAUtils { public sealed class NsaWriter : IDisposable { private readonly ExtendedBinaryWriter _writer; private readonly ExtendedBinaryWriter _indexWriter; private readonly Stream _stream; private readonly Stream _indexStream; private readonly byte[] _memBuffer; private readonly MemoryStream _memStream; private readonly ExtendedBinaryWriter _memWriter; private readonly NsaBlock _block; private readonly NsaIndex _index; private readonly bool _isPositional; private readonly bool _skipIncorrectRefEntries; private readonly bool _throwErrorOnConflicts; private readonly ISequenceProvider _refProvider; private readonly bool _leaveOpen; private int _count; private HashSet _completedChromosomes = new HashSet(); public NsaWriter(Stream nsaStream, Stream indexStream, IDataSourceVersion version, ISequenceProvider refProvider, string jsonKey, bool matchByAllele, bool isArray, int schemaVersion, bool isPositional, bool skipIncorrectRefEntries= true, bool throwErrorOnConflicts = false, int blockSize = SaCommon.DefaultBlockSize, GenomeAssembly assembly= GenomeAssembly.Unknown, bool leaveOpen=false) { _stream = nsaStream; _indexStream = indexStream; _writer = new ExtendedBinaryWriter(_stream,System.Text.Encoding.Default, leaveOpen); _indexWriter = new ExtendedBinaryWriter(_indexStream,System.Text.Encoding.Default, leaveOpen); _isPositional = isPositional; _skipIncorrectRefEntries = skipIncorrectRefEntries; _throwErrorOnConflicts = throwErrorOnConflicts; _refProvider = refProvider; _leaveOpen = leaveOpen; assembly = _refProvider?.Assembly ?? assembly; _block = new NsaBlock(new Zstandard(), blockSize); _index = new NsaIndex(_indexWriter, assembly, version, jsonKey, matchByAllele, isArray, schemaVersion, isPositional); _memBuffer = new byte[blockSize]; _memStream = new MemoryStream(_memBuffer); _memWriter = new ExtendedBinaryWriter(_memStream); } internal void Write(ushort chromIndex, NsaReader nsaReader) { if (nsaReader == null) return; var dataBlocks = nsaReader.GetCompressedBlocks(chromIndex); var indexBlocks = nsaReader.GetIndexBlocks(chromIndex); var i = 0;//index of the index Blocks //cannot convert the dataBlocks into a list since that may take up GBs of memory (proportional to the nas file size) foreach (var dataBlock in dataBlocks) { if (i > indexBlocks.Count) throw new IndexOutOfRangeException("Nsa Index have less blocks than the Nsa file. They have to be the same."); var oldIndexBlock = indexBlocks[i]; _index.Add(chromIndex, oldIndexBlock.Start, oldIndexBlock.End, _writer.BaseStream.Position, oldIndexBlock.Length); dataBlock.WriteCompressedBytes(_writer); i++; } if (i < indexBlocks.Count) throw new IndexOutOfRangeException("Nsa Index have more blocks than the Nsa file. They have to be the same."); } public int Write(IEnumerable saItems) { var itemsMinHeap = new MinHeap(SuppDataUtilities.CompareTo); var chromIndex = ushort.MaxValue; var currentEnsemblName = ""; _count = 0; var benchmark = new Benchmark(); foreach (var saItem in saItems) { if (chromIndex != saItem.Chromosome.Index) { if (chromIndex != ushort.MaxValue) { _completedChromosomes.Add(chromIndex); // this chrom is done //flushing out the remaining items in buffer WriteUptoPosition(itemsMinHeap, int.MaxValue); Flush(chromIndex); Console.WriteLine($"Chromosome {currentEnsemblName} completed in {Benchmark.ToHumanReadable(benchmark.GetElapsedTime())}"); benchmark.Reset(); } chromIndex = saItem.Chromosome.Index; currentEnsemblName = saItem.Chromosome.EnsemblName; _refProvider.LoadChromosome(saItem.Chromosome); } if (_completedChromosomes.Contains(saItem.Chromosome.Index)) { throw new UserErrorException( $"The input file is not sorted by chromosomes. {saItem.Chromosome.UcscName} is observed in multiple segments." + $"\nInput Line:\n{saItem.InputLine}"); } // the items come in sorted order of the pre-trimmed position. // So when writing out, we have to make sure that we do not write past this position. // Once a position has been seen in the stream, we can safely write all positions before that. var writeToPos = saItem.Position; // if variant is in par region, we allow N's in ref if (RegionUtilities.OverlapsParRegion(saItem, _refProvider.Assembly) && !string.IsNullOrEmpty(saItem.RefAllele) && saItem.RefAllele.All(x=> x=='N' || x=='n')) { itemsMinHeap.Add(saItem); // in order to allow room for left shifted variants, we hold off on removing them from the heap WriteUptoPosition(itemsMinHeap, writeToPos - VariantUtils.MaxUpstreamLength); continue; } string refSequence = _refProvider.Sequence.Substring(saItem.Position - 1, saItem.RefAllele.Length); if (!string.IsNullOrEmpty(saItem.RefAllele) && saItem.RefAllele != refSequence) { if (_skipIncorrectRefEntries) continue; throw new UserErrorException($"The provided reference allele {saItem.RefAllele} at {saItem.Chromosome.UcscName}:{saItem.Position} is different from {refSequence} in the reference genome sequence." + $"\nInput Line:\n {saItem.InputLine}"); } itemsMinHeap.Add(saItem); // in order to allow room for left shifted variants, we hold off on removing them from the heap WriteUptoPosition(itemsMinHeap, writeToPos- VariantUtils.MaxUpstreamLength); } //flushing out the remaining items in buffer WriteUptoPosition(itemsMinHeap, int.MaxValue); Flush(chromIndex); Console.WriteLine($"Chromosome {currentEnsemblName} completed in {Benchmark.ToHumanReadable(benchmark.GetElapsedTime())}"); Console.WriteLine($"Maximum bp shifted for any variant:{VariantUtils.MaxShiftLength}"); return _count; } private void WriteUptoPosition(MinHeap itemsHeap, int position) { if (position < 1) return; if (itemsHeap.Count() == 0) return; var bufferMin = itemsHeap.GetMin(); while (bufferMin.Position < position) { var itemsAtMinPosition = new List(); while (itemsHeap.Count() > 0 && SuppDataUtilities.CompareTo(bufferMin, itemsHeap.GetMin()) == 0) itemsAtMinPosition.Add(itemsHeap.ExtractMin()); if (itemsAtMinPosition.Count > 0) { _count += itemsAtMinPosition.Count; WritePosition(itemsAtMinPosition); } if (itemsHeap.Count() == 0) break; bufferMin = itemsHeap.GetMin(); } } private void WritePosition(List items) { int position = items[0].Position; _memStream.Position = 0; if (_isPositional) { var positionalItem = SuppDataUtilities.GetPositionalAnnotation(items); if (positionalItem == null) return; _memWriter.Write(positionalItem.GetJsonString()); } else { // any data source that is reported by allele and is not an array (e.g. allele frequencies) need this filtering step if (_index.MatchByAllele && !_index.IsArray) items = SuppDataUtilities.RemoveConflictingAlleles(items, _throwErrorOnConflicts); if (_index.JsonKey == SaCommon.PrimateAiTag) items = SuppDataUtilities.DeDuplicatePrimateAiItems(items); _memWriter.WriteOpt(items.Count); foreach (ISupplementaryDataItem saItem in items) { _memWriter.WriteOptAscii(saItem.RefAllele); _memWriter.WriteOptAscii(saItem.AltAllele); _memWriter.Write(saItem.GetJsonString()); } } int numBytes = (int)_memStream.Position; if (!_block.HasSpace(numBytes)) Flush(items[0].Chromosome.Index); _block.Add(_memBuffer, numBytes, position); } private void Flush(ushort chromIndex) { if (_block.BlockOffset == 0) return; long fileOffset = _stream.Position; (int firstPosition, int lastPosition, int numBytes) = _block.Write(_writer); _block.Clear(); _index.Add(chromIndex, firstPosition, lastPosition, fileOffset, numBytes); } public void Dispose() { _index.Write(); if (!_leaveOpen) { _writer?.Dispose(); _indexWriter?.Dispose(); _stream?.Dispose(); _indexStream?.Dispose(); _block?.Dispose(); } _memWriter?.Dispose(); _memStream?.Dispose(); } } } ================================================ FILE: SAUtils/Omim/Downloader.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using SAUtils.GeneIdentifiers; using static System.Environment; namespace SAUtils.Omim { public static class Downloader { private static string _apiKey; private static string _universalGeneArchivePath; private static string _outputDirectory; private static string _inputReferencePath; private const string OmimApiKeyEnvironmentVariableName = "OmimApiKey"; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "uga|u=", "universal gene archive {path}", v => _universalGeneArchivePath = v }, { "ref|r=", "input reference {filename}", v => _inputReferencePath = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .CheckInputFilenameExists(_universalGeneArchivePath, "universal gene archive", "--uga") .SkipBanner() .ShowHelpMenu("Download the OMIM gene annotation data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { _apiKey = GetEnvironmentVariable(OmimApiKeyEnvironmentVariableName); if (_apiKey == null) throw new InvalidDataException("Please set the OMIM API key as the environment variable \"OmimApiKey\"."); var (entrezGeneIdToSymbol, ensemblGeneIdToSymbol) = GeneUtilities.ParseUniversalGeneArchive(_inputReferencePath, _universalGeneArchivePath); var geneSymbolUpdater = new GeneSymbolUpdater(entrezGeneIdToSymbol, ensemblGeneIdToSymbol); using (var omimQuery = new OmimQuery(_apiKey, _outputDirectory)) { omimQuery.GenerateMimToGeneSymbolFile(geneSymbolUpdater); omimQuery.GenerateJsonResponse(); } OmimVersion.WriteToFile(OmimQuery.JsonResponseFile, _outputDirectory); geneSymbolUpdater.DisplayStatistics(); return ExitCodes.Success; } } } ================================================ FILE: SAUtils/Omim/EntryApiResponse/EntryResponse.cs ================================================ namespace SAUtils.Omim.EntryApiResponse { // ReSharper disable InconsistentNaming public sealed class EntryRoot { public RootItem omim; } // ReSharper disable ClassNeverInstantiated.Global public class RootItem { public string version; public Entry[] entryList; } public class Entry { public EntryItem entry; } public class EntryItem { public char prefix; public int mimNumber; public string status; public TextSection[] textSectionList; public GeneMap geneMap; } public class TextSection { public TextSectionItem textSection; } public class TextSectionItem { public string textSectionName; public string textSectionTitle; public string textSectionContent; } // ReSharper restore ClassNeverInstantiated.Global // ReSharper restore InconsistentNaming } ================================================ FILE: SAUtils/Omim/EntryApiResponse/GeneMap.cs ================================================ namespace SAUtils.Omim.EntryApiResponse { // ReSharper disable InconsistentNaming // ReSharper disable ClassNeverInstantiated.Global public class GeneMap { public string geneName; public int mimNumber; public PhenotypeMap[] phenotypeMapList; } public class PhenotypeMap { public PhenotypeMapItem phenotypeMap; } public class PhenotypeMapItem { public int phenotypeMimNumber; public string phenotype; public int phenotypeMappingKey; public string phenotypeInheritance; } // ReSharper restore ClassNeverInstantiated.Global // ReSharper restore InconsistentNaming } ================================================ FILE: SAUtils/Omim/Main.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using IO; using Newtonsoft.Json.Linq; using SAUtils.DataStructures; using VariantAnnotation.Interface.SA; using VariantAnnotation.SA; namespace SAUtils.Omim { public static class Main { private static string _mimToGeneFile; private static string _omimJsonFile; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "m2g|m=", "MimToGeneSymbol tsv file", v => _mimToGeneFile = v }, { "json|j=", "OMIM entry json file", v => _omimJsonFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .CheckInputFilenameExists(_mimToGeneFile, "MimToGeneSymbol tsv file", "--m2g") .CheckInputFilenameExists(_omimJsonFile, "OMIM entry json file", "--json") .SkipBanner() .ShowHelpMenu("Creates a gene annotation database from OMIM data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var omimSchema = OmimSchema.Get(); var omimParser = new OmimParser(_mimToGeneFile, _omimJsonFile, omimSchema); var version = omimParser.GetVersion(); string outFileName = $"{version.Name}_{version.Version}"; using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GeneFileSuffix))) using (var ngaWriter = new NgaWriter(nsaStream, version, SaCommon.OmimTag, SaCommon.SchemaVersion, true)) using (var saJsonSchemaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.GeneFileSuffix + SaCommon.JsonSchemaSuffix))) using (var schemaWriter = new StreamWriter(saJsonSchemaStream)) { IEnumerable omimItems = omimParser.GetItems(); Dictionary> geneToItems = OmimUtilities.GetGeneToOmimEntriesAndSchema(omimItems); ngaWriter.Write(geneToItems); schemaWriter.Write(omimSchema); JObject jo = JObject.Parse(omimParser.OmimStats.ToString()); Console.WriteLine(jo); //pretty printing json } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/Omim/OmimParser.cs ================================================ using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using Newtonsoft.Json; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.InputFileParsers; using SAUtils.Omim.EntryApiResponse; using SAUtils.Schema; using VariantAnnotation.Providers; namespace SAUtils.Omim; public sealed class OmimParser { private readonly string _mimToGeneSymbolFile; private readonly string _omimJsonFile; private readonly SaJsonSchema _jsonSchema; private const string CurrentOmimJsonVersion = "1.0"; public readonly OmimStatistics OmimStats = new(); public OmimParser(string mimToGeneSymbolFile, string omimJsonFile, SaJsonSchema jsonSchema) { _mimToGeneSymbolFile = mimToGeneSymbolFile; _omimJsonFile = omimJsonFile; _jsonSchema = jsonSchema; } public DataSourceVersion GetVersion() => DataSourceVersionReader.GetSourceVersion(_omimJsonFile); public IEnumerable GetItems() { Dictionary mimToGeneSymbol = GetMimNumberToGeneSymbol(); EntryRoot entryRoot = GetEntryRootObject(); Dictionary phenotypeDescriptions = GetPhenotypeDescriptions(entryRoot); foreach (OmimItem omimItem in GetOmimItems(entryRoot, mimToGeneSymbol, phenotypeDescriptions)) { OmimStats.Add(omimItem); yield return omimItem; } } private static Dictionary GetPhenotypeDescriptions(EntryRoot entryRoot) { Dictionary phenotypeToDescription = new Dictionary(); foreach (var entry in entryRoot.omim.entryList) { var item = entry.entry; // gene only item if (item.prefix == '*') continue; var description = OmimUtilities.ExtractAndProcessItemDescription(item); if (string.IsNullOrEmpty(description)) continue; phenotypeToDescription[item.mimNumber] = description; } return phenotypeToDescription; } private Dictionary GetMimNumberToGeneSymbol() { var mimNumberToGeneSymbol = new Dictionary(); using (var stream = new FileStream(_mimToGeneSymbolFile, FileMode.Open)) using (var reader = new StreamReader(stream)) { string line; //title line reader.ReadLine(); while ((line = reader.ReadLine()) != null) { var fields = line.OptimizedSplit('\t'); mimNumberToGeneSymbol[int.Parse(fields[0])] = fields[1]; } } return mimNumberToGeneSymbol; } private EntryRoot GetEntryRootObject() { using var fileStream = new FileStream(_omimJsonFile, FileMode.Open); using var uncompressedStream = new GZipStream(fileStream, CompressionMode.Decompress); using var streamReader = new StreamReader(uncompressedStream); var entryQueryResponse = JsonConvert.DeserializeObject(streamReader.ReadToEnd()); if (entryQueryResponse.omim.version != CurrentOmimJsonVersion) throw new InvalidDataException($"An unknown version of OMIM JSON schema has been used: version {entryQueryResponse.omim.version}. The latest known version is {CurrentOmimJsonVersion}"); return entryQueryResponse; } private IEnumerable GetOmimItems(EntryRoot entryRoot, Dictionary mimToGeneSymbol, Dictionary phenotypeDescriptions) { foreach (var entry in entryRoot.omim.entryList) { var item = entry.entry; var mimNumber = item.mimNumber; //skip if not a supported gene symbol if (!mimToGeneSymbol.TryGetValue(mimNumber, out var geneSymbol)) continue; string description = OmimUtilities.ExtractAndProcessItemDescription(item); string geneName = item.geneMap?.geneName; var phenotypes = item.geneMap?.phenotypeMapList?.Select(x => OmimUtilities.GetPhenotype(x, phenotypeDescriptions, _jsonSchema.GetSubSchema("phenotypes"))) .ToList() ?? new List(); yield return new OmimItem(geneSymbol, geneName, description, mimNumber, phenotypes, _jsonSchema); } } } ================================================ FILE: SAUtils/Omim/OmimPhenotypeSchema.cs ================================================ using System.Linq; using System.Text; using SAUtils.Schema; namespace SAUtils.Omim { public static class OmimPhenotypeSchema { private static readonly SaJsonValueType PrimaryValueType = SaJsonValueType.ObjectArray; private static readonly (string JsonKey, SaJsonValueType ValueType)[] SchemaDescription = { ("mimNumber", SaJsonValueType.Number), ("phenotype", SaJsonValueType.String), ("description", SaJsonValueType.String), ("mapping", SaJsonValueType.String), ("inheritances", SaJsonValueType.StringArray), ("comments", SaJsonValueType.String) }; public static SaJsonSchema Get() { var jsonSchema = SaJsonSchema.Create(new StringBuilder(), null, PrimaryValueType, SchemaDescription.Select(x => x.JsonKey)); jsonSchema.SetNonSaKeys(new[] { "isAlleleSpecific" }); foreach ((string key, var valueType) in SchemaDescription) jsonSchema.AddAnnotation(key, SaJsonKeyAnnotation.CreateFromProperties(valueType, 0, null)); return jsonSchema; } } } ================================================ FILE: SAUtils/Omim/OmimQuery.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Net.Http; using System.Text.RegularExpressions; using OptimizedCore; using SAUtils.GeneIdentifiers; namespace SAUtils.Omim { public sealed class OmimQuery : IDisposable { private readonly HttpClient _httpClient; private readonly FileStream _mimToSymbolStream; private readonly FileStream _jsonResponseStream; private string _jsonPrefix; private readonly string _mimTitlesUrl; private const string Mim2GeneUrl = "https://omim.org/static/omim/data/mim2gene.txt"; private const string MimTitlesFileName = "mimTitles.txt"; private const string OmimApiUrl = "https://api.omim.org/api/"; private const string OmimDownloadBaseUrl = "https://data.omim.org/downloads/"; private const string EntryHandler = "entry"; private const int EntryQueryLimit = 20; private const string ReturnDataFormat = "json"; private const string MimToSymbolFile = "MimToGeneSymbol.tsv"; public const string JsonResponseFile = "MimEntries.json.gz"; private const string JsonPrefixPattern = @"^{""omim"": { \n""version"": ""\d+\.\d+\"",\n""entryList"": \[ \n"; private const string JsonTextEnding = "] \n} }"; public OmimQuery(string apiKey, string outputDirectory) { _httpClient = new HttpClient(); _httpClient.DefaultRequestHeaders.Add("ApiKey", apiKey); if (string.IsNullOrEmpty(outputDirectory)) return; _mimTitlesUrl = GetMimTitlesUrl(apiKey); _mimToSymbolStream = new FileStream(Path.Combine(outputDirectory, MimToSymbolFile), FileMode.Create); _jsonResponseStream = new FileStream(Path.Combine(outputDirectory, JsonResponseFile), FileMode.Create); } private static string GetMimTitlesUrl(string apiKey) => $"{OmimDownloadBaseUrl}{apiKey}/{MimTitlesFileName}"; private List GetMimsToDownload() { var mims = new List(); using (var response = _httpClient.GetAsync(_mimTitlesUrl).Result) using (var reader = new StreamReader(response.Content.ReadAsStreamAsync().Result)) { string line; while ((line = reader.ReadLine()) != null) { //Caret (^) Entry has been removed from the database or moved to another entry if (line.OptimizedStartsWith('#') || line.StartsWith("Caret")) continue; var fields = line.Split('\t', 3); mims.Add(fields[1]); } } return mims; } public void GenerateMimToGeneSymbolFile(GeneSymbolUpdater geneSymbolUpdater) { using StreamWriter writer = new StreamWriter(_mimToSymbolStream); using var response = _httpClient.GetAsync(Mim2GeneUrl).Result; using var reader = new StreamReader(response.Content.ReadAsStreamAsync().Result); writer.WriteLine("#MIM number\tGene symbol"); string line; while ((line = reader.ReadLine()) != null) { if (line.OptimizedStartsWith('#')) continue; var fields = line.OptimizedSplit('\t'); var geneSymbol = fields[3]; if (string.IsNullOrEmpty(geneSymbol)) continue; var mimNumber = fields[0]; var entrezGeneId = fields[2]; var ensemblGeneId = fields[4]; var updatedGeneSymbol = geneSymbolUpdater.UpdateGeneSymbol(geneSymbol, ensemblGeneId, entrezGeneId); if (string.IsNullOrEmpty(updatedGeneSymbol)) continue; writer.WriteLine($"{mimNumber}\t{updatedGeneSymbol}"); } } public void GenerateJsonResponse() { var i = 0; var mimNumbers = GetMimsToDownload(); var needComma = false; using Stream gzStream = new GZipStream(_jsonResponseStream, CompressionMode.Compress); using StreamWriter writer = new StreamWriter(gzStream); while (i < mimNumbers.Count) { var endMimNumberIndex = Math.Min(i + EntryQueryLimit - 1, mimNumbers.Count - 1); var mimNumberString = GetMimNumbersString(mimNumbers, i, endMimNumberIndex); var queryUrl = GetApiQueryUrl(OmimApiUrl, EntryHandler, ("mimNumber", mimNumberString), ("include", "text:description"), ("include", "externalLinks"), ("include", "geneMap"), ("format", ReturnDataFormat)); using (var response = _httpClient.GetAsync(queryUrl).Result) { string responseContent = response.Content.ReadAsStringAsync().Result; string entries = SetPrefixAndGetEntriesString(responseContent); if (i == 0) writer.Write(_jsonPrefix); if (needComma) writer.Write(','); writer.Write(entries); needComma = true; } i = endMimNumberIndex + 1; } writer.WriteLine(JsonTextEnding); } private string SetPrefixAndGetEntriesString(string responseContent) { if (string.IsNullOrEmpty(_jsonPrefix)) { var prefixMatch = Regex.Match(responseContent, JsonPrefixPattern); if (!prefixMatch.Success) throw new InvalidDataException( $"Cannot find expected content at the beginning of the response from OMIM server. The response starts with \"{responseContent.Substring(0, JsonPrefixPattern.Length)}\""); _jsonPrefix = prefixMatch.Value; } int entriesStringLength = responseContent.Length - _jsonPrefix.Length - JsonTextEnding.Length; return responseContent.Substring(_jsonPrefix.Length, entriesStringLength); } private static string GetMimNumbersString(List allMimNumbers, int startIndex, int endIndex) { var sb = StringBuilderPool.Get(); var needComma = false; for (int i = startIndex; i <= endIndex; i++) { if (needComma) sb.Append(','); sb.Append(allMimNumbers[i]); needComma = true; } return StringBuilderPool.GetStringAndReturn(sb); } private static string GetApiQueryUrl(string baseAddress, string handler, params (string, string)[] keyValueTuples) { var sb = StringBuilderPool.Get(); sb.Append(baseAddress); sb.Append(handler); sb.Append('?'); var needAmpersand = false; foreach ((string key, string value) in keyValueTuples) { if (needAmpersand) sb.Append('&'); sb.Append(key); sb.Append('='); sb.Append(value); needAmpersand = true; } return StringBuilderPool.GetStringAndReturn(sb); } public void Dispose() { _httpClient?.Dispose(); _mimToSymbolStream?.Dispose(); _jsonResponseStream?.Dispose(); } } } ================================================ FILE: SAUtils/Omim/OmimSchema.cs ================================================ using System.Linq; using System.Text; using SAUtils.Schema; using VariantAnnotation.SA; namespace SAUtils.Omim { public static class OmimSchema { private static readonly SaJsonValueType PrimaryValueType = SaJsonValueType.ObjectArray; private static readonly (string JsonKey, SaJsonValueType ValueType, SaJsonSchema subSchema)[] SchemaDescription = { ("mimNumber", SaJsonValueType.Number, null), ("geneName", SaJsonValueType.String, null), ("description", SaJsonValueType.String, null), ("phenotypes", null, OmimPhenotypeSchema.Get()) }; public static SaJsonSchema Get() { var jsonSchema = SaJsonSchema.Create(new StringBuilder(), SaCommon.OmimTag, PrimaryValueType, SchemaDescription.Select(x => x.JsonKey)); jsonSchema.SetNonSaKeys(new[] { "isAlleleSpecific" }); foreach ((string key, var valueType, var subSchema) in SchemaDescription) { var keyAnnotation = valueType == null ? SaJsonKeyAnnotation.CreateFromSubSchema(subSchema) : SaJsonKeyAnnotation.CreateFromProperties(valueType, 0, null); jsonSchema.AddAnnotation(key, keyAnnotation); } return jsonSchema; } } } ================================================ FILE: SAUtils/Omim/OmimStatistics.cs ================================================ using System.Text; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.IO; namespace SAUtils.Omim; public class OmimStatistics { public uint TotalItems = 0; public uint TotalPhenotypes = 0; public CounterDictionary PhenotypeMappings = new(); public CounterDictionary PhenotypeInheritances = new(); public void Add(OmimItem omimItem) { TotalItems++; foreach (OmimItem.Phenotype phenotype in omimItem.Phenotypes) { TotalPhenotypes++; PhenotypeMappings.Add(phenotype.Mapping.ToString()); foreach (string inheritance in phenotype.Inheritance) { PhenotypeInheritances.Add(inheritance); } } } public override string ToString() { StringBuilder sb = StringBuilderPool.Get(); var jo = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jo.AddUIntValue("totalItems", TotalItems); jo.AddUIntValue("totalPhenotypes", TotalPhenotypes); jo.AddObjectValue("mappings", PhenotypeMappings); jo.AddObjectValue("inheritances", PhenotypeInheritances); sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } } ================================================ FILE: SAUtils/Omim/OmimUtilities.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.Omim.EntryApiResponse; using SAUtils.Schema; using VariantAnnotation.Interface.SA; namespace SAUtils.Omim { public static class OmimUtilities { public static OmimItem.Phenotype GetPhenotype(PhenotypeMap phenotypeMap, Dictionary phenotypeDescriptions, SaJsonSchema jsonSchema) { var phenotypeItem = phenotypeMap.phenotypeMap; var mimNumber = phenotypeItem.phenotypeMimNumber; phenotypeDescriptions.TryGetValue(mimNumber, out var description); var (phenotype, comments) = ExtractPhenotypeAndComments(phenotypeItem.phenotype); return new OmimItem.Phenotype(mimNumber, phenotype, description, (OmimItem.Mapping)phenotypeItem.phenotypeMappingKey, comments, ExtractInheritances(phenotypeItem.phenotypeInheritance), jsonSchema); } private static HashSet ExtractInheritances(string inheritance) { var inheritances = new HashSet(); if (String.IsNullOrEmpty(inheritance)) return inheritances; foreach (string content in inheritance.OptimizedSplit(';')) { string trimmedContent = content.Trim(' '); inheritances.Add(trimmedContent); } return inheritances; } internal static (string Phenotype, OmimItem.Comment[] Comments) ExtractPhenotypeAndComments(string phenotypeString) { phenotypeString = phenotypeString.Trim(' ').Trim(',').Replace(@"\\'", "'", StringComparison.Ordinal); string phenotype = Regex.Replace(phenotypeString,@" \(\d\) ", " "); var comments = phenotypeString.Select(GetComment) .Where(x => x != OmimItem.Comment.unknown) .ToArray(); return (phenotype, comments); } private static OmimItem.Comment GetComment(char symbol) { return symbol switch { '?' => OmimItem.Comment.unconfirmed_or_possibly_spurious_mapping, '[' => OmimItem.Comment.nondiseases, '{' => OmimItem.Comment .contribute_to_susceptibility_to_multifactorial_disorders_or_to_susceptibility_to_infection, _ => OmimItem.Comment.unknown }; } public static Dictionary> GetGeneToOmimEntriesAndSchema(IEnumerable omimItems) { var geneToOmimEntries = new Dictionary>(); SaJsonSchema jsonSchema = null; foreach (var item in omimItems) { if (jsonSchema == null) jsonSchema = item.JsonSchema; if (item.GeneSymbol == null) continue; if (geneToOmimEntries.TryGetValue(item.GeneSymbol, out var mimList)) { mimList.Add(item); } else { geneToOmimEntries[item.GeneSymbol] = new List { item }; } } return geneToOmimEntries; } // remove links enclosed by parentheses with only numbers, e.g. ({12345}) public static string RemoveLinks(this string text) => text == null ? null : Regex.Replace(Regex.Replace(Regex.Replace(text, @"((and|see|;|(e\.g\.)?,) )*{\d+(\.\d+)?}", ""), @" ?\((\ |/)*\)", ""), @"{([\d,]+:)?(.+?)}", "$2"); public static string RemoveFormatControl(this string text) => text == null ? null : Regex.Replace(text, "", ""); public static string ExtractAndProcessItemDescription(EntryItem item) { const string sectionName = "description"; return item.textSectionList?.FirstOrDefault(x => x.textSection.textSectionName == sectionName)? .textSection.textSectionContent.RemoveLinks().RemoveFormatControl(); } } } ================================================ FILE: SAUtils/Omim/OmimVersion.cs ================================================ using System; using System.IO; namespace SAUtils.Omim { public static class OmimVersion { private const string Name = "OMIM"; private const string Description = "An Online Catalog of Human Genes and Genetic Disorders"; private const string VersionFileSuffix = ".version"; public static void WriteToFile(string outputPrefix, string outputDirectory) { using (var stream = new FileStream(Path.Combine(outputDirectory, outputPrefix + VersionFileSuffix), FileMode.Create)) using (var writer = new StreamWriter(stream)) { var currentDate = DateTime.Today; writer.WriteLine($"NAME={Name}"); writer.WriteLine($"VERSION={currentDate:yyyyMMdd}"); writer.WriteLine($"DATE={currentDate:yyyy-MM-dd}"); writer.WriteLine($"DESCRIPTION={Description}"); } } } } ================================================ FILE: SAUtils/OneKGenSvDb/Create.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.OneKGen; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.OneKGenSvDb { public static class Create { private static string _inputFileName; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "OneKGenSv BED file", v => _inputFileName = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckInputFilenameExists(_inputFileName, "OneKGenSv BED file", "--in") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with 1000 Genome structural variant annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFileName + ".version"); string outFileName = $"{version.Name}_{version.Version}".Replace(' ','_'); using(var reader = GZipUtilities.GetAppropriateStreamReader(_inputFileName)) using(var oneKGenSvReader = new OneKGenSvReader(reader, referenceProvider.RefNameToChromosome)) using(var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.IntervalFileSuffix))) using(var nsiWriter = new NsiWriter(nsaStream, version, referenceProvider.Assembly, SaCommon.OnekSvTag, ReportFor.StructuralVariants, SaCommon.SchemaVersion)) { nsiWriter.Write(oneKGenSvReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/OneKGenSvDb/VcfToBed.cs ================================================ using System.IO; using System.IO.Compression; using System.Linq; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using OptimizedCore; using VariantAnnotation.Interface.IO; namespace SAUtils.OneKGenSvDb { public static class VcfToBed { private static string _inputFileName; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "in|i=", "OneKGenSv VCF file", v => _inputFileName = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_inputFileName, "OneKGenSv VCF file", "--in") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Convert the VCF file into BED-like format", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { string outFileName = Path.GetFileName(_inputFileName).Replace("vcf", "bed"); using (var reader = GZipUtilities.GetAppropriateStreamReader(_inputFileName)) using (var outputStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName))) using (var outputGzipStream = new GZipStream(outputStream, CompressionMode.Compress)) using (var writer = new StreamWriter(outputGzipStream)) { string line; while ((line = reader.ReadLine()) != null) { var fields = line.OptimizedSplit('\t', VcfCommon.InfoIndex + 2); if (fields.Length <= VcfCommon.InfoIndex) continue; string infoFields = fields[VcfCommon.InfoIndex]; string svEnd = GetSvEndString(infoFields); if (svEnd == null) continue; // Because 1K Genome SV has a padding base, the POS should add one to get the 1-based start position of the interval // However, the start position need to minus one to become the 0-based start position in a BED file // So the POS value can be used directly in the BED file. writer.WriteLine(string.Join('\t', fields[VcfCommon.ChromIndex], fields[VcfCommon.PosIndex], svEnd, fields[VcfCommon.IdIndex], fields[VcfCommon.AltIndex], infoFields)); } } return ExitCodes.Success; } private static string GetSvEndString(string infoFields) { if (infoFields == "" || infoFields == ".") return null; string endInfo = infoFields.OptimizedSplit(';').FirstOrDefault(x => x.StartsWith("END=")); return string.IsNullOrEmpty(endInfo) ? null : endInfo.Substring(4); } } } ================================================ FILE: SAUtils/ParseUtils/SAParseUtilities.cs ================================================ using Variants; namespace SAUtils.ParseUtils { public static class SaParseUtilities { public static VariantType GetSequenceAlterationType(string dgvType, string dgvSubType) { var sequenceAlterationType = VariantType.unknown; if (dgvType == "CNV") { switch (dgvSubType) { case "deletion": sequenceAlterationType = VariantType.copy_number_loss; break; case "duplication": sequenceAlterationType = VariantType.copy_number_gain; break; case "gain": sequenceAlterationType = VariantType.copy_number_gain; break; case "gain+loss": sequenceAlterationType = VariantType.copy_number_variation; break; case "loss": sequenceAlterationType = VariantType.copy_number_loss; break; case "insertion": sequenceAlterationType = VariantType.insertion; break; case "mobile element insertion": sequenceAlterationType = VariantType.mobile_element_insertion; break; case "novel sequence insertion": sequenceAlterationType = VariantType.novel_sequence_insertion; break; case "tandem duplication": sequenceAlterationType = VariantType.tandem_duplication; break; default: sequenceAlterationType = VariantType.unknown; break; } } else if (dgvType == "OTHER") { switch (dgvSubType) { case "complex": sequenceAlterationType = VariantType.complex_structural_alteration; break; case "inversion": sequenceAlterationType = VariantType.inversion; break; case "sequence alteration": sequenceAlterationType = VariantType.structural_alteration; break; default: sequenceAlterationType = VariantType.unknown; break; } } return sequenceAlterationType; } public static VariantType GetSequenceAlteration(string svType) { VariantType sequenceAlterationType; switch (svType) { case "DEL": sequenceAlterationType = VariantType.copy_number_loss; break; case "DUP": sequenceAlterationType = VariantType.copy_number_gain; break; case "CNV": sequenceAlterationType = VariantType.copy_number_variation; break; case "INS": sequenceAlterationType = VariantType.insertion; break; case "ALU": sequenceAlterationType = VariantType.mobile_element_insertion; break; case "LINE1": sequenceAlterationType = VariantType.mobile_element_insertion; break; case "SVA": sequenceAlterationType = VariantType.mobile_element_insertion; break; case "INV": sequenceAlterationType = VariantType.inversion; break; default: sequenceAlterationType = VariantType.unknown; break; } return sequenceAlterationType; } public static int? Add(int? x, int? y) { if (x == null && y == null) return null; if (x == null) return y; if (y == null) return x; return x + y; } } } ================================================ FILE: SAUtils/ParseUtils/SplitLine.cs ================================================ using System.Globalization; using OptimizedCore; namespace SAUtils.ParseUtils; public sealed class SplitLine { private readonly string[] _splitLine; public SplitLine(in string inputLine, in char delimiter) { _splitLine = inputLine.OptimizedSplit(delimiter); } public string GetString(in int index) { return _splitLine[index]; } public int? ParseInteger(in int index) { return ParseInteger(_splitLine[index]); } public double? ParseDouble(in int index) { return ParseDouble(_splitLine[index]); } public static int? ParseInteger(string valueString) { return int.TryParse( valueString, NumberStyles.Integer | NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out int temp ) ? temp : null; } public static double? ParseDouble(string valueString) { return double.TryParse(valueString, out double temp) ? temp : null; } } ================================================ FILE: SAUtils/ParseUtils/TsvIndices.cs ================================================ namespace SAUtils.ParseUtils; public struct TsvIndices { public ushort Chromosome = ushort.MaxValue; public ushort Start = ushort.MaxValue; public ushort End = ushort.MaxValue; public ushort VariantId = ushort.MaxValue; public ushort SvType = ushort.MaxValue; public ushort Filters = ushort.MaxValue; public ushort AllAlleleCount = ushort.MaxValue; public ushort AfrAlleleCount = ushort.MaxValue; public ushort AmrAlleleCount = ushort.MaxValue; public ushort EasAlleleCount = ushort.MaxValue; public ushort EurAlleleCount = ushort.MaxValue; public ushort OthAlleleCount = ushort.MaxValue; public ushort FemaleAlleleCount = ushort.MaxValue; public ushort MaleAlleleCount = ushort.MaxValue; public ushort AllAlleleFrequency = ushort.MaxValue; public ushort AfrAlleleFrequency = ushort.MaxValue; public ushort AmrAlleleFrequency = ushort.MaxValue; public ushort EasAlleleFrequency = ushort.MaxValue; public ushort EurAlleleFrequency = ushort.MaxValue; public ushort OthAlleleFrequency = ushort.MaxValue; public ushort FemaleAlleleFrequency = ushort.MaxValue; public ushort MaleAlleleFrequency = ushort.MaxValue; public ushort AllAlleleNumber = ushort.MaxValue; public ushort AfrAlleleNumber = ushort.MaxValue; public ushort AmrAlleleNumber = ushort.MaxValue; public ushort EasAlleleNumber = ushort.MaxValue; public ushort EurAlleleNumber = ushort.MaxValue; public ushort OthAlleleNumber = ushort.MaxValue; public ushort FemaleAlleleNumber = ushort.MaxValue; public ushort MaleAlleleNumber = ushort.MaxValue; public ushort AllHomCount = ushort.MaxValue; public ushort AfrHomCount = ushort.MaxValue; public ushort AmrHomCount = ushort.MaxValue; public ushort EasHomCount = ushort.MaxValue; public ushort EurHomCount = ushort.MaxValue; public ushort OthHomCount = ushort.MaxValue; public ushort FemaleHomCount = ushort.MaxValue; public ushort MaleHomCount = ushort.MaxValue; public TsvIndices() { } } ================================================ FILE: SAUtils/PhyloP/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.PhyloP { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input WifFix file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "PhyloP WigFix file", "--in") .CheckInputFilenameExists(_inputFile, "PhyloP WigFix file", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("create Ancestral allele database from 1000Genomes data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var phylopParser = new PhylopParser(GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider.Assembly, referenceProvider.RefNameToChromosome)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.PhylopFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.PhylopFileSuffix + SaCommon.IndexSuffix))) using (var writer = new NpdWriter(nsaStream, indexStream, version, referenceProvider.Assembly, SaCommon.PhylopTag, SaCommon.SchemaVersion)) { writer.Write(phylopParser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/PhyloP/NpdWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Utilities; using Compression.Algorithms; using Genome; using IO; using SAUtils.DataStructures; using VariantAnnotation.PhyloP; using VariantAnnotation.Providers; namespace SAUtils.PhyloP { public sealed class NpdWriter:IDisposable { private readonly ExtendedBinaryWriter _writer; private readonly byte[] _scores; private readonly byte[] _compressedScores; private readonly MemoryStream _memStream; private readonly ExtendedBinaryWriter _memWriter; private readonly Zstandard _zstd; private readonly Dictionary _scoreMap; private byte _nextScoreCode = 1; //0 is reserved to indicate no score private readonly NpdIndex _index; public NpdWriter(Stream dbStream, Stream indexStream, DataSourceVersion version, GenomeAssembly assembly, string jsonKey, int schemaVersion) { _writer = new ExtendedBinaryWriter( dbStream); _index = new NpdIndex(indexStream, assembly, version, jsonKey, schemaVersion); _scoreMap = new Dictionary(byte.MaxValue); _scores = new byte[NpdIndex.MaxChromLength]; _memStream = new MemoryStream(_scores); _memWriter = new ExtendedBinaryWriter(_memStream); _zstd = new Zstandard(); _compressedScores = new byte[_zstd.GetCompressedBufferBounds(_scores.Length)]; } private ushort _chromIndex = ushort.MaxValue; private string _chromName = ""; public void Write(IEnumerable items) { var benchmark = new Benchmark(); int lastPosition = 0; foreach (PhylopItem item in items) { if (item.Chromosome.Index != _chromIndex) { //flush out old chrom if (_chromIndex != ushort.MaxValue) { WriteCompressed(lastPosition); Console.WriteLine($"Chromosome {_chromName} completed in {Benchmark.ToHumanReadable(benchmark.GetElapsedTime())}"); benchmark.Reset(); } _chromIndex = item.Chromosome.Index; _chromName = item.Chromosome.EnsemblName; } if (! _scoreMap.TryGetValue(item.Score, out byte _)) { _scoreMap.Add(item.Score, _nextScoreCode++); if (_nextScoreCode==byte.MaxValue) throw new ArgumentOutOfRangeException($"No of distinct scores exceeded expected value of {_nextScoreCode}!!"); } _memStream.Position = item.Position - 1; _memWriter.Write(_scoreMap[item.Score]); lastPosition = item.Position; } //closing the last chromosome WriteCompressed(lastPosition); Console.WriteLine($"Chromosome {_chromName} completed in {Benchmark.ToHumanReadable(benchmark.GetElapsedTime())}"); benchmark.Reset(); Console.WriteLine($"\nNumber of distinct scores oberved:{_scoreMap.Count}"); _index.Write(_scoreMap); } private void WriteCompressed(int lastPosition) { var startLocation = _writer.BaseStream.Position; int compressSize = _zstd.Compress(_scores, lastPosition, _compressedScores, _compressedScores.Length); _writer.Write(_compressedScores, 0, compressSize); _index.Add(_chromIndex, startLocation, compressSize); Array.Clear(_scores, 0, _scores.Length); _memStream.Position = 0;//reset the stream } public void Dispose() { _writer?.Dispose(); _memStream?.Dispose(); _memWriter?.Dispose(); } } } ================================================ FILE: SAUtils/PhyloP/PhylopParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using IO; using OptimizedCore; using SAUtils.DataStructures; namespace SAUtils.PhyloP { public sealed class PhylopParser : IDisposable { private readonly Stream _stream; private readonly GenomeAssembly _assembly; private readonly Dictionary _refChromDict; public PhylopParser(Stream stream, GenomeAssembly assembly, Dictionary refChromDict) { _stream = stream; _assembly = assembly; _refChromDict = refChromDict; } public IEnumerable GetItems() { using (var reader = FileUtilities.GetStreamReader(_stream)) { Chromosome chrom = null; int position = 0; int step = 0; string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; if (double.TryParse(line, out double score)) { // the chrom is unrecognized, so we skip if (chrom ==null || chrom.Index==ushort.MaxValue) continue; // since phylop used hg19, we skip entries for chrM if (_assembly == GenomeAssembly.GRCh37 && chrom.UcscName == "chrM") continue; // this is a phylop score yield return new PhylopItem(chrom, position, score); position += step; } else { (chrom, position, step) = StartNewInterval(line); } } } } private (Chromosome chrom, int position, int step) StartNewInterval(string line) { var words = line.Split(); string chromName = words[1].OptimizedKeyValue().Value; var chrom = _refChromDict.TryGetValue(chromName, out var chromosome)? chromosome: Chromosome.GetEmptyChromosome(chromName); if (chrom.Index == ushort.MaxValue) return (chrom, 0, 0); int position = int.Parse(words[2].OptimizedKeyValue().Value); int step = short.Parse(words[3].OptimizedKeyValue().Value); return (chrom, position, step); } public void Dispose() { _stream?.Dispose(); } } } ================================================ FILE: SAUtils/PrimateAi/PrimateAiDb.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Caches; using VariantAnnotation.IO.Caches; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.PrimateAi { public static class PrimateAiDb { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; private static string _transcriptCachePrefix; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "cache|c=", "Transcript cache prefix", v => _transcriptCachePrefix = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "PrimateAI VCF file", "--in") .HasRequiredParameter(_transcriptCachePrefix, "transcript cache file", "--cache") .CheckInputFilenameExists(_inputFile, "PrimateAI VCF file", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; TranscriptCacheData transcriptData; using (var transcriptCacheReader = new TranscriptCacheReader(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(_transcriptCachePrefix)))) { transcriptData = transcriptCacheReader.Read(referenceProvider.RefIndexToChromosome); } var (entrezToHgnc, ensemblToHgnc) = PrimateAiUtilities.GetIdToSymbols(transcriptData); using (var primateAiParser = new PrimateAiParser(GZipUtilities.GetAppropriateReadStream(_inputFile),referenceProvider, entrezToHgnc, ensemblToHgnc)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.PrimateAiTag, true, true, SaCommon.SchemaVersion, false)) { nsaWriter.Write(primateAiParser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/PrimateAi/PrimateAiItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.PrimateAi { public sealed class PrimateAiItem : ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } private string Hgnc { get; } public double ScorePercentile { get; } public PrimateAiItem(Chromosome chromosome, int position, string refAllele, string altAllele, string hgnc, double percentile) { Chromosome = chromosome; Position = position; RefAllele = refAllele; AltAllele = altAllele; Hgnc = hgnc; ScorePercentile = percentile; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("hgnc", Hgnc); jsonObject.AddDoubleValue("scorePercentile", ScorePercentile, "0.##"); return StringBuilderPool.GetStringAndReturn(sb); } public string InputLine { get; set; } } } ================================================ FILE: SAUtils/PrimateAi/PrimateAiParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using IO; using OptimizedCore; using VariantAnnotation.Interface.Providers; namespace SAUtils.PrimateAi { public sealed class PrimateAiParser : IDisposable { private readonly Stream _stream; private readonly ISequenceProvider _referenceProvider; private readonly Dictionary _entrezToHgnc; private readonly Dictionary _ensemblToHgnc; public PrimateAiParser(Stream stream, ISequenceProvider referenceProvider, Dictionary entrezToHgnc, Dictionary ensemblToHgnc) { _stream = stream; _entrezToHgnc = entrezToHgnc; _ensemblToHgnc = ensemblToHgnc; _referenceProvider = referenceProvider; } public IEnumerable GetItems() { using (var reader = FileUtilities.GetStreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; var item = ExtractItem(line); if (item == null) continue; yield return item; } } Console.WriteLine($"Number of entries:{_count}. Entries without hgnc:{_nullGeneCount} ({100.0*_nullGeneCount/_count} %)"); } //#CHROM POS REF ALT GeneId ScorePercentile //1 69094 G A 79501 0.79 private int _nullGeneCount; private int _count; private PrimateAiItem ExtractItem(string line) { var splits = line.Split('\t'); var chromosomeName = splits[0]; if (!_referenceProvider.RefNameToChromosome.ContainsKey(chromosomeName)) return null; var chromosome = _referenceProvider.RefNameToChromosome[chromosomeName]; var position = int.Parse(splits[1]); var refAllele = splits[2]; var altAllele = splits[3]; var geneId = splits[4]; var percentile = double.Parse(splits[5]); string hgnc=null; if (_entrezToHgnc.ContainsKey(geneId)) hgnc = _entrezToHgnc[geneId]; if (_ensemblToHgnc.ContainsKey(geneId)) hgnc = _ensemblToHgnc[geneId]; if (string.IsNullOrEmpty(hgnc)) { _nullGeneCount++; return null; } _count++; return new PrimateAiItem(chromosome, position, refAllele, altAllele, hgnc, percentile); } public void Dispose() { _stream?.Dispose(); _referenceProvider?.Dispose(); } } } ================================================ FILE: SAUtils/PrimateAi/PrimateAiUtilities.cs ================================================ using System; using System.Collections.Generic; using VariantAnnotation.Caches; namespace SAUtils.PrimateAi { public static class PrimateAiUtilities { public static (Dictionary entrezToHgnc, Dictionary ensemblToHgnc) GetIdToSymbols(TranscriptCacheData transcriptData) { var entrezToHgnc = new Dictionary(); var ensemblToHgnc = new Dictionary(); foreach (var gene in transcriptData.Genes) { if(gene.EntrezGeneId.WithoutVersion == "649330") Console.WriteLine("bug"); if(! string.IsNullOrEmpty(gene.EntrezGeneId.WithoutVersion)) entrezToHgnc[gene.EntrezGeneId.WithoutVersion] = gene.Symbol; if (!string.IsNullOrEmpty(gene.EnsemblId.WithoutVersion)) ensemblToHgnc[gene.EnsemblId.WithoutVersion] = gene.Symbol; } return (entrezToHgnc, ensemblToHgnc); } } } ================================================ FILE: SAUtils/ProcessSpliceNetTsv/PredictionFilter.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using Compression.Utilities; using Intervals; namespace SAUtils.ProcessSpliceNetTsv { public static class PredictionFilter { private const int GffChrColumn = 0; private const int GffFeatureColumn = 2; private const int GffStartColumn = 3; private const int GffEndColumn = 4; private const int NumChrs = 25; private const int PredChrColumn = 0; private const int PredPosColumn = 1; private static readonly int[] PredScoreColumns = { 6, 8, 10, 12 }; private const double FreqCutoff = 0.05; private const int IntronBoundaryDistanceCutoff = 15; public static void Filter(string intputTsv, string gffFile1, string gffFile2, string outputTsv) { var intronFlankingRegions = GetIntronFlankingRegions(gffFile1, gffFile2); using (var resultsReader = GZipUtilities.GetAppropriateStreamReader(intputTsv)) using (var resultsWriter = GZipUtilities.GetStreamWriter(outputTsv)) { long lineCount = 0; string line; while ((line = resultsReader.ReadLine()) != null) { var info = line.TrimEnd().Split('\t'); ushort chrIndex = GetChrIndex(info[PredChrColumn]); int pos = int.Parse(info[PredPosColumn]); if (intronFlankingRegions.OverlapsAny(chrIndex, pos, pos) || AnyScorePassTheCutoff(info, PredScoreColumns, FreqCutoff)) { resultsWriter.WriteLine(line); } lineCount++; if (lineCount % 1_000_000 == 0) { Console.WriteLine($"Processed {lineCount} lines. Current position: {info[PredChrColumn]}:{info[PredPosColumn]}"); } } } } private static bool AnyScorePassTheCutoff(string[] columns, int[] scoreColumnIndices, double scoreCutoff) { foreach (int columnIndex in scoreColumnIndices) { if (double.Parse(columns[columnIndex]) >= scoreCutoff) return true; } return false; } private static IntervalForest GetIntronFlankingRegions(string gffFile1, string gffFile2) { var flankingRegions = new IntervalArray[NumChrs]; var flankingRegionStarts1 = GetIntronFlankingRegionStarts(gffFile1); var flankingRegionStarts2 = GetIntronFlankingRegionStarts(gffFile2); for (var i = 0; i < NumChrs; i++) { var allStartsThisChr = new HashSet(flankingRegionStarts1[i]); allStartsThisChr.UnionWith(flankingRegionStarts2[i]); var intervals = GetIntervals(allStartsThisChr, IntronBoundaryDistanceCutoff * 2); flankingRegions[i] = new IntervalArray(intervals.ToArray()); } return new IntervalForest(flankingRegions); } private static IEnumerable> GetIntervals(IEnumerable starts, int size) => starts.Select(x => new Interval(x, x + size - 1, 0)); private static HashSet[] GetIntronFlankingRegionStarts(string gffFile) { var flankingRegionStarts = new HashSet[NumChrs]; for (var i = 0; i < NumChrs; i++) flankingRegionStarts[i] = new HashSet(); using (var gffReader = GZipUtilities.GetAppropriateStreamReader(gffFile)) { string line; var previousChrIndex = ushort.MaxValue; var exonBoundaries = new List(); var flankingRegionStartsthisChr = new HashSet(); while ((line = gffReader.ReadLine()) != null) { var info = line.Split('\t'); if (info[GffFeatureColumn] == "gene") { ushort chrIndex = GetChrIndex(info[GffChrColumn]); if (previousChrIndex != ushort.MaxValue && chrIndex != previousChrIndex) { ProcessBufferedBoundaries(exonBoundaries, flankingRegionStartsthisChr); flankingRegionStarts[previousChrIndex] = flankingRegionStartsthisChr; flankingRegionStartsthisChr = new HashSet(); } previousChrIndex = chrIndex; } else if (info[GffFeatureColumn] == "transcript") { ProcessBufferedBoundaries(exonBoundaries, flankingRegionStartsthisChr); exonBoundaries = new List(); } else if (info[GffFeatureColumn] == "exon") { int start = int.Parse(info[GffStartColumn]); int end = int.Parse(info[GffEndColumn]); exonBoundaries.Add(new Interval(start, end)); } } if (previousChrIndex != ushort.MaxValue) { ProcessBufferedBoundaries(exonBoundaries, flankingRegionStartsthisChr); flankingRegionStarts[previousChrIndex] = flankingRegionStartsthisChr; } } return flankingRegionStarts; } private static void ProcessBufferedBoundaries(List exonBoundaries, HashSet flankingRegionStartsthisChr) { for (var i = 1; i < exonBoundaries.Count; i++) { // Donor site for intron i flankingRegionStartsthisChr.Add(exonBoundaries[i - 1].End - IntronBoundaryDistanceCutoff + 1); // Acceptor site for intron i flankingRegionStartsthisChr.Add(exonBoundaries[i].Start - IntronBoundaryDistanceCutoff); } } private static ushort GetChrIndex(string chrName) { if (chrName.StartsWith("chr")) chrName = chrName.Substring(3); if (ushort.TryParse(chrName, out ushort chrNum)) { return (ushort)(chrNum - 1); } switch (chrName) { case "X": return 22; case "Y": return 23; case "M": case "MT": return 24; default: return ushort.MaxValue; } } } } ================================================ FILE: SAUtils/ProcessSpliceNetTsv/SpliceNetPredictionFilterMain.cs ================================================ using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; namespace SAUtils.ProcessSpliceNetTsv { public static class SpliceNetPredictionFilterMain { private static string _spliceNetResultsFile; private static string _filteredResultsFile; private static string _gffFile1; private static string _gffFile2; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "in|i=", "SpliceNet prediction results", v => _spliceNetResultsFile = v }, { "gff1|g1=", "Gene structure file 1", v => _gffFile1 = v }, { "gff2|g2=", "Gene structures file 2", v => _gffFile2 = v }, { "out|o=", "Filtered SpliceNet results", v => _filteredResultsFile = v } }; var commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_spliceNetResultsFile, "SpliceNet prediction results", "--in") .CheckInputFilenameExists(_gffFile1, "Gene structures file 1", "--gff1") .CheckInputFilenameExists(_gffFile2, "Gene structures file 2", "--gff2") .SkipBanner() .ShowHelpMenu("Filter SpliceNet results based on predicted scores and variant location", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { PredictionFilter.Filter(_spliceNetResultsFile, _gffFile1, _gffFile2, _filteredResultsFile); return ExitCodes.Success; } } } ================================================ FILE: SAUtils/RefMinorDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.OneKGen; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.RefMinorDb { public static class Main { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFile, "OneK Gen VCFfile", "--in") .CheckInputFilenameExists(_inputFile, "OneK Gen VCFfile", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}_{SaCommon.RefMinorTag}".Replace(' ','_'); using (var refMinorReader = new RefMinorReader(GZipUtilities.GetAppropriateStreamReader(_inputFile), referenceProvider)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.RefMinorFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.RefMinorFileSuffix + SaCommon.IndexSuffix))) using (var writer = new RefMinorDbWriter(new ExtendedBinaryWriter(nsaStream), new ExtendedBinaryWriter(indexStream), version, referenceProvider, SaCommon.SchemaVersion)) { writer.Write(refMinorReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/RefMinorDb/RefMinorDbWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Utilities; using IO; using SAUtils.DataStructures; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.RefMinorDb { public sealed class RefMinorDbWriter:IDisposable { private readonly ExtendedBinaryWriter _writer; private readonly Stream _stream; private readonly ISequenceProvider _refProvider; private readonly RefMinorIndex _refMinorIndex; public RefMinorDbWriter(ExtendedBinaryWriter writer, ExtendedBinaryWriter indexWriter, DataSourceVersion version, ISequenceProvider refProvider, int schemaVersion) { _stream = writer.BaseStream; _writer = writer; _refProvider = refProvider; _refMinorIndex = new RefMinorIndex(indexWriter, _refProvider.Assembly, version, schemaVersion); } public void Write(IEnumerable saItems) { var itemsMinHeap = new MinHeap(SuppDataUtilities.CompareTo); var chromIndex = ushort.MaxValue; var currentEnsemblName = ""; var benchmark = new Benchmark(); foreach (var saItem in saItems) { if (chromIndex != saItem.Chromosome.Index) { if (chromIndex != ushort.MaxValue) { //flushing out the remaining items in buffer WriteUptoPosition(itemsMinHeap, int.MaxValue); Console.WriteLine($"Chromosome {currentEnsemblName} completed in {Benchmark.ToHumanReadable(benchmark.GetElapsedTime())}"); benchmark.Reset(); } chromIndex = saItem.Chromosome.Index; currentEnsemblName = saItem.Chromosome.EnsemblName; _refProvider.LoadChromosome(saItem.Chromosome); } if (saItem.RefAllele != _refProvider.Sequence.Substring(saItem.Position-1, saItem.RefAllele.Length)) continue; //the items come in sorted order of the pre-trimmed position. //So when writing out, we have to make sure that we do not write past this position. //Once a position has been seen in the stream, we can safely write all positions before that. var writeToPos = saItem.Position; saItem.Trim(); itemsMinHeap.Add(saItem); WriteUptoPosition(itemsMinHeap, writeToPos); } //flushing out the remaining items in buffer WriteUptoPosition(itemsMinHeap, int.MaxValue); Console.WriteLine($"Chromosome {currentEnsemblName} completed in {Benchmark.ToHumanReadable(benchmark.GetElapsedTime())}"); _refMinorIndex.Write(_stream.Position); } private void WriteUptoPosition(MinHeap itemsHeap, int position) { if (itemsHeap.Count() == 0) return; var bufferMin = itemsHeap.GetMin(); while (bufferMin.Position < position) { var itemsAtMinPosition = new List(); while (itemsHeap.Count() > 0 && SuppDataUtilities.CompareTo(bufferMin, itemsHeap.GetMin()) == 0) itemsAtMinPosition.Add(itemsHeap.ExtractMin()); WritePosition(itemsAtMinPosition); if (itemsHeap.Count() == 0) break; bufferMin = itemsHeap.GetMin(); } } private void WritePosition(List saItems) { var refMinorItem = (RefMinorItem)GetRefMinorItem(saItems); if (refMinorItem == null) return; _refMinorIndex.Add(refMinorItem.Chromosome.Index, _stream.Position); _writer.WriteOpt(refMinorItem.Position); _writer.WriteOptAscii(refMinorItem.GlobalMajor); } private static ISupplementaryDataItem GetRefMinorItem(List saItems) { var totalAltAlleleFreq = 0.0; var alleleFrequencies = new Dictionary(); string refAllele = null; foreach (var supplementaryDataItem in saItems) { var item = (AlleleFrequencyItem) supplementaryDataItem; if (!IsSnv(item.RefAllele) || !IsSnv(item.AltAllele)) continue; refAllele = item.RefAllele; totalAltAlleleFreq += item.AltFrequency; alleleFrequencies[item.AltAllele] = item.AltFrequency; } var isRefMinor = totalAltAlleleFreq >= SaCommon.RefMinorThreshold; if (!isRefMinor) return null; string globalMajor = SuppDataUtilities.GetMostFrequentAllele(alleleFrequencies, refAllele); return new RefMinorItem(saItems[0].Chromosome, saItems[0].Position, globalMajor); } private static bool IsSnv(string allele) { if (allele.Length != 1) return false; allele = allele.ToUpper(); return allele == "A" || allele == "C" || allele == "G" || allele == "T"; } public void Dispose() { _writer?.Dispose(); _stream?.Dispose(); } } } ================================================ FILE: SAUtils/RegionUtilities.cs ================================================ using System; using Genome; using Intervals; using VariantAnnotation.Interface.SA; namespace SAUtils { public static class RegionUtilities { private static readonly IInterval Grch37Par1 = new Interval(10_001, 2_649_520); private static readonly IInterval Grch37Par2 = new Interval(59_034_050, 59_363_566); private static readonly IInterval Grch38Par1 = new Interval(10_001, 2_781_479); private static readonly IInterval Grch38Par2 = new Interval(56_887_903, 57_217_415); public static bool OverlapsParRegion(ISupplementaryDataItem variant, GenomeAssembly assembly) { if (variant.Chromosome.UcscName != "chrY") return false; var start = variant.Position; var end = variant.Position + Math.Max(variant.AltAllele.Length, variant.RefAllele.Length); switch (assembly) { case GenomeAssembly.hg19: case GenomeAssembly.GRCh37: return Grch37Par1.Overlaps(start, end) || Grch37Par2.Overlaps(start, end); case GenomeAssembly.GRCh38: return Grch38Par1.Overlaps(start, end) || Grch38Par2.Overlaps(start, end); default: return false; } } } } ================================================ FILE: SAUtils/Revel/Create.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using SAUtils.InputFileParsers; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.Revel { public static class Create { private static string _inputFile; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "input REVEL file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; var commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckInputFilenameExists(_inputFile, "input REVEL file Path", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Create a supplementary database from REVEL input file ", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var nucleotides = new[] {"A", "C", "G", "T"}; var revelParserSettings = new ParserSettings( new ColumnIndex(0, 1, 2, 3, 6, null), nucleotides, GenericScoreParser.MaxRepresentativeScores ); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); var outFileName = $"{version.Name}_{version.Version}"; using (var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference))) using (var streamReader = GZipUtilities.GetAppropriateStreamReader(_inputFile)) using (var revelParser = new GenericScoreParser(revelParserSettings, streamReader, referenceProvider.RefNameToChromosome)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.RevelTag, true, false, SaCommon.SchemaVersion, false)) { nsaWriter.Write(revelParser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/SAUtils.cs ================================================ using System.Collections.Generic; using CommandLine.Builders; using ErrorHandling; using SAUtils.ClinGen; using SAUtils.CosmicGeneFusions; using SAUtils.CreateClinvarDb; using SAUtils.DbSnpRemapper; using SAUtils.ExtractCosmicSvs; using SAUtils.ExtractMiniSa; using SAUtils.ExtractMiniXml; using SAUtils.FusionCatcher; using SAUtils.GERP; using SAUtils.gnomAD; using SAUtils.GnomadGeneScores; using SAUtils.MitoHeteroplasmy; using SAUtils.MitoMap; using SAUtils.NsaIndexUpdater; using SAUtils.PrimateAi; using SAUtils.ProcessSpliceNetTsv; using SAUtils.SpliceAi; using VariantAnnotation.Interface; namespace SAUtils { public static class SaUtils { public static int Main(string[] args) { var ops = new Dictionary { ["AaCon"] = new("create AA conservation database", AAConservation.AaConservationMain.Run), ["ancestralAllele"] = new("create Ancestral allele database from 1000Genomes data", MakeAaDb.Main.Run), ["ClinGen"] = new("create ClinGen database", MakeClinGenDb.Main.Run), ["clinvar"] = new("create ClinVar database", ClinVarMain.Run), ["concat"] = new("merge multiple NSA files for the same data source having non-overlapping regions", NsaConcatenator.NsaConcatenator.Run), ["Cosmic"] = new("create COSMIC database", CreateCosmicDb.Main.Run), ["CosmicSv"] = new("create COSMIC SV database", ExtractCosmicSvsMain.Run), ["CosmicFusion"] = new("create COSMIC gene fusion database", CreateCosmicGeneFusions.Run), ["CustomGene"] = new("create custom gene annotation database", Custom.GeneMain.Run), ["CustomVar"] = new("create custom variant annotation database", Custom.VariantMain.Run), ["Dann"] = new("create DANN database", Dann.Create.Run), ["Dbsnp"] = new("create dbSNP database", CreateDbsnpDb.Main.Run), ["Dgv"] = new("create DGV database", makeDgvDb.Main.Run), ["DiseaseValidity"] = new("create disease validity database", GeneDiseaseValidity.Run), ["DosageMapRegions"] = new("create dosage map regions", DosageMapRegions.Run), ["DosageSensitivity"] = new("create dosage sensitivity database", DosageSensitivity.Run), ["DownloadOmim"] = new("download OMIM database", Omim.Downloader.Run), ["ExtractMiniSA"] = new("extracts mini SA", ExtractMiniSaMain.Run), ["ExtractMiniXml"] = new("extracts mini XML (ClinVar)", ExtractMiniXmlMain.Run), ["FilterSpliceNetTsv"] = new("filter SpliceNet predictions", SpliceNetPredictionFilterMain.Run), ["FusionCatcher"] = new("create FusionCatcher database", CreateFusionCatcher.Run), ["Gerp"] = new("create GERP conservation database", GerpMain.Run), ["GlobalMinor"] = new("create global minor allele database", CreateGlobalAllelesDb.Main.Run), ["Gnomad"] = new("create gnomAD database", GnomadSnvMain.Run), ["Gnomad-lcr"] = new("create gnomAD low complexity region database", LcrRegionsMain.Run), ["GnomadGeneScores"] = new("create gnomAD gene scores database", GnomadGenesMain.Run), ["GnomadSV"] = new("create gnomAD structural variant database", GnomadSvMain.Run), ["Index"] = new("edit an index file", UpdateIndex.Run), ["MitoHet"] = new("create mitochondrial Heteroplasmy database", MitoHeteroplasmyDb.Run), ["MitomapSvDb"] = new("create MITOMAP structural variants database", StructVarDb.Run), ["MitomapVarDb"] = new("create MITOMAP small variants database", SmallVarDb.Run), ["Omim"] = new("create OMIM database", Omim.Main.Run), ["OneKGen"] = new("create 1000 Genome small variants database", CreateOneKgDb.Main.Run), ["OneKGenSv"] = new("create 1000 Genomes structural variants database", OneKGenSvDb.Create.Run), ["OneKGenSvVcfToBed"] = new("convert 1000 Genomes structural variants VCF file into a BED-like file", OneKGenSvDb.VcfToBed.Run), ["PhyloP"] = new("create PhyloP database", PhyloP.Main.Run), ["PrimateAi"] = new("create PrimateAI database", PrimateAiDb.Run), ["RefMinor"] = new("create Reference Minor database from 1000 Genome ", RefMinorDb.Main.Run), ["RemapWithDbsnp"] = new("remap a VCF file given source and destination rsID mappings", DbSnpRemapperMain.Run), ["Revel"] = new("create REVEL database", Revel.Create.Run), ["SpliceAi"] = new("create SpliceAI database", SpliceAiDb.Run), ["TopMed"] = new("create TOPMed database", CreateTopMedDb.Main.Run), ["Gme"] = new("create GME Variome database", CreateGmeDb.Main.Run), ["Decipher"] = new("create Decipher database", CreateDecipherDb.Main.Run) }; ExitCodes exitCode = new TopLevelAppBuilder(args, ops) .Parse() .ShowBanner(Constants.Authors) .ShowHelpMenu("Utilities focused on supplementary annotation") .ShowErrors() .Execute(); return (int) exitCode; } } } ================================================ FILE: SAUtils/SAUtils.csproj ================================================  Exe net6.0 ..\bin\$(Configuration) ================================================ FILE: SAUtils/SaUtilsCommon.cs ================================================ using System.Linq; namespace SAUtils { public static class SaUtilsCommon { public static bool IsNumberNullOrZero(int? item) { return item is null or 0; } /// /// Returns a regular alternate allele when a provided with one have SA format. /// In case of long insertions or InsDel, where the saAltAllele contains an MD5 hash, the hash is returned. /// /// supplementary annotation alternate allele /// The way the calling function wants to represent an empty allele /// regular alternate allele public static string ReverseSaReducedAllele(string saAltAllele, string emptyAllele = "-") { if (saAltAllele == null) return null; if (saAltAllele.All(char.IsDigit)) return emptyAllele; // this was a deletion int firstBaseIndex; for (firstBaseIndex = 0; firstBaseIndex < saAltAllele.Length; firstBaseIndex++) { if (saAltAllele[firstBaseIndex] != 'i' && saAltAllele[firstBaseIndex] != '<' && !char.IsDigit(saAltAllele[firstBaseIndex])) break; } if (saAltAllele.Substring(firstBaseIndex) == "") return emptyAllele; return firstBaseIndex > 0 && firstBaseIndex < saAltAllele.Length ? saAltAllele.Substring(firstBaseIndex) : saAltAllele; } public static bool HasFailedFilters(string filters) { return !(filters.Equals("PASS") || filters.Equals(".")); } } } ================================================ FILE: SAUtils/Schema/SaJsonKeyAnnotation.cs ================================================ using VariantAnnotation.SA; namespace SAUtils.Schema { public sealed class SaJsonKeyAnnotation { public SaJsonKeyProperties Properties; public SaJsonSchema Schema; private SaJsonKeyAnnotation() { } public static SaJsonKeyAnnotation CreateFromProperties(SaJsonValueType valueType, CustomAnnotationCategories category, string description) { return new SaJsonKeyAnnotation {Properties = new SaJsonKeyProperties(valueType, category, description)}; } public static SaJsonKeyAnnotation CreateFromSubSchema(SaJsonSchema schema) { return new SaJsonKeyAnnotation { Schema = schema}; } } } ================================================ FILE: SAUtils/Schema/SaJsonKeyProperties.cs ================================================ using VariantAnnotation.SA; namespace SAUtils.Schema { public sealed class SaJsonKeyProperties { public readonly SaJsonValueType ValueType; public readonly CustomAnnotationCategories Category; public readonly string Description; public SaJsonKeyProperties(SaJsonValueType valueType, CustomAnnotationCategories category, string description) { ValueType = valueType; Category = category; Description = description; } } } ================================================ FILE: SAUtils/Schema/SaJsonSchema.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using System.Text; using ErrorHandling.Exceptions; using OptimizedCore; using VariantAnnotation.IO; using VariantAnnotation.SA; namespace SAUtils.Schema { public sealed class SaJsonSchema { private const string SchemaVersion = "http://json-schema.org/draft-06/schema#"; public int TotalItems { get; set; } private readonly StringBuilder _sb; private readonly JsonObject _jsonObject; private readonly Dictionary _keyAnnotation = new Dictionary(); private IEnumerable Keys { get; set; } // Keys not used to generate the NSA file, but in the Nirvana JSON output private string[] NonSaKeys { get; set; } = { }; internal readonly Dictionary KeyCounts = new Dictionary(); private Action> _jsonStringGenerationAction; private bool _finalized; internal SaJsonSchema(StringBuilder sb) { _sb = sb; _jsonObject = new JsonObject(sb); } public static SaJsonSchema Create(StringBuilder sb, string jsonTag, SaJsonValueType primaryType, IEnumerable jsonKeys) { var jsonSchema = new SaJsonSchema(sb) { Keys = jsonKeys }; // The root level schema for a SA if (jsonTag != null) { jsonSchema._jsonObject.StartObject(); jsonSchema.AddSchemaVersion(); // SA json is an object jsonSchema.AddJsonDataType(JsonDataType.Object); jsonSchema._jsonObject.StartObjectWithKey(jsonTag); } jsonSchema.AddValueTypes(primaryType); return jsonSchema; } public void SetNonSaKeys(string[] nonSaKeys) { NonSaKeys = nonSaKeys; } private void AddAnnotation(SaJsonKeyAnnotation annotation) { if (annotation.Properties != null) { AddAnnotationProperties(annotation); } else { _sb.Append(annotation.Schema); _jsonObject.EndObject(); } } private void AddAnnotationProperties(SaJsonKeyAnnotation annotation) { AddValueTypes(annotation.Properties.ValueType); int numComplexTypes = annotation.Properties.ValueType.JsonDataTypes.Count(x => x.IsComplexType()); while (numComplexTypes > 0) { _jsonObject.EndObject(); numComplexTypes--; } if (annotation.Properties.Category != CustomAnnotationCategories.Unknown) _jsonObject.AddStringValue("category", annotation.Properties.Category.ToString()); if (annotation.Properties.Description != null) _jsonObject.AddStringValue("description", annotation.Properties.Description); } private void AddValueTypes(SaJsonValueType jsonValueType) { foreach (var dataType in jsonValueType.JsonDataTypes) { AddJsonDataType(dataType); } } private void AddJsonDataType(JsonDataType jsonType) { _jsonObject.AddStringValue("type", jsonType.ToTypeString()); if (jsonType.IsComplexType()) _jsonObject.StartObjectWithKey(jsonType.GetSchemaKey()); } private void AddSchemaVersion() => _jsonObject.AddStringValue("$schema", SchemaVersion); private SaJsonValueType GetJsonType(string key) => _keyAnnotation[key].Properties?.ValueType; private CustomAnnotationCategories GetCategory(string key) => _keyAnnotation[key].Properties?.Category ?? 0; public void AddAnnotation(string key, SaJsonKeyAnnotation annotation) { _keyAnnotation.Add(key, annotation); KeyCounts.Add(key, 0); } public override string ToString() { if (!_finalized) FinalizeSchema(); return _sb.ToString(); } private void FinalizeSchema() { var requiredKeys = new List(); foreach (string key in Keys) { int counts = KeyCounts[key]; if (counts == 0 && !NonSaKeys.Contains(key)) continue; // boolean is always considered as optional if (counts == TotalItems && !GetJsonType(key).Equals(SaJsonValueType.Bool)) requiredKeys.Add(key); OutputKeyAnnotation(key); } _jsonObject.EndObject(); OutputRequiredKeys(requiredKeys); DisallowExtraProperites(); _jsonObject.EndAllObjects(); _finalized = true; } private void OutputRequiredKeys(IReadOnlyCollection requiredKeys) { if (requiredKeys.Count > 0) _jsonObject.AddStringValues("required", requiredKeys); } private void DisallowExtraProperites() { _jsonObject.AddStringValue("additionalProperties", "false", false); } private Action> GetJsonStringGenerationAction() { var actions = new List>(); foreach (string key in Keys) { if (NonSaKeys.Contains(key)) continue; var intendedType = GetJsonType(key); if (intendedType.Equals(SaJsonValueType.String)) { actions.Add((jsonObject, value) => CountKeyIfAdded(jsonObject.AddStringValue(key, value[0]), key)); } else if (intendedType.Equals(SaJsonValueType.Bool)) { actions.Add((jsonObject, value) => CountKeyIfAdded(jsonObject.AddBoolValue(key, CheckAndGetBoolFromString(value[0])), key)); } else if (intendedType.Equals(SaJsonValueType.Number)) { actions.Add((jsonObject, value) => { if (value[0] == null) return; var doubleValue = CheckAndGetNullableDoubleFromString(value[0]); CustomAnnotationCategories keyCategory = GetCategory(key); CountKeyIfAdded(keyCategory == CustomAnnotationCategories.AlleleFrequency ? jsonObject.AddDoubleValue(key, doubleValue, "0.######") : jsonObject.AddStringValue(key, value[0], false), key); }); } else if (intendedType.Equals(SaJsonValueType.StringArray)) { actions.Add((jsonObject, value) => CountKeyIfAdded(jsonObject.AddStringValues(key, value), key)); } else { throw new Exception($"Unknown data type {intendedType}"); } } return (jsonObject, strings) => { foreach (var (action, str) in actions.Zip(strings, (a, b) => (a, b))) { action(jsonObject, str); } TotalItems++; }; } public void CountKeyIfAdded(bool keyAdded, string key) { if (keyAdded) KeyCounts[key]++; } public string GetJsonString(List values) { if (_jsonStringGenerationAction == null) _jsonStringGenerationAction = GetJsonStringGenerationAction(); if (values.Count != Keys.Count(x => !NonSaKeys.Contains(x))) throw new UserErrorException("Please provide one and only one value for each JSON key."); var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); _jsonStringGenerationAction(jsonObject, values); return StringBuilderPool.GetStringAndReturn(sb); } internal void OutputKeyAnnotation(string key) { _jsonObject.StartObjectWithKey(key); var annotation = _keyAnnotation[key]; AddAnnotation(annotation); _jsonObject.EndObject(); } internal static bool CheckAndGetBoolFromString(string value) { switch (value.ToLower()) { case "true": return true; case "false": case "": case ".": return false; default: throw new UserErrorException($"{value} is not a valid boolean."); } } internal static double? CheckAndGetNullableDoubleFromString(string value) { if (value == "." || value == "") return null; if (double.TryParse(value, out double doubleValue)) return doubleValue; throw new UserErrorException($"{value} is not a valid number."); } public SaJsonSchema GetSubSchema(string key) { if (!_keyAnnotation.TryGetValue(key, out var annotation)) throw new KeyNotFoundException($"{key} is not JSON key."); return annotation.Schema; } } } ================================================ FILE: SAUtils/Schema/SaJsonValueType.cs ================================================ using System; using System.Linq; using VariantAnnotation.SA; namespace SAUtils.Schema { public sealed class SaJsonValueType : IEquatable { public JsonDataType[] JsonDataTypes { get; } public static readonly SaJsonValueType Number = Create(JsonDataType.Number); public static readonly SaJsonValueType Bool = Create(JsonDataType.Bool); public static readonly SaJsonValueType String = Create(JsonDataType.String); public static readonly SaJsonValueType Object = Create(JsonDataType.Object); public static readonly SaJsonValueType Array = Create(JsonDataType.Array); public static readonly SaJsonValueType StringArray = Create(JsonDataType.Array, JsonDataType.String); public static readonly SaJsonValueType ObjectArray = Create(JsonDataType.Array, JsonDataType.Object); private SaJsonValueType(params JsonDataType[] dataTypes) { JsonDataTypes = dataTypes; } private static SaJsonValueType Create(params JsonDataType[] dataTypes) { if (dataTypes.Length > 2) throw new ArgumentException("At most two JSON data types are allowed."); if (dataTypes.Length == 2 && !dataTypes[0].IsComplexType()) throw new ArgumentException("The first data type must a complex type when two data types provided."); return new SaJsonValueType(dataTypes); } private bool JsonTypeEquals(params JsonDataType[] dataTypes) => JsonDataTypes.Length == dataTypes.Length && JsonDataTypes.SequenceEqual(dataTypes); public bool Equals(SaJsonValueType other) { if (ReferenceEquals(null, other)) return false; return ReferenceEquals(this, other) || JsonTypeEquals(other.JsonDataTypes); } public override int GetHashCode() { if (JsonDataTypes == null) return 0; unchecked { return JsonDataTypes.Aggregate(17, (current, jsonDataType) => (current * 1201) ^ jsonDataType.GetHashCode()); } } } } ================================================ FILE: SAUtils/SpliceAi/SpliceAiDb.cs ================================================ using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Caches; using VariantAnnotation.IO.Caches; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.SpliceAi { public static class SpliceAiDb { private static string _inputFile; private static string _compressedReference; private static string _transcriptCachePrefix; private static string _outputDirectory; private static string _geneInfoFile; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "cache|c=", "Transcript cache prefix", v => _transcriptCachePrefix = v }, { "gene|g=", "SpliceAi gene data", v => _geneInfoFile = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_transcriptCachePrefix, "transcript cache file", "--cache") .CheckInputFilenameExists(CacheConstants.TranscriptPath(_transcriptCachePrefix), "transcript cache prefix", "--cache") .HasRequiredParameter(_inputFile, "SpliceAI VCF file", "--in") .CheckInputFilenameExists(_inputFile, "SpliceAI VCF file", "--in") .HasRequiredParameter(_geneInfoFile, "SpliceAi gene data", "--gene") .CheckInputFilenameExists(_geneInfoFile, "SpliceAi gene data", "--gene") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); TranscriptCacheData transcriptData; using (var transcriptCacheReader = new TranscriptCacheReader(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(_transcriptCachePrefix)))) { transcriptData = transcriptCacheReader.Read(referenceProvider.RefIndexToChromosome); } var spliceIntervals = SpliceUtilities.GetSpliceIntervals(referenceProvider, transcriptData); var nirEnstToGeneSymbols = SpliceUtilities.GetEnstToGeneSymbols(referenceProvider, transcriptData); Dictionary spliceAiEnstToGeneSymbols; using (var reader = new StreamReader(GZipUtilities.GetAppropriateReadStream(_geneInfoFile))) { spliceAiEnstToGeneSymbols = SpliceUtilities.GetSpliceAiGeneSymbols(reader); } var spliceAiToNirvanaGeneSymbols = SpliceUtilities.GetSymbolMapping(spliceAiEnstToGeneSymbols, nirEnstToGeneSymbols); Console.WriteLine($"Mapped {spliceAiToNirvanaGeneSymbols.Count} spliceAI gene symbols to Nirvana gene symbols (out of {spliceAiEnstToGeneSymbols.Count})"); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var spliceAiParser = new SpliceAiParser( GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider, spliceIntervals, spliceAiToNirvanaGeneSymbols)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.SpliceAiTag, true, true, SaCommon.SchemaVersion, false)) { nsaWriter.Write(spliceAiParser.GetItems()); } Console.WriteLine($"Total number of entries from Splice AI: {SpliceAiParser.Count}"); return ExitCodes.Success; } } } ================================================ FILE: SAUtils/SpliceAi/SpliceAiItem.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace SAUtils.SpliceAi { public sealed class SpliceAiItem:ISupplementaryDataItem { public Chromosome Chromosome { get; } public int Position { get; set; } public string RefAllele { get; set; } public string AltAllele { get; set; } public string Hgnc { get; set; } public const double MinSpliceAiScore = 0.1; private readonly bool _isSpliceAdjacent; private readonly double _acceptorGainScore; private readonly double _acceptorLossScore; private readonly double _donorGainScore; private readonly double _donorLossScore; private readonly int _acceptorGainPosition; private readonly int _acceptorLossPosition; private readonly int _donorGainPosition; private readonly int _donorLossPosition; public SpliceAiItem(Chromosome chromosome, int position, string refAllele, string altAllele, string hgnc, double acceptorGainScore, double acceptorLossScore, double donorGainScore, double donorLossScore, int acceptorGainPosition, int acceptorLossPosition, int donorGainPosition, int donorLossPosition, bool isSpliceAdjacent) { Chromosome = chromosome; Position = position; RefAllele = refAllele; AltAllele = altAllele; Hgnc = hgnc; _acceptorGainScore = acceptorGainScore; _acceptorLossScore = acceptorLossScore; _donorGainScore = donorGainScore; _donorLossScore = donorLossScore; _acceptorGainPosition = acceptorGainPosition; _acceptorLossPosition = acceptorLossPosition; _donorGainPosition = donorGainPosition; _donorLossPosition = donorLossPosition; _isSpliceAdjacent = isSpliceAdjacent; } public string GetJsonString() { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddStringValue("hgnc", Hgnc); if (_isSpliceAdjacent) { jsonObject.AddDoubleValue("acceptorGainScore", _acceptorGainScore, "0.#"); jsonObject.AddDoubleValue("acceptorGainDistance", _acceptorGainPosition); jsonObject.AddDoubleValue("acceptorLossScore", _acceptorLossScore, "0.#"); jsonObject.AddDoubleValue("acceptorLossDistance", _acceptorLossPosition); jsonObject.AddDoubleValue("donorGainScore", _donorGainScore, "0.#"); jsonObject.AddDoubleValue("donorGainDistance", _donorGainPosition); jsonObject.AddDoubleValue("donorLossScore", _donorLossScore, "0.#"); jsonObject.AddDoubleValue("donorLossDistance", _donorLossPosition); } else { if (_acceptorGainScore >= MinSpliceAiScore) { jsonObject.AddDoubleValue("acceptorGainScore", _acceptorGainScore, "0.#"); jsonObject.AddDoubleValue("acceptorGainDistance", _acceptorGainPosition); } if (_acceptorLossScore >= MinSpliceAiScore) { jsonObject.AddDoubleValue("acceptorLossScore", _acceptorLossScore, "0.#"); jsonObject.AddDoubleValue("acceptorLossDistance", _acceptorLossPosition); } if (_donorGainScore >= MinSpliceAiScore) { jsonObject.AddDoubleValue("donorGainScore", _donorGainScore, "0.#"); jsonObject.AddDoubleValue("donorGainDistance", _donorGainPosition); } if (_donorLossScore >= MinSpliceAiScore) { jsonObject.AddDoubleValue("donorLossScore", _donorLossScore, "0.#"); jsonObject.AddDoubleValue("donorLossDistance", _donorLossPosition); } } return StringBuilderPool.GetStringAndReturn(sb); } public string InputLine { get; set; } } } ================================================ FILE: SAUtils/SpliceAi/SpliceAiParser.cs ================================================ using System; using System.Collections.Generic; using System.Data; using System.IO; using Intervals; using IO; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.SpliceAi { public sealed class SpliceAiParser:IDisposable { private readonly Stream _stream; private readonly ISequenceProvider _sequenceProvider; private readonly Dictionary> _spliceIntervals; private readonly HashSet _unresolvedSymbols; public static int Count; private string _geneSymbol; private double _acceptorGainScore; private double _acceptorLossScore; private double _donorGainScore; private double _donorLossScore; private int _acceptorGainPosition; private int _acceptorLossPosition; private int _donorGainPosition; private int _donorLossPosition; private readonly Dictionary _spliceToNirvanaSymbols; public SpliceAiParser(Stream stream, ISequenceProvider sequenceProvider, Dictionary> spliceIntervals, Dictionary spliceToNirGeneSymbols) { _stream = stream; _sequenceProvider = sequenceProvider; _spliceIntervals = spliceIntervals; _spliceToNirvanaSymbols = spliceToNirGeneSymbols; _unresolvedSymbols = new HashSet(); } public IEnumerable GetItems() { using (var reader = FileUtilities.GetStreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // comments may contain the Format field if (line.OptimizedStartsWith('#')) { if (line.Contains("Format:")) GetFieldIndices(line); continue; } var item = ExtractItem(line); if (item == null) continue; UpdateGeneSymbol(item); if (string.IsNullOrEmpty(item.Hgnc)) continue; yield return item; } } Console.WriteLine($"{_unresolvedSymbols.Count} unresolved gene symbols encountered. Symbols:"); foreach (var symbol in _unresolvedSymbols) { Console.Write(symbol+','); } } //##INFO= private int _geneSymbolIndex = -1; private int _dsAgIndex = -1; private int _dsAlIndex = -1; private int _dsDgIndex = -1; private int _dsDlIndex = -1; private int _dpAgIndex = -1; private int _dpAlIndex = -1; private int _dpDgIndex = -1; private int _dpDlIndex = -1; private const string GeneSymbolTag = "SYMBOL"; private const string DsAgTag = "DS_AG"; private const string DsAlTag = "DS_AL"; private const string DsDgTag = "DS_DG"; private const string DsDlTag = "DS_DL"; private const string DpAgTag = "DP_AG"; private const string DpAlTag = "DP_AL"; private const string DpDgTag = "DP_DG"; private const string DpDlTag = "DP_DL"; private void GetFieldIndices(string line) { var format = line.Split("Format:")[1]; format = format.EndsWith("\">") ? format.Substring(0, format.Length - 2): format; var fields = format.OptimizedSplit('|'); _geneSymbolIndex = Array.IndexOf(fields, GeneSymbolTag); _dsAgIndex = Array.IndexOf(fields, DsAgTag); _dsDgIndex = Array.IndexOf(fields, DsDgTag); _dsAlIndex = Array.IndexOf(fields, DsAlTag); _dsDlIndex = Array.IndexOf(fields, DsDlTag); _dpAgIndex = Array.IndexOf(fields, DpAgTag); _dpDgIndex = Array.IndexOf(fields, DpDgTag); _dpAlIndex = Array.IndexOf(fields, DpAlTag); _dpDlIndex = Array.IndexOf(fields, DpDlTag); } /// /// Extracts a splice AI item from the specified VCF line. /// /// /// private SpliceAiItem ExtractItem(string vcfLine) { var splitLine = vcfLine.Split('\t'); if (splitLine.Length < VcfCommon.InfoIndex+1) return null; var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_sequenceProvider.RefNameToChromosome.ContainsKey(chromosomeName)) return null; var chromosome = _sequenceProvider.RefNameToChromosome[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]); var refAllele = splitLine[VcfCommon.RefIndex]; var altAllele = splitLine[VcfCommon.AltIndex]; if (altAllele.Contains(',')) throw new DataException($"multiple alt allele present for {chromosome}-{position}"); var start = position; //skipping insertions/deletions that were shifted if (VariantUtils.IsLeftShiftPossible(refAllele, altAllele)) return null; (start, refAllele, altAllele) = BiDirectionalTrimmer.Trim(start, refAllele, altAllele); var end = start + refAllele.Length - 1; var isSpliceAdjacent = _spliceIntervals[chromosome.Index].OverlapsAny(start, end); ParseInfoField(splitLine[VcfCommon.InfoIndex]); if (!HasSignificantScore() && !isSpliceAdjacent) return null; Count++; return new SpliceAiItem(chromosome, start, refAllele, altAllele, _geneSymbol, _acceptorGainScore, _acceptorLossScore, _donorGainScore, _donorLossScore, _acceptorGainPosition, _acceptorLossPosition, _donorGainPosition, _donorLossPosition, isSpliceAdjacent); } private void UpdateGeneSymbol(SpliceAiItem item) { if (_spliceToNirvanaSymbols.TryGetValue(item.Hgnc, out var nirHgnc)) item.Hgnc = nirHgnc; else { _unresolvedSymbols.Add(item.Hgnc); } } private bool HasSignificantScore() { return _acceptorLossScore >= SpliceAiItem.MinSpliceAiScore || _acceptorGainScore >= SpliceAiItem.MinSpliceAiScore || _donorGainScore >= SpliceAiItem.MinSpliceAiScore || _donorLossScore >= SpliceAiItem.MinSpliceAiScore; } //1 69091 . A C . . SpliceAI=C|OR4F5|0.01|0.00|0.00|0.00|42|25|24|2 private void ParseInfoField(string infoFields) { Clear(); if (infoFields == "" || infoFields == ".") return; var values = infoFields.OptimizedSplit('|'); _geneSymbol = values[_geneSymbolIndex]; _acceptorGainScore = Convert.ToDouble(values[_dsAgIndex]); _acceptorLossScore = Convert.ToDouble(values[_dsAlIndex]); _donorGainScore = Convert.ToDouble(values[_dsDgIndex]); _donorLossScore = Convert.ToDouble(values[_dsDlIndex]); _acceptorGainPosition = Convert.ToInt32(values[_dpAgIndex]); _acceptorLossPosition = Convert.ToInt32(values[_dpAlIndex]); _donorGainPosition = Convert.ToInt32(values[_dpDgIndex]); _donorLossPosition = Convert.ToInt32(values[_dpDlIndex]); } private void Clear() { _geneSymbol = null; _acceptorGainScore = 0; _acceptorLossScore = 0; _donorGainScore = 0; _donorLossScore = 0; _acceptorGainPosition = int.MaxValue; _acceptorLossPosition = int.MaxValue; _donorGainPosition = int.MaxValue; _donorLossPosition = int.MaxValue; } public void Dispose() { _stream?.Dispose(); } } } ================================================ FILE: SAUtils/SpliceAi/SpliceUtilities.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Intervals; using OptimizedCore; using VariantAnnotation.Caches; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; namespace SAUtils.SpliceAi { public static class SpliceUtilities { public const int SpliceFlankLength = 15; public static Dictionary> GetSpliceIntervals(ISequenceProvider sequenceProvider, TranscriptCacheData transcriptData) { var cache = transcriptData.GetCache(); var spliceIntervalDict = new Dictionary>(sequenceProvider.RefIndexToChromosome.Count); foreach (var chromIndex in sequenceProvider.RefIndexToChromosome.Keys) { var spliceIntervals = new List>(8 * 1024); var overlappingTranscripts = cache.TranscriptIntervalForest.GetAllOverlappingValues(chromIndex, 1, int.MaxValue); if (overlappingTranscripts == null) continue; foreach (var transcript in overlappingTranscripts) { bool isFirstExon = true; foreach (var transcriptRegion in transcript.TranscriptRegions) { if (transcriptRegion.Type != TranscriptRegionType.Exon) continue; var firstSplicePosition = transcriptRegion.Start; var secondSplicePosition = transcriptRegion.End; var firstInterval = new Interval(firstSplicePosition - SpliceFlankLength, firstSplicePosition + SpliceFlankLength, 0); var secondInterval = new Interval(secondSplicePosition - SpliceFlankLength, secondSplicePosition + SpliceFlankLength, 0); if(!isFirstExon) spliceIntervals.Add(firstInterval); spliceIntervals.Add(secondInterval); isFirstExon = false; } //remove the last added interval since this is the tail of the last exon- which is not a splice site if(spliceIntervals.Count > 0)spliceIntervals.RemoveAt(spliceIntervals.Count - 1); } spliceIntervalDict[chromIndex] = new IntervalArray(spliceIntervals.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray()); } return spliceIntervalDict; } public static Dictionary GetEnstToGeneSymbols(ISequenceProvider sequenceProvider, TranscriptCacheData transcriptData) { var cache = transcriptData.GetCache(); var enstToGeneSymbols = new Dictionary(); foreach (var chromIndex in sequenceProvider.RefIndexToChromosome.Keys) { var overlappingTranscripts = cache.TranscriptIntervalForest.GetAllOverlappingValues(chromIndex, 1, int.MaxValue); if (overlappingTranscripts == null) continue; foreach (var transcript in overlappingTranscripts) { if (transcript.Id.WithoutVersion.StartsWith("ENST")) enstToGeneSymbols[transcript.Id.WithoutVersion] = transcript.Gene.Symbol; } } return enstToGeneSymbols; } public static Dictionary GetSpliceAiGeneSymbols(StreamReader reader) { var enstToGeneSymbols = new Dictionary(); string line; while ((line = reader.ReadLine()) != null) { var splits = line.OptimizedSplit('\t'); var geneSymbol = splits[0]; var ensemblId = splits[1].OptimizedSplit('.')[0]; enstToGeneSymbols[ensemblId] = geneSymbol; } return enstToGeneSymbols; } public static Dictionary GetSymbolMapping(Dictionary spliceAiEnstToGeneSymbols, Dictionary nirEnstToGeneSymbols) { var spliceToNirSymbols= new Dictionary(); foreach (var (spliceEnst, spliceGene) in spliceAiEnstToGeneSymbols) { if (nirEnstToGeneSymbols.TryGetValue(spliceEnst, out var nirGene)) spliceToNirSymbols[spliceGene] = nirGene; } return spliceToNirSymbols; } } } ================================================ FILE: SAUtils/gnomAD/GnomadSnvMain.cs ================================================ using System; using System.IO; using System.Threading.Tasks; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using OptimizedCore; using SAUtils.InputFileParsers; using SAUtils.NsaConcatenator; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.gnomAD { public sealed class GnomadSnvMain { private static string _genomeDirectory; private static string _exomeDirectory; private static string _compressedReference; private static string _outputDirectory; private static string _tempDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "genome|g=", "input directory containing VCF (and .version) files with genomic frequencies", v => _genomeDirectory = v }, { "exome|e=", "input directory containing VCF (and .version) files with exomic frequencies", v => _exomeDirectory = v }, { "temp|t=", "output temp directory for intermediate (per chrom) NSA files", v => _tempDirectory = v }, { "out|o=", "output directory for NSA file", v => _outputDirectory = v } }; var commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckDirectoryExists(_genomeDirectory, "input directory containing genome vcf files", "--genome") .CheckDirectoryExists(_outputDirectory, "output Supplementary directory", "--out") .SkipBanner() .ShowHelpMenu("Reads provided supplementary data files and populates tsv files", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { //clearing temp directory if (!Directory.Exists(_tempDirectory)) Directory.CreateDirectory(_tempDirectory); Console.WriteLine($"Cleaning {SaCommon.SaFileSuffix} and {SaCommon.IndexSuffix} files from temp directory {_tempDirectory}"); foreach (var file in Directory.GetFiles(_tempDirectory, $"*{SaCommon.SaFileSuffix}")) { File.Delete(file); } foreach (var file in Directory.GetFiles(_tempDirectory, $"*{SaCommon.SaFileSuffix}{SaCommon.IndexSuffix}")) { File.Delete(file); } var version = GetVersion(); var genomeFiles = GetVcfFiles(_genomeDirectory); var exomeFiles = GetVcfFiles(_exomeDirectory); const int degOfParalleleism = 12; //hard coding since we are IO bound and stressing the disk doesn't help Console.WriteLine($"Creating merged gnomAD database file from {genomeFiles.Length + exomeFiles.Length} input files. Degree of parallelism {degOfParalleleism}"); Parallel.ForEach( genomeFiles, new ParallelOptions { MaxDegreeOfParallelism = degOfParalleleism }, genomeFile => CreateNsa(exomeFiles, genomeFile, version) ); string outFileName = Path.Combine(_outputDirectory, $"{version.Name}_{version.Version}"); //concat the nsa files Console.WriteLine("Concatenating per chromosome nsa files"); var tempNsaFiles = Directory.GetFiles(_tempDirectory, $"*{SaCommon.SaFileSuffix}"); ConcatUtilities.ConcatenateNsaFiles(tempNsaFiles, outFileName); return ExitCodes.Success; } private static void CreateNsa(string[] exomeFiles, string genomeFile, DataSourceVersion version) { Console.WriteLine($"Processing file: {genomeFile}"); var outName = Path.GetFileNameWithoutExtension(genomeFile); using (var exomeReader = GetExomeReader(exomeFiles, genomeFile)) using (var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference))) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_tempDirectory, outName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_tempDirectory, outName + SaCommon.SaFileSuffix + SaCommon.IndexSuffix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.GnomadTag, true, false, SaCommon.SchemaVersion, false)) using (var reader = GZipUtilities.GetAppropriateStreamReader(genomeFile)) { var gnomadReader = new GnomadSnvReader(reader, exomeReader, referenceProvider); var count = nsaWriter.Write(gnomadReader.GetCombinedItems()); Console.WriteLine($"Wrote {count} items to NSA file."); } } private static StreamReader GetExomeReader(string[] exomeFileNames, string genomeFileName) { if (exomeFileNames == null || exomeFileNames.Length == 0) return null; string chromName = GetChromName(genomeFileName); string exomeFileName = null; foreach (string fileName in exomeFileNames) { string exomeChrom = GetChromName(fileName); if (chromName != exomeChrom) continue; exomeFileName = fileName; break; } return string.IsNullOrEmpty(exomeFileName) ? null : GZipUtilities.GetAppropriateStreamReader(exomeFileName); } private static string GetChromName(string filePath) { // the files are named in a consistent format that allows us to match files by chrom names // e.g. gnomad.exomes.r2.1.sites.grch38.chr1_noVEP.vcf.gz or chr18.vcf.bgz var fileName = Path.GetFileName(filePath); foreach (var component in fileName.OptimizedSplit('.')) { if (component.StartsWith("chr")) return component.OptimizedSplit('_')[0]; } return null; } private static DataSourceVersion GetVersion() { var genomeVersionFiles = Directory.GetFiles(_genomeDirectory, "*.version"); if (genomeVersionFiles.Length != 1) throw new InvalidDataException($"Only one .version file should exist in: {_genomeDirectory}"); var genomeVersion = DataSourceVersionReader.GetSourceVersion(genomeVersionFiles[0]); if (string.IsNullOrEmpty(_exomeDirectory)) return genomeVersion; var exomeVersionFiles = Directory.GetFiles(_exomeDirectory, "*.version"); if (exomeVersionFiles.Length != 1) throw new InvalidDataException($"Only one .version file should exist in: {_exomeDirectory}"); var exomeVersion = DataSourceVersionReader.GetSourceVersion(genomeVersionFiles[0]); if (genomeVersion.Version != exomeVersion.Version) throw new DataMisalignedException( $"Version mismatch! Genome version: {genomeVersion.Version}, Exome Version: {exomeVersion.Version}."); return genomeVersion; } private static string[] GetVcfFiles(string directory) { if (string.IsNullOrEmpty(directory)) return new string[]{}; // the files might have gz or bgz extensions var files = Directory.GetFiles(directory, "*.vcf.bgz"); if(files.Length == 0) files = Directory.GetFiles(directory, "*.vcf.gz"); if (files.Length == 0) throw new UserErrorException($"{directory} does not contain any VCF files"); return files; } } } ================================================ FILE: SAUtils/gnomAD/GnomadSnvReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using OptimizedCore; using SAUtils.DataStructures; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using Variants; namespace SAUtils.gnomAD { public sealed class GnomadSnvReader { private readonly StreamReader _genomeReader; private readonly StreamReader _exomeReader; private readonly ISequenceProvider _sequenceProvider; private int[] _acAll; private int[] _acAfr; private int[] _acAmr; private int[] _acEas; private int[] _acFin; private int[] _acNfe; private int[] _acOth; private int[] _acAsj; private int[] _acSas; private int _anAll; private int _anAfr; private int _anAmr; private int _anEas; private int _anFin; private int _anNfe; private int _anOth; private int _anAsj; private int _anSas; //male numbers private int[] _acMale; private int _anMale; private int[] _hcMale; //female numbers private int[] _acFemale; private int _anFemale; private int[] _hcFemale; private int[] _hcAll; private int[] _hcAfr; private int[] _hcAmr; private int[] _hcEas; private int[] _hcFin; private int[] _hcNfe; private int[] _hcOth; private int[] _hcAsj; private int[] _hcSas; // controls private int[] _control_acAll; private int _control_anAll; private int? _totalDepth; public GnomadSnvReader(StreamReader genomeReader, StreamReader exomeReader, ISequenceProvider sequenceProvider) { _genomeReader = genomeReader; _exomeReader = exomeReader; _sequenceProvider = sequenceProvider; } private void Clear() { _acAll = null; _acAfr = null; _acAmr = null; _acEas = null; _acFin = null; _acNfe = null; _acOth = null; _acAsj = null; _acSas = null; _anAll = 0; _anAfr = 0; _anAmr = 0; _anEas = 0; _anFin = 0; _anNfe = 0; _anOth = 0; _anAsj = 0; _anSas = 0; _acMale = null; _anMale = 0; _hcMale = null; _acFemale = null; _anFemale = 0; _hcFemale = null; _hcAll = null; _hcAfr = null; _hcAmr = null; _hcEas = null; _hcFin = null; _hcNfe = null; _hcOth = null; _hcAsj = null; _hcSas = null; //control _control_acAll = null; _control_anAll = 0; _totalDepth = null; } /// /// Merging genomic an exomic items to create one stream of gnomad entries /// /// public IEnumerable GetCombinedItems() { using (var genomeEnumerator = GetItems(_genomeReader, GnomadDataType.Genome).GetEnumerator()) using (var exomeEnumerator = GetItems(_exomeReader, GnomadDataType.Exome).GetEnumerator()) { var hasGenomicItem = genomeEnumerator.MoveNext(); var hasExomeItem = exomeEnumerator.MoveNext(); var minHeap = new MinHeap(GnomadItem.CompareTo); while (hasExomeItem && hasGenomicItem) { var genomeItem = genomeEnumerator.Current; var exomeItem = exomeEnumerator.Current; var position = Math.Min(genomeItem.Position, exomeItem.Position); while (hasGenomicItem && genomeItem.Position == position) { //all items for a position should be gathered so as to resolve conflicts properly minHeap.Add(GnomadUtilities.GetNormalizedItem(genomeItem, _sequenceProvider)); hasGenomicItem = genomeEnumerator.MoveNext(); genomeItem = genomeEnumerator.Current; } while (hasExomeItem && exomeItem.Position == position) { minHeap.Add(GnomadUtilities.GetNormalizedItem(exomeItem, _sequenceProvider)); hasExomeItem = exomeEnumerator.MoveNext(); exomeItem = exomeEnumerator.Current; } // at this point, the min heap should not be empty int heapPosition = minHeap.GetMin().Position; while (minHeap.Count() > 0 && heapPosition < position - VariantUtils.MaxUpstreamLength) { var (genomeItems, exomeItems) = GetMinItems(minHeap); foreach (var item in GnomadUtilities.GetMergedItems(genomeItems, exomeItems).Values) { if (item.AllAlleleNumber == null || item.AllAlleleNumber.Value == 0) continue; yield return item; } } } //flush out the last positions in heap while (minHeap.Count() > 0) { var (genomeItems, exomeItems) = GetMinItems(minHeap); foreach (var item in GnomadUtilities.GetMergedItems(genomeItems, exomeItems).Values) yield return item; } //now, only one of the iterator is left if (hasGenomicItem) foreach (var item in GetRemainingItems(genomeEnumerator)) yield return item; if (hasExomeItem) foreach (var item in GetRemainingItems(exomeEnumerator)) yield return item; } } private static (Dictionary<(string refAllele, string altAllele), GnomadItem> genomeItems, Dictionary<(string refAllele, string altAllele), GnomadItem> exomeItems) GetMinItems(MinHeap minHeap) { var genomeItems = new List(); var exomeItems = new List(); if (minHeap.Count() == 0) return (null, null); var position = minHeap.GetMin().Position; while (minHeap.Count() > 0 && minHeap.GetMin().Position == position) { var item = minHeap.ExtractMin(); if (item.DataType == GnomadDataType.Genome) genomeItems.Add(item); else exomeItems.Add(item); } genomeItems = SuppDataUtilities.RemoveConflictingAlleles(genomeItems, false); exomeItems = SuppDataUtilities.RemoveConflictingAlleles(exomeItems, false); var genomeItemsByAllele = new Dictionary<(string refAllele, string altAllele), GnomadItem>(); foreach (var item in genomeItems) { genomeItemsByAllele.Add((item.RefAllele, item.AltAllele), (GnomadItem) item); } var exomeItemsByAllele = new Dictionary<(string refAllele, string altAllele), GnomadItem>(); foreach (var item in exomeItems) { exomeItemsByAllele.Add((item.RefAllele, item.AltAllele), (GnomadItem) item); } return (genomeItemsByAllele, exomeItemsByAllele); } private IEnumerable GetRemainingItems(IEnumerator enumerator) { do { var item = enumerator.Current; if (item == null) yield break; if (item.AllAlleleNumber == null || item.AllAlleleNumber.Value == 0) continue; yield return GnomadUtilities.GetNormalizedItem(item, _sequenceProvider); } while (enumerator.MoveNext()); } /// /// Parses a source file and return an enumeration object containing /// all the data objects that have been extracted. /// /// private IEnumerable GetItems(StreamReader reader, GnomadDataType type) { if (reader == null) yield break; using (reader) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.OptimizedStartsWith('#')) continue; var items = ExtractItems(line, type); if (items == null) continue; foreach (var item in items) { yield return item; } } } } /// /// Extracts a gnomad item(s) from the specified VCF line. /// /// /// /// private List ExtractItems(string line, GnomadDataType type) { if (line == null) return null; var splitLine = line.OptimizedSplit('\t'); if (splitLine.Length < 8) return null; Clear(); var chromosome = splitLine[VcfCommon.ChromIndex]; if (!_sequenceProvider.RefNameToChromosome.ContainsKey(chromosome)) return null; var chrom = _sequenceProvider.RefNameToChromosome[chromosome]; var position = int.Parse(splitLine[VcfCommon.PosIndex]); var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var filters = splitLine[VcfCommon.FilterIndex]; var infoFields = splitLine[VcfCommon.InfoIndex]; var hasFailedFilters = !(filters.Equals("PASS") || filters.Equals(".")); // parses the info fields and extract frequencies, coverage, num samples. ParseInfoField(infoFields); var gnomadItemsList = new List(); for (int i = 0; i < altAlleles.Length; i++) { gnomadItemsList.Add(new GnomadItem( chrom, position, refAllele, altAlleles[i], _totalDepth, _anAll, _anAfr, _anAmr, _anEas, _anFin, _anNfe, _anOth, _anAsj, _anSas, _anMale, _anFemale, GetCount(_acAll, i), GetCount(_acAfr, i), GetCount(_acAmr, i), GetCount(_acEas, i), GetCount(_acFin, i), GetCount(_acNfe, i), GetCount(_acOth, i), GetCount(_acAsj, i), GetCount(_acSas, i), GetCount(_acMale, i), GetCount(_acFemale, i), GetCount(_hcAll, i), GetCount(_hcAfr, i), GetCount(_hcAmr, i), GetCount(_hcEas, i), GetCount(_hcFin, i), GetCount(_hcNfe, i), GetCount(_hcOth, i), GetCount(_hcAsj, i), GetCount(_hcSas, i), GetCount(_hcMale, i), GetCount(_hcFemale, i), //controls _control_anAll, GetCount(_control_acAll, i), hasFailedFilters, type, line )); } return gnomadItemsList; } private static int? GetCount(int[] counts, int i) { if (counts == null) return null; if (i >= counts.Length) return null; return counts[i]; } /// /// split up the info field and extract information from each of them. /// /// private void ParseInfoField(string infoFields) { if (infoFields == "" || infoFields == ".") return; var infoItems = infoFields.OptimizedSplit(';'); foreach (string infoItem in infoItems) { (string key, string value) = infoItem.OptimizedKeyValue(); // sanity check if (value != null) SetInfoField(key, value); } } /// /// Get a key value pair and using the key, set appropriate values /// /// /// private void SetInfoField(string vcfId, string value) { switch (vcfId) { case "AC": _acAll = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_XY": _acMale = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_XX": _acFemale = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_afr": _acAfr = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_amr": _acAmr = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_eas": _acEas = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_fin": _acFin = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_nfe": _acNfe = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_oth": _acOth = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_asj": _acAsj = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AC_sas": _acSas = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "AN": _anAll = Convert.ToInt32(value); break; case "AN_XY": _anMale = Convert.ToInt32(value); break; case "AN_XX": _anFemale = Convert.ToInt32(value); break; case "AN_afr": _anAfr = Convert.ToInt32(value); break; case "AN_amr": _anAmr = Convert.ToInt32(value); break; case "AN_eas": _anEas = Convert.ToInt32(value); break; case "AN_fin": _anFin = Convert.ToInt32(value); break; case "AN_nfe": _anNfe = Convert.ToInt32(value); break; case "AN_oth": _anOth = Convert.ToInt32(value); break; case "AN_asj": _anAsj = Convert.ToInt32(value); break; case "AN_sas": _anSas = Convert.ToInt32(value); break; case "nhomalt": _hcAll = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_XY": _hcMale = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_XX": _hcFemale = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_afr": _hcAfr = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_amr": _hcAmr = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_eas": _hcEas = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_fin": _hcFin = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_nfe": _hcNfe = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_oth": _hcOth = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_asj": _hcAsj = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "nhomalt_sas": _hcSas = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; // controls case "AN_controls_and_biobanks": _control_anAll = Convert.ToInt32(value); break; case "AC_controls_and_biobanks": _control_acAll = value.OptimizedSplit(',').Select(val => Convert.ToInt32(val)).ToArray(); break; case "VarDP": _totalDepth = Convert.ToInt32(value); break; } } } } ================================================ FILE: SAUtils/gnomAD/GnomadSvBedParser.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using SAUtils.DataStructures; using SAUtils.ParseUtils; using Variants; namespace SAUtils.gnomAD; public sealed class GnomadSvBedParser : GnomadSvParser { public GnomadSvBedParser( StreamReader reader, Dictionary refNameDict ) : base(reader, refNameDict) { TsvIndices = new TsvIndices { Chromosome = 0, Start = 1, End = 2, VariantId = 3, SvType = 4, Filters = 241, AllAlleleNumber = 35, AllAlleleCount = 36, AllAlleleFrequency = 37, AllHomCount = 41, MaleAlleleNumber = 45, MaleAlleleCount = 46, MaleAlleleFrequency = 47, MaleHomCount = 51, FemaleAlleleNumber = 60, FemaleAlleleCount = 61, FemaleAlleleFrequency = 62, FemaleHomCount = 66, AfrAlleleNumber = 71, AfrAlleleCount = 72, AfrAlleleFrequency = 73, AfrHomCount = 77, AmrAlleleNumber = 105, AmrAlleleCount = 106, AmrAlleleFrequency = 107, AmrHomCount = 111, EasAlleleNumber = 139, EasAlleleCount = 140, EasAlleleFrequency = 141, EasHomCount = 145, EurAlleleNumber = 173, EurAlleleCount = 174, EurAlleleFrequency = 175, EurHomCount = 179, OthAlleleNumber = 207, OthAlleleCount = 208, OthAlleleFrequency = 209, OthHomCount = 211, }; } protected override GnomadSvItem ParseLine(string inputLine) { var splitLine = new SplitLine(in inputLine, in Delimiter); string chromosomeName = splitLine.GetString(TsvIndices.Chromosome); if (!RefNameDict.ContainsKey(chromosomeName)) return null; Chromosome chromosome = RefNameDict[chromosomeName]; int? start = splitLine.ParseInteger(TsvIndices.Start); int? end = splitLine.ParseInteger(TsvIndices.End); if (start == null || end == null) throw new InvalidDataException($"Invalid Data on Line {inputLine}"); VariantType svType = SvTypeMapper(splitLine.GetString(TsvIndices.SvType)); // Ignoring BND for now if (svType == VariantType.translocation_breakend) return null; // For some reason the in the source file, the end position is +1 for insertions if (svType == VariantType.insertion) end--; start += 2; // +1 start is 0-based in BED format, also +1 for padding base if (start > end) (start, end) = (end, start); string filters = splitLine.GetString(TsvIndices.Filters); bool hasFailedFilters = SaUtilsCommon.HasFailedFilters(filters); return new GnomadSvItem(chromosome, inputLine) { Start = (int) start, End = (int) end, VariantId = splitLine.GetString(TsvIndices.VariantId), SvType = SvTypeMapper(splitLine.GetString(TsvIndices.SvType)), AllAlleleNumber = splitLine.ParseInteger(TsvIndices.AllAlleleNumber), AllAlleleCount = splitLine.ParseInteger(TsvIndices.AllAlleleCount), AllAlleleFrequency = splitLine.ParseDouble(TsvIndices.AllAlleleFrequency), AllHomCount = splitLine.ParseInteger(TsvIndices.AllHomCount), MaleAlleleNumber = splitLine.ParseInteger(TsvIndices.MaleAlleleNumber), MaleAlleleCount = splitLine.ParseInteger(TsvIndices.MaleAlleleCount), MaleAlleleFrequency = splitLine.ParseDouble(TsvIndices.MaleAlleleFrequency), MaleHomCount = splitLine.ParseInteger(TsvIndices.MaleHomCount), FemaleAlleleNumber = splitLine.ParseInteger(TsvIndices.FemaleAlleleNumber), FemaleAlleleCount = splitLine.ParseInteger(TsvIndices.FemaleAlleleCount), FemaleAlleleFrequency = splitLine.ParseDouble(TsvIndices.FemaleAlleleFrequency), FemaleHomCount = splitLine.ParseInteger(TsvIndices.FemaleHomCount), AfrAlleleNumber = splitLine.ParseInteger(TsvIndices.AfrAlleleNumber), AfrAlleleCount = splitLine.ParseInteger(TsvIndices.AfrAlleleCount), AfrAlleleFrequency = splitLine.ParseDouble(TsvIndices.AfrAlleleFrequency), AfrHomCount = splitLine.ParseInteger(TsvIndices.AfrHomCount), AmrAlleleNumber = splitLine.ParseInteger(TsvIndices.AmrAlleleNumber), AmrAlleleCount = splitLine.ParseInteger(TsvIndices.AmrAlleleCount), AmrAlleleFrequency = splitLine.ParseDouble(TsvIndices.AmrAlleleFrequency), AmrHomCount = splitLine.ParseInteger(TsvIndices.AmrHomCount), EasAlleleNumber = splitLine.ParseInteger(TsvIndices.EasAlleleNumber), EasAlleleCount = splitLine.ParseInteger(TsvIndices.EasAlleleCount), EasAlleleFrequency = splitLine.ParseDouble(TsvIndices.EasAlleleFrequency), EasHomCount = splitLine.ParseInteger(TsvIndices.EasHomCount), EurAlleleNumber = splitLine.ParseInteger(TsvIndices.EurAlleleNumber), EurAlleleCount = splitLine.ParseInteger(TsvIndices.EurAlleleCount), EurAlleleFrequency = splitLine.ParseDouble(TsvIndices.EurAlleleFrequency), EurHomCount = splitLine.ParseInteger(TsvIndices.EurHomCount), OthAlleleNumber = splitLine.ParseInteger(TsvIndices.OthAlleleNumber), OthAlleleCount = splitLine.ParseInteger(TsvIndices.OthAlleleCount), OthAlleleFrequency = splitLine.ParseDouble(TsvIndices.OthAlleleFrequency), OthHomCount = splitLine.ParseInteger(TsvIndices.OthHomCount), HasFailedFilters = hasFailedFilters }; } private static VariantType SvTypeMapper(string svType) { // All possible values found in data (with counts): // BND: 52604 // CN=0: 1108 // CPX: 4778 // CTX: 8 // DEL: 169635 // DUP: 49571 // INS: 31443 // INS:ME: 672 // INS:ME:ALU: 60475 // INS:ME:LINE1: 10018 // INS:ME:SVA: 6417 // INV: 748 // Total: 387477 return svType switch { "BND" => VariantType.translocation_breakend, "CN=0" => VariantType.deletion, "CPX" => VariantType.complex_structural_alteration, "CTX" => VariantType.translocation_breakend, "DEL" => VariantType.deletion, "DUP" => VariantType.duplication, "INS" => VariantType.insertion, "INS:ME" => VariantType.mobile_element_insertion, "INS:ME:ALU" => VariantType.mobile_element_insertion, "INS:ME:LINE1" => VariantType.mobile_element_insertion, "INS:ME:SVA" => VariantType.mobile_element_insertion, "INV" => VariantType.inversion, _ => throw new InvalidDataException("unknown svType") }; } } ================================================ FILE: SAUtils/gnomAD/GnomadSvMain.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using ErrorHandling.Exceptions; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.gnomAD; public static class GnomadSvMain { private static string _inputFileName; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "gnomADV2 BED or TSV file", v => _inputFileName = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; var commandLineExample = $"{command} [options]"; ExitCodes exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .CheckInputFilenameExists(_inputFileName, "gnomADV2 BED or TSV file", "--in") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database from gnomAD v2 structural variant annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); DataSourceVersion version = DataSourceVersionReader.GetSourceVersion(_inputFileName + ".version"); string outFileName = $"{version.Name}_{version.Version}".Replace(' ', '_'); using StreamReader reader = GZipUtilities.GetAppropriateStreamReader(_inputFileName); using GnomadSvParser gnomadSvParser = _inputFileName.Substring(_inputFileName.Length - 6) switch { "tsv.gz" => new GnomadSvTsvParser(reader, referenceProvider.RefNameToChromosome), "bed.gz" => new GnomadSvBedParser(reader, referenceProvider.RefNameToChromosome), _ => throw new InvalidFileFormatException("Input file should end in '.tsv.gz' or '.bed.gz'") }; using FileStream nsiStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.IntervalFileSuffix)); using var nsiWriter = new NsiWriter( nsiStream, version, referenceProvider.Assembly, SaCommon.GnomadStructuralVariant, ReportFor.StructuralVariants, SaCommon.SchemaVersion ); nsiWriter.Write(gnomadSvParser.GetItems()); return ExitCodes.Success; } } ================================================ FILE: SAUtils/gnomAD/GnomadSvParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.ParseUtils; namespace SAUtils.gnomAD; public abstract class GnomadSvParser : IDisposable { private const char CommentChar = '#'; private readonly StreamReader _reader; protected readonly Dictionary RefNameDict; protected readonly char Delimiter = '\t'; protected TsvIndices TsvIndices; protected GnomadSvParser( StreamReader reader, Dictionary refNameDict ) { _reader = reader; RefNameDict = refNameDict; } public IEnumerable GetItems() { string line; while ((line = _reader.ReadLine()) != null) { // Skip empty lines and comment lines if (string.IsNullOrWhiteSpace(line) || line.OptimizedStartsWith(CommentChar)) continue; GnomadSvItem gnomadSvItem = ParseLine(line); if (gnomadSvItem == null) continue; yield return gnomadSvItem; } } protected abstract GnomadSvItem ParseLine(string inputLine); public void Dispose() { _reader?.Dispose(); } } ================================================ FILE: SAUtils/gnomAD/GnomadSvTsvParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using SAUtils.DataStructures; using SAUtils.ParseUtils; using Variants; namespace SAUtils.gnomAD; public sealed class GnomadSvTsvParser : GnomadSvParser { public GnomadSvTsvParser( StreamReader reader, Dictionary refNameDict ) : base(reader, refNameDict) { TsvIndices = new TsvIndices() { Chromosome = 7, Start = 10, End = 13, VariantId = 1, SvType = 2, AllAlleleCount = 33, AllAlleleFrequency = 34, AllAlleleNumber = 35 }; } protected override GnomadSvItem ParseLine(string inputLine) { var splitLine = new SplitLine(in inputLine, in Delimiter); string chromosomeName = splitLine.GetString(TsvIndices.Chromosome); if (!RefNameDict.ContainsKey(chromosomeName)) return null; Chromosome chromosome = RefNameDict[chromosomeName]; int? start = splitLine.ParseInteger(TsvIndices.Start); int? end = splitLine.ParseInteger(TsvIndices.End); if (start == null || end == null) throw new InvalidDataException($"Invalid Data on Line {inputLine}"); VariantType svType = SvTypeMapper(splitLine.GetString(TsvIndices.SvType)); // Ignoring BND for now if (svType == VariantType.translocation_breakend) return null; start += 1; // +1 for padding base if (start > end) { (start, end) = (end, start); } // 'allele_count': 'AC=21,AFR_AC=10,AMR_AC=9,EAS_AC=0,EUR_AC=2,OTH_AC=0' // 'allele_frequency': 'AF=0.038889,AFR_AF=0.044643,AMR_AF=0.03913,EAS_AF=0,EUR_AF=0.023256,OTH_AF=0' // 'allele_number': 'AN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0' Dictionary countDict = ParseValues(splitLine.GetString(TsvIndices.AllAlleleCount), "AC", SplitLine.ParseInteger); Dictionary frequencyDict = ParseValues(splitLine.GetString(TsvIndices.AllAlleleFrequency), "AF", SplitLine.ParseDouble); Dictionary numberDict = ParseValues(splitLine.GetString(TsvIndices.AllAlleleNumber), "AN", SplitLine.ParseInteger); return new GnomadSvItem(chromosome, inputLine) { Start = (int) start, End = (int) end, VariantId = splitLine.GetString(TsvIndices.VariantId), SvType = svType, AllAlleleNumber = numberDict["ALL"], AllAlleleCount = countDict["ALL"], AllAlleleFrequency = frequencyDict["ALL"], AfrAlleleNumber = numberDict["AFR"], AfrAlleleCount = countDict["AFR"], AfrAlleleFrequency = frequencyDict["AFR"], AmrAlleleNumber = numberDict["AMR"], AmrAlleleCount = countDict["AMR"], AmrAlleleFrequency = frequencyDict["AMR"], EasAlleleNumber = numberDict["EAS"], EasAlleleCount = countDict["EAS"], EasAlleleFrequency = frequencyDict["EAS"], EurAlleleNumber = numberDict["EUR"], EurAlleleCount = countDict["EUR"], EurAlleleFrequency = frequencyDict["EUR"], OthAlleleNumber = numberDict["OTH"], OthAlleleCount = countDict["OTH"], OthAlleleFrequency = frequencyDict["OTH"] }; } private static Dictionary ParseValues(string subString, string keyType, Func parsingFunction) { // 'allele_count': 'AC=21,AFR_AC=10,AMR_AC=9,EAS_AC=0,EUR_AC=2,OTH_AC=0' string[] splitValues = subString.OptimizedSplit(','); Dictionary parsedDict = new(); foreach (string splitValue in splitValues) { (string key, string value) = splitValue.OptimizedKeyValue(); if (!key.Equals(keyType)) { string dictKey = key.Replace($"_{keyType}", ""); parsedDict[dictKey] = parsingFunction(value); continue; } parsedDict["ALL"] = parsingFunction(value); } return parsedDict; } private static VariantType SvTypeMapper(string svType) { // https://www.ncbi.nlm.nih.gov/dbvar/content/var_summary/#nstd166 // All possible values found in data (with counts): // alu insertion: 61351 // copy number variation: 11383 // deletion: 161218 // duplication: 44560 // insertion: 26038 // inversion: 727 // line1 insertion: 10017 // mobile element insertion: 655 // sequence alteration: 4733 // sva insertion: 6547 // Total: 327229 return svType switch { "alu insertion" => VariantType.mobile_element_insertion, "copy number variation" => VariantType.copy_number_variation, "deletion" => VariantType.deletion, "duplication" => VariantType.duplication, "insertion" => VariantType.insertion, "inversion" => VariantType.inversion, "line1 insertion" => VariantType.mobile_element_insertion, "mobile element insertion" => VariantType.mobile_element_insertion, "sequence alteration" => VariantType.structural_alteration, "sva insertion" => VariantType.mobile_element_insertion, _ => throw new InvalidDataException("unknown svType") }; } } ================================================ FILE: SAUtils/gnomAD/GnomadUtilities.cs ================================================ using System; using System.Collections.Generic; using SAUtils.DataStructures; using SAUtils.ParseUtils; using VariantAnnotation.Interface.Providers; using Variants; namespace SAUtils.gnomAD { public static class GnomadUtilities { public static Dictionary<(string refAllele, string altAllele), GnomadItem> GetMergedItems( Dictionary<(string refAllele, string altAllele), GnomadItem> genomeItems, Dictionary<(string refAllele, string altAllele), GnomadItem> exomeItems) { if (genomeItems == null) return exomeItems; if (exomeItems == null) return genomeItems; var mergedItems = new Dictionary<(string refAllele, string altAllele), GnomadItem>(); // take care of the genomeItems and merge if needed foreach (var (key, value) in genomeItems) { mergedItems.Add(key, exomeItems.TryGetValue(key, out var exomeValue) ? MergeItems(value, exomeValue) : value); exomeItems.Remove(key); } foreach (var (key, value) in exomeItems) { mergedItems.Add(key, value); } return mergedItems; } public static GnomadItem GetNormalizedItem(GnomadItem item, ISequenceProvider sequenceProvider) { var (alignedPos, alignedRef, alignedAlt) = VariantUtils.TrimAndLeftAlign(item.Position, item.RefAllele, item.AltAllele, sequenceProvider.Sequence); if (item.Position == alignedPos && item.RefAllele == alignedRef && item.AltAllele == alignedAlt) return item; return new GnomadItem( item.Chromosome, alignedPos, alignedRef, alignedAlt, item.Depth, item.AllAlleleNumber, item.AfrAlleleNumber, item.AmrAlleleNumber, item.EasAlleleNumber, item.FinAlleleNumber, item.NfeAlleleNumber, item.OthAlleleNumber, item.AsjAlleleNumber, item.SasAlleleNumber, item.MaleAlleleNumber, item.FemaleAlleleNumber, item.AllAlleleCount, item.AfrAlleleCount, item.AmrAlleleCount, item.EasAlleleCount, item.FinAlleleCount, item.NfeAlleleCount, item.OthAlleleCount, item.AsjAlleleCount, item.SasAlleleCount, item.MaleAlleleCount, item.FemaleAlleleCount, item.AllHomCount, item.AfrHomCount, item.AmrHomCount, item.EasHomCount, item.FinHomCount, item.NfeHomCount, item.OthHomCount, item.AsjHomCount, item.SasHomCount, item.MaleHomCount, item.FemaleHomCount, //controls item.ControlsAllAlleleNumber, item.ControlsAllAlleleCount, item.HasFailedFilters, item.DataType, item.InputLine) ; } private static GnomadItem MergeItems(GnomadItem item1, GnomadItem item2) { if (item1.Chromosome.Index != item2.Chromosome.Index || item1.Position != item2.Position || item1.RefAllele != item2.RefAllele || item1.AltAllele != item2.AltAllele) throw new DataMisalignedException( $"Trying to merge unequal variants at {item1.Chromosome.UcscName}:{item1.Position} and {item2.Chromosome.UcscName}:{item2.Position}"); if (item1.DataType == item2.DataType) throw new DataMisalignedException($"Trying to merge different data types at {item1.Chromosome.UcscName}:{item1.Position}"); return new GnomadItem(item1.Chromosome, item1.Position, item1.RefAllele, item1.AltAllele, SaParseUtilities.Add(item1.Depth, item2.Depth), SaParseUtilities.Add(item1.AllAlleleNumber, item2.AllAlleleNumber), SaParseUtilities.Add(item1.AfrAlleleNumber, item2.AfrAlleleNumber), SaParseUtilities.Add(item1.AmrAlleleNumber, item2.AmrAlleleNumber), SaParseUtilities.Add(item1.EasAlleleNumber, item2.EasAlleleNumber), SaParseUtilities.Add(item1.FinAlleleNumber, item2.FinAlleleNumber), SaParseUtilities.Add(item1.NfeAlleleNumber, item2.NfeAlleleNumber), SaParseUtilities.Add(item1.OthAlleleNumber, item2.OthAlleleNumber), SaParseUtilities.Add(item1.AsjAlleleNumber, item2.AsjAlleleNumber), SaParseUtilities.Add(item1.SasAlleleNumber, item2.SasAlleleNumber), SaParseUtilities.Add(item1.MaleAlleleNumber, item2.MaleAlleleNumber), SaParseUtilities.Add(item1.FemaleAlleleNumber, item2.FemaleAlleleNumber), SaParseUtilities.Add(item1.AllAlleleCount, item2.AllAlleleCount), SaParseUtilities.Add(item1.AfrAlleleCount, item2.AfrAlleleCount), SaParseUtilities.Add(item1.AmrAlleleCount, item2.AmrAlleleCount), SaParseUtilities.Add(item1.EasAlleleCount, item2.EasAlleleCount), SaParseUtilities.Add(item1.FinAlleleCount, item2.FinAlleleCount), SaParseUtilities.Add(item1.NfeAlleleCount, item2.NfeAlleleCount), SaParseUtilities.Add(item1.OthAlleleCount, item2.OthAlleleCount), SaParseUtilities.Add(item1.AsjAlleleCount, item2.AsjAlleleCount), SaParseUtilities.Add(item1.SasAlleleCount, item2.SasAlleleCount), SaParseUtilities.Add(item1.MaleAlleleCount, item2.MaleAlleleCount), SaParseUtilities.Add(item1.FemaleAlleleCount, item2.FemaleAlleleCount), SaParseUtilities.Add(item1.AllHomCount, item2.AllHomCount), SaParseUtilities.Add(item1.AfrHomCount, item2.AfrHomCount), SaParseUtilities.Add(item1.AmrHomCount, item2.AmrHomCount), SaParseUtilities.Add(item1.EasHomCount, item2.EasHomCount), SaParseUtilities.Add(item1.FinHomCount, item2.FinHomCount), SaParseUtilities.Add(item1.NfeHomCount, item2.NfeHomCount), SaParseUtilities.Add(item1.OthHomCount, item2.OthHomCount), SaParseUtilities.Add(item1.AsjHomCount, item2.AsjHomCount), SaParseUtilities.Add(item1.SasHomCount, item2.SasHomCount), SaParseUtilities.Add(item1.MaleHomCount, item2.MaleHomCount), SaParseUtilities.Add(item1.FemaleHomCount, item2.FemaleHomCount), //control SaParseUtilities.Add(item1.ControlsAllAlleleNumber, item2.ControlsAllAlleleNumber), SaParseUtilities.Add(item1.ControlsAllAlleleCount, item2.ControlsAllAlleleCount), item1.HasFailedFilters || item2.HasFailedFilters, item1.DataType, item1.InputLine + '\n' + item2.InputLine ); } } } ================================================ FILE: SAUtils/gnomAD/LcrInterval.cs ================================================ using Genome; using VariantAnnotation.Interface.SA; namespace SAUtils.gnomAD { public class LcrInterval:ISuppIntervalItem { public int Start { get; } public int End { get; } public Chromosome Chromosome { get; } public string GetJsonString() => string.Empty; public LcrInterval(Chromosome chromosome, int start, int end) { Chromosome = chromosome; Start = start; End = end; } } } ================================================ FILE: SAUtils/gnomAD/LcrRegionParser.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; namespace SAUtils.gnomAD { public sealed class LcrRegionParser:IDisposable { private readonly StreamReader _reader; private readonly ISequenceProvider _refProvider; private int _nRegionSize; public LcrRegionParser(StreamReader reader, ISequenceProvider refProvider) { _reader = reader; _refProvider = refProvider; } public void Dispose() => _reader?.Dispose(); public IEnumerable GetItems() { using (var reader = _reader) { string line; while ((line = reader.ReadLine()) != null) { if (line == string.Empty || line.StartsWith("#")) continue; ISuppIntervalItem region; try { region = GetLcrRegion(line); if(region ==null) continue; } catch (Exception e) { Console.WriteLine(e); e.Data["Line"] = line; throw; } yield return region; } } Console.WriteLine($"Total size of N-regions:{_nRegionSize}"); } private ISuppIntervalItem GetLcrRegion(string line) { (string chromName, int start, int end) = ParsePosition(line); if (chromName==null) return null; //unknown chromosome var chromosome = _refProvider.RefNameToChromosome[chromName]; if (chromosome.IsEmpty()) return null; if (_refProvider.Assembly == GenomeAssembly.GRCh38) start++; return IsNRegion(chromosome, start, end) ? null : new LcrInterval(chromosome, start, end); } private bool IsNRegion(Chromosome chrom, int start, int end) { if (_refProvider == null) return false; _refProvider.LoadChromosome(chrom); var sequence = _refProvider.Sequence.Substring(start - 1, end - start + 1); if (sequence == null) return false; foreach (char c in sequence) { if (c != 'N' && c != 'n') return false; } _nRegionSize+=end-start+1; return true; } private (string ChromName, int Start, int End) ParsePosition(string line) { var splits = line.Split(':', '-', '\t'); var chrom = splits[0]; if (!_refProvider.RefNameToChromosome.ContainsKey(chrom)) return (null, 0, 0); var start = int.Parse(splits[1]); var end = int.Parse(splits[2]); return (chrom, start, end); } } } ================================================ FILE: SAUtils/gnomAD/LcrRegionsMain.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.gnomAD { public class LcrRegionsMain { private static string _referenceSequencePath; private static string _inputFile; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _referenceSequencePath = v }, { "in|i=", "input file path (along with a .version file)", v => _inputFile = v }, { "out|o=", "output directory for NSI file", v => _outputDirectory = v } }; var commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_referenceSequencePath, "compressed reference sequence file name", "--ref") .CheckInputFilenameExists(_inputFile, "input file with LCR regions", "--ref") .CheckDirectoryExists(_outputDirectory, "output Supplementary directory", "--out") .SkipBanner() .ShowHelpMenu("Reads provided supplementary data files and populates tsv files", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var refProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_referenceSequencePath)); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); var outFileName = $"{version.Name}_{version.Version}"; using (var parser = new LcrRegionParser(GZipUtilities.GetAppropriateStreamReader(_inputFile), refProvider)) using (var stream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.LcrFileSuffix))) using (var writer = new NsiWriter(stream, version, refProvider.Assembly, SaCommon.LowComplexityRegionTag, ReportFor.AllVariants, SaCommon.NsiSchemaVersion)) { writer.Write(parser.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: SAUtils/makeDgvDb/Main.cs ================================================ using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using IO; using SAUtils.InputFileParsers; using SAUtils.InputFileParsers.DGV; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace SAUtils.makeDgvDb { public static class Main { private static string _inputFileName; private static string _compressedReference; private static string _outputDirectory; public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "in|i=", "DGV VCFfile", v => _inputFileName = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_inputFileName, "DGV VCFfile", "--in") .CheckInputFilenameExists(_inputFileName, "DGV VCFfile", "--in") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database with ClinVar annotations", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return exitCode; } private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); var version = DataSourceVersionReader.GetSourceVersion(_inputFileName + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using(var dgvReader = new DgvReader(GZipUtilities.GetAppropriateStreamReader(_inputFileName), referenceProvider.RefNameToChromosome)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.IntervalFileSuffix))) using(var nsiWriter = new NsiWriter(nsaStream, version, referenceProvider.Assembly, SaCommon.DgvTag, ReportFor.StructuralVariants, SaCommon.SchemaVersion)) { nsiWriter.Write(dgvReader.GetItems()); } return ExitCodes.Success; } } } ================================================ FILE: Sandbox/AminoAcidAligner/AlignmentBuilder.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; namespace AminoAcidAligner { public class AlignmentBuilder { public readonly string TranscriptId; public string Chromosome; private readonly Dictionary _speciesAlignment; public AlignmentBuilder(string id) { TranscriptId = id; _speciesAlignment = new Dictionary(100);//since we are doing 100 way alignment } public void Add(string transcriptId, string species, string sequence) { if (TranscriptId != transcriptId) return; if (_speciesAlignment.TryGetValue(species, out var sb)) { sb.Append(sequence); } else { _speciesAlignment[species] = new StringBuilder(); _speciesAlignment[species].Append(sequence); } } public override string ToString() { if(!CheckAlignments()) throw new DataMisalignedException($"Alignment issues found for {TranscriptId}"); var sb = new StringBuilder(); foreach (var (species, sequence) in _speciesAlignment) { sb.Append($"{species}\t{sequence}\n"); } return sb.ToString(); } private bool CheckAlignments() { var length = -1; //checking if all the alignments have same length foreach (var sequence in _speciesAlignment.Values) { if (length == -1) length = sequence.Length; if (length != sequence.Length) return false; } //check if there are any '-' es in Human StringBuilder humanSb; if (!_speciesAlignment.TryGetValue("hg38", out humanSb) && !_speciesAlignment.TryGetValue("hg19", out humanSb)) return true; var hg38Sequence = humanSb.ToString(); if (hg38Sequence.Contains('-')) Console.WriteLine($"Human sequence contains - in {TranscriptId}"); return true; } public string GetScoresLine() { var sb = new StringBuilder(); StringBuilder humanSb; string humanSequence=null; if (_speciesAlignment.TryGetValue("hg38", out humanSb) || _speciesAlignment.TryGetValue("hg19", out humanSb)) { humanSequence = humanSb.ToString(); } if(humanSequence == null) throw new InvalidDataException($"No human sequence available for {TranscriptId}"); sb.Append($"{TranscriptId}\t{Chromosome}\t{humanSequence}"); var residueCount = new int[humanSequence.Length]; Array.Fill(residueCount, 0); foreach (var alignment in _speciesAlignment.Values) { for (int i = 0; i < humanSequence.Length; i++) { if (humanSequence[i] == alignment[i]) residueCount[i]++; } } sb.Append('\t'); sb.Append(string.Join(',', residueCount.Select(x => 100 * x / _speciesAlignment.Count))); return sb.ToString(); } } } ================================================ FILE: Sandbox/AminoAcidAligner/AminoAcidAligner.csproj ================================================  Exe net6.0 ================================================ FILE: Sandbox/AminoAcidAligner/ExonToTranscript.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Compression.Utilities; namespace AminoAcidAligner { public static class ExonToTranscript { public static void Main(string[] args) { Console.WriteLine("Aggregate exon alignments into transcript alignments"); if (args.Length != 3) { Console.WriteLine("usage: dotnet AminoAcidAligner.dll [input exon alignment FASTA file] [output transcript alignment file] [output AA conservation scores file]"); return; } var exonAlignmentFile = args[0]; var transcriptAlignmentFile = args[1]; var conservationScoresFile = args[2]; using (var reader = GZipUtilities.GetAppropriateStreamReader(exonAlignmentFile)) using (var writer = GZipUtilities.GetStreamWriter(transcriptAlignmentFile)) using (var scoresWriter = GZipUtilities.GetStreamWriter(conservationScoresFile)) { var count = CreateTranscriptAlignments(reader, writer, scoresWriter); Console.WriteLine($"Created {count} transcript alignments"); } } /// /// merges multiple exon (amino acid) alignments to create transcript alignment /// /// Stream reader for the input FASTA file with exon alignment /// Stream writer for the output file with transcript alignment /// Stream writer for the output file with conservation scores(percentage) /// number if transcripts alignments created /// private static int CreateTranscriptAlignments(StreamReader reader, StreamWriter writer, StreamWriter scoresWriter) { string name = null; string sequence = null; var count = 0; AlignmentBuilder alignmentBuilder = null; scoresWriter.WriteLine("#Ensembl\tChromosome\tProteinSequence\tPercent Conservation at each AA residue"); while (((name, sequence)= GetNextSequence(reader)) != (null, null)) { (string transcriptId, string species, string chromosome) = Utilities.ParseSequenceName(name); if(alignmentBuilder == null) alignmentBuilder = new AlignmentBuilder(transcriptId); if (alignmentBuilder.TranscriptId != transcriptId) { writer.WriteLine(alignmentBuilder.Chromosome); writer.WriteLine(alignmentBuilder.TranscriptId); writer.WriteLine(alignmentBuilder.ToString()); scoresWriter.WriteLine(alignmentBuilder.GetScoresLine()); alignmentBuilder = new AlignmentBuilder(transcriptId); count++; } alignmentBuilder.Add(transcriptId, species, sequence); if (species == "hg38" || species == "hg19") alignmentBuilder.Chromosome = chromosome; } return count; } private static (string name, string sequence) GetNextSequence(StreamReader reader) { var name = reader.ReadLine(); while (name=="") { name = reader.ReadLine(); } if (name == null) return (null, null); if(!name.StartsWith('>')) throw new DataMisalignedException($"FASTQ entry does not start with >. Observed name: {name}"); var sequence = reader.ReadLine(); if (sequence == null) throw new DataMisalignedException($"No sequence found for {name}"); return (name, sequence); } } } ================================================ FILE: Sandbox/AminoAcidAligner/Utilities.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Compression.Utilities; namespace AminoAcidAligner { public static class Utilities { public static (string TranscriptId, string Species, string Chromosome) ParseSequenceName(string name) { //>ENST00000641515.2_hg38_1_2 3 0 0 chr1:65565-65573+ var terms = name.Split('_', ' ', '\t'); var transcriptId = terms[0].TrimStart('>'); //remove versions for Ensembl transcripts if (transcriptId.StartsWith("ENST")) transcriptId = transcriptId.Split('.')[0]; var species = terms[1]; string chrom = null; chrom = terms.Length > 7 && string.IsNullOrEmpty(terms[7])? null: terms[7].Split(':')[0]; return (transcriptId, species, chrom); } /// /// using the CCDS file to find equivalence between Ensembl /// /// input file name /// ensembl to RefSeq transcript mapping //#ccds original_member current_member source nucleotide_ID protein_ID status_in_CCDS sequence_status // CCDS2.2 1 0 NCBI NM_152486.2 NP_689699.2 Updated 0 // CCDS2.2 0 1 NCBI NM_152486.3 NP_689699.2 Accepted 1 // CCDS2.2 1 1 EBI,WTSI ENST00000342066.7 ENSP00000342313.3 Accepted public static List> GroupTranscripts(string fileName) { var ccdsToTranscriptIds = new Dictionary>(); const int ccdsIndex = 0; const int transcriptIndex = 4; using (var reader = GZipUtilities.GetAppropriateStreamReader(fileName)) { string line; while ((line = reader.ReadLine()) != null) { if(line.StartsWith('#')) continue; var terms = line.Split('\t'); var ccds = terms[ccdsIndex]; var transcriptId = terms[transcriptIndex]; //remove versions for Ensembl transcripts if (transcriptId.StartsWith("ENST")) transcriptId = transcriptId.Split('.')[0]; if (ccdsToTranscriptIds.TryGetValue(ccds, out var transcriptIds)) { transcriptIds.Add(transcriptId); } else ccdsToTranscriptIds.Add(ccds, new HashSet(){transcriptId}); } } return ccdsToTranscriptIds.Values.ToList(); } public static Dictionary> GetEquivalentIds(List> transcriptGroups) { var idToGroup = new Dictionary>(); foreach (var transcriptGroup in transcriptGroups) { foreach (var transcript in transcriptGroup) { idToGroup.Add(transcript, transcriptGroup); } } return idToGroup; } } } ================================================ FILE: Sandbox/GenerateMustGenotypeVcf/ConfigurationSettings.cs ================================================ namespace GenerateMustGenotypeVcf { public static class ConfigurationSettings { #region members // filenames public static string OneKGenomeVcf; public static string ClinVarVcf; public static string CosmicVcf; public static string CompressedReferencePath; public static bool IsHg19; #endregion } } ================================================ FILE: Sandbox/GenerateMustGenotypeVcf/GenerateMustGenotypeVcf.csproj ================================================  Exe net6.0 ..\bin\$(Configuration) ================================================ FILE: Sandbox/GenerateMustGenotypeVcf/GenerateMustGenotypeVcfsMain.cs ================================================ using CommandLine.Handlers; using CommandLine.NDesk.Options; using CommandLine.VersionProvider; using VariantAnnotation.DataStructures; namespace GenerateMustGenotypeVcf { sealed class GenerateMustGenotypeVcfsMain : AbstractCommandLineHandler { public static int Main(string[] args) { var ops = new OptionSet { { "onek=", "input 1000Genomes vcf file", v => ConfigurationSettings.OneKGenomeVcf= v }, { "cvr=", "input clinvar vcf file", v => ConfigurationSettings.ClinVarVcf= v }, { "cos=", "input cosmic vcf file", v => ConfigurationSettings.CosmicVcf= v }, { "ref=", "compressed reference sequence", v => ConfigurationSettings.CompressedReferencePath= v }, { "hg19", "need file for hg19", v=>ConfigurationSettings.IsHg19 = v !=null } }; var commandLineExample = "--onek --cos --cvr --out --ref "; var generateMustGenotype = new GenerateMustGenotypeVcfsMain("Generates a must genotype vcf containing all ref minor positions in 1000 Genomes",ops, commandLineExample, Constants.Authors); generateMustGenotype.Execute(args); return generateMustGenotype.ExitCode; } public GenerateMustGenotypeVcfsMain(string programDescription, OptionSet ops, string commandLineExample, string programAuthors, IVersionProvider versionProvider = null) : base(programDescription, ops, commandLineExample, programAuthors, versionProvider) { } protected override void ValidateCommandLine() { CheckInputFilenameExists(ConfigurationSettings.OneKGenomeVcf, "input 1000 genomes vcf", "--onek", false); CheckInputFilenameExists(ConfigurationSettings.ClinVarVcf, "input clinvar vcf", "--cvr",false); CheckInputFilenameExists(ConfigurationSettings.CosmicVcf, "input cosmic vcf", "--cos", false); CheckInputFilenameExists(ConfigurationSettings.CompressedReferencePath, "compressed reference sequence", "--ref"); } protected override void ProgramExecution() { using (var refMinorExtractor = new MustGenotypeExtractor(ConfigurationSettings.CompressedReferencePath,ConfigurationSettings.OneKGenomeVcf, ConfigurationSettings.ClinVarVcf, ConfigurationSettings.CosmicVcf,ConfigurationSettings.IsHg19 )) { refMinorExtractor.ExtractEntries(); } } } } ================================================ FILE: Sandbox/GenerateMustGenotypeVcf/MustGenotypeExtractor.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using VariantAnnotation.Algorithms; using VariantAnnotation.DataStructures; using VariantAnnotation.DataStructures.CompressedSequence; using VariantAnnotation.FileHandling; using VariantAnnotation.FileHandling.Compression; using VariantAnnotation.FileHandling.VCF; using VariantAnnotation.Interface; using VariantAnnotation.Utilities; namespace GenerateMustGenotypeVcf { public sealed class MustGenotypeExtractor:IDisposable { private readonly StreamReader _oneKGenomeReader; private readonly StreamReader _clinvarReader; private readonly StreamReader _cosmicReader; private readonly GenomeAssembly _assembly; private readonly DataFileManager _dataFileManager; private readonly ICompressedSequence _compressedSequence; private int _refMinorCount; private int _clinvarCount; private int _cosmicCount; private const int CosmicMinCount = 5; private const double RefMinorFreq = 0.95; private const int SmallVariantMaxLength = 50; private const string RefMinorFileName = "RefMinorAllelev5.preprocess.vcf.gz"; private const string IsisClinicalIndelFileName = "IsasClinicalIndelsv4.preprocess.vcf.gz"; private const string OncogenicFileName = "OncogenicSitesv3.preprocess.vcf.gz"; private readonly List _grch37Contigs = new List() { "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", }; private readonly List _hg19Contigs = new List() { "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", }; private readonly List _grch38Contigs = new List() { "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=" }; private readonly List _refMinorGrch37HeaderLines = new List() { "##fileformat=VCFv4.1", "##Description=RefMinor positions (ref allele frequency < 0.05) extracted from 1000 Genomes data", "##FILTER=", "##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz", "##source=1000GenomesPhase3Pipeline", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" }; private readonly List _refMinorHg19HeaderLines = new List() { "##fileformat=VCFv4.1", "##Description=RefMinor positions (ref allele frequency < 0.05) extracted from 1000 Genomes data", "##FILTER=", "##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz", "##source=1000GenomesPhase3Pipeline", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" }; private readonly List _refMinorGRCh38HeaderLines = new List() { "##fileformat=VCFv4.1", "##Description=RefMinor positions (ref allele frequency < 0.05) extracted from 1000 Genomes data", "##FILTER=", "##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz", "##source=1000GenomesPhase3Pipeline", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "##contig=", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" }; public MustGenotypeExtractor(string compressedSeqPath, string oneKGenomeVcf,string clinvarVcf, string cosmicVcf, bool isHg19 = false) { _compressedSequence = new CompressedSequence(); _dataFileManager = new DataFileManager(new CompressedSequenceReader(FileUtilities.GetReadStream(compressedSeqPath),_compressedSequence ),_compressedSequence); _assembly = _compressedSequence.GenomeAssembly == GenomeAssembly.GRCh37 && isHg19? GenomeAssembly.hg19:_compressedSequence.GenomeAssembly; if (_assembly == GenomeAssembly.Unknown) throw new Exception("Genome assembly must be either GRCh37 or GRCh38"); if(_compressedSequence.GenomeAssembly == GenomeAssembly.GRCh38 && isHg19) throw new Exception("reference sequence is GRCh38 while generating hg19 files"); _oneKGenomeReader = string.IsNullOrEmpty(oneKGenomeVcf)? null: GZipUtilities.GetAppropriateStreamReader(oneKGenomeVcf); _clinvarReader = string.IsNullOrEmpty(clinvarVcf) ? null : GZipUtilities.GetAppropriateStreamReader(clinvarVcf); _cosmicReader = string.IsNullOrEmpty(cosmicVcf) ? null : GZipUtilities.GetAppropriateStreamReader(cosmicVcf); } #region IDisposable private bool _isDisposed; /// /// public implementation of Dispose pattern callable by consumers. /// public void Dispose() { Dispose(true); } /// /// protected implementation of Dispose pattern. /// private void Dispose(bool disposing) { lock (this) { if (_isDisposed) return; if (disposing) { // Free any other managed objects here. Close(); } // Free any unmanaged objects here. _isDisposed = true; } } private void Close() { _oneKGenomeReader?.Dispose(); _clinvarReader?.Dispose(); _cosmicReader?.Dispose(); } #endregion public void ExtractEntries() { ExtractFromClinVar(); if (_clinvarCount > 0) Console.WriteLine($"{_clinvarCount} non-SNVs extracted from clinvar"); ExtractFromCosmic(); if (_cosmicCount > 0) Console.WriteLine($"{_cosmicCount} entries with count > {CosmicMinCount} extracted from Cosmic"); ExtractFromOneKg(); if (_refMinorCount > 0) Console.WriteLine($"{_refMinorCount} ref minor positions extracted from 1000 G"); } private void ExtractFromCosmic() { if (_cosmicReader == null) return; var needParseHeaderLine = true; using (var writer = GZipUtilities.GetStreamWriter(OncogenicFileName)) { string line; while ((line = _cosmicReader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; //copy required header lines if (line.StartsWith("#") && needParseHeaderLine) { ProcessHeaderLine(writer, line); continue; } if(line.StartsWith("#")) continue; needParseHeaderLine = false; var fields = line.Split('\t'); if (IsLargeVariants(fields[VcfCommon.RefIndex], fields[VcfCommon.AltIndex])) continue; if (! HasMinCount(fields[VcfCommon.InfoIndex])) continue; _cosmicCount++; var chrName = GetChrName(fields[VcfCommon.ChromIndex]); //skip mito for hg19 if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT")) continue; var pos = Convert.ToInt32(fields[VcfCommon.PosIndex]); var refAllele = fields[VcfCommon.RefIndex]; if (ValidateReference(chrName, pos, refAllele)) writer.Write(chrName + '\t' + pos + '\t' + fields[VcfCommon.IdIndex] + '\t' + refAllele + '\t' + fields[VcfCommon.AltIndex] + '\t' + ".\t.\t.\n"); } } } private void ProcessHeaderLine(StreamWriter writer, string line) { if (line.StartsWith("##fileformat=")) { writer.WriteLine("##fileformat=VCFv4.1"); } if (IsRequiredHeaderLine(line)) { writer.Write(line + "\n"); return; } //if we have seen the chrom header if (!line.StartsWith("#CHROM")) return; writer.Write($"##Description=COSMIC variants having count greater or equal to {CosmicMinCount}" + "\n"); WriteContigLines(writer); writer.Write(line + "\n"); } private void WriteContigLines(StreamWriter writer) { List contigLines = null; if (_assembly == GenomeAssembly.GRCh37) contigLines = _grch37Contigs; if (_assembly == GenomeAssembly.GRCh38) contigLines = _grch38Contigs; if (_assembly == GenomeAssembly.hg19) contigLines = _hg19Contigs; if (contigLines == null) return; foreach (var contigLine in contigLines) { writer.Write(contigLine + "\n"); } } private static bool IsRequiredHeaderLine(string line) { return line.StartsWith("##source=") || line.StartsWith("##reference="); } private static bool IsLargeVariants(string refAllele, string altAlleles) { foreach (var altAllele in altAlleles.Split(',')) { var trimmedAlleles = BiDirectionalTrimmer.Trim(1, refAllele, altAllele); var trimmedRef = trimmedAlleles.Item2; var trimmedAlt = trimmedAlleles.Item3; if (trimmedRef.Length > SmallVariantMaxLength || trimmedAlt.Length > SmallVariantMaxLength) return true; } return false; } private static bool HasMinCount(string info) { var infoFields = info.Split(';'); foreach (var infoField in infoFields) { if (!infoField.StartsWith("CNT=")) continue; var count = Convert.ToInt32(infoField.Substring(4)); return count >= CosmicMinCount; } return false; } private void ExtractFromClinVar() { if (_clinvarReader == null) return; using (var writer = GZipUtilities.GetStreamWriter(IsisClinicalIndelFileName)) { string line; while ((line = _clinvarReader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; //copy required header lines if (line.StartsWith("#")) { ProcessHeaderLine(writer, line); continue; } var fields = line.Split('\t'); if (IsSnv(fields[VcfCommon.RefIndex], fields[VcfCommon.AltIndex])) continue; _clinvarCount++; var chrName = GetChrName(fields[VcfCommon.ChromIndex]); //skip mito for hg19 if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT")) continue; var pos = Convert.ToInt32(fields[VcfCommon.PosIndex]); var refAllele = fields[VcfCommon.RefIndex]; if(ValidateReference(chrName,pos,refAllele)) writer.Write(chrName + '\t' + pos + '\t' + fields[VcfCommon.IdIndex] + '\t' + refAllele + '\t' + fields[VcfCommon.AltIndex] + '\t' + ".\t.\t.\n"); } } } private bool ValidateReference(string chromosome, int pos, string refAllele) { var refIndex = _compressedSequence.Renamer.GetReferenceIndex(chromosome); if (refIndex == ChromosomeRenamer.UnknownReferenceIndex) return false; _dataFileManager.LoadReference(refIndex, () => { }); return _compressedSequence.Substring(pos - 1, refAllele.Length) == refAllele; } private void ExtractFromOneKg() { if (_oneKGenomeReader == null) return; using (var writer = GZipUtilities.GetStreamWriter(RefMinorFileName)) { List headerLines = null; if (_assembly == GenomeAssembly.GRCh37) headerLines = _refMinorGrch37HeaderLines; if (_assembly == GenomeAssembly.hg19) headerLines = _refMinorHg19HeaderLines; if (_assembly == GenomeAssembly.GRCh38) headerLines = _refMinorGRCh38HeaderLines; if (headerLines == null) throw new Exception("Unknown assembly for RefMinor Extraction"); foreach (var headerLine in headerLines) writer.Write(headerLine + "\n"); string line; while ((line = _oneKGenomeReader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) continue; // Skip comments. if (line.StartsWith("#"))continue; var fields = line.Split('\t'); if (!IsRefMinorPosition(fields[VcfCommon.InfoIndex])) continue; _refMinorCount++; var chrName = GetChrName(fields[VcfCommon.ChromIndex]); //skip mito for hg19 if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT")) continue; var pos = Convert.ToInt32(fields[VcfCommon.PosIndex]); var refAllele = fields[VcfCommon.RefIndex]; if (ValidateReference(chrName, pos, refAllele)) writer.Write(chrName + '\t' + pos + '\t' + fields[VcfCommon.IdIndex] + '\t' + refAllele + '\t' + fields[VcfCommon.AltIndex] + '\t' + ".\t.\t.\n"); } } } private string GetChrName(string chromosome) { var chrName = _assembly == GenomeAssembly.GRCh38 || _assembly == GenomeAssembly.hg19 ? "chr" + chromosome : chromosome; if (chrName == "chrMT") chrName = "chrM"; return chrName; } private static bool IsSnv(string refAllele, string altAlleles) { if (!IsSnv(refAllele)) return false; return altAlleles.Split(',').All(IsSnv); } private static bool IsSnv(string allele) { if (allele.Length != 1) return false; allele = allele.ToUpper(); if (allele == "A" || allele == "C" || allele == "G" || allele == "T") return true; return false; } private static bool IsRefMinorPosition(string info) { var infoFields = info.Split(';'); foreach (var infoField in infoFields) { if (! infoField.StartsWith("AF=")) continue; var totalAltAlleleFreq = 0.0; foreach (var freq in infoField.Substring(3).Split(',')) { totalAltAlleleFreq+=Convert.ToDouble(freq); } return totalAltAlleleFreq >= RefMinorFreq; } return false; } } } ================================================ FILE: Sandbox/Piano/ConfigurationSettings.cs ================================================ namespace Piano { public static class ConfigurationSettings { // filenames public static string InputCachePrefix; public static string VcfPath; public static string RefSequencePath; public static string OutputFileName; // parameters public static bool ForceMitochondrialAnnotation; } } ================================================ FILE: Sandbox/Piano/Piano.cs ================================================  using System; using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using Compression.Utilities; using ErrorHandling; using VariantAnnotation; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.Sequence; using VariantAnnotation.IO.Caches; using Vcf; using VariantAnnotation.Interface; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.IO; using VariantAnnotation.Providers; namespace Piano { sealed class Piano { #region members private const string OutHeader = "#Chrom\tPos\tRefAllele\tAltAllele\tGeneSymbol\tGeneId\tTranscriptID\tProteinID\tProteinPos\tUpstream\tAAchange\tDownstream\tConsequences"; private readonly PerformanceMetrics _performanceMetrics = PerformanceMetrics.Instance; #endregion private ExitCodes ProgramExecution() { var sequenceProvider = ProviderUtilities.GetSequenceProvider(ConfigurationSettings.RefSequencePath); var transcriptAnnotationProvider = ProviderUtilities.GetTranscriptAnnotationProvider(ConfigurationSettings.InputCachePrefix, sequenceProvider); var annotator = ProviderUtilities.GetAnnotator(transcriptAnnotationProvider, sequenceProvider); var dataSourceVesions = new List(); dataSourceVesions.AddRange(transcriptAnnotationProvider.DataSourceVersions); using (var outputWriter = new StreamWriter(ConfigurationSettings.OutputFileName)) using (var vcfReader =new VcfReader(GZipUtilities.GetAppropriateReadStream(ConfigurationSettings.VcfPath), sequenceProvider.GetChromosomeDictionary(), null, false)) { try { if (vcfReader.IsRcrsMitochondrion && annotator.GenomeAssembly == GenomeAssembly.GRCh37 || annotator.GenomeAssembly == GenomeAssembly.GRCh38 || ConfigurationSettings.ForceMitochondrialAnnotation) annotator.EnableMitochondrialAnnotation(); int previousChromIndex = -1; IPosition position; // var sortedVcfChecker = new SortedVcfChecker(); outputWriter.WriteLine(OutHeader); while ((position = vcfReader.GetNextPosition()) != null) { // sortedVcfChecker.CheckVcfOrder(position.Chromosome.UcscName); previousChromIndex = UpdatePerformanceMetrics(previousChromIndex, position.Chromosome); var annotatedPosition = annotator.Annotate(position); WriteAnnotatedPostion(annotatedPosition, outputWriter); } } catch (Exception e) { e.Data[ExitCodeUtilities.VcfLine] = vcfReader.VcfLine; throw; } } return ExitCodes.Success; } private int UpdatePerformanceMetrics(int previousChromIndex, Chromosome chromosome) { if (chromosome.Index != previousChromIndex) { if (previousChromIndex != -1) _performanceMetrics.StopReference(); _performanceMetrics.StartReference(chromosome.UcscName); previousChromIndex = chromosome.Index; } return previousChromIndex; } private static void WriteAnnotatedPostion(IAnnotatedPosition annotatedPosition, StreamWriter writer) { //"#Chrom\tPos\tRefAllele\tAltAllele\tGeneSymbol\tGeneId\tTranscriptID\tProteinID\tProteinPos\tUpstream\tAAchange\tDownstream\tConsequences"; if (annotatedPosition.AnnotatedVariants == null || annotatedPosition.AnnotatedVariants.Length == 0) return; for (int i = 0; i < annotatedPosition.AnnotatedVariants.Length; i++) { var annotatedVariant = annotatedPosition.AnnotatedVariants[i]; var chromosome = annotatedPosition.Position.VcfFields[VcfCommon.ChromIndex]; var position = annotatedPosition.Position.Start; var refAllele = annotatedPosition.Position.RefAllele; var altAllele = annotatedPosition.Position.AltAlleles[i]; foreach (var ensemblTranscript in annotatedVariant.EnsemblTranscripts) { var transcript = ensemblTranscript; if(transcript.ToString()==null) continue; var line = chromosome + "\t" + position + "\t" + refAllele + "\t" + altAllele + "\t" + transcript; writer.WriteLine(line); } foreach (var refSeqTranscript in annotatedVariant.RefSeqTranscripts) { var transcript = refSeqTranscript ; if (transcript.ToString() == null) continue; var line = chromosome + "\t" + position + "\t" + refAllele + "\t" + altAllele + "\t" + transcript; writer.WriteLine(line); } } } static int Main(string[] args) { var ops = new OptionSet { { "cache|c=", "input cache {prefix}", v => ConfigurationSettings.InputCachePrefix = v }, { "in|i=", "input VCF {path}", v => ConfigurationSettings.VcfPath = v }, { "out|o=", "output {file path} ", v => ConfigurationSettings.OutputFileName = v }, { "ref|r=", "input compressed reference sequence {path}", v => ConfigurationSettings.RefSequencePath = v }, { "force-mt", "forces to annotate mitochondria variants", v => ConfigurationSettings.ForceMitochondrialAnnotation = v != null } }; var commandLineExample = "-i -d -r -o "; var piano = new Piano(); var exitCode = new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .CheckInputFilenameExists(ConfigurationSettings.VcfPath, "vcf", "--in", true, "-") .CheckInputFilenameExists(ConfigurationSettings.RefSequencePath, "reference sequence", "--ref") .CheckInputFilenameExists(CacheConstants.TranscriptPath(ConfigurationSettings.InputCachePrefix), "transcript cache", "--cache") .HasRequiredParameter(ConfigurationSettings.OutputFileName, "output file stub", "--out") .ShowBanner(Constants.Authors) .ShowHelpMenu("peptide annotation", commandLineExample) .ShowErrors() .Execute(piano.ProgramExecution); return (int)exitCode; } } } ================================================ FILE: Sandbox/Piano/Piano.csproj ================================================  Exe net6.0 ..\bin\$(Configuration) ================================================ FILE: Sandbox/Piano/PianoAnnotatedTranscript.cs ================================================ using System; using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Text; using VariantAnnotation.Algorithms; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Intervals; namespace Piano { public class PianoAnnotatedTranscript:IAnnotatedTranscript { public void SerializeJson(StringBuilder sb) { throw new System.NotImplementedException(); } public ITranscript Transcript { get; } public string ReferenceAminoAcids { get; } public string AlternateAminoAcids { get; } public string ReferenceCodons { get; } public string AlternateCodons { get; } public IMappedPositions MappedPositions { get; } public string HgvsCoding { get; } public string HgvsProtein { get; } public PredictionScore Sift { get; } public PredictionScore PolyPhen { get; } public IEnumerable Consequences { get; } public IGeneFusionAnnotation GeneFusionAnnotation { get; } public string UpstreamAminoAcids { get; } public string DownStreamAminoAcids { get; } public PianoAnnotatedTranscript(ITranscript transcript, string referenceAminoAcids, string alternateAminoAcids, IMappedPositions mappedPositions, string upstreamAminoAcids, string downStreamAminoAcids,IEnumerable consequences) { Transcript = transcript; ReferenceAminoAcids = referenceAminoAcids; AlternateAminoAcids = alternateAminoAcids; MappedPositions = mappedPositions; UpstreamAminoAcids = upstreamAminoAcids; DownStreamAminoAcids = downStreamAminoAcids; Consequences = consequences; } public override string ToString() { if (MappedPositions.ProteinInterval.Start == null || MappedPositions.ProteinInterval.End == null) return null; var geneId = Transcript.Source == Source.Ensembl ? Transcript.Gene.EnsemblId.ToString() : Transcript.Gene.EntrezGeneId.ToString(); var downStreamAminoAcids = string.IsNullOrEmpty(DownStreamAminoAcids) ? "." : DownStreamAminoAcids; var upstreamAminoAcids = string.IsNullOrEmpty(UpstreamAminoAcids) ? "." : UpstreamAminoAcids; var line = Transcript.Gene.Symbol + "\t" + geneId + "\t" + CombineIdAndVersion(Transcript.Id,Transcript.Version) + "\t" + CombineIdAndVersion(Transcript.Translation.ProteinId,Transcript.Translation.ProteinVersion) + "\t" + GetNullablePositionRange(MappedPositions.ProteinInterval) + "\t" + upstreamAminoAcids + "\t" + GetAlleleString(ReferenceAminoAcids, AlternateAminoAcids) + "\t" + downStreamAminoAcids+"\t"+string.Join(',', Consequences?.Select(ConsequenceUtil.GetConsequence)); return line; } private static string GetAlleleString(string a, string b) { return a == b ? a : $"{(string.IsNullOrEmpty(a) ? "-" : a)}/{(string.IsNullOrEmpty(b) ? "-" : b)}"; } private static string CombineIdAndVersion(ICompactId id, byte version) => id + "." + version; private static string GetNullablePositionRange(NullableInterval interval) { if (interval.Start == null && interval.End == null) return null; if (interval.Start == null) return "?-" + interval.End.Value; if (interval.End == null) return interval.Start.Value + "-?"; var start = interval.Start.Value; var end = interval.End.Value; if (start > end) Swap.Int(ref start, ref end); return start == end ? start.ToString(CultureInfo.InvariantCulture) : start + "-" + end; } } } ================================================ FILE: Sandbox/Piano/PianoAnnotationProvider.cs ================================================ using System.Collections.Generic; using System.IO; using VariantAnnotation.Caches; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.Sequence; using VariantAnnotation.IO.Caches; using VariantAnnotation.TranscriptAnnotation; using VariantAnnotation.Utilities; namespace Piano { public class PianoAnnotationProvider:IAnnotationProvider { public string Name { get; } public GenomeAssembly GenomeAssembly { get; } public IEnumerable DataSourceVersions { get; } private readonly ITranscriptCache _transcriptCache; private readonly ISequence _sequence; private ushort _currentRefIndex = ushort.MaxValue; public PianoAnnotationProvider(string pathPrefix, ISequenceProvider sequenceProvider) { Name = "Transcript annotation provider"; _sequence = sequenceProvider.Sequence; _transcriptCache = InitiateCache(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(pathPrefix)), sequenceProvider.GetChromosomeIndexDictionary(), sequenceProvider.GenomeAssembly, sequenceProvider.NumRefSeqs); GenomeAssembly = _transcriptCache.GenomeAssembly; DataSourceVersions = _transcriptCache.DataSourceVersions; } private static TranscriptCache InitiateCache(Stream stream, IDictionary chromosomeIndexDictionary, GenomeAssembly genomeAssembly, ushort numRefSeq) { TranscriptCache cache; using (var reader = new TranscriptCacheReader(stream, genomeAssembly, numRefSeq)) cache = reader.Read(chromosomeIndexDictionary); return cache; } public void Annotate(IAnnotatedPosition annotatedPosition) { if (annotatedPosition.AnnotatedVariants == null || annotatedPosition.AnnotatedVariants.Length == 0) return; var refIndex = annotatedPosition.Position.Chromosome.Index; LoadPredictionCaches(refIndex); AddTranscripts(annotatedPosition); } private void LoadPredictionCaches(ushort refIndex) { if (refIndex == _currentRefIndex) return; if (refIndex == ushort.MaxValue) { ClearCache(); return; } _currentRefIndex = refIndex; } private void ClearCache() { _currentRefIndex = ushort.MaxValue; } private void AddTranscripts(IAnnotatedPosition annotatedPosition) { var overlappingTranscripts = _transcriptCache.GetOverlappingFlankingTranscripts(annotatedPosition.Position); if (overlappingTranscripts == null) { // todo: handle intergenic variants return; } foreach (var annotatedVariant in annotatedPosition.AnnotatedVariants) { var annotatedTranscripts = new List(); PianoAnnotationUtils.GetAnnotatedTranscripts(annotatedVariant.Variant, overlappingTranscripts, _sequence, annotatedTranscripts); if (annotatedTranscripts.Count == 0) continue; foreach (var annotatedTranscript in annotatedTranscripts) { if (annotatedTranscript.Transcript.Source == Source.Ensembl) annotatedVariant.EnsemblTranscripts.Add(annotatedTranscript); else annotatedVariant.RefSeqTranscripts.Add(annotatedTranscript); } } } } } ================================================ FILE: Sandbox/Piano/PianoAnnotationUtils.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Algorithms; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Intervals; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Sequence; namespace Piano { public static class PianoAnnotationUtils { private static readonly AminoAcids AminoAcidsProvider = new AminoAcids(false); private static readonly AminoAcids MitoAminoAcidsProvider = new AminoAcids(true); public static void GetAnnotatedTranscripts(IVariant variant, ITranscript[] transcriptCandidates, ISequence compressedSequence, IList annotatedTranscripts) { foreach (var transcript in transcriptCandidates) { if (transcript.Overlaps(variant) && !variant.Behavior.ReducedTranscriptAnnotation) { var annotatedTranscript = GetAnnotatedTranscript(variant, compressedSequence, transcript); if (annotatedTranscript != null) annotatedTranscripts.Add(annotatedTranscript); } } } private static IAnnotatedTranscript GetAnnotatedTranscript(IVariant variant, ISequence compressedSequence, ITranscript transcript) { var acidsProvider = variant.Chromosome.UcscName == "chrM" ? MitoAminoAcidsProvider : AminoAcidsProvider; var annotatedTranscript = PianoTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, compressedSequence,acidsProvider); return annotatedTranscript; } } } ================================================ FILE: Sandbox/Piano/PianoAnnotator.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using ErrorHandling.Exceptions; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.Interface; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.GeneAnnotation; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.Sequence; namespace Piano { public class PianoAnnotator:IAnnotator { private readonly IAnnotationProvider _taProvider; private readonly ISequenceProvider _sequenceProvider; private bool _annotateMito; public GenomeAssembly GenomeAssembly { get; } public PianoAnnotator(IAnnotationProvider taProvider, ISequenceProvider sequenceProvider) { _taProvider = taProvider; _sequenceProvider = sequenceProvider; GenomeAssembly = GetGenomeAssembly(); } private GenomeAssembly GetGenomeAssembly() { var assemblies = new Dictionary(); if (_taProvider != null) assemblies[_taProvider.GenomeAssembly] = _taProvider.Name; if (_sequenceProvider != null) assemblies[_sequenceProvider.GenomeAssembly] = _sequenceProvider.Name; if (assemblies.Count == 0) return GenomeAssembly.Unknown; if (assemblies.Count == 1) return assemblies.First().Key; foreach (var assembly in assemblies) { Console.WriteLine($"{assembly.Value} has genome assembly {assembly.Key}"); } throw new InconsistantGenomeAssemblyException(); } public IAnnotatedPosition Annotate(IPosition position) { if (position == null) return null; var annotatedVariants = GetAnnotatedVariants(position.Variants); var annotatedPosition = new AnnotatedPosition(position, annotatedVariants); if (annotatedPosition.AnnotatedVariants == null || annotatedPosition.AnnotatedVariants.Length == 0 || position.Chromosome.UcscName == "chrM" && !_annotateMito ) return annotatedPosition; _sequenceProvider?.Annotate(annotatedPosition); _taProvider.Annotate(annotatedPosition); return annotatedPosition; } private static IAnnotatedVariant[] GetAnnotatedVariants(IVariant[] variants) { if (variants?[0].Behavior == null) return null; var numVariants = variants.Length; var annotatedVariants = new IAnnotatedVariant[numVariants]; for (var i = 0; i < numVariants; i++) annotatedVariants[i] = new AnnotatedVariant(variants[i]); return annotatedVariants; } public IList GetAnnotatedGenes() { return null; } public void EnableMitochondrialAnnotation() { _annotateMito = true; } } } ================================================ FILE: Sandbox/Piano/PianoTranscriptAnnotator.cs ================================================ using System; using System.Collections.Generic; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.AnnotatedPositions.Consequence; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Intervals; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Sequence; namespace Piano { public static class PianoTranscriptAnnotator { private const int FlankingAminoAcidLength = 15; public static IAnnotatedTranscript GetAnnotatedTranscript(ITranscript transcript, IVariant variant, ISequence refSequence, AminoAcids aminoAcidsProvider) { var mappedPositions = MappedPositionsUtils.ComputeMappedPositions(variant.Start, variant.End, transcript); var transcriptRefAllele = HgvsUtilities.GetTranscriptAllele(variant.RefAllele, transcript.Gene.OnReverseStrand); var transcriptAltAllele = HgvsUtilities.GetTranscriptAllele(variant.AltAllele, transcript.Gene.OnReverseStrand); var codonsAndAminoAcids = GetCodonsAndAminoAcids(transcript, refSequence, transcriptRefAllele, transcriptAltAllele, variant, mappedPositions, aminoAcidsProvider); var referenceCodons = codonsAndAminoAcids.Item1; var alternateCodons = codonsAndAminoAcids.Item2; var referenceAminoAcids = codonsAndAminoAcids.Item3; var alternateAminoAcids = codonsAndAminoAcids.Item4; var insertionInStartCodonAndNoimpact = variant.Type == VariantType.insertion && mappedPositions.ProteinInterval.Start <= 1 && alternateAminoAcids.EndsWith(referenceAminoAcids); var variantEffect = GetVariantEffect(transcript, variant, mappedPositions, referenceAminoAcids, alternateAminoAcids, referenceCodons, alternateCodons, insertionInStartCodonAndNoimpact); var consequences = GetConsequences(transcript, variant, variantEffect); var proteinBegin = mappedPositions.ProteinInterval.Start == null ? -1 : mappedPositions.ProteinInterval.Start.Value; var proteinEnd = mappedPositions.ProteinInterval.End == null ? -1 : mappedPositions.ProteinInterval.End.Value; var upStreamAminoAcids = GetFlankingPeptides(transcript.Translation?.PeptideSeq, proteinBegin, proteinEnd, FlankingAminoAcidLength, true); var downStreamAminoAcids = consequences.Contains(ConsequenceTag.frameshift_variant)? null: GetFlankingPeptides(transcript.Translation?.PeptideSeq, proteinBegin, proteinEnd, FlankingAminoAcidLength, false); return new PianoAnnotatedTranscript(transcript,referenceAminoAcids, alternateAminoAcids, mappedPositions,upStreamAminoAcids,downStreamAminoAcids,consequences); } private static string GetFlankingPeptides(string peptideSeq, int proteinBegin,int proteinEnd, int nBase, bool upStrem) { if (peptideSeq == null) return null; if (proteinBegin == -1 && proteinEnd == -1) return null; if (proteinBegin == -1) proteinBegin = proteinEnd; if (proteinEnd == -1) proteinEnd = proteinBegin; if (upStrem) { var peptideStart = Math.Max(1, proteinBegin - nBase); return peptideSeq.Substring(peptideStart - 1, (proteinBegin - peptideStart)); } var peptideEnd = Math.Min(peptideSeq.Length, proteinEnd + nBase); return peptideEnd > proteinEnd + 1 ? peptideSeq.Substring(proteinEnd, (peptideEnd - proteinEnd)) : ""; } private static Tuple GetCodonsAndAminoAcids(ITranscript transcript, ISequence refSequence, string transcriptRefAllele, string transcriptAltAllele, ISimpleVariant variant, IMappedPositions mappedPositions, AminoAcids aminoAcidsProvider) { var codingSequence = transcript.Translation == null ? null : new CodingSequence(refSequence, transcript.Translation.CodingRegion.Start, transcript.Translation.CodingRegion.End, transcript.CdnaMaps, transcript.Gene.OnReverseStrand, transcript.StartExonPhase); // compute codons and amino acids AssignCodonsAndAminoAcids(transcriptRefAllele, transcriptAltAllele, mappedPositions, codingSequence, aminoAcidsProvider, out string referenceCodons, out string alternateCodons, out string referenceAminoAcids, out string alternateAminoAcids); return Tuple.Create(referenceCodons ?? "", alternateCodons ?? "", referenceAminoAcids ?? "", alternateAminoAcids ?? ""); } private static void AssignCodonsAndAminoAcids(string transcriptRefAllele, string transcriptAltAllele, IMappedPositions mappedPositions, ISequence codingSequence, AminoAcids aminoAcidProvier, out string refCodons, out string altCodons, out string refAminoAcids, out string altAminoAcids) { AssignExtended(transcriptRefAllele, transcriptAltAllele, mappedPositions.CdsInterval, mappedPositions.ProteinInterval, codingSequence, out refCodons, out altCodons); aminoAcidProvier.Assign(refCodons, altCodons, out refAminoAcids, out altAminoAcids); } private static List GetConsequences(ITranscript transcript, IVariant variant, VariantEffect variantEffect) { var featureEffect = new FeatureVariantEffects(transcript, variant.Type, variant.Start, variant.End, variant.Behavior.StructuralVariantConsequence); var consequence = new Consequences(variantEffect, featureEffect); consequence.DetermineSmallVariantEffects(); return consequence.GetConsequences(); } private static VariantEffect GetVariantEffect(ITranscript transcript, ISimpleVariant variant, IMappedPositions mappedPositions, string refAminoAcids, string altAminoAcids, string refCodons, string altCodons, bool insertionInStartAndNoImpact) { var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineIntronicEffect(transcript.Introns, variant, variant.Type); positionalEffect.DetermineExonicEffect(transcript, variant, mappedPositions, variant.AltAllele, insertionInStartAndNoImpact); var variantEffect = new VariantEffect(positionalEffect, variant, transcript, refAminoAcids, altAminoAcids, refCodons, altCodons, mappedPositions.ProteinInterval.Start); return variantEffect; } private static void AssignExtended(string transcriptReferenceAllele, string transcriptAlternateAllele, NullableInterval cdsInterval, NullableInterval proteinInterval, ISequence codingSequence, out string refCodons, out string altCodons) { refCodons = null; altCodons = null; if (cdsInterval.Start == null || cdsInterval.End == null || proteinInterval.Start == null || proteinInterval.End == null) return; int aminoAcidStart = proteinInterval.Start.Value * 3 - 2; int aminoAcidEnd = proteinInterval.End.Value * 3; int prefixLen = cdsInterval.Start.Value - aminoAcidStart; int suffixLen = aminoAcidEnd - cdsInterval.End.Value; int start1 = aminoAcidStart - 1; int start2 = aminoAcidEnd - suffixLen; int maxSuffixLen = codingSequence.Length - start2; var atTailEnd = false; if (suffixLen > maxSuffixLen) { suffixLen = maxSuffixLen; atTailEnd = true; } if (suffixLen > maxSuffixLen) suffixLen = maxSuffixLen; string prefix = start1 + prefixLen < codingSequence.Length ? codingSequence.Substring(start1, prefixLen).ToLower() : "AAA"; string suffix = suffixLen > 0 ? codingSequence.Substring(start2, suffixLen).ToLower() : ""; var needExtend = !atTailEnd && !Codons.IsTriplet(prefixLen + suffixLen + transcriptAlternateAllele.Length); var extendedLen = (maxSuffixLen - suffixLen) > 45 ? 45 : (maxSuffixLen - suffixLen) / 3 * 3; if (needExtend) suffix = codingSequence.Substring(start2, suffixLen + extendedLen); refCodons = Codons.GetCodon(transcriptReferenceAllele, prefix, suffix); altCodons = Codons.GetCodon(transcriptAlternateAllele, prefix, suffix); } } } ================================================ FILE: Sandbox/Piano/ProviderUtilities.cs ================================================ using VariantAnnotation; using VariantAnnotation.Interface; using VariantAnnotation.Interface.GeneAnnotation; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using VariantAnnotation.Utilities; namespace Piano { public static class ProviderUtilities { public static ISequenceProvider GetSequenceProvider(string compressedReferencePath) { return new ReferenceSequenceProvider(FileUtilities.GetReadStream(compressedReferencePath)); } public static IAnnotationProvider GetTranscriptAnnotationProvider(string path, ISequenceProvider sequenceProvider) { return new PianoAnnotationProvider(path, sequenceProvider); } public static IAnnotator GetAnnotator(IAnnotationProvider taProvider, ISequenceProvider sequenceProvider) { return new PianoAnnotator(taProvider, sequenceProvider); } } } ================================================ FILE: Sandbox/Sandbox.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.26730.16 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "VariantAnnotation.Interface", "..\VariantAnnotation.Interface\VariantAnnotation.Interface.csproj", "{3D09B50F-73B4-4021-B5F0-2100574BD54B}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ErrorHandling", "..\ErrorHandling\ErrorHandling.csproj", "{2B6B916D-B9DD-4156-A486-F2835C1EE992}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "VariantAnnotation", "..\VariantAnnotation\VariantAnnotation.csproj", "{7030787E-D41A-4397-9472-40D79EEB3DD2}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SAUtils", "..\SAUtils\SAUtils.csproj", "{DDAF33F9-5925-4689-B438-D339A49E52CD}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CommandLine", "..\CommandLine\CommandLine.csproj", "{34A0FF3C-8E65-4378-B2FE-41E661C0B7B6}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CommonUtilities", "..\CommonUtilities\CommonUtilities.csproj", "{44D54D5C-E5E8-4622-9701-6B26C9A831A1}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Compression", "..\Compression\Compression.csproj", "{82F0CE32-D465-4E7B-91CA-A4B67763F433}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Vcf", "..\Vcf\Vcf.csproj", "{BCB37D8F-9B8A-4846-B441-6975CF67BADF}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AminoAcidAligner", "AminoAcidAligner\AminoAcidAligner.csproj", "{793E5969-C6A0-4072-9D6B-4878AA79C917}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "IO", "..\IO\IO.csproj", "{5A95D583-1B37-4AE9-BC38-FDCCCB3183CE}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {E435C06D-762A-4BB8-9EF8-B75D02812737}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {E435C06D-762A-4BB8-9EF8-B75D02812737}.Debug|Any CPU.Build.0 = Debug|Any CPU {E435C06D-762A-4BB8-9EF8-B75D02812737}.Release|Any CPU.ActiveCfg = Release|Any CPU {E435C06D-762A-4BB8-9EF8-B75D02812737}.Release|Any CPU.Build.0 = Release|Any CPU {D71384D9-A24C-4F7A-BE25-AEA088C36E7B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {D71384D9-A24C-4F7A-BE25-AEA088C36E7B}.Debug|Any CPU.Build.0 = Debug|Any CPU {D71384D9-A24C-4F7A-BE25-AEA088C36E7B}.Release|Any CPU.ActiveCfg = Release|Any CPU {D71384D9-A24C-4F7A-BE25-AEA088C36E7B}.Release|Any CPU.Build.0 = Release|Any CPU {3D09B50F-73B4-4021-B5F0-2100574BD54B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {3D09B50F-73B4-4021-B5F0-2100574BD54B}.Debug|Any CPU.Build.0 = Debug|Any CPU {3D09B50F-73B4-4021-B5F0-2100574BD54B}.Release|Any CPU.ActiveCfg = Release|Any CPU {3D09B50F-73B4-4021-B5F0-2100574BD54B}.Release|Any CPU.Build.0 = Release|Any CPU {2B6B916D-B9DD-4156-A486-F2835C1EE992}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {2B6B916D-B9DD-4156-A486-F2835C1EE992}.Debug|Any CPU.Build.0 = Debug|Any CPU {2B6B916D-B9DD-4156-A486-F2835C1EE992}.Release|Any CPU.ActiveCfg = Release|Any CPU {2B6B916D-B9DD-4156-A486-F2835C1EE992}.Release|Any CPU.Build.0 = Release|Any CPU {7030787E-D41A-4397-9472-40D79EEB3DD2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {7030787E-D41A-4397-9472-40D79EEB3DD2}.Debug|Any CPU.Build.0 = Debug|Any CPU {7030787E-D41A-4397-9472-40D79EEB3DD2}.Release|Any CPU.ActiveCfg = Release|Any CPU {7030787E-D41A-4397-9472-40D79EEB3DD2}.Release|Any CPU.Build.0 = Release|Any CPU {DDAF33F9-5925-4689-B438-D339A49E52CD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {DDAF33F9-5925-4689-B438-D339A49E52CD}.Debug|Any CPU.Build.0 = Debug|Any CPU {DDAF33F9-5925-4689-B438-D339A49E52CD}.Release|Any CPU.ActiveCfg = Release|Any CPU {DDAF33F9-5925-4689-B438-D339A49E52CD}.Release|Any CPU.Build.0 = Release|Any CPU {34A0FF3C-8E65-4378-B2FE-41E661C0B7B6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {34A0FF3C-8E65-4378-B2FE-41E661C0B7B6}.Debug|Any CPU.Build.0 = Debug|Any CPU {34A0FF3C-8E65-4378-B2FE-41E661C0B7B6}.Release|Any CPU.ActiveCfg = Release|Any CPU {34A0FF3C-8E65-4378-B2FE-41E661C0B7B6}.Release|Any CPU.Build.0 = Release|Any CPU {44D54D5C-E5E8-4622-9701-6B26C9A831A1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {44D54D5C-E5E8-4622-9701-6B26C9A831A1}.Debug|Any CPU.Build.0 = Debug|Any CPU {44D54D5C-E5E8-4622-9701-6B26C9A831A1}.Release|Any CPU.ActiveCfg = Release|Any CPU {44D54D5C-E5E8-4622-9701-6B26C9A831A1}.Release|Any CPU.Build.0 = Release|Any CPU {82F0CE32-D465-4E7B-91CA-A4B67763F433}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {82F0CE32-D465-4E7B-91CA-A4B67763F433}.Debug|Any CPU.Build.0 = Debug|Any CPU {82F0CE32-D465-4E7B-91CA-A4B67763F433}.Release|Any CPU.ActiveCfg = Release|Any CPU {82F0CE32-D465-4E7B-91CA-A4B67763F433}.Release|Any CPU.Build.0 = Release|Any CPU {BCB37D8F-9B8A-4846-B441-6975CF67BADF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {BCB37D8F-9B8A-4846-B441-6975CF67BADF}.Debug|Any CPU.Build.0 = Debug|Any CPU {BCB37D8F-9B8A-4846-B441-6975CF67BADF}.Release|Any CPU.ActiveCfg = Release|Any CPU {BCB37D8F-9B8A-4846-B441-6975CF67BADF}.Release|Any CPU.Build.0 = Release|Any CPU {793E5969-C6A0-4072-9D6B-4878AA79C917}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {793E5969-C6A0-4072-9D6B-4878AA79C917}.Debug|Any CPU.Build.0 = Debug|Any CPU {793E5969-C6A0-4072-9D6B-4878AA79C917}.Release|Any CPU.ActiveCfg = Release|Any CPU {793E5969-C6A0-4072-9D6B-4878AA79C917}.Release|Any CPU.Build.0 = Release|Any CPU {5A95D583-1B37-4AE9-BC38-FDCCCB3183CE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {5A95D583-1B37-4AE9-BC38-FDCCCB3183CE}.Debug|Any CPU.Build.0 = Debug|Any CPU {5A95D583-1B37-4AE9-BC38-FDCCCB3183CE}.Release|Any CPU.ActiveCfg = Release|Any CPU {5A95D583-1B37-4AE9-BC38-FDCCCB3183CE}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {46D55784-DE62-489B-A761-1BF0A66DD2C1} EndGlobalSection EndGlobal ================================================ FILE: Sandbox/Sandbox.sln.DotSettings ================================================  SOLUTION ================================================ FILE: Sandbox/Scripts/ConvertCacheMatrix.pl ================================================ #!/usr/bin/perl use File::Find; use Data::Dumper; use Storable qw(fd_retrieve dclone); use Compress::Zlib; use MIME::Base64; use strict; $Data::Dumper::Sortkeys = 1; # Sort the keys in the output $Data::Dumper::Deepcopy = 1; # Enable deep copies of structures my @transcriptFiles = (); my @regulatoryFiles = (); my $numArgs = @ARGV; if($numArgs != 1) { print "USAGE: ConvertCacheMatrix.pl \n"; exit 1; } my ($srcDir) = @ARGV; if(! -d $srcDir) { print "ERROR: The directory ($srcDir) does not exist.\n"; exit 1; } find(\&wanted, $srcDir); foreach my $transcriptPath (@transcriptFiles) { print "- Dumping $transcriptPath.\n"; open my $fh, "zcat ".$transcriptPath." |"; my $cache; $cache = fd_retrieve($fh); close $fh; my $outputCache = dclone($cache); my $newPath = $transcriptPath; $newPath =~ s/\.gz$/_transcripts_data_dumper.txt.gz/g; # loop through each reference sequence foreach my $refSeq (keys %{$cache}) { print "refSeq: $refSeq\n"; # loop through each transcript my $numTranscripts = scalar @{$cache->{$refSeq}}; print "# transcripts: $numTranscripts\n"; for(my $transcriptIndex = 0; $transcriptIndex < $numTranscripts; $transcriptIndex++) { print "- evaluating transcript ".($transcriptIndex + 1)."... "; # evaluate the SIFT entry my $sift = $cache->{$refSeq}[$transcriptIndex]->{'_variation_effect_feature_cache'}->{'protein_function_predictions'}->{'sift'}->{'matrix'}; if(defined($sift)) { my $dest = Compress::Zlib::memGunzip($sift) or die "Cannot uncompress SIFT matrix: $gzerrno"; $outputCache->{$refSeq}[$transcriptIndex]->{'_variation_effect_feature_cache'}->{'protein_function_predictions'}->{'sift'}->{'matrix'} = encode_base64($dest, ""); } # evaluate the PolyPhen humvar entry my $polyphen = $cache->{$refSeq}[$transcriptIndex]->{'_variation_effect_feature_cache'}->{'protein_function_predictions'}->{'polyphen_humvar'}->{'matrix'}; if(defined($polyphen)) { my $dest = Compress::Zlib::memGunzip($polyphen) or die "Cannot uncompress PolyPhen matrix: $gzerrno"; $outputCache->{$refSeq}[$transcriptIndex]->{'_variation_effect_feature_cache'}->{'protein_function_predictions'}->{'polyphen_humvar'}->{'matrix'} = encode_base64($dest, ""); } # evaluate the PolyPhen humdiv entry my $polyphenDiv = $cache->{$refSeq}[$transcriptIndex]->{'_variation_effect_feature_cache'}->{'protein_function_predictions'}->{'polyphen_humdiv'}->{'matrix'}; if(defined($polyphenDiv)) { my $dest = Compress::Zlib::memGunzip($polyphenDiv) or die "Cannot uncompress PolyPhen humdiv matrix: $gzerrno"; $outputCache->{$refSeq}[$transcriptIndex]->{'_variation_effect_feature_cache'}->{'protein_function_predictions'}->{'polyphen_humdiv'}->{'matrix'} = encode_base64($dest, ""); } print "finished.\n"; } } open (my $MPS, "| /bin/gzip -9 -c > $newPath") or die "error starting gzip $!"; print $MPS Dumper($outputCache); close $MPS; } foreach my $regulatoryPath (@regulatoryFiles) { print "- Dumping $regulatoryPath.\n"; open my $fh, "zcat ".$regulatoryPath." |"; my $cache; $cache = fd_retrieve($fh); close $fh; my $newPath = $regulatoryPath; $newPath =~ s/\.gz$/_regulatory_regions_data_dumper.txt.gz/g; open (my $MPS, "| /bin/gzip -9 -c > $newPath") or die "error starting gzip $!"; print $MPS Dumper($cache); close $MPS; } # ======================================== sub wanted { my $filePath = $File::Find::name; if($filePath =~ /data_dumper/) { return; } if($filePath =~ /_reg.gz$/) { push(@regulatoryFiles, $filePath) if -f $filePath; return; } if($filePath =~ /_var.gz$/) { return; } if($filePath =~ /.gz$/) { push(@transcriptFiles, $filePath) if -f $filePath; return; } } ================================================ FILE: Sandbox/Scripts/StressTestUnitTests.ps1 ================================================ # ================ # global variables # ================ # ========= # main loop # ========= cd D:\Projects\NirvanaDevelopment\UnitTests dotnet build $loopCount = 1 do { Write-Host Write-Host "********************************" Write-Host "*** current loop: $loopCount" Write-Host "********************************" Write-Host $loopCount++ iex "dotnet test --no-build" Write-Host "last exit code: $lastExitCode or $?" } while ($LastExitCode -eq 0) ================================================ FILE: Sandbox/Scripts/UpdateCacheFiles.ps1 ================================================ # Configuration $UnfilteredRefSeq72Path = "E:\Data\Nirvana\Cache\Test\RefSeq\72\chr1.ndb" $Ensembl72Chr1Path = "E:\Data\Nirvana\Cache\12\Ensembl\72\chr1.ndb" $Ensembl79Chr1Path = "E:\Data\Nirvana\Cache\12\Ensembl\79\chr1.ndb" $Ensembl72Chr3Path = "E:\Data\Nirvana\Cache\12\Ensembl\72\chr3.ndb" $Ensembl72Chr4Path = "E:\Data\Nirvana\Cache\12\Ensembl\72\chr4.ndb" $Ensembl72Chr7Path = "E:\Data\Nirvana\Cache\12\Ensembl\72\chr7.ndb" $Ensembl72Chr10Path = "E:\Data\Nirvana\Cache\12\Ensembl\72\chr10.ndb" $Ensembl72Chr15Path = "E:\Data\Nirvana\Cache\12\Ensembl\72\chr15.ndb" $Ensembl72Chr17Path = "E:\Data\Nirvana\Cache\12\Ensembl\72\chr17.ndb" $OutputDir = "D:\Projects\Nirvana\NirvanaUnitTests\Resources\Caches" $ExtractTranscriptsBin = "d:\Projects\Nirvana\Sandbox\x64\Release\ExtractTranscripts.exe" $ExtractRegulatoryFeaturesBin = "d:\Projects\Nirvana\Sandbox\x64\Release\ExtractRegulatoryFeatures.exe" # ======================================= # extract the Ensembl regulatory features # ======================================= $Ensembl72RegulatoryFeatures = @("ENSR00000079256") ForEach ($regFeature in $Ensembl72RegulatoryFeatures) { $outputPath = "$($OutputDir)\$($regFeature)_Ensembl72.ndb" & $ExtractRegulatoryFeaturesBin -i $Ensembl72Chr1Path -o $outputPath -r $regFeature } $Ensembl79RegulatoryFeatures = @("ENSR00001584270") ForEach ($regFeature in $Ensembl79RegulatoryFeatures) { $outputPath = "$($OutputDir)\$($regFeature)_Ensembl79.ndb" & $ExtractRegulatoryFeaturesBin -i $Ensembl79Chr1Path -o $outputPath -r $regFeature } # ============================== # extract the RefSeq transcripts # ============================== $RefSeqTranscripts = @("CCDS30708.1", "CCDS58003.1", "CCDS877.1", "ENSESTT00000006045", "ENSESTT00000008349", "ENSESTT00000011387", "ENSESTT00000011417", "ENSESTT00000012399", "ENSESTT00000034529", "ENSESTT00000034591", "ENSESTT00000034721", "ENSESTT00000034761", "ENSESTT00000051657", "ENSESTT00000056515", "ENSESTT00000058286", "ENSESTT00000064454", "ENSESTT00000064869", "ENSESTT00000079558", "ENSESTT00000082723", "ENSESTT00000082768", "ENSESTT00000083199", "ENSESTT00000083507", "ENSESTT00000085167", "ENSESTT00000086709", "NM_000644.2", "NM_001258340.1", "NM_002524.4", "NM_007158.5", "NM_024011.2", "NM_152665.2", "NM_176877.2", "NM_178221.2", "NR_024321.1", "NR_026752.1", "NR_027120.1", "NR_034014.1", "NR_034015.1", "NR_039983.2", "NR_046018.2", "XM_003846383.1", "NM_001080484.1") ForEach ($transcript in $RefSeqTranscripts) { $outputPath = "$($OutputDir)\$($transcript)_RefSeq72.ndb" & $ExtractTranscriptsBin -i $UnfilteredRefSeq72Path -o $outputPath -t $transcript } # handle vcf entries & $ExtractTranscriptsBin -i $UnfilteredRefSeq72Path -o "$($OutputDir)\chr1_115256529_RefSeq72.ndb" -v "chr1\t115256529\t.\tT\tA\t.\tPASS\t.\tGT:GQX:DP:DPF\t0/0:99:34:2" & $ExtractTranscriptsBin -i $UnfilteredRefSeq72Path -o "$($OutputDir)\chr1_59758869_RefSeq72.ndb" -n chr1 -p 59758869 -r T -a G # ====================================== # extract the Ensembl transcripts (chr1) # ====================================== $EnsemblChr1Transcripts = @("ENST00000371614", "ENST00000255416", "ENST00000310991", "ENST00000327044", "ENST00000355439", "ENST00000368246", "ENST00000369535", "ENST00000374163", "ENST00000375759", "ENST00000378635", "ENST00000379407", "ENST00000487053", "ENST00000518655", "ENST00000391369") ForEach ($transcript in $EnsemblChr1Transcripts) { $outputPath = "$($OutputDir)\$($transcript)_Ensembl72.ndb" & $ExtractTranscriptsBin -i $Ensembl72Chr1Path -o $outputPath -t $transcript } # ====================================== # extract the Ensembl transcripts (chr3) # ====================================== $EnsemblChr3Transcripts = @("ENST00000422325") ForEach ($transcript in $EnsemblChr3Transcripts) { $outputPath = "$($OutputDir)\$($transcript)_Ensembl72.ndb" & $ExtractTranscriptsBin -i $Ensembl72Chr3Path -o $outputPath -t $transcript } # ====================================== # extract the Ensembl transcripts (chr4) # ====================================== $EnsemblChr4Transcripts = @("ENST00000288135") ForEach ($transcript in $EnsemblChr4Transcripts) { $outputPath = "$($OutputDir)\$($transcript)_Ensembl72.ndb" & $ExtractTranscriptsBin -i $Ensembl72Chr4Path -o $outputPath -t $transcript } # ====================================== # extract the Ensembl transcripts (chr7) # ====================================== $EnsemblChr7Transcripts = @("ENST00000275493") ForEach ($transcript in $EnsemblChr7Transcripts) { $outputPath = "$($OutputDir)\$($transcript)_Ensembl72.ndb" & $ExtractTranscriptsBin -i $Ensembl72Chr7Path -o $outputPath -t $transcript } # ======================================= # extract the Ensembl transcripts (chr10) # ======================================= $EnsemblChr10Transcripts = @("ENST00000348795") ForEach ($transcript in $EnsemblChr10Transcripts) { $outputPath = "$($OutputDir)\$($transcript)_Ensembl72.ndb" & $ExtractTranscriptsBin -i $Ensembl72Chr10Path -o $outputPath -t $transcript } # ======================================= # extract the Ensembl transcripts (chr15) # ======================================= $EnsemblChr15Transcripts = @("ENST00000543887") ForEach ($transcript in $EnsemblChr15Transcripts) { $outputPath = "$($OutputDir)\$($transcript)_Ensembl72.ndb" & $ExtractTranscriptsBin -i $Ensembl72Chr15Path -o $outputPath -t $transcript } # ======================================= # extract the Ensembl transcripts (chr17) # ======================================= $EnsemblChr17Transcripts = @("ENST00000269305", "ENST00000576171") ForEach ($transcript in $EnsemblChr17Transcripts) { $outputPath = "$($OutputDir)\$($transcript)_Ensembl72.ndb" & $ExtractTranscriptsBin -i $Ensembl72Chr17Path -o $outputPath -t $transcript } ================================================ FILE: Sandbox/Scripts/UpdateMiniSaFiles.ps1 ================================================ #################################################################### # This program is used to update just the miniSA, CA, and CI files # #################################################################### # ================ # global variables # ================ $NirvanaRootDir="E:\Data\Nirvana" $SaRootDir="$NirvanaRootDir\SA" $IntermediateTsvsDir="$NirvanaRootDir\IntermediateTsvs" $NirvanaSourceDir="D:\Projects\NirvanaDevelopment" $ResourcesDir="$NirvanaSourceDir\UnitTests\Resources" $RefVersion="5" $SaVersion="38.2" $CustomIntervalsBed="$ResourcesDir\customIntervals.bed" $GRCh37="$NirvanaRootDir\References\$RefVersion\Homo_sapiens.GRCh37.Nirvana.dat" $GRCh38="$NirvanaRootDir\References\$RefVersion\Homo_sapiens.GRCh38.Nirvana.dat" # unit test resource directories $miniSAGRCh37="$ResourcesDir\MiniSuppAnnot" $miniSAGRCh38="$ResourcesDir\MiniSuppAnnot\hg38" $miniCIGRCh37="$ResourcesDir\MiniSuppAnnot\CustomIntervals" $miniCAGRCh37="$ResourcesDir\MiniSuppAnnot\CustomAnnotations" $directoryIntegrity="$ResourcesDir\DirectoryIntegrity" # intermediate TSV directories $HgmdTsv="$IntermediateTsvsDir\HGMD" $IcslIntervalsTsv="$IntermediateTsvsDir\IcslIntervals" $InternalAfTsv="$IntermediateTsvsDir\InternalAF" # SA directories $SaGRCh37="$SaRootDir\$SaVersion\GRCh37" $SaGRCh38="$SaRootDir\$SaVersion\GRCh38" $SaHgmd="$SaRootDir\HGMD" $SaIcslIntervals="$SaRootDir\IcslIntervals" $SaInternalAF="$SaRootDir\InternalAF" $SaUtils="$NirvanaSourceDir\bin\Release\netcoreapp1.1\SAUtils.dll" $ExtractMiniSA="dotnet $SaUtils extractMiniSA" # ========= # functions # ========= function bg() { Param ($name, $job) $script=[scriptblock]::Create($job) Start-Job -Name $name -ScriptBlock $script } function updateMiniSA(){ Param($name,$miniSADir,$SADir,$ref) Get-ChildItem $miniSADir -Filter *.nsa | Foreach-Object { $miniSAfile=$_.BaseName $refName,$start,$end = $miniSAfile.Split('_',3) bg $name "$ExtractMiniSA --in $SADir\$refName.nsa --begin $start --end $end --ref $Ref --out $miniSADir" } } function updateMiniCA(){ Param($name,$outputDir,$SADir,$ref,$targetDataSource) Get-ChildItem $outputDir -Filter *.nsa | Foreach-Object { $miniCAfile=$_.BaseName $refName,$start,$end,$dataSource = $miniCAfile.Split('_',4) if($dataSource -match $targetDataSource) { bg $name "$ExtractMiniSA --in $SADir\$refName.nsa --begin $start --end $end --ref $Ref --out $outputDir -n $targetDataSource" } } } function copyIfNewer() { Param($sourceDir, $destDir, $filename) $localFile = Get-Item "$destDir\$filename" $remoteFile = Get-Item "$sourceDir\$filename" if ($remoteFile.LastWriteTime -gt $localFile.LastWriteTime) { Copy-Item $remoteFile $localFile } } # =========================== # create the IcslIntervals SA # =========================== $IcslIntervalsChr1 = "$SaIcslIntervals\chr1.nsa" if (!(Test-Path $IcslIntervalsChr1)) { New-Item -ItemType Directory -Force -Path $IcslIntervalsTsv | Out-Null & dotnet $SaUtils createTSV --bed $CustomIntervalsBed -r $GRCh37 -o $IcslIntervalsTsv & dotnet $SaUtils createSA -r $GRCh37 -d $IcslIntervalsTsv -o $SaIcslIntervals } # ================== # create the HGMD SA # ================== $HgmdChr1 = "$SaHgmd\chr1.nsa" if (!(Test-Path $HgmdChr1)) { New-Item -ItemType Directory -Force -Path $SaHgmd | Out-Null & dotnet $SaUtils createSA -r $GRCh37 -d $HgmdTsv -o $SaHgmd } # ======================== # create the InternalAF SA # ======================== $InternalAfChr1 = "$SaInternalAF\chr1.nsa" if (!(Test-Path $InternalAfChr1)) { New-Item -ItemType Directory -Force -Path $SaInternalAF | Out-Null & dotnet $SaUtils createSA -r $GRCh37 -d $InternalAfTsv -o $SaInternalAF } # =============================== # copy chrM to DirectoryIntegrity # =============================== copyIfNewer $SaGRCh37 $directoryIntegrity "chrM.nsa" copyIfNewer $SaGRCh37 $directoryIntegrity "chrM.nsa.idx" # ============= # update miniSA # ============= updateMiniSA "SA-37" $miniSAGRCh37 $SaGRCh37 $GRCh37 updateMiniSA "SA-38" $miniSAGRCh38 $SaGRCh38 $GRCh38 # ==================================== # update the mini-CA and mini-CI files # ==================================== updateMiniCA "hgmd-37" $miniCAGRCh37 $SaHgmd $GRCh37 "hgmd" updateMiniCA "internalAF-37" $miniCAGRCh37 $SaInternalAF $GRCh37 "internalAF" updateMiniCA "IcslIntervals-37" $miniCIGRCh37 $SaIcslIntervals $GRCh37 "IcslIntervals" Get-Job | Wait-Job ================================================ FILE: Sandbox/Scripts/updateSA.ps1 ================================================ ############## # This program is used to update SA , miniSA and minCA when the SA schema changes. # please update the file path whenever updated the datasource ############## # ================ # global variables # ================ $NirvanaRootDir="E:\Data\Nirvana" $NirvanaSourceDir="D:\Projects\Nirvana" $ExternalDataRootDir="\\ussd-prd-isi04\Nirvana\Development\IntermediateTsvs" $RefVersion=5.2 $currentSAversion=40.1 $GRCh37="$NirvanaRootDir\References\$RefVersion\Homo_sapiens.GRCh37.Nirvana.dat" $GRCh38="$NirvanaRootDir\References\$RefVersion\Homo_sapiens.GRCh38.Nirvana.dat" $miniSAGRCh37="$NirvanaSourceDir\UnitTests\Resources\MiniSuppAnnot" $miniSAGRCh38="$NirvanaSourceDir\UnitTests\Resources\MiniSuppAnnot\hg38" $SAOutGRCh37="$NirvanaRootDir\SupplementaryDatabase\$currentSAversion\GRCh37" $SAOutGRCh38="$NirvanaRootDir\SupplementaryDatabase\$currentSAversion\GRCh38" $CreateSupplementaryDatabase="dotnet $NirvanaSourceDir\bin\Release\netcoreapp1.1\SAUtils.dll createSA" $ExtractMiniSAdb="dotnet $NirvanaSourceDir\bin\Release\netcoreapp1.1\SAUtils.dll extractMiniSA" $SAisilonPath="\\ussd-prd-isi04\Nirvana\Development\SupplementaryDatabase\$currentSAversion" $PhylopFolder="\\ussd-prd-isi04\Nirvana\SupplementaryDatabase\PhyloP\latest" #$OmimDatabase="\\ussd-prd-isi04\Nirvana\Development\OmimDatabase\3\genePhenotypeMap.mim" # ================ # update files # ================ $CVR37="$ExternalDataRootDir\2017-04\GRCh37\clinvar_20170403.tsv.gz" $DBS37="$ExternalDataRootDir\2017-04\GRCh37\dbsnp_150.tsv.gz" $GLOBAl37="$ExternalDataRootDir\2017-04\GRCh37\globalAllele_150.tsv.gz" $CSM37="$ExternalDataRootDir\2017-04\GRCh37\cosmic_80.tsv.gz" $DGV37="$ExternalDataRootDir\2017-04\GRCh37\dgv_20160515.interval.tsv.gz" $CLINGEN37="$ExternalDataRootDir\2017-04\GRCh37\clinGen_20160414.interval.tsv.gz" $CVR38="$ExternalDataRootDir\2017-04\GRCh38\clinvar_20170403.tsv.gz" $DBS38="$ExternalDataRootDir\2017-04\GRCh38\dbsnp_150.tsv.gz" $GLOBAl38="$ExternalDataRootDir\2017-04\GRCh38\globalAllele_150.tsv.gz" $CSM38="$ExternalDataRootDir\2017-04\GRCh38\cosmic_80.tsv.gz" $DGV38="$ExternalDataRootDir\2017-04\GRCh38\dgv_20160515.interval.tsv.gz" $CLINGEN38="$ExternalDataRootDir\2017-04\GRCh38\clinGen_unknown.interval.tsv.gz" # ================== # files won't update # ================== $ONEK37="$ExternalDataRootDir\2017-04\GRCh37\oneKg_Phase_3_v5a.tsv.gz" $ONEKSV37="$ExternalDataRootDir\2017-04\GRCh37\oneKg_Phase_3_v5a.interval.tsv.gz" $EXAC37="$ExternalDataRootDir\2017-04\GRCh37\exac_0.3.1.tsv.gz" $EVS37="$ExternalDataRootDir\2017-04\GRCh37\evs_2.tsv.gz" $RefMinor37="$ExternalDataRootDir\2017-04\GRCh37\RefMinor_Phase_3_v5a.tsv.gz" $EVS38="$ExternalDataRootDir\2017-04\GRCh38\evs_2.tsv.gz" $ONEK38="$ExternalDataRootDir\2017-04\GRCh38\oneKg_Phase_3_v3plus.tsv.gz" $RefMinor38="$ExternalDataRootDir\2017-04\GRCh38\RefMinor_Phase_3_v3plus.tsv.gz" function bg() { Param ($name, $job) $script=[scriptblock]::Create($job) Start-Job -Name $name -ScriptBlock $script } function updateMiniSA(){ Param($name,$miniSADir,$SADir,$ref) Get-ChildItem $miniSADir -Filter *.nsa | Foreach-Object { $miniSAfile=$_.BaseName $refName,$start,$end = $miniSAfile.Split('_',3) bg $name "$ExtractMiniSAdb --in $SADir\$refName.nsa --begin $start --end $end --ref $Ref --out $miniSADir" } } # ========================================= # Create Supplementary database # ========================================= mkdir $SAOutGRCh37 mkdir $SAOutGRCh38 #============================ # copy OMIM #============================ Copy-Item $OmimDatabase $SAOutGRCh37 Copy-Item $OmimDatabase $SAOutGRCh38 bg "SA-37" "$CreateSupplementaryDatabase --out $SAOutGRCh37 --ref $GRCh37 -t $DBS37 -t $CSM37 -t $EVS37 -t $CVR37 -t $ONEK37 -i $ONEKSV37 -i $DGV37 -i $CLINGEN37 -t $EXAC37 -t $GLOBAl37 -t $RefMinor37" bg "SA-38" "$CreateSupplementaryDatabase --out $SAOutGRCh38 --ref $GRCh38 -t $DBS38 -t $CSM38 -t $EVS38 -t $CVR38 -t $ONEK38 -i $DGV38 -i $CLINGEN38 -t $GLOBAl38 -t $RefMinor38" get-job|wait-job # ========================= # update miniSA # ========================= updateMiniSA "update-37" $miniSAGRCh37 $SAOutGRCh37 $GRCh37 updateMiniSA "update-38" $miniSAGRCh38 $SAOutGRCh38 $GRCh38 get-job|wait-job #=========================== #update custom annotation #=========================== function updateMiniCA(){ Param($name,$miniCADir,$CADir,$ref) Get-ChildItem $miniCADir -Filter *.nsa | Foreach-Object { $miniCAfile=$_.BaseName $refName,$start,$end = $miniCAfile.Split('_',3) bg $name "$ExtractMiniSAdb --in $CADir\$refName.nsa --begin $start --end $end --ref $Ref --name --out $miniSADir" } } ########## # copy the SA to isilon ######### mkdir $SAisilonPath Copy-Item $SAOutGRCh37 $SAisilonPath\GRCh37 -Force -Recurse Copy-Item $SAOutGRCh38 $SAisilonPath\GRCh38 -Force -Recurse Import-Module PSCX #============================ # Add hardLink to phylop #============================ Get-ChildItem "$PhylopFolder\GRCh37" -Filter *.npd | Foreach-Object { $npdFile=$_.Name New-HardLink "$SAisilonPath\GRCh37\$npdFile" "$PhylopFolder\GRCh37\$npdFile" } Get-ChildItem "$PhylopFolder\GRCh38" -Filter *.npd | Foreach-Object { $npdFile=$_.Name New-HardLink "$SAisilonPath\GRCh38\$npdFile" "$PhylopFolder\GRCh38\$npdFile" } ================================================ FILE: Sandbox/UnitTests/Piano/PianoAnnotatedTranscriptTests.cs ================================================ using System; using System.Collections.Generic; using Moq; using Piano; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Intervals; using Xunit; namespace UnitTests { public class PianoAnnotatedTranscriptTests { [Fact] public void Empty_upstreamAminoAcids_return_dot() { var mockedTranscript = new Mock(); mockedTranscript.Setup(x => x.Source).Returns(Source.Ensembl); mockedTranscript.Setup(x => x.Gene.EnsemblId.ToString()).Returns("ENSG12345"); mockedTranscript.Setup(x => x.Gene.Symbol).Returns("TestGene"); mockedTranscript.Setup(x => x.Id).Returns(CompactId.Convert("ENST124")); mockedTranscript.Setup(x => x.Version).Returns(1); mockedTranscript.Setup(x => x.Translation.ProteinId).Returns(CompactId.Convert("ENSP123456")); mockedTranscript.Setup(x => x.Translation.ProteinVersion).Returns(2); var mappedPosition = new Mock(); mappedPosition.Setup(x => x.ProteinInterval).Returns(new NullableInterval(100, 100)); var transcript = new PianoAnnotatedTranscript(mockedTranscript.Object, "A", "R",mappedPosition.Object, "", "ATYRGD", new List {ConsequenceTag.missense_variant}); var expectedOut = "TestGene ENSG12345 ENST124.1 ENSP123456.2 100 . A/R ATYRGD missense_variant"; Assert.Equal(expectedOut,transcript.ToString()); } [Fact] public void refSeq_gene_return_entrezId() { var mockedTranscript = new Mock(); mockedTranscript.Setup(x => x.Source).Returns(Source.RefSeq); mockedTranscript.Setup(x => x.Gene.EntrezGeneId.ToString()).Returns("12345"); mockedTranscript.Setup(x => x.Gene.Symbol).Returns("TestGene"); mockedTranscript.Setup(x => x.Id).Returns(CompactId.Convert("NM_124")); mockedTranscript.Setup(x => x.Version).Returns(1); mockedTranscript.Setup(x => x.Translation.ProteinId).Returns(CompactId.Convert("NP_342")); mockedTranscript.Setup(x => x.Translation.ProteinVersion).Returns(2); var mappedPosition = new Mock(); mappedPosition.Setup(x => x.ProteinInterval).Returns(new NullableInterval(100, 101)); var transcript = new PianoAnnotatedTranscript(mockedTranscript.Object, "AT", "GR", mappedPosition.Object, "KILGF", "ATYRGD", new List { ConsequenceTag.missense_variant ,ConsequenceTag.splice_region_variant}); var expectedOut = "TestGene 12345 NM_124.1 NP_342.2 100-101 KILGF AT/GR ATYRGD missense_variant,splice_region_variant"; Assert.Equal(expectedOut, transcript.ToString()); } } } ================================================ FILE: Sandbox/UnitTests/Piano/PianoTests.cs ================================================ using Moq; using Piano; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Intervals; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Sequence; using Vcf; using Xunit; namespace UnitTests { public class PianoTests { private const string Enst00000343938GenomicSequence = "GAGGGCGGGGCGAGGGCGGGGCGGTGGGCGGGGACGGGGCCCGCACGGCGGCTACGGCCTAGGTGAGCGGCTCGGACTCGGCGGCCGCACCTGCCCAACCCAACCCGCACGGTCCGGAAGTCGCCGAGGGGCCGGGAGCGGGAGGGGACGTCGTCCTAGAGGGCCGGAGCGGGCGGGCGGCCGAGGACCCGGCTCCCGCGCAGGACGGAGCCGTGGCTCAGGTCGGCCCCTCCCCAACACCACCCCGGGCCTCCGCCCCTTCCTGGGCCTCTCGGTGGAGCAGGGACCCGAACCGGTGCCCATCCAGTCCGGTGCCATCTGAAGCCCCCTTCCCAGGTGAGACTCGTAGCGCTCGCTCGACAGGGTCTGGTCCCACCCACAAGGCCTGGGGCGCCGTGGGGCCCCGTCTCCTGCTGGCCCCCCAGCCTGCTGTCAGCCCCCGTGCTCTGTGCTCAGGCCGCCCTCGCGCCCGGCCCTGACCTTGGGCCGTTGGGCTGCCCTGGGAAAGGCCTGGAGGTGTCCTGGGTCACCTTCCTGGGCTGGCAAGCTGCCTGCCTCCTGCACAGCCACTGCCCTTCCTGTTGTTACCGAGCCACCAGCCACAGCTCTGAGAAGCTCCTGGCAGCTTCTGTTTGCCACTGGCTCGAATCTGGGCAGGAAGGCAAGGCCCGCAGAATATCTGGTGACCAAGAAGGAAACCCCAGAGCCTCAGAGACCATCTTCTCAGTGGACAAAATTAAGGCCCGAGGAGGGGAGGGGCGTGCTGGAAGTCTATGGGACTGCATCTTTCTGAGGCCCAGGAGCAGCCATCCCCCACACCTGAAGCCCGGTGAGCTCACATCTGGGGCCTCCGCCTGGTGCCAAGCATGCAACCCAACCTGTGGGGCCTGCAACGCCAGGCTTCAGCACCCTGCAGGCACCAGTGCTCCAGCAGCCTGGGCCACGGGCTGGGCAGGGCTTGCAGCCCATGATCCCTAGTGATGAAGGGCCCAGTCCTAGGGTGCTGAGCAACCTGCCCACCTGCTCCTGGCCAGGAGCTCTCACCACGGCTGGGTGCCCTTCCCCCTCCCCCACCGATGGAGTCCCTGCAGCCAGGGAGGCCAGGACAGGGCTCCCAGCACCAACCGGCCTAGGAACCCCCAGGCCCTCTTCCTGGTCGAGGTGGAATGCAGCTGACTCTCAGGTTCCCCAGAGCAGGTGCGGGCCCGTGGGGCACCCGGGGAGACAGGGCAAGGGTGCTTGGCAACACTCACACAAAGCATGGGTGCCTGGATGTCTGTGGATCTGTGGAGTGACTATGTGAATGCCAGCAGAATCCAAAGCAGGGCCTGGGCCACTCGTGGAAGGCTCCCTAGGGCTAGTACAAGAGCCTCGTGGCAATCTTCTGAGTGGTAAAACCCATCTGTGTGGGACATGGAGTTTCAGCAACAGGAGTGAAAACACGTGTCCATCCATCCAGCAAGTGCCAGCCCTACAGCCTCTTTTCTGCTTTTGGGGATGTAGCAGTGAGGAAGATGGGGCAGCCTGCCCGGCAGCATCCCCCCACCCCCGGCCCCACCTGTCTCTGCTTTCTGCTGTGTCTGTTTTCTTGTCTAGGACTTCAGAACTTCCTGTCTTTGTTGTCATCTGACCCCACCCCAGATGGCTGCTCGCACTCCCCATGCACCCAGATAGATGGCTAGGATGGTGCTTGGCTCTCGGCAGGGGCTTAGTATTTCTCCAGCTGGTAAAAGCAGATACAGCATCTAGAGAGAGAAACAAAAACAAGAAAGCACCAGCAGAGACACCTGCTGCAGACAGCGGGGCCTAGTGGTCTGATAAAGCCAGAGGGGGCCACTCTCGGGGTCAGGGACTGACACGGAGTCAGTGGCCTGATCCACAGGAGGGGCTGTGCCAAGGTCCCTGAATGCGCAATCCTGATGAAGGGTGGGTCAGGGTGGTGTGCCTGAGAGCCTGCGGCTTGGCTGGGAGCAGAGCCAGGCAGCTCCTGGGAGGAAGCTCCATGAGGGGCATGAGTGTTCAGTGAGCGGCAATGGGATCGCAGCTATTTTGTTCCCCTCCACACACAGAAAATGAGCCACAGAGCAAGCTGACCCCAGCGACACAGCCCCCCAGCCCTACTGTATTTCCGTTCCTATCAAAAAATGGATGACTCGGAGACAGGTTTCAATCTGAAAGTCGTCCTGGTCAGTTTCAAGCAGTGTCTCGATGAGAAGGAAGAGGTCTTGCTGGACCCCTACATTGCCAGCTGGAAGGGCCTGGTCAGGTGCGTGTGCCAGGGCTGCCTCCTGAGGTGGGCGCTCCCCTGGCCCGAGTCCCATATGTGGCATCTGCCTCCCGACTGCCTGTCCCCACCAGCTTTGCTGCCCGTTTCCAGATGGGTGTGAGCCCCCGCAGGCTGGGCAGCGTCCCCTGCACCCCAGGCGGGCTGCCCCAGGCCTGGGCGAGGACTCGAGCCCCGCTCCCTTCCACAGGTTTCTGAACAGCCTGGGCACCATCTTCTCATTCATCTCCAAGGACGTGGTCTCCAAGCTGCGGATCATGGAGCGCCTCAGGGGCGGCCCGCAGAGCGAGCACTACCGCAGCCTGCAGGCCATGGTGGCCCACGAGCTGAGCAACCGGCTGGTGGACCTGGAGCGCCGCTCCCACCACCCGGAGTCTGGCTGCCGGACGGTGCTGCGCCTGCACCGCGCCCTGCACTGGCTGCAGCTGTTCCTGGAGGGCCTGCGTACCAGCCCCGAGGACGCACGCACCTCCGCGCTCTGCGCCGACTCCTACAACGCCTCGCTGGCCGCCTACCACCCCTGGGTCGTGCGCCGCGCCGTCACCGTGGCCTTCTGCACGCTGCCCACACGCGAGGTCTTCCTGGAGGCCATGAACGTGGGGCCCCCGGAGCAGGCCGTGCAGATGCTAGGCGAGGCCCTCCCCTTCATCCAGCGTGTCTACAACGTCTCCCAGAAGCTCTACGCCGAGCACTCCCTGCTGGACCTGCCCTAGGGGCGGGAAGCCAGGGCCGCACCGGCTTTCCTGCTGCAGATCTGGGCTGCGGTGGCCAGGGCCGTGAGTCCCGTGGCAGAGCCTTCTGGGCGCTGCGGGAACAGGAGATCCTCTGTCGCCCCTGTGAGCTGAGCTGGTTAGGAACCACAGACTGTGACAGAGAAGGTGGCGACCAGCCCAGAAGAGGCCCACCCTCTCGGTCCGGAACAAGACGCCTCGGCCACGGCTCCCCCTCGGCCTATTACACGCGTGCGCAGCCAGGCCTCGCCAGGGTGCGGTGCAGAGCAGAGCAGGCAGGGGTGGGGGCCGGGCCTGCAAGAGCCCGAAAGGTCGCCACCCCCTAGCCTGTGGGGTGCATCTGCGAACCAGGGTGAAGTCACAGGTCCCGGGGTGTGGAGGCTCCATCCTTTCTCCTTTCTGCCAGCCGATGTGTCCTCATCTCAGGCCCGTGCCTGGGACCCCGTGTCTGCCCAGGTGGGCAGCCTTGAGCCCAGGGGACTCAGTGCCCTCCATGCCCTGGCTGGCAGAAACCCTCAACAGCAGTCTGGGCACTGTGGGGCTCTCCCCGCCTCTCCTGCCTTGTTTGCCCCTCAGCGTGCCAGGCAGACTGGGGGCAGGACAGCCGGAAGCTGAGACCAAGGCTCCTCACAGAAGGGCCCAGGAAGTCCCCGCCCTTGGGACAGCCTCCTCCGTAGCCCCTGCACGGCACCAGTTCCCCGAGGGACGCAGCAGGCCGCCTCCCGCAGCGGCCGTGGGTCTGCACAGCCCAGCCCAGCCCAAGGCCCCCAGGAGCTGGGACTCTGCTACACCCAGTGAAATGCTGTGTCCCTTCTCCCCCGTGCCCCTTGATGCCCCCTCCCCACAGTGCTCAGGAGACCCGTGGGGCACGGAACAGGAGGGTCTGGACCCTGTGGCCCAGCCAAAGGCTACCAGACAGCCACAACCAGCCCAGCCACCATCCAGTGCCTGGGGCCTGGCCACTGGCTCTTCACAGTGGACCCCAGCACCTCGGGGTGGCAGAGGGACGGCCCCCACGGCCCAGCAGACATGCGAGCTTCCAGAGTGCAATCTATGTGATGTCTTCCAACGTTAATAAATCACACAGCCTCCCAGGAGGGAGACGCTGGGGTGCAC"; private static ITranscript GetMockedTranscriptOnForwardStrand() { var mockedTranscript = new Mock(); //get info from ENST00000343938.4 var chromosome = new Chromosome("chr1", "1", 0); var start = 1260147; var end = 1264277; var introns = new IInterval[] { new Interval(1260483, 1262215), new Interval(1262413, 1262620) }; var cdnaMaps = new ICdnaCoordinateMap[] { new CdnaCoordinateMap(1260147, 1260482, 1, 336), new CdnaCoordinateMap(1262216, 1262412, 337, 533), new CdnaCoordinateMap(1262621, 1264277, 534, 2160), }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CdnaCoordinateMap(1262291, 1263143, 412, 1056)); translation.SetupGet(x => x.ProteinId).Returns(CompactId.Convert("ENST00000343938")); translation.SetupGet(x => x.ProteinVersion).Returns(4); translation.SetupGet(x => x.PeptideSeq).Returns( "MDDSETGFNLKVVLVSFKQCLDEKEEVLLDPYIASWKGLVRFLNSLGTIFSFISKDVVSKLRIMERLRGGPQSEHYRSLQAMVAHELSNRLVDLERRSHHPESGCRTVLRLHRALHWLQLFLEGLRTSPEDARTSALCADSYNASLAAYHPWVVRRAVTVAFCTLPTREVFLEAMNVGPPEQAVQMLGEALPFIQRVYNVSQKLYAEHSLLDLP"); var gene = new Mock(); gene.SetupGet(x => x.OnReverseStrand).Returns(false); gene.SetupGet(x => x.EnsemblId).Returns(CompactId.Convert("ENSG00000224051 ")); gene.SetupGet(x => x.Symbol).Returns("CPTP"); mockedTranscript.SetupGet(x => x.Id).Returns(CompactId.Convert("ENST00000343938")); mockedTranscript.SetupGet(x => x.Source).Returns(Source.Ensembl); mockedTranscript.SetupGet(x => x.Version).Returns(4); mockedTranscript.SetupGet(x => x.Chromosome).Returns(chromosome); mockedTranscript.SetupGet(x => x.Start).Returns(start); mockedTranscript.SetupGet(x => x.End).Returns(end); mockedTranscript.SetupGet(x => x.Gene).Returns(gene.Object); mockedTranscript.SetupGet(x => x.Introns).Returns(introns); mockedTranscript.SetupGet(x => x.CdnaMaps).Returns(cdnaMaps); mockedTranscript.SetupGet(x => x.Translation).Returns(translation.Object); mockedTranscript.SetupGet(x => x.TotalExonLength).Returns(2190); return mockedTranscript.Object; } [Fact] public void MissenseVariant() { var transcript = GetMockedTranscriptOnForwardStrand(); var chromosome = new Chromosome("chr1", "1", 0); var variant = new Variant(chromosome, 1262295, 1262295, "A", "C", VariantType.SNV, "1:1262295:A>C", false, false, null, null, new AnnotationBehavior(false, false, false, false, false, false)); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var result = PianoTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, new AminoAcids(false)); var expectedResult = "CPTP ENSG000000224051 ENST00000343938.4 ENST00000343938.4 2 M D/A DSETGFNLKVVLVSF missense_variant"; Assert.Equal(expectedResult, result.ToString()); } [Fact] public void missense_variant_in_TSS_returns() { var transcript = GetMockedTranscriptOnForwardStrand(); var chromosome = new Chromosome("chr1", "1", 0); var variant = new Variant(chromosome, 1262291, 1262291, "A", "C", VariantType.SNV, "1:1262291:A>C", false, false, null, null, new AnnotationBehavior(false, false, false, false, false, false)); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var result = PianoTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, new AminoAcids(false)); var expectedResult = "CPTP ENSG000000224051 ENST00000343938.4 ENST00000343938.4 1 . M/L DDSETGFNLKVVLVS start_lost"; Assert.Equal(expectedResult, result.ToString()); } [Fact] public void synounymous_mutation_returns_no_change() { var transcript = GetMockedTranscriptOnForwardStrand(); var chromosome = new Chromosome("chr1", "1", 0); var variant = new Variant(chromosome, 1262347, 1262347, "G", "A", VariantType.SNV, "1:1262347:G>A", false, false, null, null, new AnnotationBehavior(false, false, false, false, false, false)); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var result = PianoTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, new AminoAcids(false)); var expectedResult = "CPTP ENSG000000224051 ENST00000343938.4 ENST00000343938.4 19 SETGFNLKVVLVSFK Q CLDEKEEVLLDPYIA synonymous_variant"; Assert.Equal(expectedResult, result.ToString()); } [Fact] public void Frameshift_mutation_returns_no_downStreamAminoAcids() { var transcript = GetMockedTranscriptOnForwardStrand(); var chromosome = new Chromosome("chr1", "1", 0); var variant = new Variant(chromosome, 1262347, 1262348, "GT", "G", VariantType.deletion, "vid", false, false, null, null, new AnnotationBehavior(false, false, false, false, false, false)); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var result = PianoTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, new AminoAcids(false)); var expectedResult = "CPTP ENSG000000224051 ENST00000343938.4 ENST00000343938.4 19-20 SETGFNLKVVLVSFK QCLDEKEEVLLDPYIAS/QVSMRRKRSCWTPTLPX . frameshift_variant"; Assert.Equal(expectedResult, result.ToString()); } } } ================================================ FILE: Sandbox/UnitTests/Piano/SimpleSequence.cs ================================================ using VariantAnnotation.Interface.Sequence; namespace UnitTests { public sealed class SimpleSequence : ISequence { private readonly string _sequence; private readonly int _zeroBasedStartOffset; public int Length => _zeroBasedStartOffset + _sequence.Length; public SimpleSequence(string s, int zeroBasedStartOffset = 0) { _zeroBasedStartOffset = zeroBasedStartOffset; _sequence = s; } public string Substring(int offset, int length) { if (offset - _zeroBasedStartOffset + length > _sequence.Length || offset < _zeroBasedStartOffset) return ""; return _sequence.Substring(offset - _zeroBasedStartOffset, length); } } } ================================================ FILE: Sandbox/UnitTests/Resources/ConflicitingEntries1000G.vcf ================================================ 1 90 . AT A,AC 100 . . 1 91 . T A,G 100 . . 1 99 . AT A,AC 100 . . 1 100 . T TC,G 100 . . 1 100 . T C,G 100 . . X 60072 . G C 100 PASS AC=64;AF=0.0127796;AN=5008;NS=2504;DP=12897;AMR_AF=0.0144;AFR_AF=0.0363;EUR_AF=0.005;SAS_AF=0.001;EAS_AF=0;AA=.|||;VT=SNP X 60072 . G C,T 100 PASS AC=71,462;AF=0.0141773,0.0922524;AN=5008;NS=2504;DP=12897;AMR_AF=0.0159,0.0173;AFR_AF=0.0408,0.1165;EUR_AF=0.005,0.0318;SAS_AF=0.001,0.1728;EAS_AF=0,0.0942;AA=.|||;VT=SNP;MULTI_ALLELIC 4 47016909 rs552911847;rs71193895 GTATT GTATTTATT,G 100 PASS AC=17,843;AF=0.00339457,0.168331;AN=5008;NS=2504;DP=14760;EAS_AF=0.001,0.0526;AMR_AF=0,0.2608;AFR_AF=0.0121,0.0545;EUR_AF=0,0.332;SAS_AF=0,0.2076;VT=INDEL;MULTI_ALLELIC 4 47016909 rs111662489 GTATTTATT G 100 PASS AC=2944;AF=0.587859;AN=5008;NS=2504;DP=14760;EAS_AF=0.6617;AMR_AF=0.549;AFR_AF=0.6634;EUR_AF=0.4911;SAS_AF=0.5368;VT=INDEL 4 47016909 rs202176827 GTATTTATTTATT G 100 PASS AC=414;AF=0.0826677;AN=5008;NS=2504;DP=14760;EAS_AF=0.0883;AMR_AF=0.0764;AFR_AF=0.0204;EUR_AF=0.0805;SAS_AF=0.1677;VT=INDEL 4 47016909 rs558472223;rs557145274;rs553321222;rs202176827 GTATTTATTTATT GTATTTATTTATTTATT,GTATTTATT,GTATT,G 100 PASS AC=16,849,2937,405;AF=0.00319489,0.169529,0.586462,0.0808706;AN=5008;NS=2504;DP=14760;EAS_AF=0.001,0.0496,0.6607,0.0883;AMR_AF=0,0.2651,0.5461,0.0749;AFR_AF=0.0113,0.0545,0.6604,0.0189;EUR_AF=0,0.338,0.4911,0.0746;SAS_AF=0,0.2076,0.5368,0.1677;VT=INDEL;MULTI_ALLELIC ================================================ FILE: Sandbox/UnitTests/Resources/RefMinorAllele.vcf ================================================ 1 15255 rs541857151 G C 100 PASS AC=1;AF=0.000199681;AN=5008;NS=2504;DP=27519;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=g|||;VT=SNP 1 15260 rs561825427 C T 100 PASS AC=2;AF=0.000399361;AN=5008;NS=2504;DP=26100;EAS_AF=0;AMR_AF=0;AFR_AF=0.0015;EUR_AF=0;SAS_AF=0;AA=c|||;VT=SNP 1 15274 rs62636497 A G,T 100 PASS AC=1739,3230;AF=0.349244,0.640974;AN=5008;NS=2504;DP=23255;EAS_AF=0.4812,0.5188;AMR_AF=0.2752,0.7205;AFR_AF=0.323,0.6369;EUR_AF=0.2922,0.7078;SAS_AF=0.3497,0.6472;AA=g|||;VT=SNP;MULTI_ALLELIC 1 15418 rs564536632 G A 100 PASS AC=1;AF=0.000199681;AN=5008;NS=2504;DP=42394;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=g|||;VT=SNP 1 15585 rs533630043 G A 100 PASS AC=5;AF=0.998403;AN=5008;NS=2504;DP=29383;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0.0008;EUR_AF=0.002;SAS_AF=0.001;AA=g|||;VT=SNP ================================================ FILE: Sandbox/UnitTests/Resources/Test1000GFile.vcf ================================================ 1 10177 . A AC 100 PASS AC=2130;AF=0.425319;AN=5008;NS=2504;DP=103152;EAS_AF=0.3363;AMR_AF=0.3602;AFR_AF=0.4909;EUR_AF=0.4056;SAS_AF=0.4949;AA=|||unknown(NO_COVERAGE) 1 10235 . T TA 100 PASS AC=6;AF=0.00119808;AN=5008;NS=2504;DP=78015;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0;SAS_AF=0.0051;AA=|||unknown(NO_COVERAGE) 1 10352 rs145072688 T TA 100 PASS AC=2191;AF=0.4375;AN=5008;NS=2504;DP=88915;EAS_AF=0.4306;AMR_AF=0.4107;AFR_AF=0.4788;EUR_AF=0.4264;SAS_AF=0.4192;AA=|||unknown(NO_COVERAGE) 1 10505 . A T 100 PASS AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9632;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.||| 1 10506 . C G 100 PASS AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9676;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.||| 1 15274 rs201931625 A G,T 100 PASS AC=1739,3210;AF=0.347244,0.640974;AN=5008;NS=2504;DP=23255;EAS_AF=0.4812,0.5188;AMR_AF=0.2752,0.7205;AFR_AF=0.323,0.6369;EUR_AF=0.2922,0.7078;SAS_AF=0.3497,0.6472;AA=g||| ================================================ FILE: Sandbox/UnitTests/Resources/TestCosmicParser.Coding.vcf ================================================ ##fileformat=VCFv4.1 ##source=COSMICv71 ##reference=GRCh37 ##fileDate=20141104 ##comment="Missing nucleotide details indicate ambiguity during curation process" ##comment="URL stub for COSM ID field (use numeric portion of ID)='http://cancer.sanger.ac.uk/cosmic/mutation/overview?id='" ##comment="REF and ALT sequences are both forward strand ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO 17 7577520 COSM11929 AT GA . . GENE=TP53;STRAND=+;CDS=c.146A>C;AA=p.H49P;CNT=1 3 41266082 COSM27285 C T . . GENE=CTNNB1;STRAND=+;CDS=c.134A>C;AA=p.D45A;CNT=1 7 55242484 COSM29274 T C . . GENE=EGFR;STRAND=+;CDS=c.140A>C;AA=p.H47P;CNT=1 ================================================ FILE: Sandbox/UnitTests/Resources/TestCosmicParser.NonCoding.vcf ================================================ ##fileformat=VCFv4.1 ##source=COSMICv71 ##reference=GRCh37 ##fileDate=20141104 ##comment="Missing nucleotide details indicate ambiguity during curation process" ##comment="URL stub for COSM ID field (use numeric portion of ID)='http://cancer.sanger.ac.uk/cosmic/mutation/overview?id='" ##comment="REF and ALT sequences are both forward strand ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO 14 81610259 COSN26416 A G . . GENE=TSHR;STRAND=+;CDS=c.134A>C;AA=p.D45A;CNT=1 3 178936116 COSN27489 GT C . . GENE=PIK3CA;STRAND=+;CDS=c.146A>C;AA=p.H49P;CNT=1 3 178916648 COSN27496 G A . . GENE=PIK3CA;STRAND=+;CDS=c.255C>A;AA=p.I85I;CNT=1 4 178916648 COSN27497 G A . . GENE=PIK3CA;STRAND=+;CDS=c.255C>A;AA=p.I85I;CNT=1 ================================================ FILE: Sandbox/UnitTests/Resources/TestCosmicParser.tsv ================================================ Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype Primary histology Histology subtype Genome-wide screen Mutation ID Mutation CDS Mutation AA Mutation Description Mutation zygosity Mutation GRCh37 genome position Mutation GRCh37 strand SNP FATHMM prediction Mutation somatic status Pubmed_PMID ID_STUDY Sample source Tumour origin Age Comments TSHR ENST00000541158 2295 12373 1103576 1103576 1017828 thyroid NS adenoma-nodule-goitre NS n COSN26415 c.1856A>G p.D619G Substitution - Missense het 14:81610258-81610258 + n PASSENGER/OTHER Confirmed somatic variant . . 10595453 surgery fresh/frozen NS TSHR ENST00000541158 2295 12373 1136601 1136601 1049165 thyroid NS adenoma-nodule-goitre NS n COSN26416 c.1856A>G p.D619G Substitution - Missense 14:81610259-81610259 + n PASSENGER/OTHER Reported in another cancer sample as somatic . . 18694911 surgery - NOS NS TP53 ENST00000269305 1182 11998 G1205 1378050 1288091 haematopoietic_and_lymphoid_tissue lymph_node lymphoid_neoplasm MALT_lymphoma n COSM11929 c.760_761AT>GA p.I254D Substitution - Missense 17:7577520-7577521 - Variant of unknown origin . . 8541549 NS NS Grade:High grade CTNNB1 ENST00000349496 2346 2514 1127061 1127061 1039915 liver NS other hepatoblastoma n COSM27285 c.79C>T p.Q27* Substitution - Nonsense 3:41266082-41266082 + n Confirmed somatic variant . . 17962810 surgery-fixed NS 10.8 PIK3CA NM_006218.1 3207 8975 1747707 1747707 1652683 liver NS carcinoma hepatocellular_carcinoma n COSN27489 c.1658_1659GT>C p.S553fs*7 Complex - frameshift 3:178936116-178936117 + Confirmed somatic variant . . 22258409 surgery-fixed NS PIK3CA NM_006218.1 3207 8975 2023854 2023854 1906049 salivary_gland NS carcinoma myoepithelial_carcinoma n COSN27496 c.35G>A p.G12D Substitution - Missense 3:178916648-178916648 + n CANCER Reported in another cancer sample as somatic . . 23933559 surgery - NOS NS EGFR ENST00000275493 3633 3236 1188169 1188169 1100068 thyroid NS carcinoma papillary_carcinoma n COSM29274 c.2254T>C p.S752P Substitution - Missense het 7:55242484-55242484 + n CANCER Variant of unknown origin . . 19253367 surgery-fixed primary 66 Drug Response:Gefitinib clinical partial response,Grade:Some Grade data are given in publication,Metastatic site:brain,Metastatic site:lung,Metastatic site:lymph node,Stage:Some Stage data are given in publication ================================================ FILE: Sandbox/UnitTests/Resources/TestWigParser.wig ================================================ fixedStep chrom=chr3 start=400601 step=100 11 22 33 fixedStep chrom=chr3 start=400601 step=100 span=5 11 22 33 ================================================ FILE: Sandbox/UnitTests/Resources/mini.WigFix ================================================ fixedStep chrom=chr1 start=100 step=1 0.064 0.058 0.064 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.000 0.000 0.000 0.000 0.000 0.000 0.058 fixedStep chrom=chr1 start=175 step=1 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 fixedStep chrom=chr1 start=250 step=1 0.058 0.064 0.000 0.064 0.058 -2.305 0.064 0.064 0.064 0.058 0.058 -2.096 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 ================================================ FILE: Sandbox/UnitTests/Resources/missingLastVariantHgmd.vcf ================================================ ##fileformat=VCFv4.1 ##Copyright=HGMD. Not for redistribution. ##source=HGMD_PRO_2015.2 ##reference=hg19 ##comment="REF and ALT sequences are both on forward strand of reference assembly" ##IAE_TOP= ##IAE_INFO= ##IAE_INFO= ##IAE_INFO= ##IAE_INFO= ##IAE_INFO= ##IAE_INFO= ##IAE_INFO= ##IAE_INFO= ##IAE_INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO=T;PROT=NP_005092.1:p.Q55*;PHEN=Idiopathic_basal_ganglia_calcification;ACC=CM1411641 chr1 949696 . C CG . . CLASS=DM;MUT=ALT;GENE=ISG15;STRAND=+;DNA=NM_005101.3:c.339dupG;PHEN=Mycobacterial_disease_mendelian_susceptibility_to;ACC=CI128669 chr3 361508 . C T . . CLASS=DP;MUT=ALT;GENE=CHL1;STRAND=+;DNA=NM_006614.3:c.49C>T;PROT=NP_006605.2:p.L17F;DB=rs2272522;PHEN=Schizophrenia_association_with;ACC=CM023348 chr3 1269501 . G A . . CLASS=DM?;MUT=ALT;GENE=CNTN6;STRAND=+;DNA=NM_014461.3:c.183-1G>A;PHEN=Adenomatous_polyposis_coli;ACC=CS1410394 chr3 1363515 . TA T . . CLASS=DM?;MUT=ALT;GENE=CNTN6;STRAND=+;DNA=NM_014461.3:c.944delA;PHEN=Adenomatous_polyposis_coli;ACC=CD1410396 chr4 367647 . C T . . CLASS=DM;MUT=ALT;GENE=ZNF141;STRAND=+;DNA=NM_003441.2:c.1421C>T;PROT=NP_003432.1:p.T474I;PHEN=Postaxial_polydactyly_type_A;ACC=CM130005 chr4 437663 . C T . . CLASS=DM?;MUT=ALT;GENE=ZNF721;STRAND=-;DNA=NM_133474.3:c.593G>A;PROT=NP_597731.2:p.R198H;PHEN=Schizophrenia;ACC=CM142691 chr4 619535 . CCCGCC CGAGGACGGCCTGCGA . . CLASS=DM;MUT=ALT;GENE=PDE6B;STRAND=+;DNA=NM_000283.3:c.121_125delCCGCCinsGAGGACGGCCTGCGA;PHEN=Retinitis_pigmentosa_autosomal_recessive;ACC=CX148735 ================================================ FILE: Sandbox/UnitTests/Resources/testClinGenUnifier.txt ================================================ 9 chr1 757092 2394455 nssv1604129 0 . 757092 2394455 17 "ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,remapScore,validated,sample_name,phenotype,Variant_seq,Reference_seq,var_type," "16087,nssv1604129,ISCA_INST_v5_2927,nsv869079,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv869079%2CClinVar:SCV000178149,Not tested,.%2C757093,2394455%2C.,Pathogenic,1,1,Pass,Unknown,Developmental delay AND/OR other significant developmental or morphological phenotypes,-%2C.,~,copy_number_loss," 9 chr1 757092 2394455 nssv1495164 0 . 757092 2394455 17 "ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,remapScore,validated,sample_name,phenotype,Variant_seq,Reference_seq,var_type," "12784,nssv1495164,ISCA_INST_8724,nsv869079,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv869079%2CClinVar:SCV000178148,Maternal,.%2C757093,2394455%2C.,Uncertain significance,4,1,Pass,Unknown,Developmental delay AND/OR other significant developmental or morphological phenotypes,~%2C.,-,copy_number_gain," 9 chr1 779726 2558913 nssv582353 0 . 779726 2558913 18 "ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,remapScore,validated,sample_name,phenotype,phenotype_id,Variant_seq,Reference_seq,var_type," "8885,nssv582353,ISCA_ret_INST_5468,nsv529358,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv529358%2CClinVar:SCV000196301,Not tested,.%2C779727,2558913%2C.,Pathogenic,1,1,Pass,Unknown,Hypotelorism%2CMicrocephaly%2CShort stature,HP:0000252%2CHP:0000601%2CHP:0004322%2CMedGen:C0349588%2CMedGen:C1845868%2CMedGen:CN000563,-%2C.,~,copy_number_loss," 9 chr1 779726 2558913 nssv582220 0 . 779726 2558913 16 "ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,remapScore,sample_name,phenotype,Variant_seq,Reference_seq,var_type," "8754,nssv582220,ISCA_ret_INST_5335,nsv529358,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv529358%2CClinVar:SCV000196302,Not tested,.%2C779727,2558913%2C.,Pathogenic,1,1,Unknown,Developmental delay AND/OR other significant developmental or morphological phenotypes,-%2C.,~,copy_number_loss," 0 chr1 65410207 68057686 nssv1610460 0 . 65410207 68057686 17 ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,validated,sample_name,phenotype,phenotype_id,Variant_seq,Reference_seq,var_type, 6399,nssv1610460,ISCA_INST_v6_4056,nsv932267,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv932267%2CClinVar:SCV000181748,Not tested,.%2C65410208,68057686%2C.,Uncertain significance,1,Pass,Unknown,Intellectual disability%2CPanhypopituitarism%2CShort stature,HP:0000871%2CHP:0001249%2CHP:0004322%2CMedGen:C0349588%2CMedGen:C1843367%2CMedGen:CN000817,-%2C.,~,copy_number_loss, 26 chr1 145601945 146944906 nssv581879 0 . 145601945 146944906 16 "ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,remapScore,sample_name,phenotype,Variant_seq,Reference_seq,var_type," "12076,nssv581879,ISCA_ret_INST_4990,nsv530955,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv530955%2CClinVar:SCV000175616,Not tested,.%2C145601946,146944906%2C.,Benign,3,1.60363,Unknown,Developmental delay AND/OR other significant developmental or morphological phenotypes,~%2C.,-,copy_number_gain," 26 chr1 145601945 146944906 nssv584556 0 . 145601945 146944906 18 "ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,remapScore,sample_name,phenotype,phenotype_id,gender,Variant_seq,Reference_seq,var_type," "14691,nssv584556,ISCA_INST_2924,nsv530955,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv530955%2CClinVar:SCV000175617,Not tested,.%2C145601946,146944906%2C.,Uncertain significance,3,1.60363,ISCA_id_2774,Developmental delay AND/OR other significant developmental or morphological phenotypes%2CSpecific learning disability,HP:0001328%2CMedGen:CN001216,F,~%2C.,-,copy_number_gain," 26 chr1 146987840 148234205 nssv581879 0 . 146987840 148234205 16 "ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,remapScore,sample_name,phenotype,Variant_seq,Reference_seq,var_type," "12077,nssv581879,ISCA_ret_INST_4990,nsv530955,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv530955%2CClinVar:SCV000175616,Not tested,.%2C146987841,148234205%2C.,Benign,3,1.48829,Unknown,Developmental delay AND/OR other significant developmental or morphological phenotypes,~%2C.,-,copy_number_gain," 26 chr1 146987840 148234205 nssv584556 0 . 146987840 148234205 18 "ID,Name,Alias,parent,Dbxref,var_origin,Start_range,End_range,clinical_int,copy_number,remapScore,sample_name,phenotype,phenotype_id,gender,Variant_seq,Reference_seq,var_type," "14692,nssv584556,ISCA_INST_2924,nsv530955,URL:www.ncbi.nlm.nih.gov/dbvar/variants/nsv530955%2CClinVar:SCV000175617,Not tested,.%2C146987841,148234205%2C.,Uncertain significance,3,1.48829,ISCA_id_2774,Developmental delay AND/OR other significant developmental or morphological phenotypes%2CSpecific learning disability,HP:0001328%2CMedGen:CN001216,F,~%2C.,-,copy_number_gain," ================================================ FILE: Sandbox/UnitTests/Resources/tmpPopInfo.txt ================================================ Population Code Population Description Super Population Code Sequence Data Available Alignment Data Available Variant Data Available CHB Han Chinese in Bejing, China EAS 1 1 1 JPT Japanese in Tokyo, Japan EAS 1 1 1 CEU Utah Residents (CEPH) with Northern and Western Ancestry EUR 1 1 1 TSI Toscani in Italia EUR 1 1 1 FIN Finnish in Finland EUR 1 1 1 ================================================ FILE: Sandbox/UnitTests/Resources/tmpSampleInfo.txt ================================================ Sample Population Gender CHB0001 CHB male CHB0002 CHB male CHB0003 CHB female JPT0001 JPT female CEU0001 CEU male TSI0001 TSI male FIN0001 FIN male JPT0002 JPT female CEU0002 CEU female TSI0002 TSI female FIN0002 FIN female JPT0003 JPT male CEU0003 CEU male TSI0003 TSI male FIN0003 FIN male FIN0004 FIN male ================================================ FILE: Sandbox/UnitTests/UnitTests.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: Sandbox/UnitTests/Utilities/ResourceUtilities.cs ================================================ using System.IO; using System.Reflection; namespace UnitTests.Utilities { public static class ResourceUtilities { /// /// given a resource filename, this method returns a stream corresponding to the file if /// it exists. Otherwise a file not found exception is thrown. /// // ReSharper disable once UnusedParameter.Global public static Stream GetResourceStream(string resourcePath, bool checkMissingFile = true) { var stream = Assembly.GetEntryAssembly().GetManifestResourceStream(resourcePath); if (checkMissingFile && stream == null) { throw new FileNotFoundException($"ERROR: The embedded resource file ({resourcePath}) was not found."); } return stream; } } } ================================================ FILE: Sandbox/UnitTests/Utilities/Resources.cs ================================================ using System; using System.IO; namespace UnitTests.Utilities { public static class Resources { public static readonly string Top; public static string TopPath(string path) => Path.Combine(Top, path); static Resources() { var solutionDir = GetParentDirectory(AppContext.BaseDirectory, 3); Top = Path.Combine(solutionDir, "UnitTests", "Resources"); } private static string GetParentDirectory(string directory, int numLevels) { for (int i = 0; i < numLevels; i++) directory = Path.GetDirectoryName(directory); return directory; } } } ================================================ FILE: SingleAnnotationLambda/CacheConfiguration.cs ================================================ using System; using Genome; namespace SingleAnnotationLambda { public sealed class CacheConfiguration : IEquatable { private readonly GenomeAssembly _genomeAssembly; private readonly string _supplementaryAnnotations; private readonly int _vepVersion; public CacheConfiguration(GenomeAssembly genomeAssembly, string supplementaryAnnotations, int vepVersion) { _genomeAssembly = genomeAssembly; _supplementaryAnnotations = supplementaryAnnotations?.ToLower(); _vepVersion = vepVersion; } public bool Equals(CacheConfiguration other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return _genomeAssembly == other._genomeAssembly && string.Equals(_supplementaryAnnotations, other._supplementaryAnnotations) && _vepVersion == other._vepVersion; } public override int GetHashCode() { unchecked { var hashCode = (int) _genomeAssembly; if (_supplementaryAnnotations != null) hashCode = (hashCode * 397) ^ _supplementaryAnnotations.GetHashCode(); hashCode = (hashCode * 397) ^ _vepVersion; return hashCode; } } public override string ToString() { return $"genome assembly: {_genomeAssembly}, SA: {_supplementaryAnnotations}, VEP: {_vepVersion}"; } } } ================================================ FILE: SingleAnnotationLambda/CacheUtilities.cs ================================================ using System.Linq; using Cloud; using Genome; namespace SingleAnnotationLambda { public static class CacheUtilities { public const int DefaultVepVersion = 91; private static readonly int[] SupportedVepVersions = { 84, 91 }; public static bool IsVepVersionSupported(int vepVersion) => SupportedVepVersions.Any(supportedVepVersion => vepVersion == supportedVepVersion); public static string GetSupportedVersions() => string.Join(", ", SupportedVepVersions); private static string UrlCombine(string baseUrl, string relativeUrl) => baseUrl.TrimEnd('/') + '/' + relativeUrl.TrimStart('/'); public static string GetCachePathPrefix(int vepVersion, GenomeAssembly genomeAssembly) { string suffix = $"{genomeAssembly}/{LambdaUrlHelper.DefaultCacheSource}"; switch (vepVersion) { case 84: return UrlCombine($"{LambdaUrlHelper.GetBaseUrl() +LambdaUrlHelper.S3CacheFolderBase}/26/VEP84/", suffix); default: return UrlCombine($"{LambdaUrlHelper.GetCacheFolder()}", suffix); } } } } ================================================ FILE: SingleAnnotationLambda/ExceptionHandler.cs ================================================ using System; using System.IO; using Cloud.Notifications; using Cloud.Utilities; using ErrorHandling; using IO; namespace SingleAnnotationLambda { public static class ExceptionHandler { public static Stream GetStream(string id, string snsTopicArn, Exception e) { Logger.Log(e); GC.Collect(); string snsMessage = SNS.CreateMessage(e.Message, "exception", e.StackTrace); SNS.SendMessage(snsTopicArn, snsMessage); ErrorCategory errorCategory = ExceptionUtilities.ExceptionToErrorCategory(e); string message = GetMessage(errorCategory, e.Message); LogUtilities.LogObject("Result", message); return SingleResult.Create(id, message, null); } private static string GetMessage(ErrorCategory errorCategory, string exceptionMessage) { if (errorCategory == ErrorCategory.UserError) return "User error: " + FirstCharToLower(exceptionMessage); return "Nirvana error: an unexpected annotation error occurred while annotating this variant."; } private static string FirstCharToLower(string input) => string.IsNullOrEmpty(input) || char.IsLower(input[0]) ? input : char.ToLowerInvariant(input[0]) + input.Substring(1); } } ================================================ FILE: SingleAnnotationLambda/SingleAnnotationLambda.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Reflection; using System.Text; using Amazon.Lambda.Core; using Cloud; using Cloud.Messages.Single; using Cloud.Utilities; using CommandLine.Utilities; using ErrorHandling.Exceptions; using Genome; using IO; using Nirvana; using OptimizedCore; using VariantAnnotation; using VariantAnnotation.Interface; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Utilities; using JsonWriter = VariantAnnotation.IO.JsonWriter; [assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.Json.JsonSerializer))] namespace SingleAnnotationLambda { // ReSharper disable once UnusedMember.Global // ReSharper disable once UnusedType.Global public class SingleAnnotationLambda { private const int MaxNumCacheConfigurations = 2; private readonly Dictionary _cacheConfigurationToAnnotationResources = new Dictionary(); private readonly LinkedList _recentCacheConfigurations = new LinkedList(); // ReSharper disable once UnusedMember.Global public Stream Run(SingleConfig config, ILambdaContext context) { string snsTopicArn = null; Stream response; try { LogUtilities.UpdateLogger(context.Logger, null); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey }); LambdaUtilities.GarbageCollect(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); config.Validate(); GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly); var cacheConfiguration = new CacheConfiguration(genomeAssembly, config.supplementaryAnnotations, config.vepVersion); bool preloadRequired = !string.IsNullOrEmpty(config.supplementaryAnnotations); AnnotationResources annotationResources = GetAndCacheAnnotationResources(config, cacheConfiguration); if (genomeAssembly!=GenomeAssembly.hg19) annotationResources.Annotator.EnableMitochondrialAnnotation(); (IPosition position, string[] sampleNames) = config.GetPositionAndSampleNames(annotationResources.SequenceProvider, annotationResources.RefMinorProvider); if (position.Chromosome.IsEmpty()) throw new UserErrorException($"An unknown chromosome was specified ({config.variant.chromosome})"); string annotationResult = GetPositionAnnotation(position, annotationResources, sampleNames, preloadRequired); response = SingleResult.Create(config.id, LambdaUrlHelper.SuccessMessage, annotationResult); } catch (Exception exception) { response = ExceptionHandler.GetStream(config.id, snsTopicArn, exception); } return response; } private AnnotationResources GetAndCacheAnnotationResources(SingleConfig input, CacheConfiguration cacheConfiguration) { if (_cacheConfigurationToAnnotationResources.TryGetValue(cacheConfiguration, out AnnotationResources annotationResources)) { if (!_recentCacheConfigurations.Last.Value.Equals(cacheConfiguration)) { _recentCacheConfigurations.Remove(cacheConfiguration); _recentCacheConfigurations.AddLast(cacheConfiguration); Logger.WriteLine($"Cached configurations: {string.Join("; ", _recentCacheConfigurations)}"); } return annotationResources; } if (_recentCacheConfigurations.Count == MaxNumCacheConfigurations) { CacheConfiguration configurationToRemove = _recentCacheConfigurations.First.Value; _recentCacheConfigurations.RemoveFirst(); _cacheConfigurationToAnnotationResources.Remove(configurationToRemove); GC.Collect(); GC.WaitForPendingFinalizers(); } Logger.WriteLine($"Creating annotation resources for {cacheConfiguration}"); annotationResources = GetAnnotationResources(input); _cacheConfigurationToAnnotationResources[cacheConfiguration] = annotationResources; _recentCacheConfigurations.AddLast(cacheConfiguration); Logger.WriteLine($"Cached configurations: {string.Join("; ", _recentCacheConfigurations)}"); return annotationResources; } private static AnnotationResources GetAnnotationResources(SingleConfig lambdaConfig) { GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(lambdaConfig.genomeAssembly); string cachePathPrefix = CacheUtilities.GetCachePathPrefix(lambdaConfig.vepVersion, genomeAssembly); string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly); string annotatorVersion = "Nirvana " + CommandLineUtilities.GetVersion(Assembly.GetAssembly(typeof(SingleAnnotationLambda))); var metrics = new PerformanceMetrics(); Logger.WriteLine($"Cache prefix: {cachePathPrefix}"); //todo: get customStrTsv from lambdaConfig var annotationResources = new AnnotationResources(nirvanaS3Ref, cachePathPrefix, null, lambdaConfig.customAnnotations, null, false, false, metrics) { AnnotatorVersionTag = annotatorVersion }; return annotationResources; } private static string GetPositionAnnotation(IPosition position, IAnnotationResources resources, string[] sampleNames, bool preloadRequired) { if (preloadRequired) resources.SingleVariantPreLoad(position); IAnnotatedPosition annotatedPosition = resources.Annotator.Annotate(position); var sb = annotatedPosition?.GetJsonStringBuilder(); if (sb == null) throw new UserErrorException("No variant is provided for annotation"); string json = StringBuilderPool.GetStringAndReturn(sb); if (json == null) throw new UserErrorException("No variant is provided for annotation"); var outputJsonStream = new MemoryStream(); using (var jsonWriter = new JsonWriter(outputJsonStream, null, resources, Date.CurrentTimeStamp, sampleNames, true)) { WriteAnnotatedPosition(annotatedPosition, jsonWriter, json); jsonWriter.WriteGenes(resources.Annotator.GetGeneAnnotations()); } outputJsonStream.Position = 0; return Encoding.UTF8.GetString(outputJsonStream.ToArray()); } private static void WriteAnnotatedPosition(IAnnotatedPosition annotatedPosition, IJsonWriter jsonWriter, string jsonOutput) => jsonWriter.WritePosition(annotatedPosition.Position, jsonOutput); } } ================================================ FILE: SingleAnnotationLambda/SingleAnnotationLambda.csproj ================================================  net6.0 true Lambda bin\$(Configuration) ================================================ FILE: SingleAnnotationLambda/SingleConfigExtensions.cs ================================================ using System.IO; using Cloud.Messages.Single; using ErrorHandling.Exceptions; using Genome; using MitoHeteroplasmy; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Vcf; using Vcf.VariantCreator; namespace SingleAnnotationLambda { public static class SingleConfigExtensions { public static void Validate(this SingleConfig config) { if (string.IsNullOrEmpty(config.id)) throw new UserErrorException("Please specify the id."); if (string.IsNullOrEmpty(config.genomeAssembly)) throw new UserErrorException("Please specify the genome assembly."); if (config.variant == null) throw new UserErrorException("Please specify the variant (chromosome, position, reference allele, and alt alleles)."); config.ValidateSupplementaryAnnotations(); config.ValidateVepVersion(); config.variant?.Validate(); } private static void ValidateSupplementaryAnnotations(this SingleConfig config) { if (string.IsNullOrEmpty(config.supplementaryAnnotations)) return; if (SupplementaryAnnotationUtilities.IsValueSupported(config.supplementaryAnnotations)) return; throw new UserErrorException($"An invalid supplementary annotation value ({config.supplementaryAnnotations}) was specified. Please choose one of the following values: {SupplementaryAnnotationUtilities.GetSupportedValues()}"); } private static void ValidateVepVersion(this SingleConfig config) { if (config.vepVersion == 0) config.vepVersion = CacheUtilities.DefaultVepVersion; if (CacheUtilities.IsVepVersionSupported(config.vepVersion)) return; throw new UserErrorException($"An invalid VEP version ({config.vepVersion}) was specified. Please choose one of the following versions: {CacheUtilities.GetSupportedVersions()}"); } public static (IPosition, string[]) GetPositionAndSampleNames(this SingleConfig config, ISequenceProvider sequenceProvider, IRefMinorProvider refMinorProvider) => (ToPosition(config.variant.GetVcfFields(), sequenceProvider, refMinorProvider), config.variant.sampleNames); private static IPosition ToPosition(string[] vcfFields, ISequenceProvider sequenceProvider, IRefMinorProvider refMinorProvider) { Chromosome chromosome = ReferenceNameUtilities.GetChromosome(sequenceProvider.RefNameToChromosome, vcfFields[VcfCommon.ChromIndex]); sequenceProvider.LoadChromosome(chromosome); (int start, bool foundError) = vcfFields[VcfCommon.PosIndex].OptimizedParseInt32(); if (foundError) throw new InvalidDataException($"Unable to convert the VCF position to an integer: {vcfFields[VcfCommon.PosIndex]}"); SimplePosition simplePosition = SimplePosition.GetSimplePosition(chromosome, start, vcfFields, new NullVcfFilter()); var variantFactory = new VariantFactory(sequenceProvider.Sequence, new VariantId()); var mitoHeteroplasmyProvider = new MitoHeteroplasmyProvider(); return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory); } } } ================================================ FILE: SingleAnnotationLambda/SingleResult.cs ================================================ using System.Data; using System.IO; using System.Text; using Cloud; using Cloud.Utilities; using Newtonsoft.Json; namespace SingleAnnotationLambda { public static class SingleResult { private const string OutputBeforeNirvanaJson = ",\"annotation\":"; private const string OutputEnd = "}"; public static Stream Create(string id, string status, string nirvanaJson) { string statusJson = JsonConvert.SerializeObject(status); string outputStart = $"{{\"id\":\"{id}\",\"status\":{statusJson}"; string output; if (status == LambdaUrlHelper.SuccessMessage) { if (nirvanaJson == null) throw new NoNullAllowedException("Nirvana annotation cannot be null when the job is successful."); output = outputStart + OutputBeforeNirvanaJson + nirvanaJson + OutputEnd; } else { output = outputStart + OutputEnd; } LogUtilities.LogObject("Result", output); var outputStream = new MemoryStream(Encoding.UTF8.GetBytes(output)); return outputStream; } } } ================================================ FILE: SingleAnnotationLambda/SupplementaryAnnotationUtilities.cs ================================================ using System.Linq; namespace SingleAnnotationLambda { public static class SupplementaryAnnotationUtilities { private static readonly string[] SupportedValues = { "latest", "release" }; public static bool IsValueSupported(string supplementaryAnnotations) { string sa = supplementaryAnnotations?.ToLower(); return SupportedValues.Any(supportedValue => sa == supportedValue); } public static string GetSupportedValues() => string.Join(", ", SupportedValues); } } ================================================ FILE: Tabix/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: Tabix/BgzfBlockVcfReader.cs ================================================ using System.IO; using Compression.FileHandling; using Genome; using OptimizedCore; namespace Tabix { public static class BgzfBlockVcfReader { public static bool FindVariantsInBlocks(Stream stream, long beginOffset, long endOffset, BgzfBlock block, Chromosome chromosome, int start, int end) { stream.Position = beginOffset; while (stream.Position <= endOffset) { string blockString = block.Read(stream); if (HasVcfPositionsOnInterval(blockString, chromosome, start, end)) return true; } return false; } internal static bool HasVcfPositionsOnInterval(string s, Chromosome chromosome, int start, int end) { string[] rawLines = s.OptimizedSplit('\n'); foreach (string line in rawLines) { string[] cols = line.Split('\t', 3); if (cols.Length < 2) continue; string chromosomeName = cols[0]; string positionString = cols[1]; if (chromosomeName != chromosome.EnsemblName && chromosomeName != chromosome.UcscName) continue; if (!int.TryParse(positionString, out int position)) continue; if (position > end) break; if (position >= start && position <= end) return true; } return false; } } } ================================================ FILE: Tabix/BinUtilities.cs ================================================ using System.Collections.Generic; namespace Tabix { internal static class BinUtilities { internal static int FirstBin(int bin) => ((1 << ((bin << 1) + bin)) - 1) / 7; internal static int ParentBin(int bin) => (bin - 1) >> 3; internal static int BottomBin(int bin) { var level = 0; for (int b = bin; b != 0; b = ParentBin(b)) level++; return (bin - FirstBin(level)) << (Constants.NumLevels - level) * 3; } /// /// assumes begin is 0-based /// internal static int ConvertPositionToBin(int begin) => 4681 + (begin >> Constants.MinShift); internal static IEnumerable OverlappingBinsWithVariants(int begin, int end, Dictionary idToChunks) { var overlappingBins = new List(); if (begin >= end) return overlappingBins; int shift = Constants.InitialShift; if (end >= Constants.MaxReferenceLength) end = Constants.MaxReferenceLength; var level = 0; var levelStartBin = 0; for (--end; level <= Constants.NumLevels; shift -= 3, levelStartBin += 1 << ((level << 1) + level), level++) { int beginBin = levelStartBin + (begin >> shift); int endBin = levelStartBin + (end >> shift); for (int bin = beginBin; bin <= endBin; bin++) { if (idToChunks.ContainsKey(bin)) overlappingBins.Add(bin); } } return overlappingBins; } } } ================================================ FILE: Tabix/Constants.cs ================================================ namespace Tabix { internal static class Constants { internal const int TabixMagic = 21578324; internal const int MinShift = 14; internal const int NumLevels = 5; // ReSharper disable once UnusedMember.Global internal const int VcfFormat = 2; internal const int InitialShift = 29; internal const int MaxReferenceLength = 536_870_912; } } ================================================ FILE: Tabix/Index.cs ================================================ using System.Collections.Generic; namespace Tabix { public sealed class Index { // ReSharper disable NotAccessedField.Global // ReSharper disable MemberCanBePrivate.Global public readonly int Format; public readonly int SequenceNameIndex; public readonly int BeginIndex; public readonly int EndIndex; public readonly char CommentChar; public readonly int NumLinesToSkip; // ReSharper restore MemberCanBePrivate.Global // ReSharper restore NotAccessedField.Global public readonly ReferenceIndex[] ReferenceSequences; internal readonly Dictionary RefNameToTabixIndex; public Index(int format, int sequenceNameIndex, int beginIndex, int endIndex, char commentChar, int numLinesToSkip, ReferenceIndex[] referenceSequences, Dictionary refNameToTabixIndex) { Format = format; SequenceNameIndex = sequenceNameIndex; BeginIndex = beginIndex; EndIndex = endIndex; CommentChar = commentChar; NumLinesToSkip = numLinesToSkip; ReferenceSequences = referenceSequences; RefNameToTabixIndex = refNameToTabixIndex; } } } ================================================ FILE: Tabix/Interval.cs ================================================ namespace Tabix { public struct Interval { public readonly ulong Begin; public readonly ulong End; public Interval(ulong begin, ulong end) { Begin = begin; End = end; } } } ================================================ FILE: Tabix/Reader.cs ================================================ using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Text; using Compression.FileHandling; using Genome; namespace Tabix { public static class Reader { // ReSharper disable once MemberCanBePrivate.Global public static Index Read(BinaryReader reader, Dictionary refNameToChromosome) { int magic = reader.ReadInt32(); if (magic != Constants.TabixMagic) throw new InvalidDataException("This does not seem to be a tabix file. Did you use a GZipStream?"); int numReferenceSequences = reader.ReadInt32(); int format = reader.ReadInt32(); int sequenceNameIndex = reader.ReadInt32() - 1; int sequenceBeginIndex = reader.ReadInt32() - 1; int sequenceEndIndex = reader.ReadInt32() - 1; var commentChar = (char)reader.ReadInt32(); int numLinesToSkip = reader.ReadInt32(); int concatenatedSequenceNameLen = reader.ReadInt32(); byte[] concatenatedNames = reader.ReadBytes(concatenatedSequenceNameLen); string[] referenceSequenceNames = GetReferenceSequenceNames(concatenatedNames, numReferenceSequences); var referenceSequences = new ReferenceIndex[numReferenceSequences]; var refNameToTabixIndex = new Dictionary(numReferenceSequences); for (ushort i = 0; i < numReferenceSequences; i++) { string chromosomeName = referenceSequenceNames[i]; var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, chromosomeName); referenceSequences[i] = ReadReferenceSequence(reader, chromosome); refNameToTabixIndex[chromosome.UcscName] = i; refNameToTabixIndex[chromosome.EnsemblName] = i; } return new Index(format, sequenceNameIndex, sequenceBeginIndex, sequenceEndIndex, commentChar, numLinesToSkip, referenceSequences, refNameToTabixIndex); } public static Index GetTabixIndex(Stream tabixStream, Dictionary refNameToChromosome) { using (var binaryReader = new BinaryReader(new BlockGZipStream(tabixStream, CompressionMode.Decompress))) { return Read(binaryReader, refNameToChromosome); } } private static string[] GetReferenceSequenceNames(byte[] concatenatedBytes, int numRefSeqs) { var refSeqNames = new string[numRefSeqs]; IEnumerable nullIndexes = GetNullIndexes(concatenatedBytes, numRefSeqs); var startIndex = 0; var index = 0; foreach (int nullIndex in nullIndexes) { refSeqNames[index++] = Encoding.ASCII.GetString(concatenatedBytes, startIndex, nullIndex - startIndex); startIndex = nullIndex + 1; } return refSeqNames; } private static IEnumerable GetNullIndexes(IReadOnlyList bytes, int numRefSeqs) { var nullPositions = new int[numRefSeqs]; var index = 0; for (var pos = 0; pos < bytes.Count; pos++) if (bytes[pos] == 0) nullPositions[index++] = pos; return nullPositions; } private static ReferenceIndex ReadReferenceSequence(BinaryReader reader, Chromosome chromosome) { int numBins = reader.ReadInt32(); var idToChunks = new Dictionary(); for (var i = 0; i < numBins; i++) { (int id, Interval[] chunks) = ReadBin(reader); idToChunks[id] = chunks; } int numLinearFileOffsets = reader.ReadInt32(); var linearFileOffsets = new ulong[numLinearFileOffsets]; int firstNonZero = -1; for (var i = 0; i < numLinearFileOffsets; i++) { linearFileOffsets[i] = reader.ReadUInt64(); if (firstNonZero == -1 && linearFileOffsets[i] != 0) firstNonZero = i; } for (var i = 0; i < firstNonZero; i++) linearFileOffsets[i] = linearFileOffsets[firstNonZero]; return new ReferenceIndex(chromosome, idToChunks, linearFileOffsets); } private static (int Id, Interval[] Chunks) ReadBin(BinaryReader reader) { int id = reader.ReadInt32(); int numChunks = reader.ReadInt32(); var chunks = new Interval[numChunks]; for (var i = 0; i < numChunks; i++) chunks[i] = ReadChunk(reader); return (id, chunks); } private static Interval ReadChunk(BinaryReader reader) { ulong begin = reader.ReadUInt64(); ulong end = reader.ReadUInt64(); return new Interval(begin, end); } } } ================================================ FILE: Tabix/ReferenceIndex.cs ================================================ using System.Collections.Generic; using Genome; namespace Tabix { public sealed class ReferenceIndex { public readonly Chromosome Chromosome; public readonly Dictionary IdToChunks; // for each 16 kbp interval public readonly ulong[] LinearFileOffsets; public ReferenceIndex(Chromosome chromosome, Dictionary idToChunks, ulong[] linearFileOffsets) { Chromosome = chromosome; IdToChunks = idToChunks; LinearFileOffsets = linearFileOffsets; } } } ================================================ FILE: Tabix/Search.cs ================================================ using System.Collections.Generic; using System.IO; using Compression.FileHandling; using Genome; namespace Tabix { // ReSharper disable once UnusedMember.Global public sealed class Search { private readonly Index _index; private readonly Stream _vcfStream; public Search(Index index, Stream vcfStream) { _index = index; _vcfStream = vcfStream; } // ReSharper disable once UnusedMember.Global public bool HasVariants(string chromosomeName, int begin, int end) { var refSeq = _index.GetTabixReferenceSequence(chromosomeName); if (refSeq == null) return false; int adjBegin = SearchUtilities.AdjustBegin(begin); IEnumerable bins = BinUtilities.OverlappingBinsWithVariants(adjBegin, end, refSeq.IdToChunks); var block = new BgzfBlock(); foreach (int bin in bins) { refSeq.IdToChunks.TryGetValue(bin, out Interval[] chunks); if (HasVariantsInBin(refSeq.Chromosome, begin, end, block, chunks)) return true; } return false; } private bool HasVariantsInBin(Chromosome chromosome, int begin, int end, BgzfBlock block, Interval[] intervals) { (long minVirtualOffset, long maxVirtualOffset) = SearchUtilities.GetMinMaxVirtualFileOffset(intervals); long minOffset = VirtualPosition.From(minVirtualOffset).FileOffset; long maxOffset = VirtualPosition.From(maxVirtualOffset).FileOffset; return BgzfBlockVcfReader.FindVariantsInBlocks(_vcfStream, minOffset, maxOffset, block, chromosome, begin, end); } } } ================================================ FILE: Tabix/SearchUtilities.cs ================================================ namespace Tabix { public static class SearchUtilities { // ReSharper disable once UnusedMember.Global public static long GetOffset(this Index index, string chromosomeName, int begin) { var refSeq = index.GetTabixReferenceSequence(chromosomeName); if (refSeq == null) return -1; // N.B. tabix assumes begin is 0-based and end is 1-based int end = begin; begin = AdjustBegin(begin); if (begin == 0) return refSeq.LinearFileOffsets.FirstNonZeroValue(); ulong minOffset = GetMinOffset(refSeq, begin); ulong maxOffset = GetMaxOffset(refSeq, end); int bin = BinUtilities.ConvertPositionToBin(begin); if (refSeq.IdToChunks.TryGetValue(bin, out Interval[] chunks)) return GetMinOverlapOffset(chunks, minOffset, maxOffset); int linearIndex = begin >> Constants.MinShift; if (linearIndex >= refSeq.LinearFileOffsets.Length) return -1; return (long)refSeq.LinearFileOffsets[linearIndex]; } internal static int AdjustBegin(int begin) { // N.B. tabix assumes begin is 0-based and end is 1-based begin--; if (begin < 0) begin = 0; return begin; } // ReSharper disable once ParameterTypeCanBeEnumerable.Global internal static long FirstNonZeroValue(this ulong[] offsets) { foreach (ulong offset in offsets) { if (offset == 0) continue; return (long)offset; } return -1; } internal static ReferenceIndex GetTabixReferenceSequence(this Index index, string chromosomeName) { if (string.IsNullOrEmpty(chromosomeName)) return null; return !index.RefNameToTabixIndex.TryGetValue(chromosomeName, out ushort tabixIndex) ? null : index.ReferenceSequences[tabixIndex]; } internal static long GetMinOverlapOffset(Interval[] chunks, ulong minOffset, ulong maxOffset) { if (chunks == null) return 0; var minOverlapOffset = ulong.MaxValue; // ReSharper disable once LoopCanBeConvertedToQuery foreach (var chunk in chunks) { if (chunk.End > minOffset && chunk.Begin < maxOffset && chunk.Begin < minOverlapOffset) minOverlapOffset = chunk.Begin; } return (long)minOverlapOffset; } internal static ulong GetMinOffset(ReferenceIndex refSeq, int begin) { int bin = BinUtilities.FirstBin(Constants.NumLevels) + (begin >> Constants.MinShift); do { if (refSeq.IdToChunks.ContainsKey(bin)) break; int firstBin = (BinUtilities.ParentBin(bin) << 3) + 1; if (bin > firstBin) bin--; else bin = BinUtilities.ParentBin(bin); } while (bin != 0); int bottomBin = BinUtilities.BottomBin(bin); return refSeq.LinearFileOffsets[bottomBin]; } internal static ulong GetMaxOffset(ReferenceIndex refSeq, int end) { int bin = BinUtilities.FirstBin(Constants.NumLevels) + ((end - 1) >> Constants.MinShift) + 1; while (true) { while (bin % 8 == 1) bin = BinUtilities.ParentBin(bin); if (bin == 0) return ulong.MaxValue; if (refSeq.IdToChunks.TryGetValue(bin, out Interval[] chunks) && chunks.Length > 0) return chunks[0].Begin; bin++; } } internal static (long MinOffset, long MaxOffset) GetMinMaxVirtualFileOffset(Interval[] intervals) { int numIntervals = intervals.Length; var minBegin = (long)intervals[0].Begin; var minEnd = (long)intervals[0].End; for (var i = 1; i < numIntervals; i++) { var interval = intervals[i]; var begin = (long)interval.Begin; var end = (long)interval.End; if (begin < minBegin) minBegin = begin; if (end > minEnd) minEnd = end; } return (minBegin, minEnd); } } } ================================================ FILE: Tabix/Tabix.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: Tabix/VirtualPosition.cs ================================================ namespace Tabix { public static class VirtualPosition { public static (long FileOffset, int BlockOffset) From(long virtualPosition) { unchecked { return ((virtualPosition >> 16) & 0xFFFFFFFFFFFFL, (int)(virtualPosition & 0xffff)); } } // ReSharper disable once UnusedMember.Global public static long To(long fileOffset, int blockOffset) => (fileOffset << 16) | ((long)blockOffset & 0xffff); } } ================================================ FILE: UnitTests/AnnotationLambda/AnnotationLambdaTests.cs ================================================ using System.IO; using Cloud.Messages.Annotation; using Compression.Utilities; using IO; using Tabix; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.AnnotationLambda { public sealed class AnnotationLambdaTests { [Fact] public void GetTabixVirtualPosition_AsExpected() { var annotationConfig = new AnnotationConfig { vcfUrl = "anywhere/input.vcf.gz", tabixUrl = Resources.TopPath("Mother_chr22.genome.vcf.gz.tbi"), annotationRange = new AnnotationRange(new AnnotationPosition("chr22", 20_000_000), new AnnotationPosition("chr22", 30_000_000)) }; var tabixStream = FileUtilities.GetReadStream(annotationConfig.tabixUrl); var indexReader = new BinaryReader(GZipUtilities.GetAppropriateReadStream(annotationConfig.tabixUrl)); var expectedPosition = Reader.Read(indexReader, ChromosomeUtilities.RefNameToChromosome).GetOffset("chr22", annotationConfig.annotationRange.Start.Position); var virtualPosition = global::AnnotationLambda.AnnotationLambda.GetTabixVirtualPosition(annotationConfig.annotationRange, tabixStream, ChromosomeUtilities.RefNameToChromosome); Assert.Equal(expectedPosition, virtualPosition); } [Fact] public void GetTabixVirtualPosition_ReturnZeroWhenNoRangeSpecified() { Assert.Equal(0, global::AnnotationLambda.AnnotationLambda.GetTabixVirtualPosition(null, null, null)); } } } ================================================ FILE: UnitTests/AnnotationLambda/S3UtilitiesTests.cs ================================================ using AnnotationLambda; using Xunit; namespace UnitTests.AnnotationLambda { public sealed class S3UtilitiesTests { [Theory] [InlineData("/Test/", "bob", "Test/bob")] [InlineData("Test/", "bob", "Test/bob")] [InlineData("/Test", "bob", "Test/bob")] [InlineData("Test", "bob", "Test/bob")] [InlineData("", "bob", "bob")] [InlineData(null, "bob", "bob")] [InlineData("/", "bob", "bob")] public void GetKey_Theory(string outputDir, string filename, string expectedResult) { var observedResult = S3Utilities.GetKey(outputDir, filename); Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/CacheUtils/DataDumperImport/DataStructures/Import/ImportNodeExtensionsTests.cs ================================================ using System.IO; using CacheUtils.DataDumperImport.DataStructures.Import; using Xunit; namespace UnitTests.CacheUtils.DataDumperImport.DataStructures.Import { public sealed class ImportNodeExtensionsTests { [Fact] public void GetInt32_Nominal() { var node = new StringKeyValueNode("bob", "123"); var observedResult = node.GetInt32(); Assert.Equal(123, observedResult); } [Fact] public void GetInt32_ReturnMinusOne_WhenNull() { var node = new StringKeyValueNode("bob", null); var observedResult = node.GetInt32(); Assert.Equal(-1, observedResult); } [Fact] public void GetInt32_ThrowException_When_NotNumber() { var node = new StringKeyValueNode("bob", "123N"); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var observedResult = node.GetInt32(); }); } [Fact] public void GetString_ThrowException_When_NotCorrectType() { var node = new ObjectKeyValueNode("bob", null); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var observedResult = node.GetString(); }); } [Fact] public void GetString_ReturnNull_IfEmptyOrMinus() { var node = new StringKeyValueNode("bob", "-"); var observedResult = node.GetString(); Assert.Null(observedResult); node = new StringKeyValueNode("bob", ""); observedResult = node.GetString(); Assert.Null(observedResult); } [Fact] public void GetBool_ReturnTrue() { var node = new StringKeyValueNode("bob", "1"); var observedResult = node.GetBool(); Assert.True(observedResult); } [Fact] public void GetBool_ReturnFalse() { var node = new StringKeyValueNode("bob", "0"); var observedResult = node.GetBool(); Assert.False(observedResult); } [Fact] public void IsUndefined_ReturnTrue() { var node = new StringKeyValueNode("bob", null); var observedResult = node.IsUndefined(); Assert.True(observedResult); } [Fact] public void IsUndefined_ReturnFalse() { var node = new StringKeyValueNode("bob", "test"); var observedResult = node.IsUndefined(); Assert.False(observedResult); } [Fact] public void IsUndefined_ReturnFalse_IncorrectType() { var node = new ObjectKeyValueNode("bob", null); var observedResult = node.IsUndefined(); Assert.False(observedResult); } } } ================================================ FILE: UnitTests/CacheUtils/DataDumperImport/FauxRegex/RegexDecisionTreeTests.cs ================================================ using System; using CacheUtils.DataDumperImport.FauxRegex; using CacheUtils.DataDumperImport.IO; using Xunit; namespace UnitTests.CacheUtils.DataDumperImport.FauxRegex { public sealed class RegexDecisionTreeTests { [Fact] public void GetEntryType_RootObjectKeyValue() { var results = RegexDecisionTree.GetEntryType("$VAR1 = {"); Assert.Equal(EntryType.RootObjectKeyValue, results.Type); Assert.Equal("$VAR1", results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_ListObjectKeyValue() { var results = RegexDecisionTree.GetEntryType(" '1' => ["); Assert.Equal(EntryType.ListObjectKeyValue, results.Type); Assert.Equal("1", results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_OpenBraces() { var results = RegexDecisionTree.GetEntryType(" bless( {"); Assert.Equal(EntryType.OpenBraces, results.Type); Assert.Null(results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_StringKeyValue() { var results = RegexDecisionTree.GetEntryType(" '_ccds' => 'CCDS44137.1',"); Assert.Equal(EntryType.StringKeyValue, results.Type); Assert.Equal("_ccds", results.Key); Assert.Equal("CCDS44137.1", results.Value); } [Fact] public void GetEntryType_DigitKeyValue() { var results = RegexDecisionTree.GetEntryType(" 'phase' => -1,"); Assert.Equal(EntryType.DigitKeyValue, results.Type); Assert.Equal("phase", results.Key); Assert.Equal("-1", results.Value); } [Fact] public void GetEntryType_EndBracesWithDataType() { var results = RegexDecisionTree.GetEntryType(" }, 'Bio::EnsEMBL::Exon' ),"); Assert.Equal(EntryType.EndBracesWithDataType, results.Type); Assert.Equal("Bio::EnsEMBL::Exon", results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_EndBraces() { var results = RegexDecisionTree.GetEntryType(" },"); Assert.Equal(EntryType.EndBraces, results.Type); Assert.Null(results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_ObjectKeyValue() { var results = RegexDecisionTree.GetEntryType(" 'next' => bless( {"); Assert.Equal(EntryType.ObjectKeyValue, results.Type); Assert.Equal("next", results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_UndefKeyValue() { var results = RegexDecisionTree.GetEntryType(" 'adaptor' => undef,"); Assert.Equal(EntryType.UndefKeyValue, results.Type); Assert.Equal("adaptor", results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_EmptyListKeyValue() { var results = RegexDecisionTree.GetEntryType(" 'seq_edits' => [],"); Assert.Equal(EntryType.EmptyListKeyValue, results.Type); Assert.Equal("seq_edits", results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_EmptyValueKeyValue() { var results = RegexDecisionTree.GetEntryType(" 'cell_types' => {},"); Assert.Equal(EntryType.EmptyValueKeyValue, results.Type); Assert.Equal("cell_types", results.Key); Assert.Null(results.Value); } [Fact] public void GetEntryType_ReferenceStringKeyValue() { var results = RegexDecisionTree.GetEntryType(" 'transcript' => $VAR1->{'22'}[0],"); Assert.Equal(EntryType.ReferenceStringKeyValue, results.Type); Assert.Equal("transcript", results.Key); Assert.Equal("$VAR1->{'22'}[0]", results.Value); } [Fact] public void GetEntryType_DigitKey() { var results = RegexDecisionTree.GetEntryType(" 0,"); Assert.Equal(EntryType.DigitKey, results.Type); Assert.Equal("0", results.Key); Assert.Null(results.Value); } [Theory] [InlineData("'next' => bless( [")] [InlineData("A.B,")] [InlineData("$VAR1 = [")] public void GetEntryType_ThrowsNotImplementedException(string s) { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var results = RegexDecisionTree.GetEntryType(s); }); } [Theory] [InlineData("123", true)] [InlineData("-123", true)] [InlineData("12A", false)] public void OnlyDigits(string s, bool expectedResult) { var observedResult = RegexDecisionTree.OnlyDigits(s); Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/CacheUtils/DataDumperImport/FileHandling/DataDumperReaderTests.cs ================================================ using System.IO; using System.Text; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.IO; using Xunit; namespace UnitTests.CacheUtils.DataDumperImport.FileHandling { public sealed class DataDumperReaderTests { [Fact] public void GetRootNode_EndToEnd() { ObjectKeyValueNode rootNode; using (var ms = new MemoryStream()) { using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("$VAR1 = {"); writer.WriteLine(" '22' => {"); writer.WriteLine(" 'RegulatoryFeature' => ["); writer.WriteLine(" bless( {"); writer.WriteLine(" 'seq' => 'AGGGG'"); writer.WriteLine(" 'tmp_frequencies' => '87 167 281 56 8 744 40 107 851 5 333 54 12 56 104 372 82 117 402"); writer.WriteLine("291 145 49 800 903 13 528 433 11 0 3 12 0 8 733 13 482 322 181"); writer.WriteLine("76 414 449 21 0 65 334 48 32 903 566 504 890 775 5 507 307 73 266"); writer.WriteLine("459 187 134 36 2 91 11 324 18 3 9 341 8 71 67 17 37 396 59"); writer.WriteLine("'"); writer.WriteLine(" 'cell_types' => {},"); writer.WriteLine(" '_bound_lengths' => ["); writer.WriteLine(" 0,"); writer.WriteLine(" 0"); writer.WriteLine(" ],"); writer.WriteLine(" 'transcript' => $VAR1->{'1'}[0],"); writer.WriteLine(" }, 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' )"); writer.WriteLine(" ]"); writer.WriteLine(" }"); writer.WriteLine(" };"); } ms.Position = 0; using (var reader = new DataDumperReader(ms)) rootNode = reader.GetRootNode(); } Assert.NotNull(rootNode); var node = rootNode; Assert.Equal("$VAR1", node.Key); var chr22Node = node.Value.Values[0] as ObjectKeyValueNode; Assert.NotNull(chr22Node); Assert.Equal("22", chr22Node.Key); var rfNode = chr22Node.Value.Values[0] as ListObjectKeyValueNode; Assert.NotNull(rfNode); Assert.Equal("RegulatoryFeature", rfNode.Key); var blessNode = rfNode.Values[0] as ObjectValueNode; Assert.NotNull(blessNode); Assert.Null(blessNode.Key); Assert.Equal("Bio::EnsEMBL::Funcgen::RegulatoryFeature", blessNode.Type); var nodes = blessNode.Values; var seqNode = nodes[0] as StringKeyValueNode; Assert.NotNull(seqNode); Assert.Equal("seq", seqNode.Key); Assert.Equal("AGGGG", seqNode.Value); var tmpFreqNode = nodes[1] as StringKeyValueNode; Assert.NotNull(tmpFreqNode); Assert.Equal("tmp_frequencies", tmpFreqNode.Key); Assert.Equal("87 167 281 56 8 744 40 107 851 5 333 54 12 56 104 372 82 117 402 291 145 49 800 903 13 528 433 11 0 3 12 0 8 733 13 482 322 181 76 414 449 21 0 65 334 48 32 903 566 504 890 775 5 507 307 73 266 459 187 134 36 2 91 11 324 18 3 9 341 8 71 67 17 37 396 59", tmpFreqNode.Value); var cellTypesNode = nodes[2] as StringKeyValueNode; Assert.NotNull(cellTypesNode); Assert.Equal("cell_types", cellTypesNode.Key); Assert.Null(cellTypesNode.Value); var boundLengthsNode = nodes[3] as ListObjectKeyValueNode; Assert.NotNull(boundLengthsNode); Assert.Equal("_bound_lengths", boundLengthsNode.Key); var bl1Node = boundLengthsNode.Values[0] as StringValueNode; Assert.NotNull(bl1Node); Assert.Equal("0", bl1Node.Key); var bl2Node = boundLengthsNode.Values[1] as StringValueNode; Assert.NotNull(bl2Node); Assert.Equal("0", bl2Node.Key); var transcriptNode = nodes[4] as StringKeyValueNode; Assert.NotNull(transcriptNode); Assert.Equal("transcript", transcriptNode.Key); Assert.Equal("$VAR1->{'1'}[0]", transcriptNode.Value); } [Fact] public void GetRootNode_ObjectValue_UnhandledEntryType_ThrowsException() { Assert.Throws(delegate { using (var ms = new MemoryStream()) { using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("$VAR1 = {"); writer.WriteLine(" bless( {"); writer.WriteLine(" 0"); writer.WriteLine(" }, 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' )"); writer.WriteLine(" };"); } ms.Position = 0; using (var reader = new DataDumperReader(ms)) reader.GetRootNode(); } }); } [Fact] public void GetRootNode_ListObjectKeyValue_UnhandledEntryType_ThrowsException() { Assert.Throws(delegate { using (var ms = new MemoryStream()) { using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("$VAR1 = {"); writer.WriteLine(" '_bound_lengths' => ["); writer.WriteLine(" 'seq' => 'AGGGG'"); writer.WriteLine(" ]"); writer.WriteLine(" };"); } ms.Position = 0; using (var reader = new DataDumperReader(ms)) reader.GetRootNode(); } }); } [Fact] public void GetRootNode_EmptyStream_ThrowsException() { Assert.Throws(delegate { using (var ms = new MemoryStream()) { using (var reader = new DataDumperReader(ms)) reader.GetRootNode(); } }); } [Fact] public void GetRootNode_NoRootObject_ThrowsException() { Assert.Throws(delegate { using (var ms = new MemoryStream()) { using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("'seq' => 'AGGGG'"); } ms.Position = 0; using (var reader = new DataDumperReader(ms)) reader.GetRootNode(); } }); } } } ================================================ FILE: UnitTests/CacheUtils/DataDumperImport/Import/ImportRegulatoryFeatureTests.cs ================================================ using System.IO; using System.Text; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.Import; using CacheUtils.DataDumperImport.IO; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Caches; using Xunit; namespace UnitTests.CacheUtils.DataDumperImport.Import { public sealed class ImportRegulatoryFeatureTests { private readonly ObjectValueNode _regulatoryFeatureNode; public ImportRegulatoryFeatureTests() { var dataDumperOutput = GetDataDumperOutput(); _regulatoryFeatureNode = GetObjectValueNode(dataDumperOutput); } #region Data::Dumper output data private static string GetDataDumperOutput() { return @"$VAR1 = { '22' => { 'RegulatoryFeature' => [ bless( { '_analysis_id' => 16, '_bound_lengths' => [ 0, 0 ], '_vep_feature_type' => 'RegulatoryFeature', 'cell_types' => { 'A549' => 'INACTIVE', 'Aorta' => 'NA', 'B_cells_(PB)_Roadmap' => 'NA', 'CD14+CD16-_monocyte_(CB)' => 'NA', 'CD14+CD16-_monocyte_(VB)' => 'NA', 'CD4+_ab_T_cell_(VB)' => 'NA', 'CD8+_ab_T_cell_(CB)' => 'NA', 'CM_CD4+_ab_T_cell_(VB)' => 'NA', 'DND-41' => 'INACTIVE', 'EPC_(VB)' => 'NA', 'Fetal_Adrenal_Gland' => 'NA', 'Fetal_Intestine_Large' => 'NA', 'Fetal_Intestine_Small' => 'NA', 'Fetal_Muscle_Leg' => 'NA', 'Fetal_Muscle_Trunk' => 'NA', 'Fetal_Stomach' => 'NA', 'Fetal_Thymus' => 'NA', 'GM12878' => 'INACTIVE', 'Gastric' => 'NA', 'H1-mesenchymal' => 'NA', 'H1-neuronal_progenitor' => 'NA', 'H1-trophoblast' => 'NA', 'H1ESC' => 'INACTIVE', 'H9' => 'NA', 'HMEC' => 'INACTIVE', 'HSMM' => 'INACTIVE', 'HSMMtube' => 'INACTIVE', 'HUVEC' => 'INACTIVE', 'HUVEC_prol_(CB)' => 'NA', 'HeLa-S3' => 'INACTIVE', 'HepG2' => 'REPRESSED', 'IMR90' => 'INACTIVE', 'K562' => 'ACTIVE', 'Left_Ventricle' => 'NA', 'Lung' => 'NA', 'M0_macrophage_(CB)' => 'NA', 'M0_macrophage_(VB)' => 'NA', 'M1_macrophage_(CB)' => 'NA', 'M1_macrophage_(VB)' => 'NA', 'M2_macrophage_(CB)' => 'NA', 'M2_macrophage_(VB)' => 'NA', 'MSC_(VB)' => 'NA', 'Monocytes-CD14+' => 'INACTIVE', 'Monocytes-CD14+_(PB)_Roadmap' => 'NA', 'NH-A' => 'INACTIVE', 'NHDF-AD' => 'INACTIVE', 'NHEK' => 'INACTIVE', 'NHLF' => 'INACTIVE', 'Natural_Killer_cells_(PB)' => 'NA', 'Osteobl' => 'INACTIVE', 'Ovary' => 'NA', 'Pancreas' => 'NA', 'Placenta' => 'NA', 'Psoas_Muscle' => 'NA', 'Right_Atrium' => 'NA', 'Small_Intestine' => 'NA', 'Spleen' => 'NA', 'T_cells_(PB)_Roadmap' => 'NA', 'Thymus' => 'NA', 'eosinophil_(VB)' => 'NA', 'erythroblast_(CB)' => 'NA', 'iPS-20b' => 'NA', 'iPS_DF_19.11' => 'NA', 'iPS_DF_6.9' => 'NA', 'naive_B_cell_(VB)' => 'NA', 'neutrophil_(CB)' => 'NA', 'neutrophil_(VB)' => 'NA', 'neutrophil_myelocyte_(BM)' => 'NA' }, 'dbID' => '71269', 'end' => '50555915', 'epigenome_count' => 1, 'feature_type' => 'TF_binding_site', 'regulatory_build_id' => 1, 'slice' => bless( { 'circular' => 0, 'coord_system' => bless( { 'dbID' => '2', 'default' => 1, 'name' => 'chromosome', 'rank' => '1', 'sequence_level' => 0, 'top_level' => 0, 'version' => 'GRCh37' }, 'Bio::EnsEMBL::CoordSystem' ), 'end' => '51304566', 'seq_region_length' => '51304566', 'seq_region_name' => '22', 'start' => 1, 'strand' => 1 }, 'Bio::EnsEMBL::Slice' ), 'stable_id' => 'ENSR00000394520', 'start' => '50555633', 'strand' => 0 }, 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' ) ] } };"; } #endregion private static ObjectValueNode GetObjectValueNode(string dataDumperOutput) { ObjectKeyValueNode rootNode; using (var ms = new MemoryStream()) { using (var reader = new StringReader(dataDumperOutput)) using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { while (true) { var line = reader.ReadLine(); if (line == null) break; writer.WriteLine(line); } } ms.Position = 0; using (var reader = new DataDumperReader(ms)) rootNode = reader.GetRootNode(); } var chr22Node = rootNode.Value.Values[0] as ObjectKeyValueNode; Assert.NotNull(chr22Node); var regulatoryFeatureNodes = chr22Node.Value.Values[0] as ListObjectKeyValueNode; Assert.NotNull(regulatoryFeatureNodes); return regulatoryFeatureNodes.Values[0] as ObjectValueNode; } [Fact] public void Parse_Nominal() { var regulatoryRegion = ImportRegulatoryFeature.Parse(_regulatoryFeatureNode, ChromosomeUtilities.Chr1); Assert.NotNull(regulatoryRegion); Assert.Equal(ChromosomeUtilities.Chr1.Index, regulatoryRegion.Chromosome.Index); Assert.Equal(50555633, regulatoryRegion.Start); Assert.Equal(50555915, regulatoryRegion.End); Assert.Equal("ENSR00000394520", regulatoryRegion.Id.WithoutVersion); Assert.Equal(RegulatoryRegionType.TF_binding_site, regulatoryRegion.Type); } } } ================================================ FILE: UnitTests/CacheUtils/DataDumperImport/Import/ImportTranscriptTests.cs ================================================ using System.IO; using System.Text; using CacheUtils.DataDumperImport.DataStructures; using CacheUtils.DataDumperImport.DataStructures.Import; using CacheUtils.DataDumperImport.Import; using CacheUtils.DataDumperImport.IO; using Intervals; using UnitTests.TestUtilities; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.CacheUtils.DataDumperImport.Import { public sealed class ImportTranscriptTests { private readonly ObjectValueNode _transcriptNode; public ImportTranscriptTests() { var dataDumperOutput = GetDataDumperOutput(); _transcriptNode = GetObjectValueNode(dataDumperOutput); } #region Data::Dumper output data private static string GetDataDumperOutput() { return @"$VAR1 = { '22' => [ bless( { '_ccds' => 'CCDS14080.1', '_gene' => bless( { 'end' => '50051190', 'stable_id' => 'ENSG00000188511', 'start' => '49808176', 'strand' => -1 }, 'Bio::EnsEMBL::Gene' ), '_gene_hgnc_id' => '28010', '_gene_phenotype' => 0, '_gene_stable_id' => 'ENSG00000188511', '_gene_symbol' => 'C22orf34', '_gene_symbol_source' => 'HGNC', '_protein' => 'ENSP00000394865', '_refseq' => 'NM_014577.1', '_swissprot' => '-', '_trans_exon_array' => [ bless( { 'end' => '50051152', 'end_phase' => 1, 'phase' => -1, 'stable_id' => 'ENSE00001657619', 'start' => '50051053', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), bless( { 'end' => '49834861', 'end_phase' => -1, 'phase' => 1, 'stable_id' => 'ENSE00001694252', 'start' => '49834525', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), bless( { 'end' => '49810577', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001775575', 'start' => '49810464', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), bless( { 'end' => '49810384', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001669960', 'start' => '49810251', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), bless( { 'end' => '49809684', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001595042', 'start' => '49808176', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ) ], '_trembl' => 'F2Z342', '_uniparc' => 'UPI00004105EF', '_variation_effect_feature_cache' => { 'codon_table' => 1, 'five_prime_utr' => bless( { '_root_verbose' => 0, 'primary_seq' => bless( { '_nowarnonempty' => undef, '_root_verbose' => 0, 'alphabet' => 'dna', 'display_id' => 'ENST00000414287', 'length' => 45, 'seq' => 'GCT' }, 'Bio::PrimarySeq' ) }, 'Bio::Seq' ), 'introns' => [ bless( { 'adaptor' => undef, 'analysis' => undef, 'dbID' => undef, 'end' => '50051052', 'next' => bless( { 'end' => '49834861', 'end_phase' => -1, 'phase' => 1, 'stable_id' => 'ENSE00001694252', 'start' => '49834525', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'prev' => bless( { 'end' => '50051152', 'end_phase' => 1, 'phase' => -1, 'stable_id' => 'ENSE00001657619', 'start' => '50051053', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'seqname' => undef, 'slice' => bless( { 'circular' => 0, 'coord_system' => bless( { 'dbID' => '2', 'default' => 1, 'name' => 'chromosome', 'rank' => '1', 'sequence_level' => 0, 'top_level' => 0, 'version' => 'GRCh37' }, 'Bio::EnsEMBL::CoordSystem' ), 'end' => '51304566', 'seq_region_length' => '51304566', 'seq_region_name' => '22', 'start' => 1, 'strand' => 1 }, 'Bio::EnsEMBL::Slice' ), 'start' => '49834862', 'strand' => -1 }, 'Bio::EnsEMBL::Intron' ), bless( { 'adaptor' => undef, 'analysis' => undef, 'dbID' => undef, 'end' => '49834524', 'next' => bless( { 'end' => '49810577', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001775575', 'start' => '49810464', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'prev' => bless( { 'end' => '49834861', 'end_phase' => -1, 'phase' => 1, 'stable_id' => 'ENSE00001694252', 'start' => '49834525', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'seqname' => undef, 'slice' => bless( { 'circular' => 0, 'coord_system' => bless( { 'dbID' => '2', 'default' => 1, 'name' => 'chromosome', 'rank' => '1', 'sequence_level' => 0, 'top_level' => 0, 'version' => 'GRCh37' }, 'Bio::EnsEMBL::CoordSystem' ), 'end' => '51304566', 'seq_region_length' => '51304566', 'seq_region_name' => '22', 'start' => 1, 'strand' => 1 }, 'Bio::EnsEMBL::Slice' ), 'start' => '49810578', 'strand' => -1 }, 'Bio::EnsEMBL::Intron' ), bless( { 'adaptor' => undef, 'analysis' => undef, 'dbID' => undef, 'end' => '49810463', 'next' => bless( { 'end' => '49810384', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001669960', 'start' => '49810251', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'prev' => bless( { 'end' => '49810577', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001775575', 'start' => '49810464', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'seqname' => undef, 'slice' => bless( { 'circular' => 0, 'coord_system' => bless( { 'dbID' => '2', 'default' => 1, 'name' => 'chromosome', 'rank' => '1', 'sequence_level' => 0, 'top_level' => 0, 'version' => 'GRCh37' }, 'Bio::EnsEMBL::CoordSystem' ), 'end' => '51304566', 'seq_region_length' => '51304566', 'seq_region_name' => '22', 'start' => 1, 'strand' => 1 }, 'Bio::EnsEMBL::Slice' ), 'start' => '49810385', 'strand' => -1 }, 'Bio::EnsEMBL::Intron' ), bless( { 'adaptor' => undef, 'analysis' => undef, 'dbID' => undef, 'end' => '49810250', 'next' => bless( { 'end' => '49809684', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001595042', 'start' => '49808176', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'prev' => bless( { 'end' => '49810384', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001669960', 'start' => '49810251', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'seqname' => undef, 'slice' => bless( { 'circular' => 0, 'coord_system' => bless( { 'dbID' => '2', 'default' => 1, 'name' => 'chromosome', 'rank' => '1', 'sequence_level' => 0, 'top_level' => 0, 'version' => 'GRCh37' }, 'Bio::EnsEMBL::CoordSystem' ), 'end' => '51304566', 'seq_region_length' => '51304566', 'seq_region_name' => '22', 'start' => 1, 'strand' => 1 }, 'Bio::EnsEMBL::Slice' ), 'start' => '49809685', 'strand' => -1 }, 'Bio::EnsEMBL::Intron' ) ], 'mapper' => bless( { 'cdna_coding_end' => '225', 'cdna_coding_start' => 46, 'exon_coord_mapper' => bless( { '_is_sorted' => 0, '_pair_cdna' => { 'CDNA' => [ bless( { 'from' => bless( { 'end' => 100, 'id' => 'cdna', 'start' => 1 }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '50051152', 'id' => 'genome', 'start' => '50051053' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ), bless( { 'from' => bless( { 'end' => '437', 'id' => 'cdna', 'start' => 101 }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '49834861', 'id' => 'genome', 'start' => '49834525' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ), bless( { 'from' => bless( { 'end' => '551', 'id' => 'cdna', 'start' => '438' }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '49810577', 'id' => 'genome', 'start' => '49810464' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ), bless( { 'from' => bless( { 'end' => '685', 'id' => 'cdna', 'start' => '552' }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '49810384', 'id' => 'genome', 'start' => '49810251' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ), bless( { 'from' => bless( { 'end' => '2194', 'id' => 'cdna', 'start' => '686' }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '49809684', 'id' => 'genome', 'start' => '49808176' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ) ] }, '_pair_genomic' => { 'GENOME' => [ bless( { 'from' => bless( { 'end' => 100, 'id' => 'cdna', 'start' => 1 }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '50051152', 'id' => 'genome', 'start' => '50051053' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ), bless( { 'from' => bless( { 'end' => '437', 'id' => 'cdna', 'start' => 101 }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '49834861', 'id' => 'genome', 'start' => '49834525' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ), bless( { 'from' => bless( { 'end' => '551', 'id' => 'cdna', 'start' => '438' }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '49810577', 'id' => 'genome', 'start' => '49810464' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ), bless( { 'from' => bless( { 'end' => '685', 'id' => 'cdna', 'start' => '552' }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '49810384', 'id' => 'genome', 'start' => '49810251' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ), bless( { 'from' => bless( { 'end' => '2194', 'id' => 'cdna', 'start' => '686' }, 'Bio::EnsEMBL::Mapper::Unit' ), 'ori' => -1, 'to' => bless( { 'end' => '49809684', 'id' => 'genome', 'start' => '49808176' }, 'Bio::EnsEMBL::Mapper::Unit' ) }, 'Bio::EnsEMBL::Mapper::Pair' ) ] }, 'from' => 'cdna', 'from_cs' => undef, 'pair_count' => 5, 'to' => 'genomic', 'to_cs' => undef }, 'Bio::EnsEMBL::Mapper' ), 'start_phase' => -1 }, 'Bio::EnsEMBL::TranscriptMapper' ), 'peptide' => 'MIV', 'protein_features' => [ bless( { 'analysis' => bless( { '_display_label' => 'Low complexity (Seg)' }, 'Bio::EnsEMBL::Analysis' ), 'end' => '58', 'hseqname' => 'seg', 'start' => '39' }, 'Bio::EnsEMBL::ProteinFeature' ) ], 'protein_function_predictions' => { 'polyphen_humdiv' => bless( { 'analysis' => 'polyphen', 'matrix' => 'VkVQ-humdiv', 'matrix_compressed' => 1, 'peptide_length' => undef, 'sub_analysis' => 'humdiv', 'translation_md5' => '84229aef711b14371f4c0c6f5ec78ebe' }, 'Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix' ), 'polyphen_humvar' => bless( { 'analysis' => 'polyphen', 'matrix' => 'VkVQ-humvar', 'matrix_compressed' => 1, 'peptide_length' => undef, 'sub_analysis' => 'humvar', 'translation_md5' => '84229aef711b14371f4c0c6f5ec78ebe' }, 'Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix' ), 'sift' => bless( { 'analysis' => 'sift', 'matrix' => 'VkVQ-sift', 'matrix_compressed' => 1, 'peptide_length' => undef, 'sub_analysis' => undef, 'translation_md5' => '63fc5b02b6c430f970688d120e14647c' }, 'Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix' ) }, 'seq_edits' => [ bless( { 'alt_seq' => 'U', 'code' => '_selenocysteine', 'description' => undef, 'end' => '667', 'name' => 'Selenocysteine', 'start' => '667' }, 'Bio::EnsEMBL::SeqEdit' ) ], 'sorted_exons' => [ bless( { 'end' => '49809684', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001595042', 'start' => '49808176', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), bless( { 'end' => '49810384', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001669960', 'start' => '49810251', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), bless( { 'end' => '49810577', 'end_phase' => -1, 'phase' => -1, 'stable_id' => 'ENSE00001775575', 'start' => '49810464', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), bless( { 'end' => '49834861', 'end_phase' => -1, 'phase' => 1, 'stable_id' => 'ENSE00001694252', 'start' => '49834525', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), bless( { 'end' => '50051152', 'end_phase' => 1, 'phase' => -1, 'stable_id' => 'ENSE00001657619', 'start' => '50051053', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ) ], 'three_prime_utr' => bless( { '_root_verbose' => 0, 'primary_seq' => bless( { '_nowarnonempty' => undef, '_root_verbose' => 0, 'alphabet' => 'dna', 'display_id' => 'ENST00000414287', 'length' => '1969', 'seq' => 'CAC' }, 'Bio::PrimarySeq' ) }, 'Bio::Seq' ), 'translateable_seq' => 'ATG' }, '_vep_lazy_loaded' => 1, 'attributes' => [ bless( { 'code' => 'miRNA', 'name' => 'Micro RNA', 'value' => '62-83' }, 'Bio::EnsEMBL::Attribute' ), bless( { 'code' => 'cds_start_NF', 'name' => 'CDS start not found', 'value' => '1' }, 'Bio::EnsEMBL::Attribute' ) ], 'biotype' => 'nonsense_mediated_decay', 'cdna_coding_end' => '225', 'cdna_coding_start' => 46, 'coding_region_end' => undef, 'coding_region_start' => undef, 'dbID' => '2441076', 'description' => undef, 'end' => '50051152', 'is_canonical' => 1, 'slice' => bless( { 'circular' => 0, 'coord_system' => bless( { 'dbID' => '2', 'default' => 1, 'name' => 'chromosome', 'rank' => '1', 'sequence_level' => 0, 'top_level' => 0, 'version' => 'GRCh37' }, 'Bio::EnsEMBL::CoordSystem' ), 'end' => '51304566', 'seq_region_length' => '51304566', 'seq_region_name' => '22', 'start' => 1, 'strand' => 1 }, 'Bio::EnsEMBL::Slice' ), 'source' => 'havana', 'stable_id' => 'ENST00000414287', 'start' => '49808176', 'strand' => -1, 'translation' => bless( { 'dbID' => '1232784', 'end' => 125, 'end_exon' => bless( { 'end' => '49834861', 'end_phase' => -1, 'phase' => 1, 'stable_id' => 'ENSE00001694252', 'start' => '49834525', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'seq' => undef, 'stable_id' => 'ENSP00000394865', 'start' => 46, 'start_exon' => bless( { 'end' => '50051152', 'end_phase' => 1, 'phase' => 1, 'stable_id' => 'ENSE00001657619', 'start' => '50051053', 'strand' => -1 }, 'Bio::EnsEMBL::Exon' ), 'transcript' => $VAR1->{'22'}[0], 'version' => 1 }, 'Bio::EnsEMBL::Translation' ), 'version' => 1 }, 'Bio::EnsEMBL::Transcript' ) ] };"; } #endregion private static ObjectValueNode GetObjectValueNode(string dataDumperOutput) { ObjectKeyValueNode rootNode; using (var ms = new MemoryStream()) { using (var reader = new StringReader(dataDumperOutput)) using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { while (true) { var line = reader.ReadLine(); if (line == null) break; writer.WriteLine(line); } } ms.Position = 0; using (var reader = new DataDumperReader(ms)) rootNode = reader.GetRootNode(); } var chr22Node = rootNode.Value.Values[0] as ListObjectKeyValueNode; Assert.NotNull(chr22Node); return chr22Node.Values[0] as ObjectValueNode; } [Fact] public void Parse_Nominal() { var mutableTranscript = ImportTranscript.Parse(_transcriptNode, ChromosomeUtilities.Chr1, Source.Ensembl); Assert.NotNull(mutableTranscript); Assert.Equal(ChromosomeUtilities.Chr1.Index, mutableTranscript.Chromosome.Index); Assert.Equal(49808176, mutableTranscript.Start); Assert.Equal(50051152, mutableTranscript.End); Assert.Equal("ENST00000414287", mutableTranscript.Id); Assert.Equal(1, mutableTranscript.Version); Assert.Equal("CCDS14080.1", mutableTranscript.CcdsId); Assert.Equal("NM_014577.1", mutableTranscript.RefSeqId); Assert.Equal(Source.Ensembl, mutableTranscript.Source); Assert.Equal(49808176, mutableTranscript.Gene.Start); Assert.Equal(50051190, mutableTranscript.Gene.End); Assert.Equal("ENSG00000188511", mutableTranscript.Gene.GeneId); Assert.Equal("C22orf34", mutableTranscript.Gene.Symbol); Assert.Equal(28010, mutableTranscript.Gene.HgncId); Assert.Equal(ChromosomeUtilities.Chr1.Index, mutableTranscript.Gene.Chromosome.Index); Assert.True(mutableTranscript.Gene.OnReverseStrand); Assert.Equal(GeneSymbolSource.HGNC, mutableTranscript.Gene.SymbolSource); Assert.Equal(5, mutableTranscript.Exons.Length); Assert.Equal(50051053, mutableTranscript.Exons[0].Start); Assert.Equal(50051152, mutableTranscript.Exons[0].End); Assert.Equal(-1, mutableTranscript.Exons[0].Phase); Assert.Equal(2194, mutableTranscript.TotalExonLength); Assert.Equal(4, mutableTranscript.Introns.Length); Assert.Equal(49834862, mutableTranscript.Introns[0].Start); Assert.Equal(50051052, mutableTranscript.Introns[0].End); Assert.Equal("ATG", mutableTranscript.TranslateableSequence); Assert.Equal(new IInterval[] { new Interval(62, 83) }, mutableTranscript.MicroRnas); Assert.True(mutableTranscript.CdsStartNotFound); Assert.False(mutableTranscript.CdsEndNotFound); Assert.Equal(new[] { 667 }, mutableTranscript.SelenocysteinePositions); Assert.Equal(1, mutableTranscript.StartExonPhase); Assert.Equal(BioType.nonsense_mediated_decay, mutableTranscript.BioType); Assert.True(mutableTranscript.IsCanonical); Assert.Equal(5, mutableTranscript.CdnaMaps.Length); Assert.Equal(50051053, mutableTranscript.CdnaMaps[0].Start); Assert.Equal(50051152, mutableTranscript.CdnaMaps[0].End); Assert.Equal(1, mutableTranscript.CdnaMaps[0].CdnaStart); Assert.Equal(100, mutableTranscript.CdnaMaps[0].CdnaEnd); Assert.Equal(49834737, mutableTranscript.CodingRegion.Start); Assert.Equal(50051107, mutableTranscript.CodingRegion.End); Assert.Equal(46, mutableTranscript.CodingRegion.CdnaStart); Assert.Equal(225, mutableTranscript.CodingRegion.CdnaEnd); Assert.Equal("ENSP00000394865", mutableTranscript.ProteinId); Assert.Equal(1, mutableTranscript.ProteinVersion); Assert.Equal("MIV", mutableTranscript.PeptideSequence); Assert.Equal("VkVQ-sift", mutableTranscript.SiftData); Assert.Equal("VkVQ-humvar", mutableTranscript.PolyphenData); } } } ================================================ FILE: UnitTests/CacheUtils/Genes/Combiners/CombinerUtilsTests.cs ================================================ using System.IO; using CacheUtils.Genes.Combiners; using CacheUtils.Genes.DataStructures; using Intervals; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.CacheUtils.Genes.Combiners { public sealed class CombinerUtilsTests { [Fact] public void Merge_DifferentCombinations() { var interval = new Interval(17369, 17436); var uga37 = new UgaGene(ChromosomeUtilities.Chr1, interval, null, true, "102466751", null, "MIR6859-1", 50039); var uga38 = new UgaGene(ChromosomeUtilities.Chr1, null, interval, true, null, "ENSG00000278267", "MIR6859-1", 50039); var observedResult = CombinerUtils.Merge(uga37, uga38); Assert.Equal("102466751", observedResult.EntrezGeneId); Assert.Equal("ENSG00000278267", observedResult.EnsemblId); } [Fact] public void Merge_ThrowException_IfValuesDifferent() { var interval = new Interval(17369, 17436); var uga37 = new UgaGene(ChromosomeUtilities.Chr1, interval, null, true, "102466751", "ENSG00000278267", "MIR6859-1", 50039); var uga38 = new UgaGene(ChromosomeUtilities.Chr1, null, interval, true, "000000000", "ENSG00000278267", "MIR6859-1", 50039); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var observedResult = CombinerUtils.Merge(uga37, uga38); }); } } } ================================================ FILE: UnitTests/CacheUtils/Genes/Combiners/HgncIdCombinerTests.cs ================================================ using System.Collections.Generic; using CacheUtils.Genes.Combiners; using CacheUtils.Genes.DataStructures; using Intervals; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.CacheUtils.Genes.Combiners { public sealed class HgncIdCombinerTests { private readonly HgncIdCombiner _combiner = new HgncIdCombiner(); [Fact] public void Combine_CombineWhenAllIdsMatch() { var interval = new Interval(17369, 17436); var uga37 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, interval, null, true, "102466751", "ENSG00000278267", "MIR6859-1", 50039) }; var uga38 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, null, interval, true, "102466751", "ENSG00000278267", "MIR6859-1", 50039) }; var observedResults = new List(); _combiner.Combine(observedResults, uga37, uga38); Assert.Single(observedResults); var observedGene = observedResults[0]; Assert.Equal("102466751", observedGene.EntrezGeneId); Assert.Equal("ENSG00000278267", observedGene.EnsemblId); Assert.Equal(interval, observedGene.GRCh37); Assert.Equal(interval, observedGene.GRCh38); } [Fact] public void Combine_DoNotCombine_MixedStrands() { var interval = new Interval(17369, 17436); var uga37 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, interval, null, true, "102466751", "ENSG00000278267", "MIR6859-1", 50039) }; var uga38 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, null, interval, false, "102466751", "ENSG00000278267", "MIR6859-1", 50039) }; var observedResults = new List(); _combiner.Combine(observedResults, uga37, uga38); Assert.Equal(2, observedResults.Count); Assert.True(observedResults[0].OnReverseStrand); Assert.False(observedResults[1].OnReverseStrand); } [Fact] public void Combine_MIR6859_CombineWhenMissingGeneId() { var interval = new Interval(17369, 17436); var uga37 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, interval, null, true, "102466751", null, "MIR6859-1", 50039) }; var uga38 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, null, interval, true, "102466751", "ENSG00000278267", "MIR6859-1", 50039) }; var observedResults = new List(); _combiner.Combine(observedResults, uga37, uga38); Assert.Single(observedResults); var observedGene = observedResults[0]; Assert.Equal("102466751", observedGene.EntrezGeneId); Assert.Equal("ENSG00000278267", observedGene.EnsemblId); Assert.Equal(interval, observedGene.GRCh37); Assert.Equal(interval, observedGene.GRCh38); } } } ================================================ FILE: UnitTests/CacheUtils/Genes/Combiners/PartitionCombinerTests.cs ================================================ using System.Collections.Generic; using CacheUtils.Genes.Combiners; using CacheUtils.Genes.DataStructures; using Intervals; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.CacheUtils.Genes.Combiners { public sealed class PartitionCombinerTests { private readonly PartitionCombiner _combiner = new PartitionCombiner(); [Fact] public void Combine_MergeIfSameIds_EntrezGeneOnly() { var interval = new Interval(17369, 17436); var uga37 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, interval, null, true, "102466751", null, "MIR6859-1", 50039) }; var uga38 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, null, interval, true, "102466751", null, "MIR6859-1", 50039) }; var observedResults = new List(); _combiner.Combine(observedResults, uga37, uga38); Assert.Single(observedResults); var observedGene = observedResults[0]; Assert.Equal("102466751", observedGene.EntrezGeneId); Assert.Null(observedGene.EnsemblId); Assert.Equal(interval, observedGene.GRCh37); Assert.Equal(interval, observedGene.GRCh38); } [Fact] public void Combine_MergeIfSameIds_EnsemblOnly() { var interval = new Interval(17369, 17436); var uga37 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, interval, null, true, null, "ENSG00000278267", "MIR6859-1", 50039) }; var uga38 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, null, interval, true, null, "ENSG00000278267", "MIR6859-1", 50039) }; var observedResults = new List(); _combiner.Combine(observedResults, uga37, uga38); Assert.Single(observedResults); var observedGene = observedResults[0]; Assert.Equal("ENSG00000278267", observedGene.EnsemblId); Assert.Null(observedGene.EntrezGeneId); Assert.Equal(interval, observedGene.GRCh37); Assert.Equal(interval, observedGene.GRCh38); } [Fact] public void Combine_DoNotCombine_MixedIds() { var interval = new Interval(17369, 17436); var uga37 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, interval, null, true, "102466751", null, "MIR6859-1", 50039) }; var uga38 = new HashSet { new UgaGene(ChromosomeUtilities.Chr1, null, interval, true, "102466751", "ENSG00000278267", "MIR6859-1", 50039) }; var observedResults = new List(); _combiner.Combine(observedResults, uga37, uga38); Assert.Equal(2, observedResults.Count); Assert.Equal("ENSG00000278267", observedResults[0].EnsemblId); Assert.Null(observedResults[1].EnsemblId); } } } ================================================ FILE: UnitTests/CacheUtils/Genes/GeneFlattenerTests.cs ================================================ using System.Collections.Generic; using CacheUtils.DataDumperImport.DataStructures; using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.Genes; using Xunit; namespace UnitTests.CacheUtils.Genes { public sealed class GeneFlattenerTests { [Fact] public void Flatten_AllGenesShouldBeCombined() { var genes = new List { new MutableGene(null, 100, 120, false, null, GeneSymbolSource.Unknown, "test", -1), new MutableGene(null, 110, 115, false, null, GeneSymbolSource.Unknown, "test", -1), new MutableGene(null, 120, 130, false, null, GeneSymbolSource.Unknown, "test", -1) }; var flatGenes = GeneFlattener.FlattenWithSameId(genes); Assert.Single(flatGenes); var flatGene = flatGenes[0]; Assert.Equal(100, flatGene.Start); Assert.Equal(130, flatGene.End); } [Fact] public void Flatten_ReturnSameGene_WhenListHasOneEntry() { var genes = new List { new MutableGene(null, 100, 120, false, null, GeneSymbolSource.Unknown, "test", -1) }; var flatGenes = GeneFlattener.FlattenWithSameId(genes); Assert.Single(flatGenes); Assert.Equal(genes[0].Start, flatGenes[0].Start); Assert.Equal(genes[0].End, flatGenes[0].End); } [Fact] public void Flatten_ReturnNull_WhenInputNull() { var flatGenes = GeneFlattener.FlattenWithSameId(null as List); Assert.Null(flatGenes); } [Fact] public void Flatten_NoGenesShouldBeCombined() { var genes = new List { new MutableGene(null, 100, 120, false, null, GeneSymbolSource.Unknown, "test", -1), new MutableGene(null, 130, 140, false, null, GeneSymbolSource.Unknown, "test", -1), new MutableGene(null, 150, 160, false, null, GeneSymbolSource.Unknown, "test", -1) }; var flatGenes = GeneFlattener.FlattenWithSameId(genes); Assert.Equal(3, flatGenes.Count); for (int i = 0; i < flatGenes.Count; i++) { Assert.Equal(genes[i].Start, flatGenes[i].Start); Assert.Equal(genes[i].End, flatGenes[i].End); } } } } ================================================ FILE: UnitTests/CacheUtils/Genes/Utilities/DictionaryUtilitiesTests.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.Genes.DataStructures; using CacheUtils.Genes.Utilities; using Xunit; namespace UnitTests.CacheUtils.Genes.Utilities { public sealed class DictionaryUtilitiesTests { [Fact] public void GetSingleValueDict_OneKey_OneValue() { var uga1 = new UgaGene(null, null, null, true, "102466751", null, "MIR6859-1", 50039); var uga2 = new UgaGene(null, null, null, true, null, "ENSG00000278267", "MIR6859-1", 50039); var genes = new List { uga1, uga2 }; var observedResult = genes.GetSingleValueDict(x => x.EnsemblId); Assert.NotNull(observedResult); Assert.Single(observedResult); Assert.True(observedResult.ContainsKey("ENSG00000278267")); } [Fact] public void GetSingleValueDict_ThrowException_IfMultipleValuesShareKey() { var uga1 = new UgaGene(null, null, null, true, null, "ENSG00000278267", "MIR6859-1", 50039); var uga2 = new UgaGene(null, null, null, true, null, "ENSG00000278267", "MIR6859-1", 50039); var genes = new List { uga1, uga2 }; Assert.Throws(delegate { // ReSharper disable once UnusedVariable var observedResult = genes.GetSingleValueDict(x => x.EnsemblId); }); } [Fact] public void GetMultiValueDict_OneKey_WithTwoValues() { var uga1 = new UgaGene(null, null, null, true, "102466751", null, "MIR6859-1", 50039); var uga2 = new UgaGene(null, null, null, true, null, "ENSG00000278267", "MIR6859-1", 50039); var uga3 = new UgaGene(null, null, null, true, null, "ENSG00000278267", "MIR6859-1", 50039); var genes = new List { uga1, uga2, uga3 }; var observedResult = genes.GetMultiValueDict(x => x.EnsemblId); Assert.NotNull(observedResult); Assert.Single(observedResult); var firstEntry = observedResult["ENSG00000278267"]; Assert.NotNull(firstEntry); Assert.Equal(2, firstEntry.Count); } [Fact] public void GetKeyValueDict_OneKey_OneValue() { var uga1 = new UgaGene(null, null, null, true, "102466751", null, "MIR6859-1", 50039); var uga2 = new UgaGene(null, null, null, true, null, "ENSG00000278267", "MIR6859-1", 50039); var uga3 = new UgaGene(null, null, null, true, null, "ENSG00000278267", "MIR6859-1", 50039); var genes = new List { uga1, uga2, uga3 }; var observedResult = genes.GetKeyValueDict(x => x.EnsemblId, x => x.HgncId); Assert.NotNull(observedResult); Assert.Single(observedResult); var hgncId = observedResult["ENSG00000278267"]; Assert.Equal(50039, hgncId); } [Fact] public void CreateIndex_ThreeValues() { const string a = "tom"; const string b = "jane"; const string c = "sally"; var genes = new List { a, b, c }; var observedResult = genes.CreateIndex(); Assert.NotNull(observedResult); Assert.Equal(3, observedResult.Count); Assert.Equal(0, observedResult[a]); Assert.Equal(1, observedResult[b]); Assert.Equal(2, observedResult[c]); } } } ================================================ FILE: UnitTests/CacheUtils/IO/Caches/TranscriptCacheWriterTests.cs ================================================ using System.Collections.Generic; using CacheUtils.TranscriptCache; using Xunit; namespace UnitTests.CacheUtils.IO.Caches { public sealed class TranscriptCacheWriterTests { [Fact] public void CreateIndex_PopulatedDictionary() { var strings = new[] { "A", "B", "D", "P", "Z" }; var dict = TranscriptCacheWriter.CreateIndex(strings, EqualityComparer.Default); Assert.NotNull(dict); Assert.Equal(3, dict["P"]); } [Fact] public void CreateIndex_EmptyDictionary_WhenInputNull() { var dict = TranscriptCacheWriter.CreateIndex(null, EqualityComparer.Default); Assert.NotNull(dict); Assert.Empty(dict); } } } ================================================ FILE: UnitTests/CacheUtils/TranscriptCache/TranscriptRegionMergerTests.cs ================================================ using CacheUtils.DataDumperImport.DataStructures.Mutable; using CacheUtils.TranscriptCache; using CacheUtils.TranscriptCache.Comparers; using Intervals; using UnitTests.TestUtilities; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.CacheUtils.TranscriptCache { public sealed class TranscriptRegionMergerTests { private readonly TranscriptRegionComparer _comparer = new TranscriptRegionComparer(); [Fact] public void GetTranscriptRegions_OneExon() { var cdnaMaps = new[] { new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 64571756, 64572037, 2569, 2850) }; var exons = new[] { new MutableExon(ChromosomeUtilities.Chr5, 64571756, 64572037, 0) }; var expectedRegions = new ITranscriptRegion[] { new MutableTranscriptRegion(TranscriptRegionType.Exon, 1, 64571756, 64572037, 2569, 2850) }; var observedRegions = TranscriptRegionMerger.GetTranscriptRegions(cdnaMaps, exons, null, false); Assert.Single(observedRegions); Assert.Equal(expectedRegions, observedRegions, _comparer); } [Fact] public void GetTranscriptRegions_WithGap_Forward() { var exons = new[] { new MutableExon(ChromosomeUtilities.Chr5, 89623195, 89624305, 0), new MutableExon(ChromosomeUtilities.Chr5, 89653782, 89653866, 0), new MutableExon(ChromosomeUtilities.Chr5, 89690803, 89690846, 0), new MutableExon(ChromosomeUtilities.Chr5, 89692770, 89693008, 0), new MutableExon(ChromosomeUtilities.Chr5, 89702368, 89702526, 0), new MutableExon(ChromosomeUtilities.Chr5, 89711875, 89712016, 0), new MutableExon(ChromosomeUtilities.Chr5, 89717610, 89717776, 0), new MutableExon(ChromosomeUtilities.Chr5, 89720651, 89720875, 0), new MutableExon(ChromosomeUtilities.Chr5, 89725044, 89731687, 0) }; var introns = new IInterval[] { new Interval(89624306, 89653781), new Interval(89653867, 89690802), new Interval(89690847, 89692769), new Interval(89693009, 89702367), new Interval(89702527, 89711874), new Interval(89712017, 89717609), new Interval(89717777, 89720650), new Interval(89720876, 89725043) }; var cdnaMaps = new[] { new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89623195, 89623860, 1, 666), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89623862, 89624305, 667, 1110), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89653782, 89653866, 1111, 1195), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89690803, 89690846, 1196, 1239), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89692770, 89693008, 1240, 1478), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89702368, 89702526, 1479, 1637), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89711875, 89712016, 1638, 1779), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89717610, 89717776, 1780, 1946), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89720651, 89720875, 1947, 2171), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 89725044, 89731687, 2172, 8815) }; var expectedRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 89623195, 89623860, 1, 666), new TranscriptRegion(TranscriptRegionType.Gap, 1, 89623861, 89623861, 666, 667), new TranscriptRegion(TranscriptRegionType.Exon, 1, 89623862, 89624305, 667, 1110), new TranscriptRegion(TranscriptRegionType.Intron, 1, 89624306, 89653781, 1110, 1111), new TranscriptRegion(TranscriptRegionType.Exon, 2, 89653782, 89653866, 1111, 1195), new TranscriptRegion(TranscriptRegionType.Intron, 2, 89653867, 89690802, 1195, 1196), new TranscriptRegion(TranscriptRegionType.Exon, 3, 89690803, 89690846, 1196, 1239), new TranscriptRegion(TranscriptRegionType.Intron, 3, 89690847, 89692769, 1239, 1240), new TranscriptRegion(TranscriptRegionType.Exon, 4, 89692770, 89693008, 1240, 1478), new TranscriptRegion(TranscriptRegionType.Intron, 4, 89693009, 89702367, 1478, 1479), new TranscriptRegion(TranscriptRegionType.Exon, 5, 89702368, 89702526, 1479, 1637), new TranscriptRegion(TranscriptRegionType.Intron, 5, 89702527, 89711874, 1637, 1638), new TranscriptRegion(TranscriptRegionType.Exon, 6, 89711875, 89712016, 1638, 1779), new TranscriptRegion(TranscriptRegionType.Intron, 6, 89712017, 89717609, 1779, 1780), new TranscriptRegion(TranscriptRegionType.Exon, 7, 89717610, 89717776, 1780, 1946), new TranscriptRegion(TranscriptRegionType.Intron, 7, 89717777, 89720650, 1946, 1947), new TranscriptRegion(TranscriptRegionType.Exon, 8, 89720651, 89720875, 1947, 2171), new TranscriptRegion(TranscriptRegionType.Intron, 8, 89720876, 89725043, 2171, 2172), new TranscriptRegion(TranscriptRegionType.Exon, 9, 89725044, 89731687, 2172, 8815) }; var observedRegions = TranscriptRegionMerger.GetTranscriptRegions(cdnaMaps, exons, introns, false); Assert.Equal(19, observedRegions.Length); Assert.Equal(expectedRegions, observedRegions, _comparer); } [Fact] public void GetTranscriptRegions_WithGap_Reverse() { var exons = new[] { new MutableExon(ChromosomeUtilities.Chr5, 64571756, 64574228, 2), new MutableExon(ChromosomeUtilities.Chr5, 64575621, 64575829, 0), new MutableExon(ChromosomeUtilities.Chr5, 64578301, 64578407, 0), new MutableExon(ChromosomeUtilities.Chr5, 64578866, 64578927, 0) }; var introns = new IInterval[] { new Interval(64574229, 64575620), new Interval(64575830, 64578300), new Interval(64578408, 64578865) }; var cdnaMaps = new[] { new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 64571756, 64572037, 2569, 2850), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 64572039, 64574228, 379, 2568), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 64575621, 64575829, 170, 378), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 64578301, 64578407, 63, 169), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 64578866, 64578927, 1, 62) }; var expectedRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 4, 64571756, 64572037, 2569, 2850), new TranscriptRegion(TranscriptRegionType.Gap, 4, 64572038, 64572038, 2568, 2569), new TranscriptRegion(TranscriptRegionType.Exon, 4, 64572039, 64574228, 379, 2568), new TranscriptRegion(TranscriptRegionType.Intron, 3, 64574229, 64575620, 378, 379), new TranscriptRegion(TranscriptRegionType.Exon, 3, 64575621, 64575829, 170, 378), new TranscriptRegion(TranscriptRegionType.Intron, 2, 64575830, 64578300, 169, 170), new TranscriptRegion(TranscriptRegionType.Exon, 2, 64578301, 64578407, 63, 169), new TranscriptRegion(TranscriptRegionType.Intron, 1, 64578408, 64578865, 62, 63), new TranscriptRegion(TranscriptRegionType.Exon, 1, 64578866, 64578927, 1, 62) }; var observedRegions = TranscriptRegionMerger.GetTranscriptRegions(cdnaMaps, exons, introns, true); Assert.Equal(9, observedRegions.Length); Assert.Equal(expectedRegions, observedRegions, _comparer); } [Fact] public void GetTranscriptRegions_Reverse() { var exons = new[] { new MutableExon(ChromosomeUtilities.Chr1, 20977055, 20977207, 1), new MutableExon(ChromosomeUtilities.Chr1, 20976856, 20977050, 1) }; var introns = new IInterval[] { new Interval(20977051, 20977054) }; var cdnaMaps = new[] { new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 20977055, 20977207, 1, 153), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 20976856, 20977050, 154, 348) }; var expectedRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 2, 20976856, 20977050, 154, 348), new TranscriptRegion(TranscriptRegionType.Intron, 1, 20977051, 20977054, 153, 154), new TranscriptRegion(TranscriptRegionType.Exon, 1, 20977055, 20977207, 1, 153) }; var observedRegions = TranscriptRegionMerger.GetTranscriptRegions(cdnaMaps, exons, introns, true); Assert.Equal(3, observedRegions.Length); Assert.Equal(expectedRegions, observedRegions, _comparer); } [Fact] public void GetTranscriptRegions_TwoExonsNoGap_Forward() { var exons = new[] { new MutableExon(ChromosomeUtilities.Chr12, 7079944, 7080253, 1), new MutableExon(ChromosomeUtilities.Chr12, 7083501, 7083602, 2), new MutableExon(ChromosomeUtilities.Chr12, 7083714, 7083855, 2), new MutableExon(ChromosomeUtilities.Chr12, 7084252, 7084310, 1), new MutableExon(ChromosomeUtilities.Chr12, 7084391, 7084540, 2), new MutableExon(ChromosomeUtilities.Chr12, 7084858, 7085165, 2) }; var introns = new IInterval[] { new Interval(7080254, 7083500), new Interval(7083603, 7083713), new Interval(7083856, 7084251), new Interval(7084311, 7084390), new Interval(7084541, 7084857) }; var cdnaMaps = new[] { new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 7079944, 7080212, 1, 269), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 7080213, 7080253, 271, 311), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 7083501, 7083602, 312, 413), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 7083714, 7083855, 414, 555), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 7084252, 7084310, 556, 614), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 7084391, 7084540, 615, 764), new MutableTranscriptRegion(TranscriptRegionType.Exon, 0, 7084858, 7085165, 765, 1072) }; var expectedRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 7079944, 7080212, 1, 269), new TranscriptRegion(TranscriptRegionType.Exon, 1, 7080213, 7080253, 271, 311), new TranscriptRegion(TranscriptRegionType.Intron, 1, 7080254, 7083500, 311, 312), new TranscriptRegion(TranscriptRegionType.Exon, 2, 7083501, 7083602, 312, 413), new TranscriptRegion(TranscriptRegionType.Intron, 2, 7083603, 7083713, 413, 414), new TranscriptRegion(TranscriptRegionType.Exon, 3, 7083714, 7083855, 414, 555), new TranscriptRegion(TranscriptRegionType.Intron, 3, 7083856, 7084251, 555, 556), new TranscriptRegion(TranscriptRegionType.Exon, 4, 7084252, 7084310, 556, 614), new TranscriptRegion(TranscriptRegionType.Intron, 4, 7084311, 7084390, 614, 615), new TranscriptRegion(TranscriptRegionType.Exon, 5, 7084391, 7084540, 615, 764), new TranscriptRegion(TranscriptRegionType.Intron, 5, 7084541, 7084857, 764, 765), new TranscriptRegion(TranscriptRegionType.Exon, 6, 7084858, 7085165, 765, 1072) }; var observedRegions = TranscriptRegionMerger.GetTranscriptRegions(cdnaMaps, exons, introns, false); Assert.Equal(12, observedRegions.Length); Assert.Equal(expectedRegions, observedRegions, _comparer); } } } ================================================ FILE: UnitTests/CacheUtils/Utilities/AccessionUtilitiesTests.cs ================================================ using System.IO; using CacheUtils.Utilities; using Xunit; namespace UnitTests.CacheUtils.Utilities { public sealed class AccessionUtilitiesTests { [Fact] public void GetMaxVersion_Dupl() { const string expectedId = "NM_004522.2_dupl6"; const byte expectedVersion = 1; var observedResult = AccessionUtilities.GetMaxVersion("NM_004522.2_dupl6", 1); Assert.Equal(expectedId, observedResult.Id); Assert.Equal(expectedVersion, observedResult.Version); } [Fact] public void GetMaxVersion_IdVersionMax() { const string expectedId = "NM_004522"; const byte expectedVersion = 2; var observedResult = AccessionUtilities.GetMaxVersion("NM_004522.2", 1); Assert.Equal(expectedId, observedResult.Id); Assert.Equal(expectedVersion, observedResult.Version); } [Fact] public void GetMaxVersion_SuppliedVersionMax() { const string expectedId = "NM_004522"; const byte expectedVersion = 3; var observedResult = AccessionUtilities.GetMaxVersion("NM_004522.2", 3); Assert.Equal(expectedId, observedResult.Id); Assert.Equal(expectedVersion, observedResult.Version); } [Fact] public void GetAccessionNumber_ReturnNumber_RefSeq() { const int expectedResult = 4522; var observedResult = AccessionUtilities.GetAccessionNumber("NM_004522"); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetAccessionNumber_ReturnNumber_Ensembl() { const int expectedResult = 515242; var observedResult = AccessionUtilities.GetAccessionNumber("ENST00000515242"); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetAccessionNumber_ReturnMinusOne() { const int expectedResult = -1; var observedResult = AccessionUtilities.GetAccessionNumber(null); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetAccessionNumber_ThrowException_IfUnderlineMissingRefSeq() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var observedResult = AccessionUtilities.GetAccessionNumber("NM004522"); }); } } } ================================================ FILE: UnitTests/CacheUtils/Utilities/RemoteFileTests.cs ================================================ using System; using CacheUtils.Utilities; using VariantAnnotation.Utilities; using Xunit; namespace UnitTests.CacheUtils.Utilities { public sealed class RemoteFileTests { [Fact] public void GetFilename_WithoutUrlPrefix() { string expectedResult = $"ccds_1000_{Date.GetDate(DateTime.Now.Ticks)}.txt"; var observedResult = RemoteFile.GetFilename("ccds_1000.txt", true); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetFilename_WithoutDate() { const string expectedResult = "CCDS2Sequence.20160908.txt"; var observedResult = RemoteFile.GetFilename("ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS2Sequence.20160908.txt", false); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetFilename_WithUrlPrefix() { string expectedResult = $"CCDS2Sequence.20160908_{Date.GetDate(DateTime.Now.Ticks)}.txt"; var observedResult = RemoteFile.GetFilename("ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS2Sequence.20160908.txt", true); Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/Cloud/ConsistencyTests.cs ================================================ using Cloud; using VariantAnnotation.SA; using Xunit; namespace UnitTests.Cloud; public sealed class ConsistencyTests { [Fact] public void Consistency_with_SAUtils() { Assert.Equal(LambdaUrlHelper.SaSchemaVersion, SaCommon.SchemaVersion); } } ================================================ FILE: UnitTests/Cloud/JsonUtilitiesTests.cs ================================================ using System.IO; using System.Linq; using System.Text; using Cloud.Utilities; using Xunit; namespace UnitTests.Cloud { public sealed class JsonUtilitiesTests { [Fact] public void Serialize_AsExpected() { var inputObject = new[] { new ObjectExample {Name = "Ada", Age = 8, Skills = new []{"dancing", "skating"}}, new ObjectExample {Name = "Bob", Age = 10, Skills = new []{"programming"}} }; var memStream = JsonUtilities.Serialize(inputObject); const string expectedString = "[{\"Name\":\"Ada\",\"Age\":8,\"Skills\":[\"dancing\",\"skating\"]},{\"Name\":\"Bob\",\"Age\":10,\"Skills\":[\"programming\"]}]"; var expectedStream = new MemoryStream(Encoding.ASCII.GetBytes(expectedString)); Assert.Equal(expectedStream.Length, memStream.Length); Assert.True(expectedStream.ToArray().SequenceEqual(memStream.ToArray())); } [Fact] public void Stringify_AsExpected() { var inputObject = new[] { new ObjectExample {Name = "Ken", Age = 16, Skills = new[] {"boxing"}}, new ObjectExample {Name = "Armanda", Age = 18, Skills = new[] {"cooking"}} }; const string expectedString = "[{\"Name\":\"Ken\",\"Age\":16,\"Skills\":[\"boxing\"]},{\"Name\":\"Armanda\",\"Age\":18,\"Skills\":[\"cooking\"]}]"; Assert.Equal(expectedString, JsonUtilities.Stringify(inputObject)); } } public sealed class ObjectExample { public string Name; public int Age; public string[] Skills; } } ================================================ FILE: UnitTests/Cloud/LambdaUrlHelperTests.cs ================================================ using System; using Cloud; using Cloud.Utilities; using Genome; using IO; using ReferenceSequence; using VariantAnnotation.SA; using Xunit; namespace UnitTests.Cloud { public sealed class LambdaUrlHelperTests { [Fact] public void GetDataUrlBase_AsExpected() { Environment.SetEnvironmentVariable("NirvanaDataUrlBase", "http://somewhere.on.the.earth/"); Assert.Equal($"http://somewhere.on.the.earth/ab0cf104f39708eabd07b8cb67e149ba-Cache/{CacheConstants.DataVersion}/", LambdaUrlHelper.GetCacheFolder()); Assert.Equal($"http://somewhere.on.the.earth/d95867deadfe690e40f42068d6b59df8-References/{ReferenceSequenceCommon.HeaderVersion}/Homo_sapiens.", LambdaUrlHelper.GetRefPrefix()); } [Fact] public void GetS3RefLocation_AsExpected() { Environment.SetEnvironmentVariable("NirvanaDataUrlBase", "whatever"); Assert.Equal(LambdaUrlHelper.GetRefPrefix() + "GRCh37" + LambdaUrlHelper.RefSuffix, LambdaUrlHelper.GetRefUrl(GenomeAssembly.GRCh37)); } [Fact] public void GetS3_SaManifest_Location_AsExpected() { Environment.SetEnvironmentVariable("NirvanaDataUrlBase", "http://nirvana-annotations.s3.us-west-2.amazonaws.com/"); var saManifestUrl = LambdaUtilities.GetManifestUrl("latest", GenomeAssembly.GRCh38, SaCommon.SchemaVersion); HttpUtilities.ValidateUrl(saManifestUrl); } [Fact] public void GetS3_SaManifest_Location_from_config() { var saManifestUrl = LambdaUtilities.GetManifestUrl("latest", GenomeAssembly.GRCh38, SaCommon.SchemaVersion); HttpUtilities.ValidateUrl(saManifestUrl); } } } ================================================ FILE: UnitTests/Cloud/RedactionUtilitiesTests.cs ================================================ using Cloud; using Xunit; namespace UnitTests.Cloud { public sealed class RedactionUtilitiesTests { [Fact] public void Redact_PresignedUrl() { const string json = "{\"id\":\"e96a15ab-13f8-48cd-b3b8-ca37aca8480f\",\"genomeAssembly\":\"GRCh37\",\"vcfUrl\":\"https://s3.amazonaws.com/illumina-early-access/Test.vcf.gz?AWSAccessKeyId=AKISKSD87A3C4&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\",\"tabixUrl\":\"https://s3.amazonaws.com/illumina-early-access/Test.vcf.gz.tbi?AWSAccessKeyId=AKISKSD87A3C4&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\",\"outputDir\":{\"bucketName\":\"illumina-early-access\",\"region\":\"us-east-1\",\"path\":\"/5a2a3c8c-3744-422d-b343/\",\"accessKey\":\"AKIAIOSFODNN7EXAMPLE\",\"secretKey\":\"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\",\"sessionToken\":\"AQoEXAMPLEH4aoAH0gNCAPyJxz4BlCFFxWNE1OPTgk5TthT+FvwqnKwRcOIfrRh3c/LTo6UDdyJwOOvEVPvLXCrrrUtdnniCEXAMPLE/IvU1dYUg2RVAJBanLiHb4IgRmpRV3zrkuWJOgQs8IZZaIv2BXIa2R4OlgkBN9bkUDNCJiBeb/AXlzBBko7b15fjrBs2+cTQtpZ3CYWFXG8C5zqx37wnOE49mRl/+OtkIKGO7fAE\"},\"supplementaryAnnotations\":\"latest\",\"customAnnotations\":[{\"nsaUrl\":\"https://s3.amazonaws.com/illumina-early-access/ClinVar.nsa?AWSAccessKeyId=AKISKSD87A3C4&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\",\"idxUrl\":\"https://s3.amazonaws.com/illumina-early-access/ClinVar.nsa.idx?AWSAccessKeyId=AKISKSD87A3C4&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\"},{\"nsiUrl\":\"https://s3.amazonaws.com/illumina-early-access/ClinVar.nsi?AWSAccessKeyId=AKISKSD87A3C4&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\"},{\"ngaUrl\":\"https://s3.amazonaws.com/illumina-early-access/ClinVar.nga?AWSAccessKeyId=AKISKSD87A3C4&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\"}]}"; const string expected = "{\"id\":\"e96a15ab-13f8-48cd-b3b8-ca37aca8480f\",\"genomeAssembly\":\"GRCh37\",\"vcfUrl\":\"https://s3.amazonaws.com/illumina-early-access/Test.vcf.gz?AWSAccessKeyId=XXXXXXXXXXXXX&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\",\"tabixUrl\":\"https://s3.amazonaws.com/illumina-early-access/Test.vcf.gz.tbi?AWSAccessKeyId=XXXXXXXXXXXXX&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\",\"outputDir\":{\"bucketName\":\"illumina-early-access\",\"region\":\"us-east-1\",\"path\":\"/5a2a3c8c-3744-422d-b343/\",\"accessKey\":\"XXXXXXXXXXXXXXXXXXXX\",\"secretKey\":\"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\",\"sessionToken\":\"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\"},\"supplementaryAnnotations\":\"latest\",\"customAnnotations\":[{\"nsaUrl\":\"https://s3.amazonaws.com/illumina-early-access/ClinVar.nsa?AWSAccessKeyId=XXXXXXXXXXXXX&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\",\"idxUrl\":\"https://s3.amazonaws.com/illumina-early-access/ClinVar.nsa.idx?AWSAccessKeyId=XXXXXXXXXXXXX&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\"},{\"nsiUrl\":\"https://s3.amazonaws.com/illumina-early-access/ClinVar.nsi?AWSAccessKeyId=XXXXXXXXXXXXX&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\"},{\"ngaUrl\":\"https://s3.amazonaws.com/illumina-early-access/ClinVar.nga?AWSAccessKeyId=XXXXXXXXXXXXX&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f\"}]}"; string observed = json.Redact(); Assert.Equal(expected, observed); } [Fact] public void Redact_AwsSignatureVersion4() { const string json = "{\"id\":\"Test\",\"genomeAssembly\":\"GRCh38\",\"vcfUrl\":\"https://illumina-dev.s3.us-west-2.amazonaws.com/Annotation/input/test.vcf.gz?X-Amz-Expires=604800&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAZYATIEHL37L46ZIO/20191007/us-west-2/s3/aws4_request&X-Amz-Date=20191007T222533Z&X-Amz-SignedHeaders=host&X-Amz-Signature=44433f0ec4875323d8e82084469f4e34b6384aead83f9c176595b96badaba3f8\",\"tabixUrl\":\"https://illumina-dev.s3.us-west-2.amazonaws.com/Annotation/input/test.vcf.gz.tbi?X-Amz-Expires=604800&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAZYATIEHL37L46ZIO/20191007/us-west-2/s3/aws4_request&X-Amz-Date=20191007T222533Z&X-Amz-SignedHeaders=host&X-Amz-Signature=19cd9c1244cf156952746e85bfc4977946a80a1110205a3dae9b578647dacd50\",\"outputDir\":{\"bucketName\":\"illumina-early-access\",\"region\":\"us-east-1\",\"path\":\"/5a2a3c8c-3744-422d-b343/\",\"accessKey\":\"AKIAIOSFODNN7EXAMPLE\",\"secretKey\":\"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\",\"sessionToken\":\"AQoEXAMPLEH4aoAH0gNCAPyJxz4BlCFFxWNE1OPTgk5TthT+FvwqnKwRcOIfrRh3c/LTo6UDdyJwOOvEVPvLXCrrrUtdnniCEXAMPLE/IvU1dYUg2RVAJBanLiHb4IgRmpRV3zrkuWJOgQs8IZZaIv2BXIa2R4OlgkBN9bkUDNCJiBeb/AXlzBBko7b15fjrBs2+cTQtpZ3CYWFXG8C5zqx37wnOE49mRl/+OtkIKGO7fAE\"},\"supplementaryAnnotations\":\"latest\",\"customAnnotations\":null}"; const string expected = "{\"id\":\"Test\",\"genomeAssembly\":\"GRCh38\",\"vcfUrl\":\"https://illumina-dev.s3.us-west-2.amazonaws.com/Annotation/input/test.vcf.gz?X-Amz-Expires=604800&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=XXXXXXXXXXXXXXXXXXXX/20191007/us-west-2/s3/aws4_request&X-Amz-Date=20191007T222533Z&X-Amz-SignedHeaders=host&X-Amz-Signature=44433f0ec4875323d8e82084469f4e34b6384aead83f9c176595b96badaba3f8\",\"tabixUrl\":\"https://illumina-dev.s3.us-west-2.amazonaws.com/Annotation/input/test.vcf.gz.tbi?X-Amz-Expires=604800&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=XXXXXXXXXXXXXXXXXXXX/20191007/us-west-2/s3/aws4_request&X-Amz-Date=20191007T222533Z&X-Amz-SignedHeaders=host&X-Amz-Signature=19cd9c1244cf156952746e85bfc4977946a80a1110205a3dae9b578647dacd50\",\"outputDir\":{\"bucketName\":\"illumina-early-access\",\"region\":\"us-east-1\",\"path\":\"/5a2a3c8c-3744-422d-b343/\",\"accessKey\":\"XXXXXXXXXXXXXXXXXXXX\",\"secretKey\":\"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\",\"sessionToken\":\"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\"},\"supplementaryAnnotations\":\"latest\",\"customAnnotations\":null}"; string observed = json.Redact(); Assert.Equal(expected, observed); } } } ================================================ FILE: UnitTests/Cloud/S3PathTests.cs ================================================ using Cloud.Messages; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.Cloud { public sealed class S3PathTests { [Theory] [InlineData("/this/is/a/folder/", false)] [InlineData("/this/is/a/file", true)] public void ValidatePathFormat_AsExpected(string path, bool isDirectory) { Assert.Throws(() => S3Path.ValidatePathFormat(path, isDirectory)); } [Fact] public void FormatPath_AsExpected() { Assert.Equal("to/the/file", S3Path.FormatPath("/to/the/file")); Assert.Equal("to/the/directory/", S3Path.FormatPath("/to/the/directory/")); } } } ================================================ FILE: UnitTests/Cloud/SaUrlsTests.cs ================================================ using Cloud.Messages; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.Cloud { public sealed class SaUrlsTests { [Theory] [InlineData("test.nsa", "test.idx", "test.nsi", "test.nga")] [InlineData(null, "test.idx", "test.nsi", "test.nga")] [InlineData("test.nsa", "test.idx", null, "test.nga")] [InlineData("test.nsa", "test.idx", "test.nsi", null)] [InlineData(null, "test.idx", null, null)] [InlineData(null, null, null, null)] [InlineData("test.nsa", null, null, null)] public void SetSaType_InvalidValues_ThrowException(string nsaUrl, string idxUrl, string nsiUrl, string ngaUrl) { var saUrls = new SaUrls{nsaUrl = nsaUrl, idxUrl = idxUrl, nsiUrl = nsiUrl, ngaUrl = ngaUrl}; Assert.Throws(() => saUrls.GetSaType()); } [Theory] [InlineData("test.nsa", "test.idx", null, null, CustomSaType.Nsa)] [InlineData(null, null, "test.nsi", null, CustomSaType.Nsi)] [InlineData(null, null, null, "test.nga", CustomSaType.Nga)] public void SetSaType_AsExpected(string nsaUrl, string idxUrl, string nsiUrl, string ngaUrl, CustomSaType expectSaType) { var saUrls = new SaUrls { nsaUrl = nsaUrl, idxUrl = idxUrl, nsiUrl = nsiUrl, ngaUrl = ngaUrl }; Assert.Equal(expectSaType, saUrls.SaType); } } } ================================================ FILE: UnitTests/Cloud/UploadUtilitiesTests.cs ================================================ using System; using System.Net; using System.Security.Cryptography; using System.Threading.Tasks; using Amazon.S3.Model; using Cloud.Utilities; using IO; using Moq; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Cloud { public sealed class UploadUtilitiesTests { private readonly FileMetadata _metadata = new FileMetadata(new byte[] { 0, 1, 2, 3, 4, 5, 6 }, 1); private readonly string _filePath = Resources.TopPath("clinvar.dict"); private readonly AesCryptoServiceProvider _aes = new AesCryptoServiceProvider(); private static Mock GetS3ClientMock() { var s3ClientMock = new Mock(); s3ClientMock.Setup(x => x.PutObjectAsync(It.IsAny())).ReturnsAsync(new PutObjectResponse()); return s3ClientMock; } private static Mock GetS3ClientMockAlwaysFail() { var s3ClientMock = new Mock(); s3ClientMock.Setup(x => x.PutObjectAsync(It.IsAny())).ThrowsAsync(new WebException()); return s3ClientMock; } private static Mock GetS3ClientMockCanWorkAfterRetries() { var s3ClientMock = new Mock(); s3ClientMock.SetupSequence(x => x.PutObjectAsync(It.IsAny())) .ThrowsAsync(new WebException()) .ThrowsAsync(new WebException()) .ReturnsAsync(new PutObjectResponse()) .ThrowsAsync(new WebException()); return s3ClientMock; } [Fact] public void TryDecryptUpload_AsExpected() { var s3ClientMock = GetS3ClientMock(); Assert.True(s3ClientMock.Object.TryDecryptUpload("bucket", "bob.json.gz", _filePath, _aes, _metadata)); s3ClientMock.Verify(x => x.PutObjectAsync(It.IsAny()), Times.Once); } [Fact] public void TryDecryptUpload_FileNotFound() { var s3ClientMock = GetS3ClientMock(); Assert.False(s3ClientMock.Object.TryDecryptUpload("bucket", "bob.json.gz", "bob123", _aes, _metadata)); s3ClientMock.Verify(x => x.PutObjectAsync(It.IsAny()), Times.Never); } [Fact] public void DecryptUpload_OnlyPutOnceWhenSuccess() { var s3ClientMock = GetS3ClientMock(); s3ClientMock.Object.DecryptUpload("bucket", "bob.json.gz", _filePath, _aes, _metadata, 1); s3ClientMock.Verify(x => x.PutObjectAsync(It.IsAny()), Times.Once); } [Fact] public void DecryptUpload_SuccessWithRetries() { var s3ClientMock = GetS3ClientMockCanWorkAfterRetries(); s3ClientMock.Object.DecryptUpload("bucket", "bob.json.gz", _filePath, _aes, _metadata, 1); s3ClientMock.Verify(x => x.PutObjectAsync(It.IsAny()), Times.Exactly(3)); } [Fact] public void DecryptUpload_TimeOutWhenFail() { var timeOut = TimeSpan.FromMilliseconds(500); var s3ClientMockAlwaysFail = GetS3ClientMockAlwaysFail(); var failTask = Task.Run(() => s3ClientMockAlwaysFail.Object.DecryptUpload("bucket", "bob.json.gz", _filePath, _aes, _metadata, 1)); Assert.False(Task.WaitAll(new[] { failTask }, timeOut)); s3ClientMockAlwaysFail.Verify(x => x.PutObjectAsync(It.IsAny()), Times.AtLeast(2)); } } } ================================================ FILE: UnitTests/CommandLine/Builders/ConsoleAppBuilderDataTests.cs ================================================ using CommandLine.Builders; using CommandLine.NDesk.Options; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.CommandLine.Builders { public sealed class ConsoleAppBuilderDataTests { [Fact] public void VersionProvider_Set() { var ops = new OptionSet { { "test=", "test", v => { } } }; var data = new ConsoleAppBuilder(null, ops).UseVersionProvider(new VersionProvider()) .Parse() .Data; Assert.True(data.VersionProvider is VersionProvider); } } public sealed class ConsoleAppValidatorTests { [Fact] public void ShowBanner_EnabledOutput() { var ops = new OptionSet { { "test=", "test", v => { } } }; var banner = new ConsoleAppBuilder(null, ops).UseVersionProvider(new VersionProvider()) .Parse() .ShowBanner("authors"); Assert.True(banner is ConsoleAppBanner); } } } ================================================ FILE: UnitTests/CommandLine/Builders/ConsoleAppBuilderTests.cs ================================================ using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using Xunit; namespace UnitTests.CommandLine.Builders { public sealed class ConsoleAppBuilderTests { [Fact] public void Parse_UnsupportedOption() { var ops = new OptionSet { { "test=", "test", v => { } } }; var data = new ConsoleAppBuilder(new[] { "--if", "-" }, ops) .Parse() .Data; Assert.Single(data.Errors); Assert.Equal(2, data.UnsupportedOps.Count); } [Fact] public void Parse_Version() { var ops = new OptionSet { { "test=", "test", v => { } } }; var validator = new ConsoleAppBuilder(new[] {"--version"}, ops) .Parse(); Assert.True(validator.Data.ShowVersion); var exitCode = validator .CheckInputFilenameExists("dummy", "vcf", "--in") .ShowBanner("authors") .ShowHelpMenu("description", "example") .ShowErrors() .Execute(() => ExitCodes.Success); Assert.Equal(ExitCodes.Success, exitCode); } [Fact] public void Parse_HelpMenu() { var ops = new OptionSet { { "test=", "test", v => { } } }; var validator = new ConsoleAppBuilder(new[] { "--help" }, ops) .Parse(); Assert.True(validator.Data.ShowHelpMenu); var exitCode = validator .CheckInputFilenameExists("dummy", "vcf", "--in") .ShowBanner("authors") .ShowHelpMenu("description", "example") .ShowErrors() .Execute(() => ExitCodes.Success); Assert.Equal(ExitCodes.Success, exitCode); } [Fact] public void Parse_ShowOutput() { var ops = new OptionSet { { "test=", "test", v => { } } }; var exitCode = new ConsoleAppBuilder(new[] { "--test", "test" }, ops) .Parse() .ShowBanner("authors") .ShowHelpMenu("description", "example") .ShowErrors() .Execute(() => ExitCodes.Success); Assert.Equal(ExitCodes.Success, exitCode); } } } ================================================ FILE: UnitTests/CommandLine/Builders/TopLevelAppBuilderTests.cs ================================================ using System.Collections.Generic; using CommandLine.Builders; using ErrorHandling; using Xunit; namespace UnitTests.CommandLine.Builders { public sealed class TopLevelAppBuilderTests { private readonly Dictionary _ops; public TopLevelAppBuilderTests() { _ops = new Dictionary { ["combine"] = new TopLevelOption("combine cache directories", EmptyMethod) }; } private static ExitCodes EmptyMethod(string command, string[] args) => ExitCodes.Success; [Fact] public void Parse_UnsupportedOption() { var validator = new TopLevelAppBuilder(new[] {"--if", "-"}, _ops).Parse(); Assert.True(validator.Data.Errors.Count > 0); var exitCode = validator .ShowBanner("banner") .ShowHelpMenu("help") .ShowErrors() .Execute(); Assert.Equal(ExitCodes.UnknownCommandLineOption, exitCode); } [Fact] public void Parse_ShowHelpMenu() { var validator = new TopLevelAppBuilder(null, _ops).Parse(); Assert.True(validator.Data.ShowHelpMenu); var exitCode = validator .ShowBanner("banner") .ShowHelpMenu("help") .ShowErrors() .Execute(); Assert.Equal(ExitCodes.MissingCommandLineOption, exitCode); } [Fact] public void Parse_Nominal() { var exitCode = new TopLevelAppBuilder(new[] { "combine", "dummy" }, _ops) .Parse() .ShowBanner("banner") .ShowHelpMenu("help") .ShowErrors() .Execute(); Assert.Equal(ExitCodes.Success, exitCode); } } } ================================================ FILE: UnitTests/CommandLine/Builders/ValidationExtensionsTests.cs ================================================ using System.Collections.Generic; using System.IO; using CommandLine.Builders; using CommandLine.NDesk.Options; using ErrorHandling; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.CommandLine.Builders { public sealed class ValidationExtensionsTests { private static ExitCodes Execute(IConsoleAppValidator validator) { return validator .DisableOutput() .ShowBanner("authors") .ShowHelpMenu("description", "example") .ShowErrors() .Execute(() => ExitCodes.Success); } [Fact] public void CheckInputFilenameExists_FileExists_SuccessExitCode() { string randomPath = RandomPath.GetRandomPath(); File.Create(randomPath); var ops = new OptionSet { { "if=", "if", v => { } } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--if", randomPath }, ops) .Parse() .CheckInputFilenameExists(randomPath, "test", "--if")); Assert.Equal(ExitCodes.Success, exitCode); } [Fact] public void CheckInputFilenameExists_MissingFiles_FileNotFoundExitCode() { string randomPath = RandomPath.GetRandomPath() + ".anavrin"; var ops = new OptionSet { { "id=", "id", v => { } } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--if", randomPath }, ops) .Parse() .CheckInputFilenameExists(randomPath, "test", "--if")); Assert.Equal(ExitCodes.FileNotFound, exitCode); } [Fact] public void CheckInputFilenameExists_MissingArguments_MissingCommandLineExitCode() { var ops = new OptionSet { { "if=", "if", v => { } } }; var exitCode = Execute(new ConsoleAppBuilder(null, ops) .Parse() .CheckInputFilenameExists(null, "test", "--if")); Assert.Equal(ExitCodes.MissingCommandLineOption, exitCode); } [Fact] public void CheckInputFilenameExists_EmptyPath_MissingCommandLineExitCode() { var ops = new OptionSet { { "if=", "if", v => { } } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--if" }, ops) .Parse() .CheckInputFilenameExists(null, "test", "--if")); Assert.Equal(ExitCodes.MissingCommandLineOption, exitCode); } [Fact] public void CheckInputFilenameExists_IgnoredPath_SuccessExitCode() { var ops = new OptionSet { { "if=", "if", v => { } } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--if", "-" }, ops) .Parse() .CheckInputFilenameExists("-", "test", "--if",true, "-")); Assert.Equal(ExitCodes.Success, exitCode); } [Fact] public void CheckDirectoryExists_MissingDirectory_PathNotFoundExitCode() { var ops = new OptionSet { { "if=", "if", v => { } } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--if", "-" }, ops) .Parse() .CheckDirectoryExists("-", "test", "--if")); Assert.Equal(ExitCodes.PathNotFound, exitCode); } [Fact] public void CheckDirectoryExists_EmptyPath_MissingCommandLineOptionExitCode() { var ops = new OptionSet { { "if=", "if", v => { } } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--if", "-" }, ops) .Parse() .CheckDirectoryExists(null, "test", "--if")); Assert.Equal(ExitCodes.MissingCommandLineOption, exitCode); } [Fact] public void CheckEachFilenameExists_MissingFile_MissingCommandLineOptionExitCode() { var ops = new OptionSet { { "if=", "if", v => { } } }; var filenames = new List { "bob", null }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--if", "-" }, ops) .Parse() .CheckEachFilenameExists(filenames, "test", "--if")); Assert.Equal(ExitCodes.MissingCommandLineOption, exitCode); } [Fact] public void HasRequiredParameter_Exists_SuccessExitCode() { string observedString = default; const string expectedString = "foo"; var ops = new OptionSet { { "test=", "test", v => observedString = v } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--test", expectedString }, ops) .Parse() .HasRequiredParameter(observedString, "test", "--test")); Assert.Equal(expectedString, observedString); Assert.Equal(ExitCodes.Success, exitCode); } [Fact] public void HasRequiredDate_Exists_SuccessExitCode() { string observedDate = default; const string expectedDate = "2018-03-14"; var ops = new OptionSet { { "date=", "date", v => observedDate = v } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--date", expectedDate }, ops) .Parse() .HasRequiredDate(observedDate, "date", "--date")); Assert.Equal(expectedDate, observedDate); Assert.Equal(ExitCodes.Success, exitCode); } [Fact] public void HasRequiredDate_Exists_BadFormat() { string observedDate = default; var ops = new OptionSet { { "date=", "date", v => observedDate = v } }; var validator = new ConsoleAppBuilder(new[] { "--date", "garbage" }, ops) .Parse() .HasRequiredDate(observedDate, "date", "--date"); Assert.True(validator.Data.Errors.Count > 0); } [Fact] public void HasRequiredDate_DoesNotExist_MissingCommandLineExitCode() { string observedDate = default; var ops = new OptionSet { { "date=", "date", v => observedDate = v } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--bar", "bar" }, ops) .Parse() .HasRequiredDate(observedDate, "date", "--date")); Assert.Equal(ExitCodes.MissingCommandLineOption, exitCode); } [Fact] public void CheckOutputFilenameSuffix_True() { var ops = new OptionSet { { "date=", "date", v => { } } }; var validator = new ConsoleAppBuilder(new[] {"--date", "2018-03-14" }, ops) .Parse() .CheckOutputFilenameSuffix("test.json", ".json", "temp"); Assert.Equal(ExitCodes.Success, validator.Data.ExitCode); Assert.Empty(validator.Data.Errors); } [Fact] public void CheckOutputFilenameSuffix_False() { var ops = new OptionSet { { "date=", "date", v => { } } }; var validator = new ConsoleAppBuilder(new[] { "--date", "2018-03-14" }, ops) .Parse() .CheckOutputFilenameSuffix("test.json", ".gz", "temp"); Assert.NotEqual(ExitCodes.Success, validator.Data.ExitCode); Assert.True(validator.Data.Errors.Count > 0); } [Fact] public void HasRequiredParameter_DoesNotExist_MissingCommandLineExitCode() { string testString = default; const string expectedString = default; var ops = new OptionSet { {"test=", "test", v => testString = v}, {"bar=", "bar", v => { } } }; var exitCode = Execute(new ConsoleAppBuilder(new[] { "--bar", "bar" }, ops) .Parse() .HasRequiredParameter(testString, "test", "--test")); Assert.Equal(expectedString, testString); Assert.Equal(ExitCodes.MissingCommandLineOption, exitCode); } [Fact] public void HasRequiredParameter_MissingArguments_MissingCommandLineExitCode() { string observedString = default; var ops = new OptionSet { { "test=", "test", v => observedString = v } }; var exitCode = Execute(new ConsoleAppBuilder(null, ops) .Parse() .HasRequiredParameter(observedString, "test", "--test")); Assert.Equal(ExitCodes.MissingCommandLineOption, exitCode); } } } ================================================ FILE: UnitTests/CommandLine/NDesk.Options/OptionContextTests.cs ================================================ using System; using CommandLine.NDesk.Options; using Xunit; namespace UnitTests.CommandLine.NDesk.Options { public sealed class OptionContextTests { private readonly OptionSet _optionSet; public OptionContextTests() { _optionSet = new OptionSet { { "a=", "test", v => { /* ignore */ } } }; } [Fact] public void Should_ThrowException_When_ContextIsEmpty() { var optionContext = new OptionContext(); Assert.Throws(delegate { // ReSharper disable once UnusedVariable string ignore = optionContext.OptionValues[0]; }); } [Fact] public void Should_ThrowException_When_IndexGreaterThanLength() { var optionContext = new OptionContext { Option = _optionSet[0] }; Assert.Throws(delegate { // ReSharper disable once UnusedVariable string ignore = optionContext.OptionValues[2]; }); } [Fact] public void Should_ThrowException_When_RequiredValueMissing() { var optionContext = new OptionContext { Option = _optionSet[0], OptionName = "-a" }; Assert.Throws(delegate { // ReSharper disable once UnusedVariable string ignore = optionContext.OptionValues[0]; }); } } } ================================================ FILE: UnitTests/CommandLine/NDesk.Options/OptionSetTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Text; using CommandLine.NDesk.Options; using Xunit; namespace UnitTests.CommandLine.NDesk.Options { public sealed class OptionSetTests { private readonly OptionSet _optionSet; private string _a; public OptionSetTests() { _optionSet = new OptionSet { {"a=", "", v => _a = v}, {"b", "", v => { }}, {"c", "", v => { }}, {"n=", "", (int v) => { }} }; } [Fact] public void BundledValues() { var defines = new List(); var libs = new List(); bool debug = false; var optionSet = new OptionSet { { "D|define=", "", v => defines.Add (v) }, { "L|library:", "", v => libs.Add (v) }, { "Debug", "", v => debug = v != null }, { "E", "", v => { /* ignore */ } } }; optionSet.Parse(new[] { "-DNAME", "-D", "NAME2", "-Debug", "-L/foo", "-L", "/bar", "-EDNAME3" }); Assert.Equal(3, defines.Count); Assert.Equal("NAME", defines[0]); Assert.Equal("NAME2", defines[1]); Assert.Equal("NAME3", defines[2]); Assert.True(debug); Assert.Equal(2, libs.Count); Assert.Equal("/foo", libs[0]); Assert.Null(libs[1]); Assert.Throws(delegate { optionSet.Parse(new[] { "-EVALUENOTSUP" }); }); } [Fact] public void RequiredValues() { string a = null; int n = 0; var optionSet = new OptionSet { { "a=", "", v => a = v }, { "n=", "",(int v) => n = v } }; var extra = optionSet.Parse(new[] { "a", "-a", "s", "-n=42", "n" }); Assert.Equal(2, extra.Count); Assert.Equal("a", extra[0]); Assert.Equal("n", extra[1]); Assert.Equal("s", a); Assert.Equal(42, n); extra = optionSet.Parse(new[] { "-a=" }); Assert.Empty(extra); Assert.Equal("", a); } [Fact] public void OptionalValues() { string a = null; int n = -1; Foo foo = null; var optionSet = new OptionSet { {"a:", "", v => a = v}, {"n:", "", (int v) => n = v}, {"f:", "", (Foo v) => foo = v} }; optionSet.Parse(new[] { "-a=s" }); Assert.Equal("s", a); optionSet.Parse(new[] { "-a" }); Assert.Null(a); optionSet.Parse(new[] { "-a=" }); Assert.Equal("", a); optionSet.Parse(new[] { "-f", "A" }); Assert.Null(foo); optionSet.Parse(new[] { "-f" }); Assert.Null(foo); optionSet.Parse(new[] { "-n42" }); Assert.Equal(42, n); optionSet.Parse(new[] { "-n=42" }); Assert.Equal(42, n); Assert.Throws(delegate { optionSet.Parse(new[] { "-n=" }); }); } [Fact] public void BooleanValues() { bool a = false; var optionSet = new OptionSet { { "a", "", v => a = v != null } }; optionSet.Parse(new[] { "-a" }); Assert.True(a); optionSet.Parse(new[] { "-a+" }); Assert.True(a); optionSet.Parse(new[] { "-a-" }); Assert.False(a); } [Fact] public void CombinationPlatter() { int a = -1, b = -1; string av = null, bv = null; int help = 0; int verbose = 0; var optionSet = new OptionSet { { "a=", "", v => { a = 1; av = v; } }, { "b", "desc", v => {b = 2; bv = v;} }, { "v", "", v => { ++verbose; } }, { "h|?|help", "", v => { switch (v) { case "h": help |= 0x1; break; case "?": help |= 0x2; break; case "help": help |= 0x4; break; } } } }; var e = optionSet.Parse(new[] { "foo", "-v", "-a=42", "/b-", "-a", "64", "bar", "/h", "-?", "--help", "-v" }); Assert.Equal(2, e.Count); Assert.Equal("foo", e[0]); Assert.Equal("bar", e[1]); Assert.Equal(1, a); Assert.Equal("64", av); Assert.Equal(2, b); Assert.Null(bv); Assert.Equal(2, verbose); Assert.Equal(0x7, help); } [Fact] public void Should_ThrowException_When_MissingRequiredValue() { Assert.Throws(delegate { _optionSet.Parse(new[] { "-a" }); }); } [Fact] public void ShouldNot_ThrowException_When_ProvidingMoreOptionsThanExpected() { var ex = Record.Exception(() => { _optionSet.Parse(new[] { "-a", "-a" }); }); Assert.Null(ex); Assert.Equal("-a", _a); } [Fact] public void ShouldNot_ThrowException_When_ProvidingUnregisteredNamedOption() { var ex = Record.Exception(() => { _optionSet.Parse(new[] { "-a", "-b" }); }); Assert.Null(ex); Assert.Equal("-b", _a); } [Fact] public void Should_ThrowException_When_ArgumentNull() { Assert.Throws(delegate { _optionSet.Add(null); }); } [Fact] public void Should_ThrowException_With_InvalidType() { Assert.Throws(delegate { _optionSet.Parse(new[] { "-n", "value" }); }); } [Fact] public void Should_ThrowException_When_BundlingWithOptionRequiringValue() { Assert.Throws(delegate { _optionSet.Parse(new[] { "-cz", "extra" }); }); } [Fact] public void WriteOptionDescriptions() { var optionSet = new OptionSet { { "p|indicator-style=", "append / indicator to directories", v => {} }, { "color:", "controls color info", v => {} }, { "color2:", "set {color}", v => {} }, { "long-desc", "This has a really\nlong, multi-line description that also\ntests\n" + "the-builtin-supercalifragilisticexpialidicious-break-on-hyphen. " + "Also, a list:\n" + " item 1\n" + " item 2", v => {} }, { "long-desc2", "IWantThisDescriptionToBreakInsideAWordGeneratingAutoWordHyphenation.", v => {} }, { "long-desc3", "OnlyOnePeriod.AndNoWhitespaceShouldBeSupportedEvenWithLongDescriptions", v => {} }, { "long-desc4", "Lots of spaces in the middle 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 and more until the end.", v => {} }, { "long-desc5", "Lots of spaces in the middle - . - . - . - . - . - . - . - and more until the end.", v => {} }, { "h|?|help", "show help text", v => {} }, { "version", "output version information and exit", v => {} }, { "<>", "", v => {} } }; var expected = new StringBuilder(); expected.AppendLine(" -p, --indicator-style "); expected.AppendLine(" append / indicator to directories"); expected.AppendLine(" --color [] controls color info"); expected.AppendLine(" --color2 [] set color"); expected.AppendLine(" --long-desc This has a really"); expected.AppendLine(" long, multi-line description that also"); expected.AppendLine(" tests"); expected.AppendLine(" the-builtin-supercalifragilisticexpialidicious-"); expected.AppendLine(" break-on-hyphen. Also, a list:"); expected.AppendLine(" item 1"); expected.AppendLine(" item 2"); expected.AppendLine(" --long-desc2 IWantThisDescriptionToBreakInsideAWordGeneratingAu-"); expected.AppendLine(" toWordHyphenation."); expected.AppendLine(" --long-desc3 OnlyOnePeriod."); expected.AppendLine(" AndNoWhitespaceShouldBeSupportedEvenWithLongDesc-"); expected.AppendLine(" riptions"); expected.AppendLine(" --long-desc4 Lots of spaces in the middle 1 2 3 4 5 6 7 8 9 0"); expected.AppendLine(" 1 2 3 4 5 and more until the end."); expected.AppendLine(" --long-desc5 Lots of spaces in the middle - . - . - . - . - . -"); expected.AppendLine(" . - . - and more until the end."); expected.AppendLine(" -h, -?, --help show help text"); expected.AppendLine(" --version output version information and exit"); var actual = new StringWriter(); optionSet.WriteOptionDescriptions(actual); Assert.Equal(expected.ToString(), actual.ToString()); } [Fact] public void OptionBundling() { string a, b, c, f; a = b = c = f = null; var optionSet = new OptionSet { { "a", "", v => a = "a" }, { "b", "", v => b = "b" }, { "c", "", v => c = "c" }, { "f=", "", v => f = v } }; var extra = optionSet.Parse(new[] { "-abcf", "foo", "bar" }); Assert.Single(extra); Assert.Equal("bar", extra[0]); Assert.Equal("a", a); Assert.Equal("b", b); Assert.Equal("c", c); Assert.Equal("foo", f); } [Fact] public void HaltProcessing() { var optionSet = new OptionSet { { "a", "", v => {} }, { "b", "", v => {} } }; var e = optionSet.Parse(new[] { "-a", "-b", "--", "-a", "-b" }); Assert.Equal(2, e.Count); Assert.Equal("-a", e[0]); Assert.Equal("-b", e[1]); } private sealed class ContextCheckerOption : Option { private readonly string _eName; private readonly string _eValue; private readonly int _index; public ContextCheckerOption(string p, string d, string eName, string eValue, int index) : base(p, d, 1) { _eName = eName; _eValue = eValue; _index = index; } protected override void OnParseComplete(OptionContext c) { Assert.Equal(1, c.OptionValues.Count); Assert.Equal(c.OptionValues[0], _eValue); Assert.Equal(c.OptionName, _eName); Assert.Equal(c.OptionIndex, _index); Assert.Equal(c.Option, this); Assert.Equal(c.Option.Description, Description); } } [Fact] public void OptionContext() { var optionSet = new OptionSet { new ContextCheckerOption ("a=", "a desc", "/a", "a-val", 1), new ContextCheckerOption ("b", "b desc", "--b+", "--b+", 2), new ContextCheckerOption ("c=", "c desc", "--c", "C", 3), new ContextCheckerOption ("d", "d desc", "/d-", null, 4) }; Assert.Equal(4, optionSet.Count); optionSet.Parse(new[] { "/a", "a-val", "--b+", "--c=C", "/d-" }); } [Fact] public void DefaultHandler() { var extra = new List(); var optionSet = new OptionSet { { "<>", "", v => extra.Add (v) } }; var e = optionSet.Parse(new[] { "-a", "b", "--c=D", "E" }); Assert.Empty(e); Assert.Equal(4, extra.Count); Assert.Equal("-a", extra[0]); Assert.Equal("b", extra[1]); Assert.Equal("--c=D", extra[2]); Assert.Equal("E", extra[3]); } [Fact] public void MixedDefaultHandler() { var tests = new List(); var optionSet = new OptionSet { { "t|<>=", "", v => tests.Add (v) } }; var e = optionSet.Parse(new[] { "-tA", "-t:B", "-t=C", "D", "--E=F" }); Assert.Empty(e); Assert.Equal(5, tests.Count); Assert.Equal("A", tests[0]); Assert.Equal("B", tests[1]); Assert.Equal("C", tests[2]); Assert.Equal("D", tests[3]); Assert.Equal("--E=F", tests[4]); } [Fact] public void DefaultHandlerRuns() { var formats = new Dictionary>(); string format = "foo"; var optionSet = new OptionSet { { "f|format=", "", v => format = v }, { "<>", "", v => { if (!formats.TryGetValue (format, out var f)) { f = new List (); formats.Add (format, f); } f.Add (v); } } }; var e = optionSet.Parse(new[] { "a", "b", "-fbar", "c", "d", "--format=baz", "e", "f" }); Assert.Empty(e); Assert.Equal(3, formats.Count); Assert.Equal(2, formats["foo"].Count); Assert.Equal("a", formats["foo"][0]); Assert.Equal("b", formats["foo"][1]); Assert.Equal(2, formats["bar"].Count); Assert.Equal("c",formats["bar"][0]); Assert.Equal("d",formats["bar"][1]); Assert.Equal(2, formats["baz"].Count); Assert.Equal("e", formats["baz"][0]); Assert.Equal("f", formats["baz"][1]); } // ReSharper disable once ClassNeverInstantiated.Local private class Foo { private readonly string _s; private Foo(string s) { _s = s; } public override string ToString() { return _s; } } } } ================================================ FILE: UnitTests/CommandLine/NDesk.Options/OptionsTests.cs ================================================ using System; using System.IO; using CommandLine.NDesk.Options; using Xunit; namespace UnitTests.CommandLine.NDesk.Options { public sealed class OptionsTests { [Fact] public void Should_ThrowException_When_PrototypeNull() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption(null, null); }); } [Fact] public void Should_ThrowException_When_PrototypeEmpty() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("", null); }); } [Fact] public void Should_ThrowException_When_OptionNameEmpty() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a|b||c=", null); }); } [Fact] public void Should_ThrowException_When_OptionTypesConflict() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a=|b:", null); }); } [Fact] public void Should_ThrowException_When_DefaultHandlerRequiresValue() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("<>=", null); }); } [Fact] public void Should_ThrowException_When_DefaultHandlerRequiresValues() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("<>:", null); }); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("t|<>=", null, 2); }); } [Fact] public void Should_Not_ThrowException() { // ReSharper disable NotAccessedVariable // ReSharper disable RedundantAssignment var ex = Record.Exception(() => { var option = new DefaultOption("a|b=", null, 2); option = new DefaultOption("t|<>=", null, 1); option = new DefaultOption("a", null, 0); }); // ReSharper restore RedundantAssignment // ReSharper restore NotAccessedVariable Assert.Null(ex); } [Fact] public void Should_ThrowException_When_MaxValueCountOutOfRange() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a", null, -1); }); } [Fact] public void Should_ThrowException_When_MaxValueCountZero_And_RequiredType() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a=", null, 0); }); } [Fact] public void Should_ThrowException_With_IllFormedSeparator() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a={", null); }); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a=}", null); }); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a={{}}", null); }); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a={}}", null); }); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a={}{", null); }); } [Fact] public void Should_ThrowException_When_CannotProvideSeparatorsWhenTakingOneValue() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a==", null); }); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a={}", null); }); Assert.Throws(delegate { // ReSharper disable once UnusedVariable var option = new DefaultOption("a=+-*/", null); }); } private sealed class DefaultOption : Option { public DefaultOption(string prototypes, string description) : base(prototypes, description, 1) {} public DefaultOption(string prototypes, string description, int c) : base(prototypes, description, c) {} protected override void OnParseComplete(OptionContext c) { throw new NotImplementedException(); } } } } ================================================ FILE: UnitTests/CommandLine/Utilities/BenchmarkTests.cs ================================================ using System; using CommandLine.Utilities; using Xunit; namespace UnitTests.CommandLine.Utilities { public sealed class BenchmarkTests { [Fact] public void ToHumanReadable_Days() { const string expectedString = "1:02:03:04.5"; var timeSpan = new TimeSpan(1, 2, 3, 4, 500); var observedString = Benchmark.ToHumanReadable(timeSpan); Assert.Equal(expectedString, observedString); } [Fact] public void ToHumanReadable_LessThanOneDay() { const string expectedString = "01:02:03.4"; var timeSpan = new TimeSpan(0, 1, 2, 3, 400); var observedString = Benchmark.ToHumanReadable(timeSpan); Assert.Equal(expectedString, observedString); } [Fact] public void Benchmark_EndToEnd() { var benchmark = new Benchmark(); var elapsedTime = benchmark.GetElapsedTime(); // perform some work double unitsPerSecond = Benchmark.GetElapsedIterationsPerSecond(elapsedTime, 100); Assert.True(unitsPerSecond > 0); } } } ================================================ FILE: UnitTests/CommandLine/Utilities/MemoryUtilitiesTests.cs ================================================ using CommandLine.Utilities; using Xunit; namespace UnitTests.CommandLine.Utilities { public sealed class MemoryUtilitiesTests { [Fact] public void ToHumanReadable_Convert_Bytes() { var observedValue = MemoryUtilities.ToHumanReadable(123); Assert.Equal("123 B", observedValue); } [Fact] public void ToHumanReadable_Convert_KiloBytes() { var observedValue = MemoryUtilities.ToHumanReadable(1_234); Assert.Equal("1.2 KB", observedValue); } [Fact] public void ToHumanReadable_Convert_MegaBytes() { var observedValue = MemoryUtilities.ToHumanReadable(1_234_567); Assert.Equal("1.2 MB", observedValue); } [Fact] public void ToHumanReadable_Convert_GigaBytes() { var observedValue = MemoryUtilities.ToHumanReadable(1_234_567_890); Assert.Equal("1.150 GB", observedValue); } } } ================================================ FILE: UnitTests/CommandLine/VersionProviders/DefaultVersionProviderTests.cs ================================================ using CommandLine.VersionProviders; using Xunit; namespace UnitTests.CommandLine.VersionProviders { public sealed class DefaultVersionProviderTests { [Fact] public void GetProgramVersion() { var versionProvider = new DefaultVersionProvider(); Assert.Equal(string.Empty, versionProvider.DataVersion); } } } ================================================ FILE: UnitTests/Compression/CompressionAlgorithmTests.cs ================================================ using System; using Compression.Algorithms; using UnitTests.Compression.FileHandling; using Xunit; namespace UnitTests.Compression { public sealed class CompressionAlgorithmTests { private const int NumOriginalBytes = 20000; private readonly byte[] _originalBytes; public CompressionAlgorithmTests() { _originalBytes = BlockStreamTests.GetRandomBytes(NumOriginalBytes); } [Theory] [InlineData(CompressionAlgorithms.Zlib)] [InlineData(CompressionAlgorithms.Zstandard)] public void RoundTrip(CompressionAlgorithms ca) { var compressionAlgorithm = GetCompressionAlgorithm(ca); int compressedBufferSize = compressionAlgorithm.GetCompressedBufferBounds(NumOriginalBytes); var observedCompressedBytes = new byte[compressedBufferSize]; var smallBuffer = new byte[10]; Assert.Throws(delegate { compressionAlgorithm.Compress(_originalBytes, NumOriginalBytes, null, compressedBufferSize); }); Assert.Throws(delegate { compressionAlgorithm.Compress(_originalBytes, NumOriginalBytes, smallBuffer, compressedBufferSize); }); int numCompressedBytes = compressionAlgorithm.Compress(_originalBytes, NumOriginalBytes, observedCompressedBytes, compressedBufferSize); int decompressedBufferSize = compressionAlgorithm.GetDecompressedLength(observedCompressedBytes, numCompressedBytes); var observedDecompressedBytes = new byte[decompressedBufferSize]; Assert.Throws(delegate { compressionAlgorithm.Decompress(observedCompressedBytes, numCompressedBytes, null, decompressedBufferSize); }); int numDecompressedBytes = compressionAlgorithm.Decompress(observedCompressedBytes, numCompressedBytes, observedDecompressedBytes, decompressedBufferSize); Assert.Equal(NumOriginalBytes, numDecompressedBytes); Assert.Equal(_originalBytes, observedDecompressedBytes); } private static ICompressionAlgorithm GetCompressionAlgorithm(CompressionAlgorithms ca) { switch (ca) { case CompressionAlgorithms.Zlib: return new Zlib(); case CompressionAlgorithms.Zstandard: return new Zstandard(); default: throw new InvalidOperationException($"Unknown compression algorithm: {ca}"); } } } public enum CompressionAlgorithms { Zlib, Zstandard } } ================================================ FILE: UnitTests/Compression/DataStructures/BlockTests.cs ================================================ using System; using System.IO; using Compression.Algorithms; using Compression.DataStructures; using Compression.FileHandling; using ErrorHandling.Exceptions; using UnitTests.Compression.FileHandling; using Xunit; namespace UnitTests.Compression.DataStructures { public sealed class BlockTests { private static readonly byte[] ExpectedDecompressedBytes; private static readonly int NumExpectedUncompressedBytes; private static readonly Zstandard Zstd = new Zstandard(1); static BlockTests() { const string polyPhenBase64 = "G4D6gHGAUIA0gDSA+oAagDKACoD//3GAUIBQgE2ANIAygBqAbYFxgAOASoAAgP//NIACgCSAGoACgAyASoACgBCAB4AMgAKABYAJgJWANIAEQ8gDiAOsA8oD///KA8oDvAO8A8oDjAO8A7wDugNOQ6wDugPKA8oDp4FpQ+2A//86Q6eBOkOqQkaBKkJpQ7KBKkJBQrKBsoGygSBCaUMGQwCAAIAHgASACIAEgBmAAoACgAKAGYAEgBSAB4D//wKAAYABgCSAC4ABgCSAB4ADgAKABYAZgAGAAYAAgAOAB4AUgAeA//8DgAKAAYAkgAiA//9mQ/BCdEJmQ/5BiwN+Qn5CCUJmQ/BC6ELwQvBC/kH+Qf5BiwNmQ3mAA0NhgWGBW0K7gAND1UFhgWWBA0NhgVeB1UHVQf//tYBhgYFDW0IBgDqAA4ABgBSAA4AMgA2AAIACgB6AAoAHgP//AIACgAOAA4A6gA2A//9KgAWADIA0gACASoAQgAWAB4BKgAmAEIAagBCAAoAHgAqASoA0gACAAIAAgACAAIAAgACAAIAAgACAAIAAgP//AIAAgACAAIAAgACAAIB5gPtCYYFhgVFCu4ADQ9VBYYFhgQNDYYFXgdVB1UH//7WAYYE/Q1FCAYAZgASAA4ACgAGAGYADgAOAAYAZgAeAC4AEgAiA//8DgAOAAIABgAuA4YAAgAiArID//6qAe4AXgESA6IAOgFyAXIBZgA2AIIBZgJCB6IAJQosDZkPwQnRC8EKIA4iBfkL///BCZkM3Q2ZD6EJ+Qn5CiIGLA/BCckKsA+NC40KsA+NCrAOTA+NCQUOsA3ZD//+lA6UD20LbQuNCrAOTA///SoACgACANIADgBqAEIACgAOASoAEgAuABIAEgAKAA4AEgEqANIAEQ8oDjAOqA8oD///KA8oDvAO8A8oDjAO8A7wDvQNTQ6wDugPXA8oDBEPIA4gDrAPKA///ygPKA7wDvAPKA4wDvAO8A7oDTkOsA7oDygPKA///dYAAgASAVIAEgDqAG4AGgAyAdYAGgByAFIAUgAeABYAJgOKAVIAAgACAAIAAgACAAIAAgACAAIAAgACA//8AgACAAIAAgACAAIABgACAAICqgCyAH4B7gP//6IAygBeAF4CqgCyARIBcgEaACIAOgA2AXoF7gAOASoAAgP//NIACgCSAGoACgAyASoACgBCAB4AMgAKABYAJgJWANIB+gPtCYYFhgVtCtYADQ8tBYYFhgQNDV4FhgdVBy0H//7WAYYE/Q1tCAIAzgAeABYAkgAKASoAGgASAAoAzgAyA//8VgBWAAoACgAKASoAkgARDygOMA6oDygP//8oDygO8A7wDygOMA7wDvAO6A1NDrAO6A8gDygMCgCOAAIACgA6A//8lgAGAA4ABgBuAA4AMgA6ADYACgAOAAIA1gBqAAYBXgACAAoAkgP//M4AagASABoBKgASADIAMgBiAAIAEgAiASoAkgP//JYADgAWACoAAgDWAAYAEgAGAE4AGgBWAE4AMgAOAAoAAgDWAGoByQqwD40LjQqwD40KqA5MD40I4Q6wDdkP//3ZDckPbQttC40KsA5MDAIAAgACAAID//wCAAIAAgACAAIAAgACAAIAAgACAAIAAgACAAYAAgHJCrAPjQuNCrAPjQqwDkwPjQkFDrAN2Q///pQOlA9tC20LjQqwDkwMAgKSAKoAggHuA///ogDKAF4AXgKqALIBEgFyAQoAIgA6ADYDogHuAAYBTgAWABYAtgACAPYATgAWACIA9gASAC4ATgBOA//8FgAuAVoA9gACAGIADgAOAAoADgBmAAIADgACABoAEgAeAC4AHgP//AYAAgCSACIABgACAA4AGgCSA//8zgAaABIAEgEqABIAMgBOADYAAgAKAA4BKgCSAeYD7QmGBYYFRQruAA0PVQWGBYYEDQ2GBV4HVQdVB//+1gGGBP0NRQgGAOIADgAKAJ4ACgCiACYACgAKAKIAFgACAB4AHgP//A4ADgHeAJ4D//0eABYAFgDKAA4BHgBGABYADgGWABYAegBGADIAAgAOAA4BlgDKACUKLA2ZD8EJ+QvBCiwOSgX5C///oQmZDN0NiQ+hCfkJ+QoiBiwPwQglCiwNmQ/BCfkLwQosDkoF+Qv//6EJmQzdDYkPoQn5CfkKIgYsD8EIBgDqAA4ABgB6AAoAcgAWAAYACgCiABYAAgP//BIACgAOAA4B3gBOA//9mQ/BC90JmQ5NCiwN+Qn5CCUJmQ/BC6ELwQvBC/kH+QQ5CiwNmQwWAdYAAgP//VIAEgDqAK4AEgBSAdYAEgBuAC4AUgASACIAPgOKAVIC1gANDW0LLQdVBV4E/Q36A1UF5gFFCW0JbQltCxULVQWGB//8/Q1tCCUKLA2ZD8EJ+QvBCiwOSgX5C///oQmZDN0NiQ+hCfkJ+QoiBiwPwQnJCrAP///1BkwNyQpADkwPjQj5DrANyQj5DPkN2Q3xC40I4Q6wDkAMJQosDZkPwQn5C8EKLA5KBfkL//+hCZkM3Q2JD6EJ+Qn5CiIGLA/BCckKsA///kkKTA3JCkAOTA+NCPkOsA3JCPkM+Q3ZDfELjQjhDrAOQA6eBaUPtgP//OkOngTpDqkJGgSpCaUOygSpCQUKygbKBsoEgQmlDBkNyQqwD//+SQpMDckKQA5MD40I+Q6wDckI+Qz5DdkN8QuNCOEOsA5ADp4FpQ+2A//86Q6eBOkOqQkaBKkJpQ7KBKkJBQrKBsoGygSBCaUMGQ3JCrAP//5JCkwNyQpADkwPjQj5DrANyQj5DPkN2Q3xC40I4Q6wDkANyQqwD///9QZMDckK3A5MD40I+Q6wDgUI+Qz5DdkN8QuNCOEOsA7cDCUKLA2ZD8EJ+QvBCiwOSgX5C///oQmZDN0NiQ+hCfkJ+QoiBiwPwQqeBaUPtgP//OkOngTpDqkI4gSpCaUOygSpCp4GygbKBsoEgQmlDBkO1gANDW0LLQdVBV4E/Q36A1UF5gFFCW0JbQltCxULVQWGB//8/Q1tCcoFhQzVD9EL//0dCs0LsQfRCmIBmQzVDZkM1Q/RCPEKzQmiBMkMPgI2AH0NPgU+B5ELmgCZD60ELgQuBJkNGgQaAoUI7Qv//EYDSgIVD5ELEgBdDJ4FdgBdDbYHPQn5C//8ngRBDY4EwQhCAjoDEgGOBu0F9Q9JCMIGNA///ooGNAw2AiQNpQ1pCN0ONAz+AaUNpQzdD1EGlQmRDvQOJA///U0M5Qs9BHENEgVND9kHPQc6AU0OQQoZC1UKQQoWADYDHgJ4DU0MOgF1Dr4FigSpDlYBdQ+VBYoEVgV1DNkIrQqRCpEL//xmA3YCkAypDbEK1A58DnwN8Q58D0ANzgJ8D//98Q7UDtQO1A58DgkMbQ0yBzwOfAwKA50FGgEaAx4AXgOdBMoAygA+A//80gAGAXoBDgAOAAoAPgPFCLIHdQsMD//8hgMMDFkOwA8MDn0KBQ8MDFkObA/1CYEPmQl9DmAPiA8EDAYArgSyAIYARgACAK4EAgB+AAYD//yGARoBGgEOAC4AQgACAf4FGgBxC0QMLQxJDxgNvQsUDxQMLQ4JD0QP//8YDpQOCQ5WAhkKlA90DxQOtQt0DoAOCQ9ADREPRA8YDgkOCQ9EDoANRgMYDxgP//z5DugPdA9ADUoF0Q01DE0P//9dCeEMBgRNDCoBDQk1DeEN4QxNDz0KSQgGBeEPhQeVCzQN7QgNDwwNQQsMDwgMDQ5oDzgNQgLYDoAN1Q///mUKaA9wDwwOKgM4DtgN5Q84De0LNA7MDeUNPQ84DwwP//8MDsANxQk9DKUPkA84DA4C9gA2ADYAPgAmAvYADgA2AAID//w2AKIAogBGAAYAAgASA/YAogEGAykIpQrRBpIBXgcpCA4DhgAKA//8pQilCKUK9QemAhYADgMpCyEHZQsoDdkPKQpcDZkPKA09DykJZgKwDqQP//1CBckM3Q2FDykLKA8oDJ4CXAytCd0IXQ1SBlwM5gG1CnoBPQy+AEENIQxBD6YD//56AwQN2Q11D+0HOA84D3wO8A+QD1gPOA7QD4APOA9sD3APcA///ugPQA+cD3wPmA+gD5wPnA+cD5wPnA+cD5wPnA+cD5wP//+cD5wPnA+cD5wPoA+cDIoB4Q9RBUoEXQ7eAeEPiQVKB//9NQ9RB2ELYQpNCEYC3gOiAsgNNQ4BD5APOA84D4AO8A+AD3APCA9cD5APOA4WB3APcA///ugPcA+cD4AOaQs4DwgN1Q7QDdUPOAytDcUOQQv//sQPCA7QDsgO+QsaAkELcA8IDUYG0A54DfkNbQ1tDtAP//3pD4IB6Q54DngO0A5sDW0OSQhWA0AOeA+cD5wPmA+YD5wPlA+cD5wPmA+cD5wP//+cD5wPnA+QD5gPnA+gD5wPhA+cD5APcA+cD5APnA+YD4QPkA+cD5QPmA///5APkA+UD5QPnA+YDGYB8gaCAY4AOgEqAtYD//0qABoDigJqA6YA2gemAR4AYgAGAKIEBgG5D4APHA0FD1wO0A9cD0AP//50D1wOaA9ADmgN3gJ0DsgPQA+AD1wNuQ9cD0APQA///xwPgA9FC0AN3gMUD0APXA9cD0APGA7QD0ULgA48DwAPnAy+B///gA8AD4APgA8AD3APkA8cD3APXA9wDwgPXA9sD5wPkA7ED5AP//9aA4APAA98D4AOHQ9YD4APAA9wDuwPQA7QD0APWA+cD3wO6QbQDKID//7QDJYC0A54DXUJ+Q7QD3UKeA0xDW0PFQRhDVkPQA7QDJIBjQgWAA4AHQj2AY0K2QWCAlYC7QmCA///FgMWAA4BcgMyAu0IHQhBCtAP//wyAtANdQpoDngPFQS9DtAPVQltDS0L7QhpC3UJWQ9ADsgNZgL0DyEJ9Qm5D+UG9AwaB+UH//4KBfUJzQ01DTUNLgFmAMYDUA6sDlgPgA8QDTkPgA8QD3APXA///tgPgA7QD3QOlQo+AlgPCA9ED5APdA+gD6AP//+cD6APnA+gD6APoA+gD6APoA+gD6APoA+gD6APoA+gD6AO9A+QD5APiA9wD4gPkA3CB4gP//+AD5APkA+QD5APiA9gDIkPmA+IDxgPkA+ID4gP//98D5gOPA+ID8EHfA+ID5APkA+ID3wPfA48D5gPQAyxC0gPIA6kDY0OpA9ID//+pAyJCuAPHA8gDyAPIA4RDOkMygN4DC0K6A+YD4gPiA+YD2gPmA+UD4QPeA+YD4gPlA+UD4wN3Q///3gPnA+YD5wPoA+gD6APoA+gD6APnA+gD5wPoA+gD6APoA+gD6APnA///6APoA+gD6AP//+cD6APnA+gD6APoA+gD6APoA+gD6APoA+gD6APoA+gD6AMTgIwDCoD//2dDGoFLgNFCZIGPgagDNYDyQo6AM0IjgY+BhYHKAz5D5wPoA+gD6APoA+gD6APoA+gD6APoA+gD///oA+gD6APoA+gD6APoA+cD6APnA///6APoA+gD6APnA+gD6APoA+gD5wPoA+cD6APoA+gD6APlA+gD5wPnA+gD5gPoA+gD5wPnA+gD5wPoA+gD6AP//+YD6APoA+gDYEPaA6cD/kJKQ6oD///AA1FDTEPaA6cD0wMVQnRDlAPAA58D1AODgDBD2gPTA9QDsAPTA9oDVoDLAxVCwAPaA9QD2gPaA8sDlAP//+YD1AO8A+QDygPYA+QDuAPiA+ID0gPWA+QDQELiA+AD3AOWA///1gPnA+QDHYDSA4gDZ0O6AyxC0gNjQzpDLELSA4gDhEO6A6cDIkL//yxC3gPIA75BmwN7Q3tDLENYQ7ID//9SQ9uAUkOcA3tDsgOYAytDOkIXgLIDe0PnA+gD5wP//+gD6APoA+gD5wPoA+gD6APoA+cD6APnA+gD6APoA+gD5wPoA+gD6APoA+gD6APoA+gD5wPoA+gD6APoA+gD5wP//+cD6APoA94D5gPlA+UD///lA+QD4QPjA8cD5gPmA+YD5gPmA+MD5QPaA+YDzEL+QtoD1APLA78DywPaA///ywOTQsoD0wPUA9oD2gO/A3RDjYDiA9QDc0PhA9oD0gPUA9ID4QNlQ9wDAYEmQ9oD3gPeA+IDwgP///lB5QPeA+gD6APoA+gD5wPoA+gD6APoA+cD6APoA+gD6APoA+gD6APoA+gD///nA+gD6APnA+gD5wPoA+gD5gPnA+gD5wPoA+cD///nA+cD6APoA+gDLELSA8gDqQOQA4gD0gP//6kDMYGnA8cDyAPIA7oDY0PdQh2A3gO6AwqAjQMzQjNCqEIjgagD///kQVmAjoBfQoBC8kKxQhqBIIAKgMoD8kLnA+gD6APoA+gD6APoA+gD6APnA+gD6APoA+gD6APnA///5wPoA+gD/kLaA58D9kLaA5QD1APUA///eEPTA5wDywN+gRWBUUOwA8AD4wPaA4ND4QPeA94D1QPaA+EDLELZAxxD2gPeA94D4QPeA8cD//8cQ+UD4QOnQtsDakNqQ9UDKkPbA4FDakMOQ9sDakOsA8MDrwP//1SADkPiA9UD5wPoA+gD5wPoA+gD6APoA+YD6APoA+cD6APnA///6APnA+gD6APoAwVD4AO2A7QD4AP//+AD1wOlA8gD4AOlA9cD1wPQA92AjAPQA+QD4AN2Q+AD4ID//+ADtAPcA9cDTkPIA+ADtgPYA8tBjAN6Q7YD0APkA90D5wPoA+gD6AP//+gD6APnA+gD5gPoA+gD6APoA+gD6APoA+cD6APnA+gD6AP//+cD6APnA+gD6APoA+gD6APoA+gD6APoA+gD6APoA+gD6ANWgH6BqgOUA8oD/kLaA7IDeEP+QtoDqgPKA8sDywP//zBDUUPiA9MDnEKDgL0DzQPaA31D2gO/A70DUUPaA8wD1APUA9MD//9aQ58D4gPaA+cD6APnA///6APoA+gD6APnA+gD6APoA+gD5wPoA+cD6APoA+gD6AMdQ9EDyAOpA///iAOUA2NDqQMiQtIDyAPIA8gDugOEQ4gD3UK6Ax2A3gPoA9YD///mA+ED5gPmA8cD4wPmA+MD5QOfA98D3gPiA+UD6APmA+cD6APoA+gD6APoA+gD5wPoA+cD6APoA+gD6APoA+gD5wP//+gD6APGA+YD3wPGA+ID2gP6QeIDsAPaA+QD1APiA9QD///TA9oD4gPmA9sD5wPoA+gD5wPoA+cD6APoA+YD5wPoA+cD6APnA///5wPnA+gD6APoA+cD6APoA+cD6APoA+gD6APmA+gD6APnA+gD5wP//+cD5wPoA+gD6APoA+gD6APoA+cD6APoA+gD6APnA+gD6APoA+gD6APoA+gD6APoA///5gPoA+cD5QPoA+cD6APoA+YD5wPoA+cD6AP//+cD5wPnA+cD6APoA+gD6AP//+cD6APnA+gD6APoA+gD6APoA+gD6APoA+gD6APoA+gD6APnA+gD6APoA///6APoA+cD6APmA+gD6APoA+gD6APoA+gD5wPoA+cDvQPkA+QD4gPcA+ID5ANAQuID///cA+QD5APkA+QD4APYA3hC5gPkA+gD6APoA+gD6APoA+gD6APoA+gD6APoA+gD6APoA+gD6APoA///5wPnA+gD6APoA+gD6APoA+cD6AP//+gD6APoA+gD6APoA+gD5wPoA+gDxgPmA98DuAPkA9oD5APiA///1APkA9MD4gPTA1yB1APaA+ID5gPkA+SAlQNNgReAlQP//0tDREPPQSRClQMkQhNDQ0KGQhiATYHPQpUDlQOWA+AD0QN2Q+ADxAPgA9wD//+2A+ADtAPdA7QDj4C2A8ID0QPkA+AD5wPoA+gD6APoA+gD6APnA+gD///oA+gD6APoA+gD6APoA+cD6APoA+cD6APnA///6APoA+gD6APnA+gD6APoA+gD5wPoA+cD6APoA+gD6AMJQrIDF4D//7IDOkKbA3tDs0FYQ7IDRUJ7Q89CWEO+QZVCd0PfA7ID///kA8MDwQPkA7gD5APiA8JC1APkA8JC4gPbA7QD10KnA9MD5gPkAyND4QPHA7MDlwOgA///2QPIA7sD4QOwA94D2gPHA3lCrQPIA+ED7kHnA+gD6APoA+gD6APoA+gD6APoA+gD6AP//+gD6APoA+gD6APoA+gD5wPoA+gD6APoA+gD6APoA+gD5wPoA+gD6APoA+gD5wP//+cD6APoA+cD6APoA+gD6APoA+gD5wPoA///6APoA+gD6APoA+gD6APnA+gD6APmA+gD6APoA+gD6APoA///6APlA+gD6APoA+gD6APoA+cD5QPoA+gDVEPhA94D2gPTA9oD4QP//9oDxELZA94D3gPeA94DxwOdA+6A5QPeA5sD5AOEQ4RD0wOEQ8SA2gOeA7oD3gOigf//ywOpA4FDmwPKA+QDwQPFA+YDXIH//+QD0wPkA+IDxQPiA+YDzQPiA9oD4gO6A9QD4gPnA+QDHkPNA80DwAOXgJsDvwNhQq8D//+yA80DzAPNA78DmwOvA2FCwANdQ2GAwQP2QhtCRUP4QcIDmIBuQjaAmwO1QkVD6YAbQ///N4CjgdcDnAOcA+YD2gPUA+QDxgPkA+ID1APUA+QD2gPrgOID4gP//8UD2gPmA+QDHUPSA6IDiAOiQogD//+UA2dDA0PSA4RDxwOnA4RDZ0OpA0BDyAM4gN4D5wPlA9oD5gPkA+YD5gP//+ID5gPjA+UD4wPMQt4D5APlA+gD5gPnA+gD6APnA+gD6APoA+gD///oA+gD6APoA+gD5wPnA+gD6APoA+gD5gPoA+cD5QPoA+cD6APoA+YD5wPoA+cD6AP//+cD5wPnA+cD6APoA70D5QPjA+ED4QP//+UDyAPhA8gD4QPjA+UD5QPjA94D1QNVQucD4wM3Q+YD5QPjA+YD2gPmA+UD4wPhA+YD5gP//+YD5gPcA+ED4gPoA+YD5wPoA+gD6APoA///6APoA+gD6APoA+gD6APoA+gD6APoA+gD6APoA8YD5APiA+IDXIHiA+QDqgPiA///4gPkA+QD5APiA98D3wO4A+QDtAOrA+AD4APdA9mA3QPgA05D3QP//90D4APgA+AD3APdA9gDdkPgA60DI0PhA70DvQPeA4ND4QPAA60DlwPhA6sD2QPZA9MD//9agbQD5wPhA/5C2gPUA6oDwgOnA9oDngNaQwBCFULAA9QDsgP//5EDkQOEQ9oD1APNgF1D80KSQvhBkkKGQ3+AtUETgP//X0M0QzRD60K/QR6BBoC5A/NC5wPoA+gD6APoA///6APoA+gD6APoA+gD6APoA+gD6APoA+gD6APoA1BD5APmQmuA2gOeA9kD2gPeQpsD3gOeA8gD//+BQ1BDngO8A+QD2gPfQcADMUMxQ8AD4ULAA5gDMUPhQs0DK0OYA5sDgUNjgP//4ULNA8ADvAPkA+QD4gPhA+ID5APyQeIDc0PiA+QD5APkA+QD4gPPA///5wPkA3pD4APIA1RD4AO0A+AD1wPggLYD4AO2A9cDpQP//6IDtAPRA+QD3QP//94DrgOdA9kDK0PeA8sDsQOUA94DsQPSA8sDywNYgXFDkAPmA94D0kLYA8QDuAOMA7YD2ANoQqgDU4HxQdAD0APHA8cDuANkQ///2APFA///6APoA+gD6APoA+gD6APoA+cD6APoA+gD6APoA+cD6APoA+gD6AMPgAdDu4AwgdSABYAjgFiA8IAFgICBD4AEQq2BrYH//52AW4ACgNSA5QPoA+cD5wPoA+YD6APoA+cD5wPoA+cD6APoA+gD///mA+gD6APoAyiAwEKBgc6AAoDOgMBCDoDIgAGA//+BgYGBgYEggZ6AdIABgAhDWYDnA+gD6APnA+gD5wPoA+gD5gPnA+gD5wPoA+cD///nA+cD6APoA+gD5wPoA+gD6APoA///6APoA+gD6APoA+gD6APoA+gD6APoA+gD6APoAyND2wPVA88DwgPOA9sD3oDDA5iAzQPWAzCB1gPWA68DakP//+ID1gM2Q9oDwAMwQ9oDlAPUA8oD//94Q9oDkQPUA3RDVoB4Q5EDywPiA9oDcoBIQ6yABYEOQxKAB0PKQv2AeoF2Q///jUIcQs1BBYCsgHqBdkMHQ+cD6APoA+cD6APoA+gD6APmA+gD6APnA+gD5wP//+gD5wPoA+gD6APnA+gD6APoA+gD6APoA+gD6APoA+gD6AP//+gD6APoA+gD6APoA+gDWkPaA1aA///aA3RD2gPLAzBDywPaA1FDywORA8sD/kKUA8oD5gPaA71BtAMlgP//MkNDQp0Dy0G9QUqB6YBOQlxD10IxQ8dBnkI/gLQDngPnA+gD6APoA///6APoA+cD6APmA+gD6APoA+gD6APoA+gD5wPoA+cDBIAoQ4CAq4D7gFuAMkIKgHuAIoD//waAhIE6gfOAE4AEgCKAtQNcQiKBqgMUgP//bEPjQY8DmIGOgT6AbEMfQvlCXIAfQpiBPEJcgKoDkANqQ9sD1gPPA6UDzwPbA6dCvwNegP//1gPWA9YD1QPBA6wDp0LbA9YDI4GoA2SBI4EMQxSAfYAygDqAw4BnQ///8kI8gI+BIIAagcOAygPqQiOBqAMKgAqAZ0MjgWND6kIageRBjQP//4BCI4E8gCCAGoEzQsoDakMlQrwDlgM4Q///3EIfQ45COEN4gasDc0OrA6sDc0P9QjhDy0GQAw+AI4GoAzND8kLZQYBCqAP//zNCCoBZgDhDM0M+QwxDKEJkgRGAqAPyQpVC0gMdgP//0gNWgNIDyAMWQ6kD0gMKQ70DhEOpA59CiAOnA94D0gMDgExCA4AJgFaAN4BMQgGATID//wiATIDDgJWAlYAjgAOACICjQi+B5wPoA+gD6AP//+gD6APnA+gD5gPoA+gD6APoA+gD6APoA+cD6APnA8ID5wPhA+ED5gPzQeYD5QPhA+QD5wPhA+UD5QPlA///3APlA+gD5wMqgaoD7UEigURDw0EUgGCAbYEigZADYIAyQ///40FlgO1BmIGqAx9ClwPhA5cDNYHhA8wD3gPdA///zAPhA8sD2QOyA6sDlwPLA9QD5wPhA7oD5APiA+ID3wPiA+QD///eA19DX0PkA+QD5APiA98D0wNfQ+YD4gOwQiKBZUNqQ9YDsELbA8wDPUObA9sD///NA80DtgP4gAZDmwPiA9UDgoFogAVDBUNHQ05CvQNOgAVD//9uQwVDlgOQA5ADVoCCgTGA0wOrAyVCvQOrA2pDyEJqQ70D//8yQw+AEIGrA6sDqwONAzhD00IugL0DlgPoA+gD///nA+gD5wPoA+gD6APoA+gD6APoA+gD6APoA+gD6APoA+gD5wPoA+gD5wPoA+gD6APoA///6APoA+gD6APoA+cD5wPoA+gD6APoA6oD5APiA+ID3wPiA+QD///iA19D3gPiA+ID5APkA9sDywPwQeYD4gNdgNEDiAMdQ8cDLELSA5QDZ0OfQtIDiAOnA7oDqQP//1iAn0LeA8cDd0LJA99Cy0GoA99CKICoA8tB10LJA05CqQP//6+A30JlQ2VD2gPIQucD6APoA+cD6APnA+gD6APmA+cD6APnA+gD5wP//+cD5wPoA+gD6AOZA+QD4gPiA98D4gPkA///4ANpQ+ID5APkA+QD5APfA84DcIHnA+IDZYDBA+5CtULuQkZCM4BlgLVCo4GbA9tCeEMbQxtDmoD4QWWA1wP//8YD5gPfA7gD5APUA+QD4gP//80D5APTA+IDywNcgdQD0wPiA+YD5APnA+gD5wP//+gD6APoA+gD5wPoA+gD6APoA+cD6APnA+gD6APoA+gDRIGVA+SA//+VAxpCS0MTQ+SAz0FxQ89B10IOgM9BTYEkQhpClQNEQ/5C2gOyA/5C2gN0Q9oDywNWgJQD2gOUA8sDUUP//0xDkQOyA+ED1AOoQtQDPID//9QDXYDUA74D7kK+A9QDGEO+A2xDmgOxQnFDrAPfA9QD2gPmA+YD5QPQA+UD5QPOA+UD1ELkA+UD5gPlA+UD5QPjA84D5wP//ziAu0LbgNuA//+MgCJCNoCMgAKAgIHbgNFB0UGEgQeAW4APgMRCCIAigaoD//8KgNVCIoE+QxJDmIEfQqoDH4D5Qh9CH0JggJiBsEKPA0uAlgPnA8gD///kA12B5APkA9UD4APkA9wD4gPgA+IDzwPcA+ID5wPkAw6AMEOcQrlBJoEEgWJDlICvgQSA//+cQpxCnEKSQsyAPoFhgGJDnEKpgDFDSoEdgO9Ck4HvQplC//8DgTFDQYGiQvuABoCpgEGBCEIxQ+9CxoDOAytD///OA/RCzgOxA/FBUEPOA3VDtAMJgRKBvkJ1Q0pD3APCA2SBHYCGQ2pDHYDqQoZDhYFqQxqBpQOGQ4oDigOKA+JC6kJZgKUD//8cgdcDtgM2Q9cD///XA8UDY0OnA9cDpwPQA3BCtgNvgWNDpAPgA9cD5gPoA+cD5wPnA+cD5wPnA+cD5wPnA+cD///nA+cD5wPnA+cD6APnAxmAfIFKgEqAWIBKgAmA//83gAmAD4BHgKCAoIB5gAaAAoABgNtBoIBUgLtCPoGqgCWAqoD//5qAqoBcgGNC1YANQj6B/ICqgASBYIAUQgKAAYAhgS2ALYBVgBiAeIH//xiABoCpgCCAYIBIgAKABoABgAGAk0KbgKSBlwNIQ9ZBSkJ3QpcDVIHWQf//QkNIQ3FD6YA5gL9Cd0JGgDeAlkLHA98D4APcA8cD1wPgA7oD3AOFgdcD4APgA+AD3wPbA9YDqQP//9cDhoDOA3JC/ELCAxmAwQOxA/xCSkPOA3uAsgOxA4ND///0QlBD3APCA///UkPNgA6A1EIOgBlDn4HNgM2AUkMfgIVCN4AzgYSAFIDQgJ4DG0MygDlCUoBwgIyBNIA5QoWAUoAEgDlCBYA5gUKBsoD//wSAUoCWQtxBaUPXA48D///FA7ID4ANxQ48DJYHQA8cD0AOyA8cDtAO0A15D4APXAxyB1wNBQ///1wOAQ9cDxQPRQoRD1wO4A8cDb4FjQxdDnQOAQ+AD0ANKgZcD//8IgJcD6EFwQ0hDVIF3QpcDmoEXQ3aA1kFUgXdCbULBA3FDIkPNA80DzQOdA8IDzQOpgLED//+aA80DzAPMA8wDsQOdA+ZB5APNAwGAOULdgLKA0UFPgJZCMoCFgDKA3EFCgeWAjIFCgTSACYD//5ZC3EHpgJcD//8IgHZD6EFwQ0hDVIF3QpcD6EEXQ9ZBmkJ2gKSBtkLBA3FDA4DKQkGAQYB5gUGAd0IGgEGAKIDIQWKAAoASgRKBA4D//yiATkMpQqgD4APgA9wD1gPcA+ADR0PcA///AkPgA98D3wPbA9wD0ANwQ+QD3ANUgZcDK0I4QpcDD4CXA3ZD//+/QpMDRoB2Q7ZCRoCkgbZCF0PBA3ZDBIAGgf//DoBKgACAkIAJgA+ACYDEgAGALIAsgCGAAIAPgAGAWIFfgG+ANkOhQmBCZoFWQjZDCoARQgaAFICqQvlCYUJhQp2BU4H//zZDqkL//+QD2wPCA9wDugOYQtwD0APOA+AD1gPbA9YD1gOlA80DzQPkA9YDYYASQwiAlIDUQgWA0UJ2QpSAb4EZQwWAj0LiQWWB//8ZgG+BfkPUQuED///mA+cD5wPmA+cD5gPmA+QD5wPnA+cD5wPnA+UD5QPkA+cD5wMmgORBhYDigF+AbYBGQv//roABgLyACYCIgehBPoFpgDmAAYBGQj6BukG0A///F4CdA9VCmgO3QktCS0K0A5JCfkO3QvdCxUHdQpeA3wOyAxmA20EogBmAf4EYgOmAroABgCiAKIEBgHmAAYD//xiAJoB5gNtBKIGwgP//6UHpQQRDbYF0QwKBd4EggAxDOkIMQ89Cx0IbgAKBSYGuA0JDVYH//xpD4EJFQmFCn4AaQ+BCHkK1A+BCnwN/Q1hDboAeQmtCsgM/gLeAeEPUQTOAE0MjQktDgIH//zWADEPKQdhCKYAQgAqBkoE1gHhDTUP//6UDe4EggEiAbEKlAxKBhYEagYgDOENlQ8hCBUMeQh5CMYClA8hCvUHgA9wD0APgA6ID4APWA9YDtAPgA9sD2wPcA9wDcEP//7QD5APgA9lCygMogP//qQM3Q8oDIEPZQvaAvAPKQrwDYUOTA6BCZkMEQ9oDygOJgFJDzYANgBtDTYHUQvRB///NgExDQ4EVgAKBB4CJgBSAzYCeAxtDiYBSQ4GBiYBFQtZBFYAzgYmAFoCPQguBBEMOgP//zYBNgTOBVENDQjKAUkJZgJ6AWYA5gPJBAYBLgAGA//8FgI6BNYGZgDmAZIAFgKlCNYEBgASAaIA5gAOAOYDyQSKACYADgFWBToDHgAmAOYD//wKAA4CpQgmACIATQwiAWoA8gf//DoDYQQ6Av4ATQ5CAtEEkgBiAXoD1gIuAE0MkgN+AMUPyQqJCDoBXQudCo4CiQv//V0IxQypDMUOZQldC40EdgO9CtEEKgHhDCoGcgU1Dt4B1gJNCnIECgXhDHYDPQthCcUL//zOAgIGyAxND0EG4AyiA//+kAyiA94CkAyZCJkO4A69CjAP4gO5CfEImQ1dD0gOGQ///dIEXgAKAA4AXgAKADoAYgA6A44ABgF2ABoA1gASAAYABgNNBWICoA+QD1gOHQ9wD1gPgA7wDuwP//9sD1gPbA/pCzwPQA8IDugPkA9wD2oCNA1lCWUIegAxCiQMjgFlC//82Q8NCY0M2QzBDQYBBgCiAjQMrQuYD6APnA+cD5wPnA+cD5wPnA+cD5wPnA///5wPnA+cD5wPnA+gD5wMDgNpCcIF6geJBBYAgQwmAM4E8gD1C7UE9QopCPUKagM2A//+CQ91CLIGXA3ZDcUOaQktDlwMZgBdDCIDeQnZDdkN2Q0hDv0LzQf//lwN2QzBCuAPmQttBYUJ8Qv//hkPbQeZCuANyQqEDdoBWQjBC7kImQ7gDdoBpQ+ADs0L//9cDgEPXA9ADaUPHA9cDJYHQA7IDuANHQ50DzwPlA9cDnQPXA88D0ANpQ9AD3AM2Q9ADJYHQA88D1wPQA9ADxgPHAxdD4AP//3BCygO8A7sDl4CqA8oD2EG8A0iAlAO8A7wDygO8A60DZkP//8oDUkOoA+AD4APcA9YD3APkAxxC3AP//9wD4APfA+AD3wPcA9cD+kLkA9wDVYEYgBNDGkP5QhmAnAMaQxpDpoG1AxNDhUOFQ4VDdIBrQmtC0AP//zeAwQNxQvdCswPwQcMDFkO3QnFCwwOUgHlDeUNHQ///YoBxQtcDswNpQ9cDsYD//8UDmgPXA3FDaUMlgdADhEPQA7IDxwNuQ7QDXkPgA9cDBoAJgRyAHIABgByACYEAgBOAAYD//xyAPIA8gCqACYAAgAGACYEUgCWB1wPQA8UDwwOnA9cDHIHFA///0APQA9AD0APQA15DnQMcgeAD0AMFgFJCnoBOgPWAOYBSQiGABYAOgP//eYDOgJ6ANoAJgDaAAYA5QzWBAYAvQgSADoDcQf//6UH9gIWAhYA5QgSAQoFCgfmAMoBSgICAlkLcQfiAnQOkQupBekM1QnxDTUIagD6BnQOtQlNDqUJeQn+AuUH//8QDfkOngBNDh0IVQgWAFUITQw+ALYECgP//h0KHQodCqYE2gS2BOoATQ+CAGkK0A1tDEEK0A91CnQN6Q///m0K0A9VCngOSQheAm0LVQltD0AO0A7CAS0PCQcWA40G3QUtDiYEMgAyA40HCQYRCPID///mAt0EpgUtDykK5QZ0DfkPQQvRCHkOdA59CTUIagPRCfkN+Q35D//8eQ4FCXkKdA3xD5gPnA///4wPnA+UD5wPnA+cD5wPnA+UD5wPnA+cD5QPnA+cD6APnAyeAT0IngDSABYA2gAOAPoA0gCWA8kECgAKA//9SgDaAC4AGgE9CPoBIgIxClIFJgQKBAoHfQv//+oAEgDyBRYHiQeJBioGzgAaADIDfQjNCg4BLQ4OAB4ATQ0KBMoDKQn+AOYFLQwGBCEL//76APIABgcJBmgOEQv//nQPqQeBBU0MZgJ0DqULqQepBnQNeQk1DJUMlQxuAK4DgQcQDnQO+gEtDBID//+NBC4ARQ8pC+YDCQUtDKYGEQrdBN0LFgMJBCEJLQzSArIAZQwiAlIAFgMWAMkKUgBqA//92QueAdkLXQd+AlID8gF2A0UIJgLCAeEP//wWAE0MCgUZDE0MKgdRBeEMCgXFCVoDUQR2ACoEYQrIDSEMDgMFCA4BBgJ6AQYDIQWaAioAogBmAZoBOgRKBEoH//3OAC4B0QgmAsIB0QwKBMYBGQwKBFUPHQv//ykFGQ22Bz0IngBCAHID6gMpBdENIQwWAGUO/gPyAioGPgBlDBYDFgDmAMkIRgL1BMkJvgWGACYD//35DgELpgJcDD4D//5cDmoGXA0hDF4ArQpcD1kEXQ8tB1kF2gCtCbULBA5cDAIDEgA6AAIBfgAmAxIAOgA+ACYCWgBSAHIAXgBeAAoAJgP//WIFjgLCAdEPPQopCgkJ3gXRDqoDKQf//bICMQgxDz0LPQjGAAoEbgHBDDEMBgNtBKIA3gEKAAYDpgAKAKIABgAWAAoCagHmAW4AYgP//D4B8gQOABIC9gA6AD4ABgA+AbYAPgA+AAYCWgACALIAsgCGAAIABgA+ACYH//6SBlwO/QkqBlwN3QnRDQkP//ytClwNtQnZDnoAPgKSBbUIXQ8EDdkNhgBlDCIBdgIBCCIAZQyaA//9hgBJD34C9QSSADoCUgPSAGoB+Q9RC//8QgNOAzIAsgQKAfkIEgKOAHIDCQaOABIDCQSyBLYACgASAG0MkQi2ACoD//wGAwkECgBVCDIBKgC+AIEJigA6BeoCjgC+AB4AtgBtDGUIAgN+AEYARgECAAIDfgAGAAIAGgKyAAYAygAOAAYAKgP//A4B/gXOARIDaQv//AoCKQkSA0kHcQUeAI4HaQgaAaoGvgBOAR4ATgN+AWUOBQgKAIEJKgEqATIAvgC6BDIBKgP//uEEHgAaBHYDMgASAL4AEgCBCBoD8gEhDD0MPQxCAfkJCQ7qAfkL//4CAD0MHQw9DvEJ+QrtBIoAtgI2BAoBmgAaAA4AxgASARoAHgACABIAxgAeA//8BgAGAAIAAgAeAxIAdgE5CzgMWgP//sQNOQrEDmQNngNFCsQOOQn5DyUIngFhCDkMGQ84DmQNOQrEDI4D//7EDTkKxA5kDO4HRQrEDjkJ+Q5CAPYBYQg5DTkPOA5kDRYDUQoNCY4EUgWOB1EL//2OBA4DKQXpCg0KDQiVCFIGLgAOADoAdgROAF4ALgAqA+EL//zZDqkKugFKBNkOcgWBCCoEYgJuAaIGSgY8D+EJGgLED0UJOQnhDC0KxAzJC//+1QbEDyUJ+Q05DBkNGgKuBa4DOA5kDw0LFA8UDtgNgQ7YDwwMhQp8D//+zQcUDwwO2A5wDhkOGQyFCxoCGQzCAmwP4Qf//fEOqgZsDUENLgANCmwPMQllDaoBOgLRBUkJ8QsQDfEOrgbED//+aQnhDI4CuA9yADkM8QpgDBkOZA5kDeEOogI5CMkLOA64Di4GbA2KBMIB8Q1JCmwPbQf//YoGbA8NCFUMxQhuAA0JIQk6AxAN8QxOASEPFQiVC4UEbgUhDgIC7QQyAIoDFQsVCxULFQiOBPIH//0hDxULEQtYDOEP//9YDV0PWA9YDekPDA9YDowPOA5YDwwNqgS1DwQPlA9YDN4D//6+ASoAEgG+Av0EvgC+ABIBxgdGAX4EMgAKAU4B0gC+ABIBHgP//34DNA84D1gOWA9YDwwPOA1xD1gPWA80D1gPOA5YDlgN6Q+UD1gPlA+cD4wPkA+cD4QPnA+cD4wPmA+cD///nA+YD5gPgA+MD5gPoA+cDgIBIQzOBwYDFQt2AB0PSQbqAI4FIQ///fkIzgG+BOIATgCOBmAMHQ///xQOCQ/BCxQO6QsUDtgPbgFRDxQOGQ7MDhkMHQ5yA6EJOQ+IDxQPkA+cD5wPnA+YD5wPnA+ED5gP//+cD5wPnA+cD5wPmA+YD4APoA+cDX4F+QzFCDYF+QzFCU0MVQ///4kF+QyZCVUNMgBGAX4EmQuNCtQNVQ///sQMLQmeAmQMLgLEDmQNYQg5DsQPRQnpDJkNTQ2eAq4GaQs4DmQPmA+gD///gA+cD4wPnA+cD5gPnA+cD5QPmA+YD5wPlA+YD5wPoA+cD5gPnA+cD5wPlA+UD5wPnA+cD5QPnA+cD5wPnA+cD5gPmA+YD///kAyyACYACgP//bYECgHdCdoBfgASAu0FIgAiBAIEIgS6AB4AsgBZDHELfA+UD5APfA+UD4gPlA+UD2APiA+UD4gPkA+ID///iA+ID5APnA+UD5gPnA+cD5wPlA+UD5wPnA+cD5QPnA+cD5wPnA+cD5gPmA+YD///kA1qAykJmgD+AeEKKgBlCD0L//4qAykIJgLVBCoAFgFqAroATgU5DeEIMgCiBAYAHgHyAEIABgHyAAoAZgOCADIBOgP//AYAMgCSAJIAogSCAc4A2QxiBIYGqQgaA8EICgd2AroD1Qv//7EHsQZyBE4BugB+AjwPwQttBxANZQ8xCFUMVQ5sDo0L1gF2B//8VQ1lDzEJLgFJCo0IxQsQDfEMdgAdCL4AcgKmBHYBEgc+AAYAcgKmBL4CHgP//AYAdgEGAV4DAQqmBB4B3QlKAVYB2gC6AH4EqgD2ALoC7Qf//CIEIgYOAB4ACgASABYAEgOYD6AP//+AD5wPjA+cD5wPmA+cD5wPlA+YD5gPnA+UD5gPnA+gD5wM/gHdCy0EIgUaACIF3Qv//g4AAgAuAwUHLQW2BCIF+gH6AE4B3QgiBRYDUQmuAQoCDQkWAJUIlQv//kIDMQgmAY4HagAKAEoBngB2BVUODQgCAo4ATgBOAK4AMgN2AB4AMgP//BIATgDyAPIAsgACADIABgCiBfID//35D40ImQuNCoIF+QxmBoEK5gFND40IVQxxDHEMNgROAHoC1A35D3wPlA+UD5AP//+QD5QPiA+QD2APlA+UD5QPlA+UD5APkA98D5QPfAwKAGUKVgF2AcoAfgBlCAoBDgAOADIBygL2AGIGQgAWA//8FgBlCGIERgIiB//8BgDOBEYCsgOeAAoBZgIiBAYB3gDCAMIADgB2AVoBLQiqBAoCVgVCAM4BQgCaAlYEPgCSAC4D//zyAbIBsgE2AG4ABgAGArkLrgP//sQMgQ+9CsQOqgbEDeEMnQ9FCsQMnQ3NDeENTQ4+AMULIQt4DsQOggX5DC4D//35D2EF+QxxDoIEcQ35DjoEcQ3VCHENfgTFCFUPVA35DLIB3QiyA//8cQgKAbYEIgQGAP4C7QQGArYAEgAGALoA/gH6Ad0LLQbRCqQN6Qm1CqQODQosDiwOJgQ5DqQP//44DQUMtgCxC7EJpQ90DpgNegBNDzUIqQhqBtUETQ///KkIFgNFBxULNQs1CeEKqgbmABIATQ81CAoAAgASAAYALgASA//8EgAKAAIATgAeAEYACgACAAIAEgAeAxIAcgLRCpgNkQ9RCekJAQyyAjgNBQ/RCqQMJQ44DQUNBQwdDaUM1Q8oD///lA+cD5wPmA+AD5gPmA+YD5gPkA+cD5wPnA+cD5wPmA+YD5QPnA///YkPgA2JD///WA5YD1gPOAwVDtQPWA7UDzgPgQX5DmQOZA8MD4APWA0WA1EJFgAKAg0K8gBtC1UEGgLaAg0K8gFmB//9CgEWAvIDygFVDg0JOgP//RkJGQgaAeIHuQkuA9UEvgFJCoUKhQqFCO0IvgXiAB4DmQgmBtIA8Q7NCs0IPgLNCNkMKgLNC//8dQjxDNkM8Q6pCs0LMQSiAPEPHQf///UK9QXmBHUKugP1CGIB5gXOAvkLHQWBCakIdQhGACoFugKID/UIagI6BqoCqgCuAaoBEgQKA0YABgJSA6YDhgEeBcYFMgP//AYCOgQKAuIA6Q85Bf4ELgLGAQEPOQX+BsYACQ85BuEJwQmZCEoAXgWGB//+4gK6APEMhgf//x0GcgTxDB4ERgAqAJUIbQrNCUoESgeyApoEKgTxDs0JLgJ1COIE4gVaB0YDuQkuAOIEGgFaBOIHrQUZC9UH//wuAB4DuQkZC4gPmA+UD5QP//+UD5gPhA+UD3QPmA+YD5gPmA+YD5QPlA+ED5gPdAyqBPEMAQ7NCD4AbQjZDGYCzQv//akI8QzZDPEOqQmpCpoFHgBiAeYEAgIiAEoANgBWAAIAAgACACYABgEyADYAcgDGAHIABgP//AYD9gA+ABIDuQjiBOIFGQgGA7kIRgfmAdIDuQvmA60FGQqWB//8LgL6A7kKhQjiAp0JYgAWAVYGdgORBB4AggCCAn4G9gJWB//8EgFiAnYA2gKdC70EhgFFCkICQgBCBWYBRQouAkIADgBCBi4BNgaOBFIEFgP//WYCrQvRBBIDEgACAA4BMgASAIoAvgAKAB4BmgP//HYABgACABIAJgBWAxIAvgCmAW0KQgEGASIEAgAWAloADgP///kGQgJOBoIBmgEGAkIBygFtCoIAEgMSAB4AEgAeACoD//xWAB4AHgGaAB4AAgAGAAIAHgAqACoCNgACAAYDdgCCAE4AfgAGAKIEHgBOA//8CgCCAUYA8gDyAAYAMgAGAH4F8gByAB0IBgP//0YAtgE2BAoABgBKACoEvgLiALYAvgB2AQYARgMBCTYE3gKtCD4H//xCB7oCrQjeAmoADgBaAN4EEQu6A9oCfgHmAN4CrQvRB///dgACADIAggAuA3YABgAGAAYBEgBOAAYAggAuAC4ABgAeAKIF8gLmAekOqgaqBT0M6gX5DHEOqgTFCfkN/QguA40LjQv//MYExQrUDT0OzgAxDBYD//wxDhoAfQm5ChoBegQxDCIAfQlSBGoGLgIuAoIF3Q8VCg4AIQ///BYAIQwGAs0JoQtWAGUIIQwyBGUIvgXeBiIBYgb5BCEMAQx2AB0IvgByATYEdgAqByID//0GAB0IBgLiAPoABgAOAHIBrgMBCqYHGA+ID1QPVA+ID1QPiA98D1QPbA+ID3wP//98D3wPUA9QD3APmA+ID"; ExpectedDecompressedBytes = Convert.FromBase64String(polyPhenBase64); NumExpectedUncompressedBytes = ExpectedDecompressedBytes.Length; } private static MemoryStream GetBlockStream(ICompressionAlgorithm compressionAlgorithm, byte[] bytes, int numBytes, out int copyLength) { var ms = new MemoryStream(); var writeBlock = new Block(compressionAlgorithm); copyLength = writeBlock.CopyTo(bytes, 0, numBytes); writeBlock.Write(ms); writeBlock.WriteEof(ms); return ms; } [Fact] public void QuickLzBlock() { var ms = GetBlockStream(Zstd, ExpectedDecompressedBytes, NumExpectedUncompressedBytes, out var copyLength); ms.Seek(0, SeekOrigin.Begin); var readBlock = new Block(Zstd); readBlock.Read(ms); var observedDecompressedBytes = new byte[NumExpectedUncompressedBytes]; var numObservedDecompressedBytes = readBlock.CopyFrom(observedDecompressedBytes, 0, copyLength); Assert.False(readBlock.IsFull); Assert.False(readBlock.HasMoreData); Assert.Equal(NumExpectedUncompressedBytes, numObservedDecompressedBytes); Assert.Equal(ExpectedDecompressedBytes, observedDecompressedBytes); } [Fact] public void BlockZeroLengthCopy() { var writeBlock = new Block(Zstd); const int expectedCopyLength = 0; Assert.Equal(expectedCopyLength, writeBlock.CopyTo(ExpectedDecompressedBytes, 0, 0)); Assert.Equal(expectedCopyLength, writeBlock.CopyFrom(ExpectedDecompressedBytes, 0, 0)); } [Fact] public void BlockWrongSize() { var ms = GetBlockStream(Zstd, ExpectedDecompressedBytes, NumExpectedUncompressedBytes, out _); ms.Seek(0, SeekOrigin.Begin); var readBlock = new Block(Zstd); using (var updatedMs = new MemoryStream()) { updatedMs.Write(ms.ToArray(), 0, (int)ms.Length); updatedMs.Seek(0, SeekOrigin.Begin); // read the original header var header = new BlockHeader(); header.Read(updatedMs); // change and write the header header.NumUncompressedBytes--; updatedMs.Seek(0, SeekOrigin.Begin); header.Write(updatedMs); // read the updated block updatedMs.Seek(0, SeekOrigin.Begin); // ReSharper disable once AccessToDisposedClosure Assert.Throws(delegate { readBlock.Read(updatedMs); }); } } [Fact] public void BlockTruncation() { var ms = GetBlockStream(Zstd, ExpectedDecompressedBytes, NumExpectedUncompressedBytes, out _); ms.Seek(0, SeekOrigin.Begin); var readBlock = new Block(Zstd); using (var truncatedMs = new MemoryStream()) { truncatedMs.Write(ms.ToArray(), 0, (int)ms.Length - 100); truncatedMs.Seek(0, SeekOrigin.Begin); // ReSharper disable once AccessToDisposedClosure Assert.Throws(delegate { readBlock.Read(truncatedMs); }); } } [Fact] public void UncompressedBlockTruncation() { const int bufferSize = 10000; var buffer = BlockStreamTests.GetRandomBytes(bufferSize); var ms = GetBlockStream(Zstd, buffer, bufferSize, out _); ms.Seek(0, SeekOrigin.Begin); var readBlock = new Block(Zstd); using (var truncatedMs = new MemoryStream()) { truncatedMs.Write(ms.ToArray(), 0, (int)ms.Length - 100); truncatedMs.Seek(0, SeekOrigin.Begin); // ReSharper disable once AccessToDisposedClosure Assert.Throws(delegate { readBlock.Read(truncatedMs); }); } } } } ================================================ FILE: UnitTests/Compression/FileHandling/BgzipTextWriterTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.IO.Compression; using Compression.FileHandling; using Xunit; namespace UnitTests.Compression.FileHandling { public sealed class BgzipTextWriterTests { [Fact] public void BgzipTextWriter_EndToEnd() { var asterisks = new string('*', BlockGZipStream.BlockGZipFormatCommon.BlockSize); var observedLines = new List(); var observedPositions = new List(); using (var ms = new MemoryStream()) { using (var stream = new BlockGZipStream(ms, CompressionMode.Compress, true, 1)) using (var writer = new BgzipTextWriter(stream)) { writer.Flush(); writer.WriteLine("BOB"); writer.WriteLine(); writer.Flush(); writer.Write("AB"); writer.Write(""); writer.Write("C"); writer.Write(" "); writer.WriteLine("123"); writer.WriteLine(asterisks); writer.WriteLine(asterisks); writer.WriteLine(asterisks); } ms.Position = 0; using (var reader = new BgzipTextReader(new BlockGZipStream(ms, CompressionMode.Decompress))) { while (true) { string line = reader.ReadLine(); observedPositions.Add(reader.Position); if (line == null) break; observedLines.Add(line); } } } Assert.Equal(6, observedLines.Count); Assert.Equal("BOB", observedLines[0]); Assert.Equal(0, observedLines[1].Length); Assert.Equal("ABC 123", observedLines[2]); Assert.Equal(asterisks, observedLines[3]); Assert.Equal(2162687, observedPositions[0]); Assert.Equal(2162688, observedPositions[1]); Assert.Equal(2162696, observedPositions[2]); Assert.Equal(45678601, observedPositions[3]); Assert.Equal(88932362, observedPositions[4]); } } } ================================================ FILE: UnitTests/Compression/FileHandling/BlockGZipStreamTests.cs ================================================ using System; using System.IO; using System.IO.Compression; using System.Text; using Compression.FileHandling; using ErrorHandling.Exceptions; using IO; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Compression.FileHandling { public sealed class BlockGZipStreamTests { #region members private readonly byte[] _expectedDecompressedBuffer; #endregion public BlockGZipStreamTests() { // TODO: Fix fragile constructor _expectedDecompressedBuffer = GrabBytes(ResourceUtilities.GetReadStream(Resources.TopPath("HelloWorld_original.dat"))); } [Fact] public void FileIO() { var observedDecompressedBuffer = new byte[_expectedDecompressedBuffer.Length]; string randomPath = RandomPath.GetRandomPath(); // compress the data long observedPosition; using (var writer = new BlockGZipStream(FileUtilities.GetCreateStream(randomPath), CompressionMode.Compress, false, 1)) { writer.Write(_expectedDecompressedBuffer, 0, _expectedDecompressedBuffer.Length); observedPosition = writer.Position; var exception = Record.Exception(() => { var buffer = new byte[10]; // ReSharper disable once AccessToDisposedClosure writer.Read(buffer, 0, 1); }); Assert.NotNull(exception); Assert.IsType(exception); } const long expectedPosition = 979042574; Assert.Equal(expectedPosition, observedPosition); // decompress the data using (var reader = new BlockGZipStream(FileUtilities.GetReadStream(randomPath), CompressionMode.Decompress)) { reader.Read(observedDecompressedBuffer, 0, _expectedDecompressedBuffer.Length); var exception = Record.Exception(() => { var buffer = new byte[10]; // ReSharper disable once AccessToDisposedClosure reader.Write(buffer, 0, 1); }); Assert.NotNull(exception); Assert.IsType(exception); } Assert.Equal(_expectedDecompressedBuffer, observedDecompressedBuffer); } [Fact] public void InvalidHeader() { const string dummyString = "The quick brown fox jumped over the lazy dog."; using (var ms = new MemoryStream()) using (var truncatedMs = new MemoryStream()) { using (var writer = new StreamWriter(ms, Encoding.ASCII, 4096, true)) { writer.WriteLine(dummyString); } var observedCompressedBuffer = ms.ToArray(); truncatedMs.Write(ms.ToArray(), 0, 17); ms.Seek(0, SeekOrigin.Begin); truncatedMs.Seek(0, SeekOrigin.Begin); // attempt to decompress the data Assert.Throws(delegate { using (var reader = new BlockGZipStream(ms, CompressionMode.Decompress, true)) { reader.Read(observedCompressedBuffer, 0, observedCompressedBuffer.Length); } }); Assert.Throws(delegate { using (var reader = new BlockGZipStream(truncatedMs, CompressionMode.Decompress, true)) { reader.Read(observedCompressedBuffer, 0, observedCompressedBuffer.Length); } }); } } [Fact] public void NullStream() { Assert.Throws(delegate { using (new BlockGZipStream(null, CompressionMode.Decompress)) { } }); } [Fact] public void NotImplementedMethods() { using (var ms = new MemoryStream()) { // ReSharper disable AccessToDisposedClosure using (var writer = new BlockGZipStream(ms, CompressionMode.Compress, true)) { Assert.Throws(delegate { // ReSharper disable once UnusedVariable long len = writer.Length; }); Assert.Throws(delegate { writer.SetLength(10); }); Assert.Throws(delegate { writer.Seek(0, SeekOrigin.Begin); }); } // ReSharper restore AccessToDisposedClosure } } [Fact] public void StreamIO() { byte[] observedCompressedBuffer; var observedDecompressedBuffer = new byte[_expectedDecompressedBuffer.Length]; using (var ms = new MemoryStream()) { // compress the data using (var writer = new BlockGZipStream(ms, CompressionMode.Compress, true, 9)) { Assert.Throws(delegate { // ReSharper disable once AccessToDisposedClosure writer.Read(observedDecompressedBuffer, 0, 1); }); Assert.True(writer.CanWrite); Assert.False(writer.CanRead); Assert.False(writer.CanSeek); writer.Write(_expectedDecompressedBuffer, 0, _expectedDecompressedBuffer.Length); } observedCompressedBuffer = ms.ToArray(); ms.Seek(0, SeekOrigin.Begin); // decompress the data using (var reader = new BlockGZipStream(ms, CompressionMode.Decompress)) { Assert.Throws(delegate { // ReSharper disable once AccessToDisposedClosure reader.Write(_expectedDecompressedBuffer, 0, 1); }); Assert.False(reader.CanWrite); Assert.True(reader.CanRead); Assert.True(reader.CanSeek); reader.Read(observedDecompressedBuffer, 0, _expectedDecompressedBuffer.Length); } } Assert.Equal(_expectedDecompressedBuffer, observedDecompressedBuffer); Assert.Equal(9629, observedCompressedBuffer.Length); } [Fact] public void StreamTypeMismatch() { string randomPath = RandomPath.GetRandomPath(); using (var writeStream = new FileStream(randomPath, FileMode.Create, FileAccess.Write)) { Assert.Throws(delegate { // ReSharper disable once AccessToDisposedClosure using (new BlockGZipStream(writeStream, CompressionMode.Decompress)) { } }); } using (var readStream = FileUtilities.GetReadStream(randomPath)) { Assert.Throws(delegate { // ReSharper disable once AccessToDisposedClosure using (new BlockGZipStream(readStream, CompressionMode.Compress)) { } }); } } [Theory] [InlineData(650*1024)] [InlineData(65*1024)] [InlineData(1024)] public void VariableDataLength(int numBytesToBeWritten) { using (var ms = new MemoryStream()) { // compress our data using (var writer = new StreamWriter(new BlockGZipStream(ms, CompressionMode.Compress, true))) { var currentIndex = 1; var numBytes = 0; while (true) { string s = $"Hello World {currentIndex}"; writer.WriteLine(s); currentIndex++; numBytes += s.Length; if (numBytes > numBytesToBeWritten) break; } } ms.Seek(0, SeekOrigin.Begin); // decompress our data using (var reader = FileUtilities.GetStreamReader(new BlockGZipStream(ms, CompressionMode.Decompress))) { var index = 1; while (true) { string expected = $"Hello World {index}"; index++; string observed = reader.ReadLine(); if (observed == null) break; Assert.Equal(expected, observed); } } } } [Fact] public void EndOfFile() { using (var ms = new MemoryStream()) { var writeBuffer = ByteUtilities.GetRandomBytes(100); var readBuffer = new byte[60]; using (var bgzipStream = new BlockGZipStream(ms, CompressionMode.Compress, true)) { bgzipStream.Write(writeBuffer, 0, writeBuffer.Length); } ms.Position = 0; using (var bgzipStream = new BlockGZipStream(ms, CompressionMode.Decompress)) { int numBytesRead = bgzipStream.Read(readBuffer, 0, 0); Assert.Equal(0, numBytesRead); numBytesRead = bgzipStream.Read(readBuffer, 0, readBuffer.Length); Assert.Equal(readBuffer.Length, numBytesRead); numBytesRead = bgzipStream.Read(readBuffer, 0, readBuffer.Length); Assert.Equal(writeBuffer.Length - readBuffer.Length, numBytesRead); numBytesRead = bgzipStream.Read(readBuffer, 0, readBuffer.Length); Assert.Equal(0, numBytesRead); } } } [Fact] public void ReadBlockCorrupted() { using (var ms = new MemoryStream()) using (var truncatedMs = new MemoryStream()) using (var corruptMs = new MemoryStream()) { using (var bgzipStream = new BlockGZipStream(ms, CompressionMode.Compress, true)) using (var writer = new StreamWriter(bgzipStream, Encoding.ASCII, 4096)) { writer.WriteLine("The quick brown fox jumped over the lazy dog."); } var compressedData = ms.ToArray(); truncatedMs.Write(compressedData, 0, compressedData.Length - 10); truncatedMs.Position = 0; corruptMs.Write(compressedData, 0, BlockGZipStream.BlockGZipFormatCommon.BlockHeaderLength); corruptMs.Write(_expectedDecompressedBuffer, 0, _expectedDecompressedBuffer.Length); corruptMs.Position = 0; var readBuffer = new byte[60]; Assert.Throws(delegate { using (var bgzipStream = new BlockGZipStream(truncatedMs, CompressionMode.Decompress)) { bgzipStream.Read(readBuffer, 0, readBuffer.Length); } }); Assert.Throws(delegate { using (var bgzipStream = new BlockGZipStream(corruptMs, CompressionMode.Decompress)) { bgzipStream.Read(readBuffer, 0, readBuffer.Length); } }); } } [Fact] public void DoubleDispose() { using (var ms = new MemoryStream()) { var bgzipStream = new BlockGZipStream(ms, CompressionMode.Compress); bgzipStream.Dispose(); bgzipStream.Dispose(); } } private static byte[] GrabBytes(Stream s) { byte[] buffer; using (var ms = new MemoryStream()) { s.CopyTo(ms); buffer = ms.ToArray(); } return buffer; } } } ================================================ FILE: UnitTests/Compression/FileHandling/BlockHeaderTests.cs ================================================ using System.IO; using Compression.FileHandling; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.Compression.FileHandling { public sealed class BlockHeaderTests { [Fact] public void ReadAndWrite() { const int expectedNumUncompressedBytes = 100; const int expectedNumCompressedBytes = 50; var header = new BlockHeader { NumUncompressedBytes = expectedNumUncompressedBytes, NumCompressedBytes = expectedNumCompressedBytes }; using (var ms = new MemoryStream()) { header.Write(ms); ms.Seek(0, SeekOrigin.Begin); header.NumUncompressedBytes = -1; header.NumCompressedBytes = -1; header.Read(ms); } Assert.Equal(expectedNumUncompressedBytes, header.NumUncompressedBytes); Assert.Equal(expectedNumCompressedBytes, header.NumCompressedBytes); } [Fact] public void SizeMismatch() { using (var ms = new MemoryStream()) { var array = new byte[10]; ms.Write(array, 0, array.Length); ms.Seek(0, SeekOrigin.Begin); var header = new BlockHeader(); // ReSharper disable once AccessToDisposedClosure Assert.Throws(delegate { header.Read(ms); }); } } [Fact] public void WrongHeaderId() { using (var ms = new MemoryStream()) { var array = new byte[12]; ms.Write(array, 0, array.Length); ms.Seek(0, SeekOrigin.Begin); var header = new BlockHeader(); // ReSharper disable once AccessToDisposedClosure Assert.Throws(delegate { header.Read(ms); }); } } } } ================================================ FILE: UnitTests/Compression/FileHandling/BlockStreamTests.cs ================================================ using System; using System.IO; using System.IO.Compression; using System.Linq; using System.Security.Cryptography; using System.Text; using Compression.Algorithms; using Compression.DataStructures; using Compression.FileHandling; using ErrorHandling.Exceptions; using Genome; using IO; using UnitTests.TestUtilities; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; using Xunit; namespace UnitTests.Compression.FileHandling { public sealed class BlockStreamTests { private const long NumTicks = 3; private const GenomeAssembly ExpectedAssembly = GenomeAssembly.hg19; private const string SmallString = "Testing 123"; private const string FinalString = "Squeamish Ossifrage"; private static readonly Random Random = new Random(10); private static readonly Zstandard Zstd = new Zstandard(1); [Fact] public void BlockStream_EndToEnd() { string expectedString = GetRandomString(Block.DefaultSize + 10000); var customHeader = new DemoCustomHeader(-1, -1); var header = new DemoHeader(CacheConstants.Identifier, CacheConstants.SchemaVersion, CacheConstants.DataVersion, Source.Ensembl, NumTicks, ExpectedAssembly, customHeader); using (var ms = new MemoryStream()) { WriteBlockStream(Zstd, header, customHeader, ms, expectedString); ms.Position = 0; ReadFromBlockStream(Zstd, ms, expectedString); } } private static string GetRandomString(int length) { const string chars = " !\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; return new string(Enumerable.Repeat(chars, length).Select(s => s[Random.Next(s.Length)]).ToArray()); } public static byte[] GetRandomBytes(int numBytes) { var buffer = new byte[numBytes]; using (var csp = RandomNumberGenerator.Create()) csp.GetBytes(buffer); return buffer; } // ReSharper disable once UnusedParameter.Local private static void ReadFromBlockStream(ICompressionAlgorithm compressionAlgorithm, Stream ms, string expectedRandomString) { // grab the header var header = DemoHeader.Read(ms); Assert.Equal(ExpectedAssembly, header.Assembly); using (var blockStream = new BlockStream(compressionAlgorithm, ms, CompressionMode.Decompress)) using (var reader = new ExtendedBinaryReader(blockStream)) { CheckWriteException(blockStream); // sequential string check CheckString(reader, expectedRandomString); CheckString(reader, SmallString); CheckString(reader, FinalString); // random access string check blockStream.SetBlockPosition(header.Custom.FileOffset, header.Custom.InternalOffset); //reader.Reset(); CheckString(reader, SmallString); } } // ReSharper disable once UnusedParameter.Local // ReSharper disable once ParameterOnlyUsedForPreconditionCheck.Local private static void CheckString(ExtendedBinaryReader reader, string expectedString) { string s = reader.ReadAsciiString(); Assert.NotNull(s); Assert.Equal(expectedString.Length, s.Length); Assert.Equal(expectedString, s); } private static void WriteBlockStream(ICompressionAlgorithm compressionAlgorithm, DemoHeader header, DemoCustomHeader customHeader, Stream ms, string s) { using (var blockStream = new BlockStream(compressionAlgorithm, ms, CompressionMode.Compress, true)) using (var writer = new ExtendedBinaryWriter(blockStream)) { CheckReadException(blockStream); blockStream.WriteHeader(header.Write); writer.WriteOptAscii(s); (customHeader.FileOffset, customHeader.InternalOffset) = blockStream.GetBlockPosition(); Assert.Equal(customHeader.FileOffset, blockStream.Position); writer.WriteOptAscii(SmallString); blockStream.Flush(); // this will be flushed during dispose writer.WriteOptAscii(FinalString); } } private static void CheckReadException(Stream writer) { var exception = Record.Exception(() => { var buffer = new byte[10]; // ReSharper disable once AccessToDisposedClosure writer.Read(buffer, 0, 1); }); Assert.NotNull(exception); Assert.IsType(exception); } private static void CheckWriteException(Stream reader) { var exception = Record.Exception(() => { var buffer = new byte[10]; // ReSharper disable once AccessToDisposedClosure reader.Write(buffer, 0, 1); }); Assert.NotNull(exception); Assert.IsType(exception); } [Fact] public void NullStream() { Assert.Throws(delegate { using (new BlockStream(Zstd, null, CompressionMode.Decompress)) { } }); } [Fact] public void NotImplementedMethods() { using (var ms = new MemoryStream()) { // ReSharper disable AccessToDisposedClosure using (var writer = new BlockStream(Zstd, ms, CompressionMode.Compress, true)) { Assert.Throws(delegate { // ReSharper disable once UnusedVariable long len = writer.Length; }); Assert.Throws(delegate { writer.SetLength(10); }); Assert.Throws(delegate { writer.Seek(0, SeekOrigin.Begin); }); Assert.Throws(delegate { writer.Position = 0; }); } // ReSharper restore AccessToDisposedClosure } } [Fact] public void StreamTypeMismatch() { string randomPath = RandomPath.GetRandomPath(); using (var writeStream = new FileStream(randomPath, FileMode.Create, FileAccess.Write)) { Assert.Throws(delegate { // ReSharper disable once AccessToDisposedClosure using (new BlockStream(Zstd, writeStream, CompressionMode.Decompress)) { } }); } using (var readStream = FileUtilities.GetReadStream(randomPath)) { Assert.Throws(delegate { // ReSharper disable once AccessToDisposedClosure using (new BlockStream(Zstd, readStream, CompressionMode.Compress)) { } }); } } [Fact] public void CanReadWriteSeek() { string randomPath = RandomPath.GetRandomPath(); using (var writeStream = new FileStream(randomPath, FileMode.Create, FileAccess.Write)) using (var blockStream = new BlockStream(Zstd, writeStream, CompressionMode.Compress)) { Assert.False(blockStream.CanRead); Assert.True(blockStream.CanWrite); Assert.True(blockStream.CanSeek); } } [Fact] public void ValidateParameters() { using (var ms = new MemoryStream()) { using (var blockStream = new BlockStream(Zstd, ms, CompressionMode.Compress)) { var buffer = new byte[10]; // ReSharper disable once AssignNullToNotNullAttribute Assert.Throws(delegate { blockStream.Write(null, 10, 10); }); Assert.Throws(delegate { blockStream.Write(buffer, -1, 10); }); Assert.Throws(delegate { blockStream.Write(buffer, 10, -1); }); Assert.Throws(delegate { blockStream.Write(buffer, 5, 10); }); } } } [Fact] public void EndOfFile() { using (var ms = new MemoryStream()) { var writeBuffer = GetRandomBytes(100); var readBuffer = new byte[60]; using (var blockStream = new BlockStream(Zstd, ms, CompressionMode.Compress, true)) { blockStream.Write(writeBuffer, 0, writeBuffer.Length); } ms.Position = 0; using (var blockStream = new BlockStream(Zstd, ms, CompressionMode.Decompress)) { int numBytesRead = blockStream.Read(readBuffer, 0, readBuffer.Length); Assert.Equal(readBuffer.Length, numBytesRead); numBytesRead = blockStream.Read(readBuffer, 0, readBuffer.Length); Assert.Equal(writeBuffer.Length - readBuffer.Length, numBytesRead); numBytesRead = blockStream.Read(readBuffer, 0, readBuffer.Length); Assert.Equal(0, numBytesRead); } } } [Fact] public void DoubleDispose() { using (var ms = new MemoryStream()) { var blockStream = new BlockStream(Zstd, ms, CompressionMode.Compress); blockStream.Dispose(); blockStream.Dispose(); } } } public sealed class DemoHeader : Header { public readonly DemoCustomHeader Custom; public DemoHeader(string identifier, ushort schemaVersion, ushort dataVersion, Source source, long creationTimeTicks, GenomeAssembly genomeAssembly, DemoCustomHeader customHeader) : base( identifier, schemaVersion, dataVersion, source, creationTimeTicks, genomeAssembly) { Custom = customHeader; } public new void Write(BinaryWriter writer) { base.Write(writer); Custom.Write(writer); } public static DemoHeader Read(Stream stream) { DemoHeader header; using (var reader = new BinaryReader(stream, Encoding.Default, true)) { var baseHeader = Read(reader); var customHeader = DemoCustomHeader.Read(reader); header = new DemoHeader(baseHeader.Identifier, baseHeader.SchemaVersion, baseHeader.DataVersion, baseHeader.Source, baseHeader.CreationTimeTicks, baseHeader.Assembly, customHeader); } return header; } } public sealed class DemoCustomHeader { public long FileOffset; public int InternalOffset; public DemoCustomHeader(long fileOffset, int internalOffset) { FileOffset = fileOffset; InternalOffset = internalOffset; } public void Write(BinaryWriter writer) { writer.Write(FileOffset); writer.Write(InternalOffset); } public static DemoCustomHeader Read(BinaryReader reader) { long fileOffset = reader.ReadInt64(); int internalOffset = reader.ReadInt32(); return new DemoCustomHeader(fileOffset, internalOffset); } } } ================================================ FILE: UnitTests/Compression/Utilities/GZipUtilitiesTests.cs ================================================ using System.IO; using System.IO.Compression; using Compression.Utilities; using IO; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Compression.Utilities { public sealed class GZipUtilitiesTests { private const string ExpectedString = "charlie"; [Fact] public void GetAppropriateReadStream_Handle_TextFile() { string randomPath = RandomPath.GetRandomPath(); using (var writer = new StreamWriter(FileUtilities.GetCreateStream(randomPath))) { writer.WriteLine(ExpectedString); } string observedString; using (var reader = GZipUtilities.GetAppropriateStreamReader(randomPath)) { observedString = reader.ReadLine(); } Assert.Equal(ExpectedString, observedString); } [Fact] public void GetAppropriateReadStream_Handle_GZipFile() { string randomPath = RandomPath.GetRandomPath(); using (var writer = new StreamWriter(new GZipStream(FileUtilities.GetCreateStream(randomPath), CompressionMode.Compress))) { writer.WriteLine(ExpectedString); } string observedString; using (var reader = GZipUtilities.GetAppropriateStreamReader(randomPath)) { observedString = reader.ReadLine(); } Assert.Equal(ExpectedString, observedString); } [Fact] public void GetAppropriateReadStream_Handle_BlockGZipFile() { string randomPath = RandomPath.GetRandomPath(); using (var writer = GZipUtilities.GetStreamWriter(randomPath)) { writer.WriteLine(ExpectedString); } string observedString; using (var reader = GZipUtilities.GetAppropriateStreamReader(randomPath)) { observedString = reader.ReadLine(); } Assert.Equal(ExpectedString, observedString); } } } ================================================ FILE: UnitTests/Compression/Utilities/LibraryUtilitiesTests.cs ================================================ using Xunit; using Compression.Utilities; namespace UnitTests.Compression.Utilities { public sealed class LibraryUtilitiesTests { [Fact] public void CheckLibrary_ValidLibrary_NoExceptionThrown() { var ex = Record.Exception(LibraryUtilities.CheckLibrary); Assert.Null(ex); } } } ================================================ FILE: UnitTests/CustomAnnotationLambda/CustomAnnotationConfigTests.cs ================================================ using Cloud.Messages; using Cloud.Messages.Custom; using CustomAnnotationLambda; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.CustomAnnotationLambda { public sealed class CustomAnnotationConfigTests { [Fact] public void CheckFieldsNotNull_AsExpected() { var config = GetConfig(); config.id = null; var exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("id cannot be null.", exception.Message); config = GetConfig(); config.tsvUrl = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("tsvUrl cannot be null.", exception.Message); config = GetConfig(); config.outputDir = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("outputDir cannot be null.", exception.Message); config = GetConfig(); config.outputDir.bucketName = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("bucketName of outputDir cannot be null.", exception.Message); config = GetConfig(); config.outputDir.path = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("path of outputDir cannot be null.", exception.Message); config = GetConfig(); config.outputDir.region = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("region of outputDir cannot be null.", exception.Message); config = GetConfig(); config.outputDir.accessKey = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("accessKey of outputDir cannot be null.", exception.Message); config = GetConfig(); config.outputDir.secretKey = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("secretKey of outputDir cannot be null.", exception.Message); config = GetConfig(); config.outputDir.sessionToken = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("sessionToken of outputDir cannot be null.", exception.Message); } private static CustomConfig GetConfig() => new CustomConfig { id = "Test", tsvUrl = "https://somewhere.org/input.tsv", outputDir = new S3Path { bucketName = "OutputBucket", path = "/OutputDir/", region = "nowhere", accessKey = "access", secretKey = "show me the money", sessionToken = "314159265" } }; } } ================================================ FILE: UnitTests/Downloader/AnnotationRepositoryTests.cs ================================================ using System.Collections.Generic; using System.IO; using Downloader; using IO; using Moq; using Xunit; namespace UnitTests.Downloader { public class AnnotationRepositoryTests { [Fact] public void DownloadFiles_Nominal() { const ushort dataVersion = CacheConstants.DataVersion; var clientMock = new Mock(); clientMock.Setup(x => x.DownloadFile(It.IsAny())).Returns(true).Verifiable(); var files = new List { new RemoteFile($"remote/{dataVersion}/GRCh37/Both.transcripts.ndb", Path.Combine("local", "GRCh37", "Both.transcripts.ndb"), "Both.transcripts.ndb (GRCh37)"), new RemoteFile($"remote/{dataVersion}/GRCh37/Both.sift.ndb", Path.Combine("local", "GRCh37", "Both.sift.ndb"), "Both.sift.ndb (GRCh37)"), new RemoteFile($"remote/{dataVersion}/GRCh37/Both.polyphen.ndb", Path.Combine("local", "GRCh37", "Both.polyphen.ndb"), "Both.polyphen.ndb (GRCh37)"), new RemoteFile($"remote/{dataVersion}/GRCh38/Both.transcripts.ndb", Path.Combine("local", "GRCh38", "Both.transcripts.ndb"), "Both.transcripts.ndb (GRCh38)"), new RemoteFile($"remote/{dataVersion}/GRCh38/Both.sift.ndb", Path.Combine("local", "GRCh38", "Both.sift.ndb"), "Both.sift.ndb (GRCh38)"), new RemoteFile($"remote/{dataVersion}/GRCh38/Both.polyphen.ndb", Path.Combine("local", "GRCh38", "Both.polyphen.ndb"), "Both.polyphen.ndb (GRCh38)") }; AnnotationRepository.DownloadFiles(clientMock.Object, files); clientMock.Verify(x => x.DownloadFile(It.IsAny()), Times.Exactly(6)); } } } ================================================ FILE: UnitTests/Downloader/ConfigurationTests.cs ================================================ using Downloader; using VariantAnnotation.SA; using Xunit; namespace UnitTests.Downloader { public sealed class ConfigurationTests { [Fact] public void Load_ExpectedResults() { (string hostName, string cacheDir, string referencesDir, string manifestGRCh37, string manifestGRCh38) = Configuration.Load(null, null); Assert.EndsWith("annotations.nirvana.illumina.com", hostName); Assert.StartsWith("/", cacheDir); Assert.EndsWith("Cache", cacheDir); Assert.StartsWith("/", referencesDir); Assert.EndsWith("References", referencesDir); Assert.Contains("GRCh37", manifestGRCh37); Assert.Contains("GRCh38", manifestGRCh38); } [Fact] public void Load_OverrideHostName() { (string hostName, string _, string _, string _, string _) = Configuration.Load("www.illumina.com", null); Assert.Equal("www.illumina.com", hostName); } [Fact] public void Load_OverrideManifest() { var config = new global::Cloud.Configuration(); (string _, string _, string _, string manifestGRCh37, string manifestGRCh38) = Configuration.Load(null, "Schema23"); Assert.Equal($"http://annotations.nirvana.illumina.com/{config.ManifestDirectory}/{SaCommon.SchemaVersion}/Schema23_SA_GRCh37.txt", manifestGRCh37); Assert.Equal($"http://annotations.nirvana.illumina.com/{config.ManifestDirectory}/{SaCommon.SchemaVersion}/Schema23_SA_GRCh38.txt", manifestGRCh38); } } } ================================================ FILE: UnitTests/Downloader/FileExtensions/CacheFileExtensionsTests.cs ================================================ using System.Collections.Generic; using System.IO; using Downloader; using Downloader.FileExtensions; using Genome; using IO; using Xunit; namespace UnitTests.Downloader.FileExtensions { public sealed class CacheFileExtensionsTests { [Fact] public void AddCacheFiles_Nominal() { var comparer = new RemoteFileComparer(); var genomeAssemblies = new List { GenomeAssembly.GRCh37, GenomeAssembly.GRCh38 }; const string remoteCacheDirectory = "remote"; const string cacheDirectory = "local"; const ushort dataVersion = CacheConstants.DataVersion; var expectedFiles = new List { new RemoteFile($"remote/{dataVersion}/GRCh37/Both.transcripts.ndb", Path.Combine("local", "GRCh37", "Both.transcripts.ndb"), "Both.transcripts.ndb (GRCh37)"), new RemoteFile($"remote/{dataVersion}/GRCh37/Both.sift.ndb", Path.Combine("local", "GRCh37", "Both.sift.ndb"), "Both.sift.ndb (GRCh37)"), new RemoteFile($"remote/{dataVersion}/GRCh37/Both.polyphen.ndb", Path.Combine("local", "GRCh37", "Both.polyphen.ndb"), "Both.polyphen.ndb (GRCh37)"), new RemoteFile($"remote/{dataVersion}/GRCh38/Both.transcripts.ndb", Path.Combine("local", "GRCh38", "Both.transcripts.ndb"), "Both.transcripts.ndb (GRCh38)"), new RemoteFile($"remote/{dataVersion}/GRCh38/Both.sift.ndb", Path.Combine("local", "GRCh38", "Both.sift.ndb"), "Both.sift.ndb (GRCh38)"), new RemoteFile($"remote/{dataVersion}/GRCh38/Both.polyphen.ndb", Path.Combine("local", "GRCh38", "Both.polyphen.ndb"), "Both.polyphen.ndb (GRCh38)") }; var files = new List(); files.AddCacheFiles(genomeAssemblies, remoteCacheDirectory, cacheDirectory); Assert.Equal(expectedFiles, files, comparer); } } } ================================================ FILE: UnitTests/Downloader/FileExtensions/ReferencesFileExtensionTests.cs ================================================ using System.Collections.Generic; using System.IO; using Downloader; using Downloader.FileExtensions; using Genome; using ReferenceSequence; using Xunit; namespace UnitTests.Downloader.FileExtensions { public sealed class ReferencesFileExtensionsTests { [Fact] public void AddReferenceFiles_Nominal() { var comparer = new RemoteFileComparer(); var genomeAssemblies = new List { GenomeAssembly.GRCh37, GenomeAssembly.GRCh38 }; const string remoteReferencesDirectory = "remote"; const string referencesDirectory = "local"; const ushort dataVersion = ReferenceSequenceCommon.HeaderVersion; var expectedFiles = new List { new RemoteFile($"remote/{dataVersion}/Homo_sapiens.GRCh37.Nirvana.dat", Path.Combine("local", "Homo_sapiens.GRCh37.Nirvana.dat"), "Homo_sapiens.GRCh37.Nirvana.dat"), new RemoteFile($"remote/{dataVersion}/Homo_sapiens.GRCh38.Nirvana.dat", Path.Combine("local", "Homo_sapiens.GRCh38.Nirvana.dat"), "Homo_sapiens.GRCh38.Nirvana.dat") }; var files = new List(); files.AddReferenceFiles(genomeAssemblies, remoteReferencesDirectory, referencesDirectory); Assert.Equal(expectedFiles, files, comparer); } } } ================================================ FILE: UnitTests/Downloader/FileExtensions/SupplementaryAnnotationFileExtensionsTests.cs ================================================ using System.Collections.Generic; using System.IO; using Downloader; using Downloader.FileExtensions; using Genome; using Xunit; namespace UnitTests.Downloader.FileExtensions { public sealed class SupplementaryAnnotationFileExtensionsTests { [Fact] public void AddSupplementaryAnnotationFiles_Nominal() { var comparer = new RemoteFileComparer(); const string saDirectory = "local"; var remotePaths37 = new List { "/0bf0cb93e64824b20f0b551a629596fd-TopMed/2/GRCh37/TOPMed_freeze_5.nsa" }; var remotePaths38 = new List { "/43cafec8b0624b77663e2ba1dec32883-gnomAD-exome/2/GRCh38/gnomAD_exome_2.0.2.nsa", "/2551e067cb59c540a4da905a99ee5ff4-ClinGen/2/GRCh38/ClinGen_20160414.nsi" }; var remotePathsByGenomeAssembly = new Dictionary> { [GenomeAssembly.GRCh37] = remotePaths37, [GenomeAssembly.GRCh38] = remotePaths38 }; var expectedFiles = new List { new RemoteFile("/0bf0cb93e64824b20f0b551a629596fd-TopMed/2/GRCh37/TOPMed_freeze_5.nsa", Path.Combine("local", "GRCh37", "TOPMed_freeze_5.nsa"), "TOPMed_freeze_5.nsa (GRCh37)"), new RemoteFile("/0bf0cb93e64824b20f0b551a629596fd-TopMed/2/GRCh37/TOPMed_freeze_5.nsa.idx", Path.Combine("local", "GRCh37", "TOPMed_freeze_5.nsa.idx"), "TOPMed_freeze_5.nsa.idx (GRCh37)"), new RemoteFile("/43cafec8b0624b77663e2ba1dec32883-gnomAD-exome/2/GRCh38/gnomAD_exome_2.0.2.nsa", Path.Combine("local", "GRCh38", "gnomAD_exome_2.0.2.nsa"), "gnomAD_exome_2.0.2.nsa (GRCh38)"), new RemoteFile("/43cafec8b0624b77663e2ba1dec32883-gnomAD-exome/2/GRCh38/gnomAD_exome_2.0.2.nsa.idx", Path.Combine("local", "GRCh38", "gnomAD_exome_2.0.2.nsa.idx"), "gnomAD_exome_2.0.2.nsa.idx (GRCh38)"), new RemoteFile("/2551e067cb59c540a4da905a99ee5ff4-ClinGen/2/GRCh38/ClinGen_20160414.nsi", Path.Combine("local", "GRCh38", "ClinGen_20160414.nsi"), "ClinGen_20160414.nsi (GRCh38)") }; var files = new List(); files.AddSupplementaryAnnotationFiles(remotePathsByGenomeAssembly, saDirectory); Assert.Equal(expectedFiles, files, comparer); } } } ================================================ FILE: UnitTests/Downloader/GenomeAssemblyHelperTests.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; using Genome; using Xunit; using du = Downloader.Utilities; namespace UnitTests.Downloader { public sealed class GenomeAssemblyHelperTests { [Fact] public void GetGenomeAssemblies_GRCh37() { List genomeAssemblies = du.GenomeAssemblyHelper.GetGenomeAssemblies("GRCh37"); Assert.Single(genomeAssemblies); Assert.Equal(GenomeAssembly.GRCh37, genomeAssemblies[0]); } [Fact] public void GetGenomeAssemblies_GRCh38() { List genomeAssemblies = du.GenomeAssemblyHelper.GetGenomeAssemblies("GrcH38"); Assert.Single(genomeAssemblies); Assert.Equal(GenomeAssembly.GRCh38, genomeAssemblies[0]); } [Fact] public void GetGenomeAssemblies_Both() { List genomeAssemblies = du.GenomeAssemblyHelper.GetGenomeAssemblies("BoTh"); Assert.Equal(2, genomeAssemblies.Count); Assert.Equal(GenomeAssembly.GRCh37, genomeAssemblies[0]); Assert.Equal(GenomeAssembly.GRCh38, genomeAssemblies[1]); } [Fact] public void GetGenomeAssemblies_Unknown() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable List genomeAssemblies = du.GenomeAssemblyHelper.GetGenomeAssemblies("hg19"); }); } } } ================================================ FILE: UnitTests/Downloader/ManifestTests.cs ================================================ using System.Collections.Generic; using System.Linq; using Downloader; using Genome; using Xunit; namespace UnitTests.Downloader { public sealed class ManifestTests { private const string ManifestGRCh37 = "Manifest_GRCh37"; private const string ManifestGRCh38 = "Manifest_GRCh38"; [Fact] public void CreateGenomeAssemblyPaths_GRCh37() { var genomeAssemblies = new List { GenomeAssembly.GRCh37 }; List<(GenomeAssembly GenomeAssembly, string ManifestPath)> list = Manifest.CreateGenomeAssemblyPaths(ManifestGRCh37, ManifestGRCh38, genomeAssemblies).ToList(); Assert.Single(list); Assert.Equal(GenomeAssembly.GRCh37, list[0].GenomeAssembly); Assert.Equal(ManifestGRCh37, list[0].ManifestPath); } [Fact] public void CreateGenomeAssemblyPaths_GRCh38() { var genomeAssemblies = new List { GenomeAssembly.GRCh38 }; List<(GenomeAssembly GenomeAssembly, string ManifestPath)> list = Manifest.CreateGenomeAssemblyPaths(ManifestGRCh37, ManifestGRCh38, genomeAssemblies).ToList(); Assert.Single(list); Assert.Equal(GenomeAssembly.GRCh38, list[0].GenomeAssembly); Assert.Equal(ManifestGRCh38, list[0].ManifestPath); } [Fact] public void CreateGenomeAssemblyPaths_Both() { var genomeAssemblies = new List { GenomeAssembly.GRCh37, GenomeAssembly.GRCh38 }; List<(GenomeAssembly GenomeAssembly, string ManifestPath)> list = Manifest.CreateGenomeAssemblyPaths(ManifestGRCh37, ManifestGRCh38, genomeAssemblies).ToList(); Assert.Equal(2, list.Count); } [Fact] public void CreateGenomeAssemblyPaths_Unknown() { var genomeAssemblies = new List { GenomeAssembly.hg19 }; List<(GenomeAssembly GenomeAssembly, string ManifestPath)> list = Manifest.CreateGenomeAssemblyPaths(ManifestGRCh37, ManifestGRCh38, genomeAssemblies).ToList(); Assert.Empty(list); } } } ================================================ FILE: UnitTests/Downloader/RemoteFileComparer.cs ================================================ using System.Collections.Generic; using Downloader; namespace UnitTests.Downloader { internal sealed class RemoteFileComparer : EqualityComparer { public override bool Equals(RemoteFile x, RemoteFile y) { return x.LocalPath == y.LocalPath && x.RemotePath == y.RemotePath && x.Description == y.Description; } public override int GetHashCode(RemoteFile obj) { unchecked { int hashCode = obj.RemotePath.GetHashCode(); hashCode = (hashCode * 397) ^ obj.LocalPath.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Description.GetHashCode(); return hashCode; } } } } ================================================ FILE: UnitTests/EndToEndTests.cs ================================================ using System; using OptimizedCore; using UnitTests.TestUtilities; using Xunit; namespace UnitTests { // NOTE: these tests do not include phyloP scores yet // CACHE: v26 (VEP91) // SA: SA38 based on intermediate TSV from /illumina/development/Nirvana/Development/IntermediateTsvs/2017-05/GRCh37 public sealed class EndToEndTests { private readonly string _cacheFilePrefix; public EndToEndTests() => _cacheFilePrefix = Resources.EndToEnd37("chr12_7018490_7086889_Both"); [Fact] public void Annotation_RefMinor_not_annotated_when_no_SA() { const string vcfLine = "chr12 7054859 . G . 100 PASS . . ."; var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, null, vcfLine); var sb = annotatedPosition.GetJsonStringBuilder(); Assert.Null(sb); } [Obsolete("We need to extract the SA files to enable these again.")] [Theory(Skip = "We need to extract the SA files to enable these again.")] [InlineData("chr12 7045879 . C , . PASS SVTYPE=STR;END=7045936;REF=19;RL=57;RU=CAG;REPID=DRPLA GT:SO:CN:CI:AD_SP:AD_FL:AD_IR 1/2:SPANNING/SPANNING:14/22:9/4:19/20:0/0", "{\"chromosome\":\"chr12\",\"position\":7045879,\"repeatUnit\":\"CAG\",\"refRepeatCount\":19,\"svEnd\":7045936,\"refAllele\":\"C\",\"altAlleles\":[\"\",\"\"],\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"genotype\":\"1/2\",\"repeatNumbers\":\"14/22\",\"repeatNumberSpans\":\"9/4\"}],\"variants\":[{\"vid\":\"12:7045880:7045936:CAG:14\",\"chromosome\":\"chr12\",\"begin\":7045880,\"end\":7045936,\"refAllele\":\"C\",\"altAllele\":\"\",\"variantType\":\"short_tandem_repeat_variation\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_contraction\"],\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_contraction\"],\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_contraction\"],\"proteinId\":\"NP_001931.2\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_contraction\"],\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_contraction\"],\"proteinId\":\"ENSP00000379915.2\"}]}},{\"vid\":\"12:7045880:7045936:CAG:22\",\"chromosome\":\"chr12\",\"begin\":7045880,\"end\":7045936,\"refAllele\":\"C\",\"altAllele\":\"\",\"variantType\":\"short_tandem_repeat_variation\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_expansion\"],\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_expansion\"],\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_expansion\"],\"proteinId\":\"NP_001931.2\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_expansion\"],\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"exons\":\"5/10\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"short_tandem_repeat_expansion\"],\"proteinId\":\"ENSP00000379915.2\"}]}}]}")] [InlineData("chr12 7048190 . G A 322 PASS SB=0.1234567 . .", "{\"chromosome\":\"chr12\",\"position\":7048190,\"refAllele\":\"G\",\"altAlleles\":[\"A\"],\"quality\":322,\"filters\":[\"PASS\"],\"strandBias\":0.123457,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7048190:A\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"A\",\"variantType\":\"SNV\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001007026.1:c.3064G>A\",\"hgvsp\":\"NP_001007027.1:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001007027.1\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3391\",\"cdsPos\":\"3061\",\"exons\":\"7/10\",\"proteinPos\":\"1021\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"XM_005253672.1:c.3061G>A\",\"hgvsp\":\"XP_005253729.1:p.(Ala1021Thr)\",\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3294\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001940.3:c.3064G>A\",\"hgvsp\":\"NP_001931.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001931.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"XM_005253669.1\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"XP_005253726.1\"},{\"transcript\":\"NM_001301836.1\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288765.1\"},{\"transcript\":\"NM_001301834.1\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_001288763.1\"},{\"transcript\":\"XM_005253670.1\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"XP_005253727.1\"},{\"transcript\":\"NR_023317.1\",\"bioType\":\"snRNA\",\"geneId\":\"100147744\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"NM_001301838.1\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288767.1\"},{\"transcript\":\"NM_001301837.1\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288766.1\"},{\"transcript\":\"NR_126035.1\",\"bioType\":\"misc_RNA\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_138425.3\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_612434.1\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000356654.4:c.3064G>A\",\"hgvsp\":\"ENSP00000349076.3:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000349076.3\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3298\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000396684.2:c.3064G>A\",\"hgvsp\":\"ENSP00000379915.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000379915.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000541029.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000538392.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000542222.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000545581.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440602.1\"},{\"transcript\":\"ENST00000607421.1\",\"bioType\":\"antisense\",\"geneId\":\"ENSG00000272173\",\"hgnc\":\"U47924.2\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000458811.1\",\"bioType\":\"snRNA\",\"geneId\":\"ENSG00000238923\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000544681.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000475422.1\"},{\"transcript\":\"ENST00000537087.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440937.1\"},{\"transcript\":\"ENST00000229281.5\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"ENSP00000229281.5\"}]}}]}")] [InlineData("chr12 7054859 . G . 100 PASS . . .", "{\"chromosome\":\"chr12\",\"position\":7054859,\"refAllele\":\"G\",\"quality\":100,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7054859:G\",\"chromosome\":\"chr12\",\"begin\":7054859,\"end\":7054859,\"isReferenceMinorAllele\":true,\"refAllele\":\"A\",\"altAllele\":\"G\",\"variantType\":\"SNV\",\"globalAllele\":{\"globalMinorAllele\":\"G\",\"globalMinorAlleleFrequency\":0.003794},\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"NP_001931.2\"},{\"transcript\":\"XM_005253669.1\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"XP_005253726.1\"},{\"transcript\":\"NM_001301836.1\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"NM_001301836.1:c.191-75A>G\",\"proteinId\":\"NP_001288765.1\"},{\"transcript\":\"NM_001301834.1\",\"bioType\":\"protein_coding\",\"introns\":\"3/3\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"NM_001301834.1:c.230-75A>G\",\"isCanonical\":true,\"proteinId\":\"NP_001288763.1\"},{\"transcript\":\"XM_005253670.1\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"XM_005253670.1:c.191-75A>G\",\"proteinId\":\"XP_005253727.1\"},{\"transcript\":\"NR_023317.1\",\"bioType\":\"snRNA\",\"geneId\":\"100147744\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"NM_001301838.1\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"NM_001301838.1:c.125-75A>G\",\"proteinId\":\"NP_001288767.1\"},{\"transcript\":\"NM_001301837.1\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"NM_001301837.1:c.143-75A>G\",\"proteinId\":\"NP_001288766.1\"},{\"transcript\":\"NR_126035.1\",\"bioType\":\"misc_RNA\",\"introns\":\"2/2\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\",\"non_coding_transcript_variant\"],\"hgvsc\":\"NR_126035.1:n.544-75A>G\"},{\"transcript\":\"NM_138425.3\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"NM_138425.3:c.230-75A>G\",\"proteinId\":\"NP_612434.1\"},{\"transcript\":\"NM_138425.2\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"NM_138425.2:c.230-75A>G\",\"proteinId\":\"NP_612434.1\"},{\"transcript\":\"NM_080548.4\",\"bioType\":\"protein_coding\",\"geneId\":\"5777\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_536858.1\"},{\"transcript\":\"XM_005253719.1\",\"bioType\":\"protein_coding\",\"geneId\":\"5777\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"XP_005253776.1\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000379915.2\"},{\"transcript\":\"ENST00000537488.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000538392.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000542222.1\",\"bioType\":\"processed_transcript\",\"introns\":\"2/2\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\",\"non_coding_transcript_variant\"],\"hgvsc\":\"ENST00000542222.1:n.408-75A>G\"},{\"transcript\":\"ENST00000545581.1\",\"bioType\":\"protein_coding\",\"introns\":\"3/3\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"ENST00000545581.1:c.230-75A>G\",\"proteinId\":\"ENSP00000440602.1\"},{\"transcript\":\"ENST00000607421.1\",\"bioType\":\"antisense\",\"geneId\":\"ENSG00000272173\",\"hgnc\":\"U47924.2\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000458811.1\",\"bioType\":\"snRNA\",\"geneId\":\"ENSG00000238923\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000544681.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000475422.1\"},{\"transcript\":\"ENST00000537087.1\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"ENST00000537087.1:c.143-75A>G\",\"proteinId\":\"ENSP00000440937.1\"},{\"transcript\":\"ENST00000229281.5\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"ENST00000229281.5:c.230-75A>G\",\"isCanonical\":true,\"proteinId\":\"ENSP00000229281.5\"},{\"transcript\":\"ENST00000540506.2\",\"bioType\":\"protein_coding\",\"introns\":\"2/2\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"ENST00000540506.2:c.125-75A>G\",\"proteinId\":\"ENSP00000475635.1\"},{\"transcript\":\"ENST00000543115.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000443393.1\"},{\"transcript\":\"ENST00000542848.1\",\"bioType\":\"nonsense_mediated_decay\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000444805.1\"},{\"transcript\":\"ENST00000543120.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000399448.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000382376.1\"},{\"transcript\":\"ENST00000534900.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000447931.2\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000415979.2\"},{\"transcript\":\"ENST00000538318.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"]}]}}]}")] [InlineData("chr12 7073931 . T 100 PASS SVTYPE=INV;END=7074100 . .", "{\"chromosome\":\"chr12\",\"position\":7073931,\"svEnd\":7074100,\"refAllele\":\"T\",\"altAlleles\":[\"\"],\"quality\":100,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"clingen\":[{\"chromosome\":\"12\",\"begin\":173786,\"end\":34835837,\"variantType\":\"copy_number_gain\",\"id\":\"nsv995956\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Decreased calvarial ossification\",\"Delayed gross motor development\",\"Feeding difficulties\",\"Frontal bossing\",\"Morphological abnormality of the central nervous system\",\"Patchy alopecia\"],\"phenotypeIds\":[\"HP:0002007\",\"HP:0002011\",\"HP:0002194\",\"HP:0002232\",\"HP:0005474\",\"HP:0011968\",\"MedGen:C0232466\",\"MedGen:C1862862\",\"MedGen:CN001816\",\"MedGen:CN001820\",\"MedGen:CN001989\",\"MedGen:CN004852\"],\"reciprocalOverlap\":0},{\"chromosome\":\"12\",\"begin\":282465,\"end\":7425202,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532325\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Global developmental delay\"],\"phenotypeIds\":[\"HP:0001263\",\"MedGen:CN001157\"],\"reciprocalOverlap\":0.00002},{\"chromosome\":\"12\",\"begin\":282465,\"end\":8514342,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532326\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00002},{\"chromosome\":\"12\",\"begin\":282465,\"end\":25623263,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532324\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Abnormality of cardiac morphology\",\"Agenesis of corpus callosum\",\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"phenotypeIds\":[\"HP:0001274\",\"HP:0001627\",\"MedGen:C1837248\",\"MedGen:CN001482\"],\"reciprocalOverlap\":0.00001},{\"chromosome\":\"12\",\"begin\":282465,\"end\":28568117,\"variantType\":\"copy_number_loss\",\"id\":\"nsv531493\",\"clinicalInterpretation\":\"pathogenic\",\"observedLosses\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Global developmental delay\"],\"phenotypeIds\":[\"HP:0001263\",\"MedGen:CN001157\"],\"reciprocalOverlap\":0.00001},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34533111,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532323\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":2,\"phenotypes\":[\"Coarse facial features\",\"Abnormal facial shape\",\"Abnormality of cardiac morphology\",\"Cleft upper lip\",\"Global developmental delay\",\"Hearing impairment\",\"Short stature\"],\"phenotypeIds\":[\"HP:0000280\",\"MedGen:C1854600\",\"HP:0000204\",\"HP:0000365\",\"HP:0001263\",\"HP:0001627\",\"HP:0001999\",\"HP:0004322\",\"MedGen:C0349588\",\"MedGen:C1384666\",\"MedGen:CN000197\",\"MedGen:CN001157\",\"MedGen:CN001482\",\"MedGen:CN001810\"],\"reciprocalOverlap\":0},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34756196,\"variantType\":\"copy_number_gain\",\"id\":\"nsv916406\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Ambiguous genitalia\",\"Delayed fine motor development\",\"Delayed gross motor development\",\"Delayed speech and language development\",\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Intellectual disability\",\"Short stature\"],\"phenotypeIds\":[\"HP:0000062\",\"HP:0000750\",\"HP:0001249\",\"HP:0002194\",\"HP:0004322\",\"HP:0010862\",\"MedGen:C0349588\",\"MedGen:C1843367\",\"MedGen:CN000062\",\"MedGen:CN000706\",\"MedGen:CN001989\",\"MedGen:CN116596\"],\"reciprocalOverlap\":0},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34756209,\"variantType\":\"copy_number_gain\",\"id\":\"nsv533931\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34761006,\"variantType\":\"copy_number_gain\",\"id\":\"nsv917315\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0},{\"chromosome\":\"12\",\"begin\":282465,\"end\":133773393,\"variantType\":\"copy_number_gain\",\"id\":\"nsv917029\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Abnormality of toe\",\"Defect in the atrial septum\",\"Downslanted palpebral fissures\",\"Frontal bossing\",\"Low-set ears\",\"Overlapping fingers\",\"Patent ductus arteriosus\",\"Sacral dimple\",\"Sandal gap\",\"Single transverse palmar crease\"],\"phenotypeIds\":[\"HP:0000369\",\"HP:0000494\",\"HP:0000954\",\"HP:0000960\",\"HP:0001631\",\"HP:0001643\",\"HP:0001780\",\"HP:0001852\",\"HP:0002007\",\"HP:0010557\",\"MedGen:C0426848\",\"MedGen:C1865016\",\"MedGen:C1873502\",\"MedGen:CN000345\",\"MedGen:CN001485\",\"MedGen:CN001496\",\"MedGen:CN001615\",\"MedGen:CN001674\",\"MedGen:CN001816\",\"MedGen:CN009386\"],\"reciprocalOverlap\":0},{\"chromosome\":\"12\",\"begin\":322142,\"end\":34079848,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532328\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00001},{\"chromosome\":\"12\",\"begin\":1367440,\"end\":20810511,\"variantType\":\"copy_number_gain\",\"id\":\"nsv995558\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Feeding difficulties\",\"Laryngomalacia\"],\"phenotypeIds\":[\"HP:0001601\",\"HP:0011968\",\"MedGen:C0232466\",\"MedGen:CN001457\"],\"reciprocalOverlap\":0.00001},{\"chromosome\":\"12\",\"begin\":2980907,\"end\":15140282,\"variantType\":\"copy_number_gain\",\"id\":\"nsv868869\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00001},{\"chromosome\":\"12\",\"begin\":6837831,\"end\":7858216,\"variantType\":\"copy_number_loss\",\"id\":\"nsv531496\",\"clinicalInterpretation\":\"pathogenic\",\"observedLosses\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00017}],\"dgv\":[{\"chromosome\":\"12\",\"begin\":6985480,\"end\":7103003,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.00003,\"id\":\"nsv1035811\",\"sampleSize\":29084,\"observedGains\":1,\"reciprocalOverlap\":0.00144},{\"chromosome\":\"12\",\"begin\":7005694,\"end\":7115157,\"variantType\":\"insertion\",\"variantFreqAll\":0.25,\"id\":\"nsv509453\",\"sampleSize\":4,\"observedGains\":1,\"reciprocalOverlap\":0.00154},{\"chromosome\":\"12\",\"begin\":7012055,\"end\":7163058,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.00003,\"id\":\"nsv1047373\",\"sampleSize\":29084,\"observedGains\":1,\"reciprocalOverlap\":0.00112}],\"variants\":[{\"vid\":\"12:7073932:7074100:Inverse\",\"chromosome\":\"chr12\",\"begin\":7073932,\"end\":7074100,\"refAllele\":\"T\",\"altAllele\":\"\",\"variantType\":\"inversion\",\"overlappingGenes\":[\"EMG1\"],\"transcripts\":{\"ensembl\":[{\"transcript\":\"ENST00000607161.1\",\"bioType\":\"processed_transcript\",\"introns\":\"1/5\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_variant\"]}]}}]}")] [InlineData("chr12 7040534 . ACATA A 100 PASS . . .", "{\"chromosome\":\"chr12\",\"position\":7040534,\"refAllele\":\"ACATA\",\"altAlleles\":[\"A\"],\"quality\":100,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7040535:7040538\",\"chromosome\":\"chr12\",\"begin\":7040535,\"end\":7040538,\"refAllele\":\"CATA\",\"altAllele\":\"-\",\"variantType\":\"deletion\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"introns\":\"1/9\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"NM_001007026.1:c.-162-2468_-162-2465delCATA\",\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"introns\":\"1/9\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"XM_005253672.1:c.-162-2468_-162-2465delCATA\",\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"introns\":\"1/9\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"NM_001940.3:c.-162-2468_-162-2465delCATA\",\"proteinId\":\"NP_001931.2\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"introns\":\"1/9\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"ENST00000356654.4:c.-162-2468_-162-2465delCATA\",\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"introns\":\"1/9\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"intron_variant\"],\"hgvsc\":\"ENST00000396684.2:c.-162-2468_-162-2465delCATA\",\"proteinId\":\"ENSP00000379915.2\"}]}}]}")] [InlineData("chr12 7045274 . T 100 PASS SVTYPE=DEL;END=7084024 . .", "{\"chromosome\":\"chr12\",\"position\":7045274,\"svEnd\":7084024,\"refAllele\":\"T\",\"altAlleles\":[\"\"],\"quality\":100,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"clingen\":[{\"chromosome\":\"12\",\"begin\":147099,\"end\":7054359,\"variantType\":\"copy_number_gain\",\"id\":\"nsv498529\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00132},{\"chromosome\":\"12\",\"begin\":173786,\"end\":34835837,\"variantType\":\"copy_number_gain\",\"id\":\"nsv995956\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Decreased calvarial ossification\",\"Delayed gross motor development\",\"Feeding difficulties\",\"Frontal bossing\",\"Morphological abnormality of the central nervous system\",\"Patchy alopecia\"],\"phenotypeIds\":[\"HP:0002007\",\"HP:0002011\",\"HP:0002194\",\"HP:0002232\",\"HP:0005474\",\"HP:0011968\",\"MedGen:C0232466\",\"MedGen:C1862862\",\"MedGen:CN001816\",\"MedGen:CN001820\",\"MedGen:CN001989\",\"MedGen:CN004852\"],\"reciprocalOverlap\":0.00112},{\"chromosome\":\"12\",\"begin\":282465,\"end\":7425202,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532325\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Global developmental delay\"],\"phenotypeIds\":[\"HP:0001263\",\"MedGen:CN001157\"],\"reciprocalOverlap\":0.00543},{\"chromosome\":\"12\",\"begin\":282465,\"end\":8514342,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532326\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00471},{\"chromosome\":\"12\",\"begin\":282465,\"end\":25623263,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532324\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Abnormality of cardiac morphology\",\"Agenesis of corpus callosum\",\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"phenotypeIds\":[\"HP:0001274\",\"HP:0001627\",\"MedGen:C1837248\",\"MedGen:CN001482\"],\"reciprocalOverlap\":0.00153},{\"chromosome\":\"12\",\"begin\":282465,\"end\":28568117,\"variantType\":\"copy_number_loss\",\"id\":\"nsv531493\",\"clinicalInterpretation\":\"pathogenic\",\"observedLosses\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Global developmental delay\"],\"phenotypeIds\":[\"HP:0001263\",\"MedGen:CN001157\"],\"reciprocalOverlap\":0.00137},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34533111,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532323\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":2,\"phenotypes\":[\"Coarse facial features\",\"Abnormal facial shape\",\"Abnormality of cardiac morphology\",\"Cleft upper lip\",\"Global developmental delay\",\"Hearing impairment\",\"Short stature\"],\"phenotypeIds\":[\"HP:0000280\",\"MedGen:C1854600\",\"HP:0000204\",\"HP:0000365\",\"HP:0001263\",\"HP:0001627\",\"HP:0001999\",\"HP:0004322\",\"MedGen:C0349588\",\"MedGen:C1384666\",\"MedGen:CN000197\",\"MedGen:CN001157\",\"MedGen:CN001482\",\"MedGen:CN001810\"],\"reciprocalOverlap\":0.00113},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34756196,\"variantType\":\"copy_number_gain\",\"id\":\"nsv916406\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Ambiguous genitalia\",\"Delayed fine motor development\",\"Delayed gross motor development\",\"Delayed speech and language development\",\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Intellectual disability\",\"Short stature\"],\"phenotypeIds\":[\"HP:0000062\",\"HP:0000750\",\"HP:0001249\",\"HP:0002194\",\"HP:0004322\",\"HP:0010862\",\"MedGen:C0349588\",\"MedGen:C1843367\",\"MedGen:CN000062\",\"MedGen:CN000706\",\"MedGen:CN001989\",\"MedGen:CN116596\"],\"reciprocalOverlap\":0.00112},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34756209,\"variantType\":\"copy_number_gain\",\"id\":\"nsv533931\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00112},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34761006,\"variantType\":\"copy_number_gain\",\"id\":\"nsv917315\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00112},{\"chromosome\":\"12\",\"begin\":282465,\"end\":133773393,\"variantType\":\"copy_number_gain\",\"id\":\"nsv917029\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Abnormality of toe\",\"Defect in the atrial septum\",\"Downslanted palpebral fissures\",\"Frontal bossing\",\"Low-set ears\",\"Overlapping fingers\",\"Patent ductus arteriosus\",\"Sacral dimple\",\"Sandal gap\",\"Single transverse palmar crease\"],\"phenotypeIds\":[\"HP:0000369\",\"HP:0000494\",\"HP:0000954\",\"HP:0000960\",\"HP:0001631\",\"HP:0001643\",\"HP:0001780\",\"HP:0001852\",\"HP:0002007\",\"HP:0010557\",\"MedGen:C0426848\",\"MedGen:C1865016\",\"MedGen:C1873502\",\"MedGen:CN000345\",\"MedGen:CN001485\",\"MedGen:CN001496\",\"MedGen:CN001615\",\"MedGen:CN001674\",\"MedGen:CN001816\",\"MedGen:CN009386\"],\"reciprocalOverlap\":0.00029},{\"chromosome\":\"12\",\"begin\":322142,\"end\":34079848,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532328\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00115},{\"chromosome\":\"12\",\"begin\":1367440,\"end\":20810511,\"variantType\":\"copy_number_gain\",\"id\":\"nsv995558\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Feeding difficulties\",\"Laryngomalacia\"],\"phenotypeIds\":[\"HP:0001601\",\"HP:0011968\",\"MedGen:C0232466\",\"MedGen:CN001457\"],\"reciprocalOverlap\":0.00199},{\"chromosome\":\"12\",\"begin\":2980907,\"end\":15140282,\"variantType\":\"copy_number_gain\",\"id\":\"nsv868869\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.00319},{\"chromosome\":\"12\",\"begin\":6837831,\"end\":7858216,\"variantType\":\"copy_number_loss\",\"id\":\"nsv531496\",\"clinicalInterpretation\":\"pathogenic\",\"observedLosses\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"reciprocalOverlap\":0.03798}],\"dgv\":[{\"chromosome\":\"12\",\"begin\":6985480,\"end\":7103003,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.00003,\"id\":\"nsv1035811\",\"sampleSize\":29084,\"observedGains\":1,\"reciprocalOverlap\":0.32972},{\"chromosome\":\"12\",\"begin\":7005694,\"end\":7115157,\"variantType\":\"insertion\",\"variantFreqAll\":0.25,\"id\":\"nsv509453\",\"sampleSize\":4,\"observedGains\":1,\"reciprocalOverlap\":0.354},{\"chromosome\":\"12\",\"begin\":7012055,\"end\":7163058,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.00003,\"id\":\"nsv1047373\",\"sampleSize\":29084,\"observedGains\":1,\"reciprocalOverlap\":0.25662},{\"chromosome\":\"12\",\"begin\":7053085,\"end\":7063682,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.00006,\"id\":\"nsv557262\",\"sampleSize\":17421,\"observedGains\":1,\"reciprocalOverlap\":0.2735},{\"chromosome\":\"12\",\"begin\":7054324,\"end\":7054491,\"variantType\":\"copy_number_gain\",\"id\":\"esv5830\",\"sampleSize\":1,\"reciprocalOverlap\":0.00434},{\"chromosome\":\"12\",\"begin\":7054931,\"end\":7065635,\"variantType\":\"copy_number_loss\",\"variantFreqAll\":0.00011,\"id\":\"dgv2324n54\",\"sampleSize\":17421,\"observedLosses\":2,\"reciprocalOverlap\":0.27626},{\"chromosome\":\"12\",\"begin\":7055492,\"end\":7070110,\"variantType\":\"copy_number_loss\",\"variantFreqAll\":0.00017,\"id\":\"dgv2325n54\",\"sampleSize\":17421,\"observedLosses\":3,\"reciprocalOverlap\":0.37726},{\"chromosome\":\"12\",\"begin\":7058967,\"end\":7059349,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.00541,\"id\":\"esv3356433\",\"sampleSize\":185,\"observedGains\":1,\"reciprocalOverlap\":0.00988},{\"chromosome\":\"12\",\"begin\":7059100,\"end\":7059327,\"variantType\":\"insertion\",\"variantFreqAll\":1,\"id\":\"nsv513351\",\"sampleSize\":1,\"observedGains\":1,\"reciprocalOverlap\":0.00588},{\"chromosome\":\"12\",\"begin\":7059146,\"end\":7059890,\"variantType\":\"insertion\",\"variantFreqAll\":0.33333,\"id\":\"esv994245\",\"sampleSize\":3,\"observedGains\":1,\"reciprocalOverlap\":0.01923},{\"chromosome\":\"12\",\"begin\":7059221,\"end\":7059221,\"variantType\":\"insertion\",\"variantFreqAll\":0.01622,\"id\":\"esv3381477\",\"sampleSize\":185,\"observedGains\":3},{\"chromosome\":\"12\",\"begin\":7060846,\"end\":7070110,\"variantType\":\"copy_number_loss\",\"variantFreqAll\":0.00011,\"id\":\"dgv2326n54\",\"sampleSize\":17421,\"observedLosses\":2,\"reciprocalOverlap\":0.2391},{\"chromosome\":\"12\",\"begin\":7063701,\"end\":7070500,\"variantType\":\"copy_number_loss\",\"variantFreqAll\":1,\"id\":\"nsv952791\",\"sampleSize\":1,\"observedLosses\":1,\"reciprocalOverlap\":0.17548}],\"variants\":[{\"vid\":\"12:7045275:7084024\",\"chromosome\":\"chr12\",\"begin\":7045275,\"end\":7084024,\"refAllele\":\"T\",\"altAllele\":\"\",\"variantType\":\"deletion\",\"regulatoryRegions\":[{\"id\":\"ENSR00000361210\",\"type\":\"promoter\",\"consequence\":[\"regulatory_region_ablation\",\"regulatory_region_variant\"]},{\"id\":\"ENSR00000361211\",\"type\":\"promoter\",\"consequence\":[\"regulatory_region_ablation\",\"regulatory_region_variant\"]},{\"id\":\"ENSR00000361212\",\"type\":\"CTCF_binding_site\",\"consequence\":[\"regulatory_region_ablation\",\"regulatory_region_variant\"]},{\"id\":\"ENSR00000361213\",\"type\":\"promoter\",\"consequence\":[\"regulatory_region_ablation\",\"regulatory_region_variant\"]},{\"id\":\"ENSR00000361214\",\"type\":\"promoter\",\"consequence\":[\"regulatory_region_ablation\",\"regulatory_region_variant\"]},{\"id\":\"ENSR00000361215\",\"type\":\"promoter\",\"consequence\":[\"regulatory_region_ablation\",\"regulatory_region_variant\"]},{\"id\":\"ENSR00000361216\",\"type\":\"promoter\",\"consequence\":[\"regulatory_region_ablation\",\"regulatory_region_variant\"]}],\"overlappingGenes\":[\"ATN1\",\"C12orf57\",\"U47924.2\",\"RNU7-1\",\"PTPN6\",\"MIR200CHG\",\"EMG1\",\"MIR200C\",\"MIR141\",\"U47924.1\",\"PHB2\",\"SCARNA12\"],\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"exons\":\"5-10/10\",\"introns\":\"5-9/9\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"transcript_truncation\",\"unidirectional_gene_fusion\"],\"geneFusion\":{\"exon\":5,\"fusions\":[{\"hgvsc\":\"ATN1{NM_001007026.1}:c.1_844_EMG1{NM_006331.7}:c.412+170_735\",\"intron\":3},{\"hgvsc\":\"ATN1{NM_001007026.1}:c.1_844_EMG1{NM_001320049.1}:c.409+170_582\",\"intron\":4}]},\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"exons\":\"5-10/10\",\"introns\":\"5-9/9\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"transcript_truncation\",\"unidirectional_gene_fusion\"],\"geneFusion\":{\"exon\":5,\"fusions\":[{\"hgvsc\":\"ATN1{XM_005253672.1}:c.1_841_EMG1{NM_006331.7}:c.412+170_735\",\"intron\":3},{\"hgvsc\":\"ATN1{XM_005253672.1}:c.1_841_EMG1{NM_001320049.1}:c.409+170_582\",\"intron\":4}]},\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"exons\":\"5-10/10\",\"introns\":\"5-9/9\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"transcript_truncation\",\"unidirectional_gene_fusion\"],\"geneFusion\":{\"exon\":5,\"fusions\":[{\"hgvsc\":\"ATN1{NM_001940.3}:c.1_844_EMG1{NM_006331.7}:c.412+170_735\",\"intron\":3},{\"hgvsc\":\"ATN1{NM_001940.3}:c.1_844_EMG1{NM_001320049.1}:c.409+170_582\",\"intron\":4}]},\"proteinId\":\"NP_001931.2\"},{\"transcript\":\"NM_006331.7\",\"bioType\":\"protein_coding\",\"exons\":\"1-3/6\",\"introns\":\"1-3/5\",\"geneId\":\"10436\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\",\"unidirectional_gene_fusion\"],\"geneFusion\":{\"intron\":3,\"fusions\":[{\"hgvsc\":\"ATN1{NM_001007026.1}:c.1_844_EMG1{NM_006331.7}:c.412+170_735\",\"exon\":5},{\"hgvsc\":\"ATN1{XM_005253672.1}:c.1_841_EMG1{NM_006331.7}:c.412+170_735\",\"exon\":5},{\"hgvsc\":\"ATN1{NM_001940.3}:c.1_844_EMG1{NM_006331.7}:c.412+170_735\",\"exon\":5}]},\"isCanonical\":true,\"proteinId\":\"NP_006322.4\"},{\"transcript\":\"NM_001320049.1\",\"bioType\":\"protein_coding\",\"exons\":\"1-4/6\",\"introns\":\"1-4/5\",\"geneId\":\"10436\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\",\"unidirectional_gene_fusion\"],\"geneFusion\":{\"intron\":4,\"fusions\":[{\"hgvsc\":\"ATN1{NM_001007026.1}:c.1_844_EMG1{NM_001320049.1}:c.409+170_582\",\"exon\":5},{\"hgvsc\":\"ATN1{XM_005253672.1}:c.1_841_EMG1{NM_001320049.1}:c.409+170_582\",\"exon\":5},{\"hgvsc\":\"ATN1{NM_001940.3}:c.1_844_EMG1{NM_001320049.1}:c.409+170_582\",\"exon\":5}]},\"proteinId\":\"NP_001306978.1\"},{\"transcript\":\"NR_135131.1\",\"bioType\":\"misc_RNA\",\"exons\":\"1-3/8\",\"introns\":\"1-3/7\",\"geneId\":\"10436\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"exons\":\"5-10/10\",\"introns\":\"5-9/9\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"transcript_truncation\",\"unidirectional_gene_fusion\"],\"geneFusion\":{\"exon\":5,\"fusions\":[{\"hgvsc\":\"ATN1{ENST00000356654.4}:c.1_844_EMG1{ENST00000261406.6}:c.409+170_732\",\"intron\":4}]},\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"exons\":\"5-10/10\",\"introns\":\"5-9/9\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"transcript_truncation\",\"unidirectional_gene_fusion\"],\"geneFusion\":{\"exon\":5,\"fusions\":[{\"hgvsc\":\"ATN1{ENST00000396684.2}:c.1_844_EMG1{ENST00000261406.6}:c.409+170_732\",\"intron\":4}]},\"proteinId\":\"ENSP00000379915.2\"},{\"transcript\":\"ENST00000541029.1\",\"bioType\":\"retained_intron\",\"exons\":\"1-2/2\",\"introns\":\"1/1\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"transcript_ablation\"]},{\"transcript\":\"ENST00000537488.1\",\"bioType\":\"retained_intron\",\"exons\":\"1-3/3\",\"introns\":\"1-2/2\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"transcript_ablation\"]},{\"transcript\":\"ENST00000607161.1\",\"bioType\":\"processed_transcript\",\"exons\":\"1-3/6\",\"introns\":\"1-3/5\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]},{\"transcript\":\"ENST00000261406.6\",\"bioType\":\"protein_coding\",\"exons\":\"1-4/7\",\"introns\":\"1-4/6\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\",\"unidirectional_gene_fusion\"],\"geneFusion\":{\"intron\":4,\"fusions\":[{\"hgvsc\":\"ATN1{ENST00000356654.4}:c.1_844_EMG1{ENST00000261406.6}:c.409+170_732\",\"exon\":5},{\"hgvsc\":\"ATN1{ENST00000396684.2}:c.1_844_EMG1{ENST00000261406.6}:c.409+170_732\",\"exon\":5}]},\"isCanonical\":true,\"proteinId\":\"ENSP00000476966.1\"},{\"transcript\":\"ENST00000546220.1\",\"bioType\":\"processed_transcript\",\"exons\":\"1-3/6\",\"introns\":\"1-3/5\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]},{\"transcript\":\"ENST00000539440.1\",\"bioType\":\"retained_intron\",\"exons\":\"1-3/4\",\"introns\":\"1-2/3\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]},{\"transcript\":\"ENST00000564245.1\",\"bioType\":\"processed_transcript\",\"exons\":\"1-3/8\",\"introns\":\"1-3/7\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]},{\"transcript\":\"ENST00000451846.2\",\"bioType\":\"retained_intron\",\"exons\":\"1-2/2\",\"introns\":\"1/1\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]},{\"transcript\":\"ENST00000539535.2\",\"bioType\":\"processed_transcript\",\"exons\":\"1-3/8\",\"introns\":\"1-3/7\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]},{\"transcript\":\"ENST00000541016.1\",\"bioType\":\"processed_transcript\",\"exons\":\"1-3/5\",\"introns\":\"1-3/4\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]},{\"transcript\":\"ENST00000539196.1\",\"bioType\":\"processed_transcript\",\"exons\":\"1-3/5\",\"introns\":\"1-3/4\",\"geneId\":\"ENSG00000126749\",\"hgnc\":\"EMG1\",\"consequence\":[\"transcript_truncation\"]}]}}]}")] [InlineData("chr12 7067124 . GGCC ATTG 100 PASS . . .", "{\"chromosome\":\"chr12\",\"position\":7067124,\"refAllele\":\"GGCC\",\"altAlleles\":[\"ATTG\"],\"quality\":100,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7067124:7067127:ATTG\",\"chromosome\":\"chr12\",\"begin\":7067124,\"end\":7067127,\"refAllele\":\"GGCC\",\"altAllele\":\"ATTG\",\"variantType\":\"MNV\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_080548.4\",\"bioType\":\"protein_coding\",\"codons\":\"GGCCcc/ATTGcc\",\"aminoAcids\":\"GP/IA\",\"cdnaPos\":\"1404-1407\",\"cdsPos\":\"1255-1258\",\"exons\":\"11/16\",\"proteinPos\":\"419-420\",\"geneId\":\"5777\",\"hgnc\":\"PTPN6\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_080548.4:c.1255_1258delGGCCinsATTG\",\"hgvsp\":\"NP_536858.1:p.(Gly419_Pro420delinsIleAla)\",\"proteinId\":\"NP_536858.1\"},{\"transcript\":\"XM_005253719.1\",\"bioType\":\"protein_coding\",\"codons\":\"GGCCcc/ATTGcc\",\"aminoAcids\":\"GP/IA\",\"cdnaPos\":\"1248-1251\",\"cdsPos\":\"1132-1135\",\"exons\":\"10/15\",\"proteinPos\":\"378-379\",\"geneId\":\"5777\",\"hgnc\":\"PTPN6\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"XM_005253719.1:c.1132_1135delGGCCinsATTG\",\"hgvsp\":\"XP_005253776.1:p.(Gly378_Pro379delinsIleAla)\",\"proteinId\":\"XP_005253776.1\"},{\"transcript\":\"NM_002831.5\",\"bioType\":\"protein_coding\",\"codons\":\"GGCCcc/ATTGcc\",\"aminoAcids\":\"GP/IA\",\"cdnaPos\":\"1491-1494\",\"cdsPos\":\"1249-1252\",\"exons\":\"11/16\",\"proteinPos\":\"417-418\",\"geneId\":\"5777\",\"hgnc\":\"PTPN6\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_002831.5:c.1249_1252delGGCCinsATTG\",\"hgvsp\":\"NP_002822.2:p.(Gly417_Pro418delinsIleAla)\",\"proteinId\":\"NP_002822.2\"},{\"transcript\":\"NM_080549.3\",\"bioType\":\"protein_coding\",\"codons\":\"GGCCcc/ATTGcc\",\"aminoAcids\":\"GP/IA\",\"cdnaPos\":\"1491-1494\",\"cdsPos\":\"1249-1252\",\"exons\":\"11/16\",\"proteinPos\":\"417-418\",\"geneId\":\"5777\",\"hgnc\":\"PTPN6\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_080549.3:c.1249_1252delGGCCinsATTG\",\"hgvsp\":\"NP_536859.1:p.(Gly417_Pro418delinsIleAla)\",\"isCanonical\":true,\"proteinId\":\"NP_536859.1\"}],\"ensembl\":[{\"transcript\":\"ENST00000542848.1\",\"bioType\":\"nonsense_mediated_decay\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000444805.1\"},{\"transcript\":\"ENST00000543120.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000399448.1\",\"bioType\":\"protein_coding\",\"codons\":\"GGCCcc/ATTGcc\",\"aminoAcids\":\"GP/IA\",\"cdnaPos\":\"1404-1407\",\"cdsPos\":\"1255-1258\",\"exons\":\"11/16\",\"proteinPos\":\"419-420\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000399448.1:c.1255_1258delGGCCinsATTG\",\"hgvsp\":\"ENSP00000382376.1:p.(Gly419_Pro420delinsIleAla)\",\"proteinId\":\"ENSP00000382376.1\"},{\"transcript\":\"ENST00000534900.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000447931.2\",\"bioType\":\"protein_coding\",\"codons\":\"GGCCcc/ATTGcc\",\"aminoAcids\":\"GP/IA\",\"cdnaPos\":\"1248-1251\",\"cdsPos\":\"1132-1135\",\"exons\":\"10/15\",\"proteinPos\":\"378-379\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000447931.2:c.1132_1135delGGCCinsATTG\",\"hgvsp\":\"ENSP00000415979.2:p.(Gly378_Pro379delinsIleAla)\",\"proteinId\":\"ENSP00000415979.2\"},{\"transcript\":\"ENST00000538318.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000538715.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000438740.1\"},{\"transcript\":\"ENST00000318974.9\",\"bioType\":\"protein_coding\",\"codons\":\"GGCCcc/ATTGcc\",\"aminoAcids\":\"GP/IA\",\"cdnaPos\":\"1493-1496\",\"cdsPos\":\"1249-1252\",\"exons\":\"11/16\",\"proteinPos\":\"417-418\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000318974.9:c.1249_1252delGGCCinsATTG\",\"hgvsp\":\"ENSP00000326010.9:p.(Gly417_Pro418delinsIleAla)\",\"proteinId\":\"ENSP00000326010.9\"},{\"transcript\":\"ENST00000456013.1\",\"bioType\":\"protein_coding\",\"codons\":\"GGCCcc/ATTGcc\",\"aminoAcids\":\"GP/IA\",\"cdnaPos\":\"1491-1494\",\"cdsPos\":\"1249-1252\",\"exons\":\"11/16\",\"proteinPos\":\"417-418\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000456013.1:c.1249_1252delGGCCinsATTG\",\"hgvsp\":\"ENSP00000391592.1:p.(Gly417_Pro418delinsIleAla)\",\"isCanonical\":true,\"proteinId\":\"ENSP00000391592.1\"},{\"transcript\":\"ENST00000543744.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000540740.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000416215.2\",\"bioType\":\"retained_intron\",\"cdnaPos\":\"1657-1660\",\"exons\":\"10/15\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"non_coding_transcript_exon_variant\"],\"hgvsc\":\"ENST00000416215.2:n.1657_1660delGGCCinsATTG\"},{\"transcript\":\"ENST00000545153.1\",\"bioType\":\"nonsense_mediated_decay\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000476175.1\"},{\"transcript\":\"ENST00000535462.1\",\"bioType\":\"nonsense_mediated_decay\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000441044.1\"},{\"transcript\":\"ENST00000541698.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000445646.1\"},{\"transcript\":\"ENST00000542462.1\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000440114.1\"},{\"transcript\":\"ENST00000542277.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000536013.1\",\"bioType\":\"nonsense_mediated_decay\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"ENSP00000446345.1\"},{\"transcript\":\"ENST00000539365.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000539029.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000542761.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000537533.1\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111679\",\"hgnc\":\"PTPN6\",\"consequence\":[\"upstream_gene_variant\"]}]}}]}")] [InlineData("chr12 7033330 . T 100 PASS SVTYPE=INS;END=7033330;SVLEN=1350 . .", "{\"chromosome\":\"chr12\",\"position\":7033330,\"svEnd\":7033330,\"refAllele\":\"T\",\"altAlleles\":[\"\"],\"quality\":100,\"filters\":[\"PASS\"],\"svLength\":1350,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"clingen\":[{\"chromosome\":\"12\",\"begin\":147099,\"end\":7054359,\"variantType\":\"copy_number_gain\",\"id\":\"nsv498529\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"]},{\"chromosome\":\"12\",\"begin\":173786,\"end\":34835837,\"variantType\":\"copy_number_gain\",\"id\":\"nsv995956\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Decreased calvarial ossification\",\"Delayed gross motor development\",\"Feeding difficulties\",\"Frontal bossing\",\"Morphological abnormality of the central nervous system\",\"Patchy alopecia\"],\"phenotypeIds\":[\"HP:0002007\",\"HP:0002011\",\"HP:0002194\",\"HP:0002232\",\"HP:0005474\",\"HP:0011968\",\"MedGen:C0232466\",\"MedGen:C1862862\",\"MedGen:CN001816\",\"MedGen:CN001820\",\"MedGen:CN001989\",\"MedGen:CN004852\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":7425202,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532325\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Global developmental delay\"],\"phenotypeIds\":[\"HP:0001263\",\"MedGen:CN001157\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":8514342,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532326\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":25623263,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532324\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Abnormality of cardiac morphology\",\"Agenesis of corpus callosum\",\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"phenotypeIds\":[\"HP:0001274\",\"HP:0001627\",\"MedGen:C1837248\",\"MedGen:CN001482\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":28568117,\"variantType\":\"copy_number_loss\",\"id\":\"nsv531493\",\"clinicalInterpretation\":\"pathogenic\",\"observedLosses\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Global developmental delay\"],\"phenotypeIds\":[\"HP:0001263\",\"MedGen:CN001157\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34533111,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532323\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":2,\"phenotypes\":[\"Coarse facial features\",\"Abnormal facial shape\",\"Abnormality of cardiac morphology\",\"Cleft upper lip\",\"Global developmental delay\",\"Hearing impairment\",\"Short stature\"],\"phenotypeIds\":[\"HP:0000280\",\"MedGen:C1854600\",\"HP:0000204\",\"HP:0000365\",\"HP:0001263\",\"HP:0001627\",\"HP:0001999\",\"HP:0004322\",\"MedGen:C0349588\",\"MedGen:C1384666\",\"MedGen:CN000197\",\"MedGen:CN001157\",\"MedGen:CN001482\",\"MedGen:CN001810\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34756196,\"variantType\":\"copy_number_gain\",\"id\":\"nsv916406\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Ambiguous genitalia\",\"Delayed fine motor development\",\"Delayed gross motor development\",\"Delayed speech and language development\",\"Developmental delay AND/OR other significant developmental or morphological phenotypes\",\"Intellectual disability\",\"Short stature\"],\"phenotypeIds\":[\"HP:0000062\",\"HP:0000750\",\"HP:0001249\",\"HP:0002194\",\"HP:0004322\",\"HP:0010862\",\"MedGen:C0349588\",\"MedGen:C1843367\",\"MedGen:CN000062\",\"MedGen:CN000706\",\"MedGen:CN001989\",\"MedGen:CN116596\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34756209,\"variantType\":\"copy_number_gain\",\"id\":\"nsv533931\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":34761006,\"variantType\":\"copy_number_gain\",\"id\":\"nsv917315\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"]},{\"chromosome\":\"12\",\"begin\":282465,\"end\":133773393,\"variantType\":\"copy_number_gain\",\"id\":\"nsv917029\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Abnormality of toe\",\"Defect in the atrial septum\",\"Downslanted palpebral fissures\",\"Frontal bossing\",\"Low-set ears\",\"Overlapping fingers\",\"Patent ductus arteriosus\",\"Sacral dimple\",\"Sandal gap\",\"Single transverse palmar crease\"],\"phenotypeIds\":[\"HP:0000369\",\"HP:0000494\",\"HP:0000954\",\"HP:0000960\",\"HP:0001631\",\"HP:0001643\",\"HP:0001780\",\"HP:0001852\",\"HP:0002007\",\"HP:0010557\",\"MedGen:C0426848\",\"MedGen:C1865016\",\"MedGen:C1873502\",\"MedGen:CN000345\",\"MedGen:CN001485\",\"MedGen:CN001496\",\"MedGen:CN001615\",\"MedGen:CN001674\",\"MedGen:CN001816\",\"MedGen:CN009386\"]},{\"chromosome\":\"12\",\"begin\":322142,\"end\":34079848,\"variantType\":\"copy_number_gain\",\"id\":\"nsv532328\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"]},{\"chromosome\":\"12\",\"begin\":1367440,\"end\":20810511,\"variantType\":\"copy_number_gain\",\"id\":\"nsv995558\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"validated\":true,\"phenotypes\":[\"Feeding difficulties\",\"Laryngomalacia\"],\"phenotypeIds\":[\"HP:0001601\",\"HP:0011968\",\"MedGen:C0232466\",\"MedGen:CN001457\"]},{\"chromosome\":\"12\",\"begin\":2980907,\"end\":15140282,\"variantType\":\"copy_number_gain\",\"id\":\"nsv868869\",\"clinicalInterpretation\":\"pathogenic\",\"observedGains\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"]},{\"chromosome\":\"12\",\"begin\":6837831,\"end\":7858216,\"variantType\":\"copy_number_loss\",\"id\":\"nsv531496\",\"clinicalInterpretation\":\"pathogenic\",\"observedLosses\":1,\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"]}],\"dgv\":[{\"chromosome\":\"12\",\"begin\":6889463,\"end\":7041469,\"variantType\":\"copy_number_loss\",\"variantFreqAll\":0.02105,\"id\":\"nsv832324\",\"sampleSize\":95,\"observedLosses\":2},{\"chromosome\":\"12\",\"begin\":6948468,\"end\":7033823,\"variantType\":\"copy_number_loss\",\"variantFreqAll\":0.00006,\"id\":\"nsv557261\",\"sampleSize\":17421,\"observedLosses\":1},{\"chromosome\":\"12\",\"begin\":6985480,\"end\":7103003,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.00003,\"id\":\"nsv1035811\",\"sampleSize\":29084,\"observedGains\":1},{\"chromosome\":\"12\",\"begin\":7005694,\"end\":7115157,\"variantType\":\"insertion\",\"variantFreqAll\":0.25,\"id\":\"nsv509453\",\"sampleSize\":4,\"observedGains\":1},{\"chromosome\":\"12\",\"begin\":7012055,\"end\":7163058,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.00003,\"id\":\"nsv1047373\",\"sampleSize\":29084,\"observedGains\":1}],\"variants\":[{\"vid\":\"12:7033331:7033330:INS\",\"chromosome\":\"chr12\",\"begin\":7033331,\"end\":7033330,\"refAllele\":\"T\",\"altAllele\":\"\",\"variantType\":\"insertion\",\"regulatoryRegions\":[{\"id\":\"ENSR00000361206\",\"type\":\"CTCF_binding_site\",\"consequence\":[\"regulatory_region_variant\"]}]}]}")] [InlineData("chr12 7043410 . C CTCC 50 PASS . . .", "{\"chromosome\":\"chr12\",\"position\":7043410,\"refAllele\":\"C\",\"altAlleles\":[\"CTCC\"],\"quality\":50,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7043411:7043410:TCC\",\"chromosome\":\"chr12\",\"begin\":7043411,\"end\":7043410,\"refAllele\":\"-\",\"altAllele\":\"TCC\",\"variantType\":\"insertion\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"codons\":\"-/TCC\",\"aminoAcids\":\"-/S\",\"cdnaPos\":\"336-337\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"inframe_insertion\"],\"hgvsc\":\"NM_001007026.1:c.100_102dupTCC\",\"hgvsp\":\"NP_001007027.1:p.(Ser34dup)\",\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"codons\":\"-/TCC\",\"aminoAcids\":\"-/S\",\"cdnaPos\":\"429-430\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"inframe_insertion\"],\"hgvsc\":\"XM_005253672.1:c.100_102dupTCC\",\"hgvsp\":\"XP_005253729.1:p.(Ser34dup)\",\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"codons\":\"-/TCC\",\"aminoAcids\":\"-/S\",\"cdnaPos\":\"329-330\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"inframe_insertion\"],\"hgvsc\":\"NM_001940.3:c.100_102dupTCC\",\"hgvsp\":\"NP_001931.2:p.(Ser34dup)\",\"proteinId\":\"NP_001931.2\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"codons\":\"-/TCC\",\"aminoAcids\":\"-/S\",\"cdnaPos\":\"336-337\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"inframe_insertion\"],\"hgvsc\":\"ENST00000356654.4:c.100_102dupTCC\",\"hgvsp\":\"ENSP00000349076.3:p.(Ser34dup)\",\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"codons\":\"-/TCC\",\"aminoAcids\":\"-/S\",\"cdnaPos\":\"333-334\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"inframe_insertion\"],\"hgvsc\":\"ENST00000396684.2:c.100_102dupTCC\",\"hgvsp\":\"ENSP00000379915.2:p.(Ser34dup)\",\"proteinId\":\"ENSP00000379915.2\"},{\"transcript\":\"ENST00000541029.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]}]}}]}")] [InlineData("chr12 7043410 . CT GATG 50 PASS . . .", "{\"chromosome\":\"chr12\",\"position\":7043410,\"refAllele\":\"CT\",\"altAlleles\":[\"GATG\"],\"quality\":50,\"filters\":[\"PASS\"],\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7043410:7043411:GATG\",\"chromosome\":\"chr12\",\"begin\":7043410,\"end\":7043411,\"refAllele\":\"CT\",\"altAllele\":\"GATG\",\"variantType\":\"indel\",\"transcripts\":{\"refSeq\":[{\"transcript\":\"NM_001007026.1\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"336-337\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_001007026.1:c.99_100delCTinsGATG\",\"hgvsp\":\"NP_001007027.1:p.(Ser34MetfsTer27)\",\"isCanonical\":true,\"proteinId\":\"NP_001007027.1\"},{\"transcript\":\"XM_005253672.1\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"429-430\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"XM_005253672.1:c.99_100delCTinsGATG\",\"hgvsp\":\"XP_005253729.1:p.(Ser34MetfsTer27)\",\"proteinId\":\"XP_005253729.1\"},{\"transcript\":\"NM_001940.3\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"329-330\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_001940.3:c.99_100delCTinsGATG\",\"hgvsp\":\"NP_001931.2:p.(Ser34MetfsTer27)\",\"proteinId\":\"NP_001931.2\"}],\"ensembl\":[{\"transcript\":\"ENST00000356654.4\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"336-337\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"ENST00000356654.4:c.99_100delCTinsGATG\",\"hgvsp\":\"ENSP00000349076.3:p.(Ser34MetfsTer27)\",\"isCanonical\":true,\"proteinId\":\"ENSP00000349076.3\"},{\"transcript\":\"ENST00000396684.2\",\"bioType\":\"protein_coding\",\"codons\":\"gcCTcc/gcGATGcc\",\"aminoAcids\":\"AS/AMX\",\"cdnaPos\":\"333-334\",\"cdsPos\":\"99-100\",\"exons\":\"3/10\",\"proteinPos\":\"33-34\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"ENST00000396684.2:c.99_100delCTinsGATG\",\"hgvsp\":\"ENSP00000379915.2:p.(Ser34MetfsTer27)\",\"proteinId\":\"ENSP00000379915.2\"},{\"transcript\":\"ENST00000541029.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]}]}}]}")] public void Annotate_with_SA(string vcfLine, string expectedResults) { var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, null, vcfLine); var sb = annotatedPosition.GetJsonStringBuilder(); var observedResults = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResults, observedResults); } #if (NI_ALLELE) [Theory] [InlineData("chr12 7048190 . G A,* 322 PASS SB=0.1234567 . .", "{\"chromosome\":\"chr12\",\"position\":7048190,\"refAllele\":\"G\",\"altAlleles\":[\"A\",\"*\"],\"quality\":322,\"filters\":[\"PASS\"],\"strandBias\":0.123457,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7048190:A\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"A\",\"variantType\":\"SNV\",\"transcripts\":[{\"transcript\":\"ENST00000356654.4\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000356654.4:c.3064G>A\",\"hgvsp\":\"ENSP00000349076.3:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000349076.3\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"NM_001007026.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001007026.1:c.3064G>A\",\"hgvsp\":\"NP_001007027.1:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001007027.1\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000396684.2\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3298\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000396684.2:c.3064G>A\",\"hgvsp\":\"ENSP00000379915.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000379915.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"NM_001940.3\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3294\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001940.3:c.3064G>A\",\"hgvsp\":\"NP_001931.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001931.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000541029.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000538392.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000542222.1\",\"source\":\"Ensembl\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_001301836.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288765.1\"},{\"transcript\":\"ENST00000545581.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440602.1\"},{\"transcript\":\"NM_001301834.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_001288763.1\"},{\"transcript\":\"ENST00000607421.1\",\"source\":\"Ensembl\",\"bioType\":\"antisense\",\"geneId\":\"ENSG00000272173\",\"hgnc\":\"U47924.2\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000458811.1\",\"source\":\"Ensembl\",\"bioType\":\"snRNA\",\"geneId\":\"ENSG00000238923\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"NR_023317.1\",\"source\":\"RefSeq\",\"bioType\":\"snRNA\",\"geneId\":\"100147744\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000544681.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000475422.1\"},{\"transcript\":\"NM_001301838.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288767.1\"},{\"transcript\":\"NM_001301837.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288766.1\"},{\"transcript\":\"NR_126035.1\",\"source\":\"RefSeq\",\"bioType\":\"misc_RNA\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_138425.3\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_612434.1\"},{\"transcript\":\"ENST00000537087.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440937.1\"},{\"transcript\":\"ENST00000229281.5\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"ENSP00000229281.5\"}]},{\"vid\":\"12:7048190:*\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"*\",\"variantType\":\"non_informative_allele\"}]}")] [InlineData("chr12 7048190 . G A,<*> 322 PASS SB=0.1234567 . .", "{\"chromosome\":\"chr12\",\"position\":7048190,\"refAllele\":\"G\",\"altAlleles\":[\"A\",\"<*>\"],\"quality\":322,\"filters\":[\"PASS\"],\"strandBias\":0.123457,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7048190:A\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"A\",\"variantType\":\"SNV\",\"transcripts\":[{\"transcript\":\"ENST00000356654.4\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000356654.4:c.3064G>A\",\"hgvsp\":\"ENSP00000349076.3:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000349076.3\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"NM_001007026.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001007026.1:c.3064G>A\",\"hgvsp\":\"NP_001007027.1:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001007027.1\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000396684.2\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3298\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000396684.2:c.3064G>A\",\"hgvsp\":\"ENSP00000379915.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000379915.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"NM_001940.3\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3294\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001940.3:c.3064G>A\",\"hgvsp\":\"NP_001931.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001931.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000541029.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000538392.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000542222.1\",\"source\":\"Ensembl\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_001301836.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288765.1\"},{\"transcript\":\"ENST00000545581.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440602.1\"},{\"transcript\":\"NM_001301834.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_001288763.1\"},{\"transcript\":\"ENST00000607421.1\",\"source\":\"Ensembl\",\"bioType\":\"antisense\",\"geneId\":\"ENSG00000272173\",\"hgnc\":\"U47924.2\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000458811.1\",\"source\":\"Ensembl\",\"bioType\":\"snRNA\",\"geneId\":\"ENSG00000238923\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"NR_023317.1\",\"source\":\"RefSeq\",\"bioType\":\"snRNA\",\"geneId\":\"100147744\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000544681.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000475422.1\"},{\"transcript\":\"NM_001301838.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288767.1\"},{\"transcript\":\"NM_001301837.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288766.1\"},{\"transcript\":\"NR_126035.1\",\"source\":\"RefSeq\",\"bioType\":\"misc_RNA\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_138425.3\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_612434.1\"},{\"transcript\":\"ENST00000537087.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440937.1\"},{\"transcript\":\"ENST00000229281.5\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"ENSP00000229281.5\"}]},{\"vid\":\"12:7048190:*\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"<*>\",\"variantType\":\"non_informative_allele\"}]}")] [InlineData("chr12 7048190 . G A, 322 PASS SB=0.1234567 . .", "{\"chromosome\":\"chr12\",\"position\":7048190,\"refAllele\":\"G\",\"altAlleles\":[\"A\",\"\"],\"quality\":322,\"filters\":[\"PASS\"],\"strandBias\":0.123457,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7048190:A\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"A\",\"variantType\":\"SNV\",\"transcripts\":[{\"transcript\":\"ENST00000356654.4\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000356654.4:c.3064G>A\",\"hgvsp\":\"ENSP00000349076.3:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000349076.3\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"NM_001007026.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001007026.1:c.3064G>A\",\"hgvsp\":\"NP_001007027.1:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001007027.1\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000396684.2\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3298\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000396684.2:c.3064G>A\",\"hgvsp\":\"ENSP00000379915.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000379915.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"NM_001940.3\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3294\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001940.3:c.3064G>A\",\"hgvsp\":\"NP_001931.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001931.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000541029.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000538392.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000542222.1\",\"source\":\"Ensembl\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_001301836.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288765.1\"},{\"transcript\":\"ENST00000545581.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440602.1\"},{\"transcript\":\"NM_001301834.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_001288763.1\"},{\"transcript\":\"ENST00000607421.1\",\"source\":\"Ensembl\",\"bioType\":\"antisense\",\"geneId\":\"ENSG00000272173\",\"hgnc\":\"U47924.2\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000458811.1\",\"source\":\"Ensembl\",\"bioType\":\"snRNA\",\"geneId\":\"ENSG00000238923\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"NR_023317.1\",\"source\":\"RefSeq\",\"bioType\":\"snRNA\",\"geneId\":\"100147744\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000544681.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000475422.1\"},{\"transcript\":\"NM_001301838.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288767.1\"},{\"transcript\":\"NM_001301837.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288766.1\"},{\"transcript\":\"NR_126035.1\",\"source\":\"RefSeq\",\"bioType\":\"misc_RNA\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_138425.3\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_612434.1\"},{\"transcript\":\"ENST00000537087.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440937.1\"},{\"transcript\":\"ENST00000229281.5\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"ENSP00000229281.5\"}]},{\"vid\":\"12:7048190:*\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"\",\"variantType\":\"non_informative_allele\"}]}")] [InlineData("chr12 7048190 . G A, 322 PASS SB=0.1234567 . .", "{\"chromosome\":\"chr12\",\"position\":7048190,\"refAllele\":\"G\",\"altAlleles\":[\"A\",\"\"],\"quality\":322,\"filters\":[\"PASS\"],\"strandBias\":0.123457,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7048190:A\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"A\",\"variantType\":\"SNV\",\"transcripts\":[{\"transcript\":\"ENST00000356654.4\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000356654.4:c.3064G>A\",\"hgvsp\":\"ENSP00000349076.3:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000349076.3\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"NM_001007026.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3301\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001007026.1:c.3064G>A\",\"hgvsp\":\"NP_001007027.1:p.(Ala1022Thr)\",\"isCanonical\":true,\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001007027.1\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000396684.2\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3298\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"ENST00000396684.2:c.3064G>A\",\"hgvsp\":\"ENSP00000379915.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"ENSP00000379915.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"NM_001940.3\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"codons\":\"Gca/Aca\",\"aminoAcids\":\"A/T\",\"cdnaPos\":\"3294\",\"cdsPos\":\"3064\",\"exons\":\"7/10\",\"proteinPos\":\"1022\",\"geneId\":\"1822\",\"hgnc\":\"ATN1\",\"consequence\":[\"missense_variant\"],\"hgvsc\":\"NM_001940.3:c.3064G>A\",\"hgvsp\":\"NP_001931.2:p.(Ala1022Thr)\",\"polyPhenScore\":0.36,\"polyPhenPrediction\":\"benign\",\"proteinId\":\"NP_001931.2\",\"siftScore\":0.1,\"siftPrediction\":\"tolerated - low confidence\"},{\"transcript\":\"ENST00000541029.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"downstream_gene_variant\"]},{\"transcript\":\"ENST00000537488.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111676\",\"hgnc\":\"ATN1\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000538392.1\",\"source\":\"Ensembl\",\"bioType\":\"retained_intron\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"ENST00000542222.1\",\"source\":\"Ensembl\",\"bioType\":\"processed_transcript\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_001301836.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288765.1\"},{\"transcript\":\"ENST00000545581.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440602.1\"},{\"transcript\":\"NM_001301834.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_001288763.1\"},{\"transcript\":\"ENST00000607421.1\",\"source\":\"Ensembl\",\"bioType\":\"antisense\",\"geneId\":\"ENSG00000272173\",\"hgnc\":\"U47924.2\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000458811.1\",\"source\":\"Ensembl\",\"bioType\":\"snRNA\",\"geneId\":\"ENSG00000238923\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"NR_023317.1\",\"source\":\"RefSeq\",\"bioType\":\"snRNA\",\"geneId\":\"100147744\",\"hgnc\":\"RNU7-1\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true},{\"transcript\":\"ENST00000544681.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000475422.1\"},{\"transcript\":\"NM_001301838.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288767.1\"},{\"transcript\":\"NM_001301837.1\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_001288766.1\"},{\"transcript\":\"NR_126035.1\",\"source\":\"RefSeq\",\"bioType\":\"misc_RNA\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"]},{\"transcript\":\"NM_138425.3\",\"source\":\"RefSeq\",\"bioType\":\"protein_coding\",\"geneId\":\"113246\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"NP_612434.1\"},{\"transcript\":\"ENST00000537087.1\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"proteinId\":\"ENSP00000440937.1\"},{\"transcript\":\"ENST00000229281.5\",\"source\":\"Ensembl\",\"bioType\":\"protein_coding\",\"geneId\":\"ENSG00000111678\",\"hgnc\":\"C12orf57\",\"consequence\":[\"upstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"ENSP00000229281.5\"}]},{\"vid\":\"12:7048190:*\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"\",\"variantType\":\"non_informative_allele\"}]}")] [InlineData("chr12 7048190 . G * 322 PASS SB=0.1234567 . .", "{\"chromosome\":\"chr12\",\"position\":7048190,\"refAllele\":\"G\",\"altAlleles\":[\"*\"],\"quality\":322,\"filters\":[\"PASS\"],\"strandBias\":0.123457,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7048190:*\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"*\",\"variantType\":\"non_informative_allele\"}]}")] [InlineData("chr12 7048190 . G <*> 322 PASS SB=0.1234567 . .", "{\"chromosome\":\"chr12\",\"position\":7048190,\"refAllele\":\"G\",\"altAlleles\":[\"<*>\"],\"quality\":322,\"filters\":[\"PASS\"],\"strandBias\":0.123457,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7048190:*\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"<*>\",\"variantType\":\"non_informative_allele\"}]}")] [InlineData("chr12 7048190 . G 322 PASS SB=0.1234567 . .", "{\"chromosome\":\"chr12\",\"position\":7048190,\"refAllele\":\"G\",\"altAlleles\":[\"\"],\"quality\":322,\"filters\":[\"PASS\"],\"strandBias\":0.123457,\"cytogeneticBand\":\"12p13.31\",\"samples\":[{\"isEmpty\":true}],\"variants\":[{\"vid\":\"12:7048190:*\",\"chromosome\":\"chr12\",\"begin\":7048190,\"end\":7048190,\"refAllele\":\"G\",\"altAllele\":\"\",\"variantType\":\"non_informative_allele\"}]}")] [InlineData("chr12 7048190 . G 322 PASS SB=0.1234567 . .", null)] public void Annotate_NonInformativeAllele_MinimalAnnotation_NoSa_AsExpected(string vcfLine, string expectedResults) { var annotatedPosition = AnnotationUtilities.GetAnnotatedPosition(_cacheFilePrefix, null, vcfLine); string observedResults = annotatedPosition.GetJsonString(); Assert.Equal(expectedResults, observedResults); } #endif } } ================================================ FILE: UnitTests/ErrorHandling/ExceptionUtilitiesTests.cs ================================================ using System; using System.IO; using System.Net; using System.Threading.Tasks; using Xunit; using static ErrorHandling.ExceptionUtilities; namespace UnitTests.ErrorHandling { public sealed class ExceptionUtilitiesTests { private readonly Exception _generalException = new Exception("first level", new Exception("second level", new Exception("third level"))); private readonly Exception _taskCancellation1 = new Exception("first level", new TaskCanceledException("second level", new Exception("third level"))); private readonly Exception _taskCancellation2 = new Exception("first level", new Exception("second level", new TaskCanceledException("third level"))); [Fact] public void HasException_AsExpected() { Assert.False(HasException(_generalException)); Assert.True(HasException(_taskCancellation1)); Assert.True(HasException(_taskCancellation2)); } [Fact] public void GetInnermostException_AsExpected() { Assert.Equal("third level", GetInnermostException(_generalException).Message); Assert.Equal("third level", GetInnermostException(_taskCancellation1).Message); Assert.Equal("third level", GetInnermostException(_taskCancellation2).Message); } } public sealed class MockHttpWebResponse : WebResponse { private readonly Stream _stream; public HttpStatusCode StatusCode { get; } public MockHttpWebResponse(Stream stream, HttpStatusCode statusCode) { _stream = stream; StatusCode = statusCode; } public override Stream GetResponseStream() => _stream; } } ================================================ FILE: UnitTests/ErrorHandling/Exceptions/ExceptionsTests.cs ================================================ using System; using System.Collections; using System.Collections.Generic; using ErrorHandling; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.ErrorHandling.Exceptions { public sealed class ExceptionsTests { private sealed class ExceptionGenerator : IEnumerable { private readonly List _data = new List { new object[] { new CompressionException("test"), ExitCodes.Compression}, new object[] { new FileNotSortedException("test"), ExitCodes.FileNotSorted}, new object[] { new InvalidFileFormatException("test"), ExitCodes.InvalidFileFormat}, new object[] { new MissingCompressionLibraryException("test"), ExitCodes.MissingCompressionLibrary}, new object[] { new ProcessLockedFileException("test"), ExitCodes.SharingViolation}, new object[] { new UserErrorException("test"), ExitCodes.UserError} }; public IEnumerator GetEnumerator() => _data.GetEnumerator(); IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); } [Theory] [ClassData(typeof(ExceptionGenerator))] public void Check_ExceptionToExitCode_Mapping(Exception ex, ExitCodes expectedExitCode) { ExitCodes observedExitCode = ExitCodeUtilities.GetExitCode(ex.GetType()); Assert.Equal(expectedExitCode, observedExitCode); } } } ================================================ FILE: UnitTests/ErrorHandling/ExitCodeUtilitiesTests.cs ================================================ using System; using System.Threading; using ErrorHandling; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.ErrorHandling { public sealed class ExitCodeUtilitiesTests { [Fact] public void ShowException_CompressionException_CheckExitCode() { var compressionException = new CompressionException("test"); compressionException.Data[ExitCodeUtilities.VcfLine] = "chr1\t100\tA\tC"; var exitCode = ExitCodeUtilities.ShowException(compressionException); Assert.Equal(ExitCodes.Compression, exitCode); } [Fact] public void ShowException_UnknownException_ExitCode_ShouldBeOne() { var unknownException = new AbandonedMutexException(); var exitCode = ExitCodeUtilities.ShowException(unknownException); Assert.Equal(ExitCodes.InvalidFunction, exitCode); } [Fact] public void ShowException_AggregateException_ExitCode_ShouldBeOne() { // TODO: It would be great to verify which exception was shown var refNullException = new NullReferenceException(); var aggregateException = new AggregateException(refNullException); var exitCode = ExitCodeUtilities.ShowException(aggregateException); Assert.Equal(ExitCodes.InvalidFunction, exitCode); } } } ================================================ FILE: UnitTests/ErrorHandling/ExitCodesTests.cs ================================================ ================================================ FILE: UnitTests/GeneAnnotationLambda/GeneAnnotationLambdaTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Cloud.Messages.Gene; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.GeneAnnotationLambda { public sealed class GeneAnnotationLambdaTests { private readonly string _manifestPath = Resources.TopPath("manifest.txt"); private readonly string _customNgaPath = Resources.TopPath("custom_gene.nga"); private readonly string _prefix = Resources.Top + Path.DirectorySeparatorChar; [Fact] public void GetNgaFiles_AsExpected() { IEnumerable ngaFiles = global::GeneAnnotationLambda.GeneAnnotationLambda.GetNgaFileList(_manifestPath, _prefix, new[] { _customNgaPath }); IEnumerable expectedFiles = new[] { "ClinGen_Dosage_Sensitivity_Map_20190507.nga", "gnomAD_gene_scores_2.1.nga", "OMIM_20190812.nga", "custom_gene.nga" }.Select(Resources.TopPath); Assert.Equal(expectedFiles, ngaFiles); } [Fact] public void GetGeneAnnotation_AsExpected() { var input = new GeneConfig { id = "test", geneSymbols = new[] { "TP53", "ZIC2", "LOC645752" }, ngaUrls = new[] { _customNgaPath } }; string responseString = global::GeneAnnotationLambda.GeneAnnotationLambda.GetGeneAnnotation(input, _manifestPath, _prefix); Assert.Contains("header", responseString); Assert.Contains("TP53", responseString); Assert.Contains("ZIC2", responseString); Assert.Contains("clingenDosageSensitivityMap", responseString); Assert.Contains("gnomAD", responseString); Assert.Contains("omim", responseString); Assert.Contains("InternalGeneAnnotation", responseString); Assert.DoesNotContain("LOC645752", responseString); } } } ================================================ FILE: UnitTests/GeneAnnotationLambda/GeneConfigTests.cs ================================================ using Cloud.Messages.Gene; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.GeneAnnotationLambda { public sealed class GeneConfigTests { [Fact] public void Validate_NoId_ThrowException() { var input = new GeneConfig {geneSymbols = new[] {"TP53"}}; Assert.Throws(() => input.Validate()); } [Fact] public void Validate_NoGeneSymbols_ThrowException() { var input = new GeneConfig { id = "test" }; Assert.Throws(() => input.Validate()); } [Fact] public void Validate_EmptyGeneSymbols_ThrowException() { var input = new GeneConfig { id = "test", geneSymbols = new string[]{}}; Assert.Throws(() => input.Validate()); } } } ================================================ FILE: UnitTests/Genome/ChromosomeIntervalTests.cs ================================================ using Genome; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Genome { public sealed class ChromosomeIntervalTests { [Fact] public void ChromosomeInterval_Setup() { const int expectedStart = 100; const int expectedEnd = 200; var observedInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); Assert.Equal(ChromosomeUtilities.Chr1, observedInterval.Chromosome); Assert.Equal(expectedStart, observedInterval.Start); Assert.Equal(expectedEnd, observedInterval.End); } } } ================================================ FILE: UnitTests/Genome/CytogeneticBandTests.cs ================================================ using Genome; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Genome { public sealed class CytogeneticBandTests { private static readonly Band[] CytogeneticBands = { new Band(88300001, 92800000, "q14.3"), new Band(92800001, 97200000, "q21") }; [Theory] [InlineData(88400000, 92900000, "11q14.3-q21")] [InlineData(88400000, 92400000, "11q14.3")] [InlineData(92820001, 92900000, "11q21")] [InlineData(92820001, 92820001, "11q21")] [InlineData(1, 1, null)] [InlineData(97000000, 98200000, null)] public void GetCytogeneticBand_Range(int start, int end, string expectedCytogeneticBand) { string observedCytogeneticBand = CytogeneticBands.Find(ChromosomeUtilities.Chr11, start, end); Assert.Equal(expectedCytogeneticBand, observedCytogeneticBand); } [Fact] public void GetCytogeneticBand_UnknownReference_ReturnNull() { string observedCytogeneticBand = CytogeneticBands.Find(ChromosomeUtilities.Chr12, 100, 200); Assert.Null(observedCytogeneticBand); } [Fact] public void GetCytogeneticBand_UnknownReferenceIndex_ReturnNull() { string observedCytogeneticBand = CytogeneticBands.Find(ChromosomeUtilities.Bob, 100, 200); Assert.Null(observedCytogeneticBand); } } } ================================================ FILE: UnitTests/Genome/EmptyChromosomeTests.cs ================================================ using Genome; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Genome { public sealed class EmptyChromosomeTests { private readonly Chromosome _emptyChromosome = Chromosome.GetEmptyChromosome("chr1"); private readonly Chromosome _emptyChromosome2 = Chromosome.GetEmptyChromosome("chr1"); [Fact] public void Equals_True() { Assert.True(_emptyChromosome.Equals(_emptyChromosome2)); } [Fact] public void Equals_False() { Assert.False(_emptyChromosome.Equals(ChromosomeUtilities.Chr1)); Assert.False(ChromosomeUtilities.Chr1.Equals(_emptyChromosome)); } [Fact] public void GetHashCode_True() { Assert.Equal(_emptyChromosome.GetHashCode(), _emptyChromosome2.GetHashCode()); } [Fact] public void GetHashCode_False() { Assert.NotEqual(_emptyChromosome.GetHashCode(), ChromosomeUtilities.Chr1.GetHashCode()); } } } ================================================ FILE: UnitTests/Genome/GenomeAssemblyHelperTests.cs ================================================ using ErrorHandling.Exceptions; using Genome; using Xunit; namespace UnitTests.Genome { public sealed class GenomeAssemblyHelperTests { [Theory] [InlineData("GRCH37", GenomeAssembly.GRCh37)] [InlineData("GRCH38", GenomeAssembly.GRCh38)] [InlineData("HG19", GenomeAssembly.hg19)] [InlineData("", GenomeAssembly.Unknown)] [InlineData("RCRS", GenomeAssembly.rCRS)] public void Convert_GenomeAssemblyExists(string s, GenomeAssembly expectedGenomeAssembly) { var observedResult = GenomeAssemblyHelper.Convert(s); Assert.Equal(expectedGenomeAssembly, observedResult); } [Fact] public void Convert_GenomeAssemblyDoesNotExist() { Assert.Throws(delegate { GenomeAssemblyHelper.Convert("dummy"); }); } } } ================================================ FILE: UnitTests/Genome/ReferenceNameUtilitiesTests.cs ================================================ using System.IO; using Genome; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Genome { public sealed class ReferenceNameUtilitiesTests { [Fact] public void GetChromosome_RefIndex_Exists() { var chromosome = ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefIndexToChromosome, 2); Assert.Equal("3", chromosome.EnsemblName); } [Fact] public void GetChromosome_RefIndex_DoesNotExist() { Assert.Throws(delegate { ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefIndexToChromosome, 171); }); } [Fact] public void GetChromosome_RefName_Exists() { var chromosome = ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefNameToChromosome, "1"); Assert.Equal(0, chromosome.Index); } [Fact] public void GetChromosome_RefName_DoesNotExist() { const string chromosomeName = "dummy"; var chromosome = ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefNameToChromosome, chromosomeName); Assert.Equal(chromosomeName, chromosome.EnsemblName); Assert.Equal(chromosomeName, chromosome.UcscName); Assert.True(chromosome.IsEmpty()); } [Fact] public void GetChromosome_RefName_NullName() { var chromosome = ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefNameToChromosome, null); Assert.Equal(string.Empty, chromosome.EnsemblName); Assert.Equal(string.Empty, chromosome.UcscName); Assert.True(chromosome.IsEmpty()); } } } ================================================ FILE: UnitTests/Genome/SequenceUtilitiesTests.cs ================================================ using Genome; using Xunit; namespace UnitTests.Genome { public sealed class SequenceUtilitiesTests { [Theory] [InlineData("ACGTTTGA", "TCAAACGT")] [InlineData(null, null)] public void GetReverseComplement(string bases, string expectedResult) { var observedResult = SequenceUtilities.GetReverseComplement(bases); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData("ACGT", false)] [InlineData("ACXT", true)] [InlineData(null, false)] public void HasNonCanonicalBase(string bases, bool expectedResult) { var observedResult = SequenceUtilities.HasNonCanonicalBase(bases); Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/IO/BufferedBinaryReaderTests.cs ================================================ using System; using System.IO; using System.Text; using IO; using Xunit; namespace UnitTests.IO { public sealed class BufferedBinaryReaderTests { [Fact] public void ReadBoolean() { const bool expectedValue = true; bool observedValue = GetObservedValue(writer => writer.Write(expectedValue), reader => reader.ReadBoolean()); Assert.Equal(expectedValue, observedValue); } [Fact] public void ReadByte() { const byte expectedValue = byte.MaxValue; byte observedValue = GetObservedValue(writer => writer.Write(expectedValue), reader => reader.ReadByte()); Assert.Equal(expectedValue, observedValue); } [Fact] public void ReadBytes() { byte[] expectedValue = Encoding.ASCII.GetBytes("Hello world"); byte[] observedValue = GetObservedValue(writer => writer.Write(expectedValue), reader => reader.ReadBytes(expectedValue.Length)); Assert.Equal(expectedValue, observedValue); } [Fact] public void ReadUInt16() { const ushort expectedValue = ushort.MaxValue; ushort observedValue = GetObservedValue(writer => writer.Write(expectedValue), reader => reader.ReadUInt16()); Assert.Equal(expectedValue, observedValue); } [Fact] public void ReadUInt32() { const uint expectedValue = uint.MaxValue; uint observedValue = GetObservedValue(writer => writer.Write(expectedValue), reader => reader.ReadUInt32()); Assert.Equal(expectedValue, observedValue); } [Fact] public void ReadAsciiString() { const string expectedValue = "Hello world"; string observedValue = GetObservedValue(writer => writer.Write(expectedValue), reader => reader.ReadAsciiString()); Assert.Equal(expectedValue, observedValue); } [Theory] [InlineData(ushort.MaxValue)] [InlineData(3)] [InlineData(0)] public void ReadOptUInt16_HandleExtremeIntegers(ushort expectedValue) { ushort observedValue = GetObservedValue(writer => writer.WriteOpt(expectedValue), reader => reader.ReadOptUInt16()); Assert.Equal(expectedValue, observedValue); } [Theory] [InlineData(int.MaxValue)] [InlineData(-1)] [InlineData(int.MinValue)] public void ReadOptInt32_HandleExtremeIntegers(int expectedValue) { int observedValue = GetObservedValue(writer => writer.WriteOpt(expectedValue), reader => reader.ReadOptInt32()); Assert.Equal(expectedValue, observedValue); } private static T GetObservedValue(Action writeMethod, Func readMethod) { T observedValue; using (var memoryStream = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(memoryStream, Encoding.UTF8, true)) { writeMethod(writer); } memoryStream.Position = 0; using (var reader = new BufferedBinaryReader(memoryStream)) { observedValue = readMethod(reader); } } return observedValue; } } } ================================================ FILE: UnitTests/IO/ExtendedBinaryReaderTests.cs ================================================ using System; using System.IO; using System.Text; using IO; using Xunit; namespace UnitTests.IO { public sealed class ExtendedBinaryReaderTests { [Theory] [InlineData(3)] [InlineData(0)] [InlineData(-2)] public void ReadOptInt32_HandleSmallIntegers(int expectedValue) { int observedValue = GetObservedValue(writer => writer.WriteOpt(expectedValue), reader => reader.ReadOptInt32()); Assert.Equal(expectedValue, observedValue); } [Theory] [InlineData(int.MaxValue)] [InlineData(int.MinValue)] public void ReadOptInt32_HandleExtremeIntegers(int expectedValue) { int observedValue = GetObservedValue(writer => writer.WriteOpt(expectedValue), reader => reader.ReadOptInt32()); Assert.Equal(expectedValue, observedValue); } [Fact] public void ReadOptInt32_ThrowException_WithCorruptData() { Assert.Throws(delegate { using (var ms = new MemoryStream()) { using (var writer = new BinaryWriter(ms, Encoding.UTF8, true)) { const ulong corruptInt = 0xffffffffffffffff; writer.Write(corruptInt); } ms.Position = 0; using (var reader = new ExtendedBinaryReader(ms)) { reader.ReadOptInt32(); } } }); } [Theory] [InlineData(ushort.MaxValue)] [InlineData(ushort.MinValue)] public void ReadOptUInt16_HandleExtremeIntegers(ushort expectedValue) { ushort observedValue = GetObservedValue(writer => writer.WriteOpt(expectedValue), reader => reader.ReadOptUInt16()); Assert.Equal(expectedValue, observedValue); } [Theory] [InlineData(3)] [InlineData(0)] [InlineData(-2)] public void ReadOptInt64_HandleSmallIntegers(long expectedValue) { long observedValue = GetObservedValue(writer => writer.WriteOpt(expectedValue), reader => reader.ReadOptInt64()); Assert.Equal(expectedValue, observedValue); } [Theory] [InlineData(long.MaxValue)] [InlineData(long.MinValue)] public void ReadOptInt64_HandleExtremeIntegers(long expectedValue) { long observedValue = GetObservedValue(writer => writer.WriteOpt(expectedValue), reader => reader.ReadOptInt64()); Assert.Equal(expectedValue, observedValue); } [Fact] public void ReadOptInt64_ThrowException_WithCorruptData() { Assert.Throws(delegate { using (var ms = new MemoryStream()) { using (var writer = new BinaryWriter(ms, Encoding.UTF8, true)) { const ulong corruptData = 0xffffffffffffffff; writer.Write(corruptData); writer.Write(corruptData); } ms.Position = 0; using (var reader = new ExtendedBinaryReader(ms)) { reader.ReadOptInt64(); } } }); } [Fact] public void ReadAsciiString_NullString() { string observedValue = GetObservedValue(writer => writer.WriteOptAscii(null), reader => reader.ReadAsciiString()); Assert.Null(observedValue); } [Fact] public void BufferedBinaryReader_EndToEnd_DoNotLeaveOpen() { const int expectedResult = 5; int observedResult; byte[] data; using (var ms = new MemoryStream()) using (var writer = new ExtendedBinaryWriter(ms)) { writer.Write(expectedResult); data = ms.ToArray(); } using (var ms = new MemoryStream(data)) using (var reader = new ExtendedBinaryReader(ms)) { observedResult = reader.ReadInt32(); } Assert.Equal(expectedResult, observedResult); } private static T GetObservedValue(Action writeMethod, Func readMethod) { T observedValue; using (var memoryStream = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(memoryStream, Encoding.UTF8, true)) { writeMethod(writer); } memoryStream.Position = 0; using (var reader = new ExtendedBinaryReader(memoryStream)) { observedValue = readMethod(reader); } } return observedValue; } } } ================================================ FILE: UnitTests/IO/FilePathUtilitiesTests.cs ================================================ using IO; using Xunit; namespace UnitTests.IO { public sealed class FilePathUtilitiesTests { [InlineData("C:\\Input files\\input.test.mp3", ".mp3", true)] [InlineData("C:\\Input files\\input", "C:\\Input files\\input", true)] [InlineData("\\\\ussd-prd-isi04\\Nirvana\\input.vcf", "vcf", false)] [InlineData("/d/Projects/Nirvana/input.vcf", ".vcf", true)] [InlineData("https://illumina.s3.amazonaws.com/input/Custom_SA/Custom-annotations_short-GRCh37.nsa?AWSAccessKeyId=UUNE5Q&Expires=asdf223&Signature=asdfasd", ".nsa", true)] [InlineData("https://stratus-gds-stage.s3.us-west-2.amazonaws.com/b9077f78-6b4e-4068-b4b2-08d6d80d1d7d/custom-filter-file/custom-annotation/2b8e155e-9046-4ef5-9ec0-374ccc98a93c/2b8e155e-9046-4ef5-9ec0-374ccc98a93c.nsa?X-Amz-Expires=604800&response-content-disposition=attachment%3Bfilename%3D%222b8e155e-9046-4ef5-9ec0-374ccc98a93c.nsa%22&x-userId=086723b2-1e53-32cd-a410-80cb885de66c&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJ7P2VLXQJYGXATTA/20190708/us-west-2/s3/aws4_request&X-Amz-Date=20190708T163940Z&X-Amz-SignedHeaders=host&X-Amz-Signature=d386f9d0aa7aab1a1a67c3ee625a208589924a51e384840ce9159a88b6c8363a", "nsa", false)] [Theory] public void GetFileSuffix_AsExpected(string filePath, string suffix, bool includeDot) { Assert.Equal(suffix, filePath.GetFileSuffix(includeDot)); } } } ================================================ FILE: UnitTests/IO/FileUtilitiesTests.cs ================================================ using System.IO; using IO; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.IO { public sealed class FileUtilitiesTests { [Fact] public void GetReadStream_GetCreateStream_Loopback() { string random = RandomPath.GetRandomPath(); const string expectedString = "charlie"; using (var writer = new StreamWriter(FileUtilities.GetCreateStream(random))) { writer.WriteLine(expectedString); } string observedString; using (var reader = FileUtilities.GetStreamReader(FileUtilities.GetReadStream(random))) { observedString = reader.ReadLine(); } Assert.Equal(expectedString, observedString); } } } ================================================ FILE: UnitTests/IO/HttpUtilitiesTests.cs ================================================ using System; using System.IO; using System.Net; using System.Xml.Linq; using ErrorHandling.Exceptions; using UnitTests.ErrorHandling; using Xunit; using IO; namespace UnitTests.IO { public sealed class HttpUtilitiesTests { [Fact] public void IsWebProtocolErrorException_AsExpected() { Assert.False(HttpUtilities.IsWebProtocolErrorException(new Exception("An exception"))); Assert.False(HttpUtilities.IsWebProtocolErrorException(new WebException("web exception"))); Assert.False(HttpUtilities.IsWebProtocolErrorException(new WebException("web exception", WebExceptionStatus.ConnectFailure))); Assert.True(HttpUtilities.IsWebProtocolErrorException(new WebException("web exception", null, WebExceptionStatus.ProtocolError, new MockHttpWebResponse(null, HttpStatusCode.NotFound)))); Assert.True(HttpUtilities.IsWebProtocolErrorException(new WebException("web exception", null, WebExceptionStatus.ProtocolError, new MockHttpWebResponse(null, HttpStatusCode.Forbidden)))); } [Theory] [InlineData("InvalidAccessKeyId", "The AWS Access Key Id you provided does not exist in our records", "https://unit.test/bob.vcf.gz", "Something wrong.", "Authentication error while reading from URL for bob.vcf.gz.")] [InlineData("AccessDenied", "Request has expired", "https://expired.url/bob.vcf.gz", "Something wrong again.", "The provided URL for bob.vcf.gz has expired.")] public void ProcessHttpRequestForbiddenException_AsExpected(string errorCode, string message, string url, string exceptionMessage, string newErrorMessage) { XElement xmlMessage = new XElement("Root", new XElement("Code", errorCode), new XElement("Message", message)); var stream = new MemoryStream(); xmlMessage.Save(stream); stream.Position = 0; var response = new MockHttpWebResponse(stream, HttpStatusCode.Forbidden); var inputException = new WebException(exceptionMessage, null, WebExceptionStatus.ProtocolError, response); var outputException = HttpUtilities.ProcessHttpRequestWebProtocolErrorException(inputException, url); Assert.IsType(outputException); Assert.Equal(newErrorMessage, outputException.Message); } [Fact] public void ValidateUrl_invalid_user_provided() { Assert.Throws(() => HttpUtilities.ValidateUrl( "https://nirvana-annotations.s3.us-west-2.amazonaws.com/645778a7d475ac437d15765ef3c6f50c-OMIM/0/OMIM_20191004.nga")); } [Fact] public void ValidateUrl_invalid_deployment() { Assert.Throws(() => HttpUtilities.ValidateUrl( "https://nirvana-annotations.s3.us-west-2.amazonaws.com/645778a7d475ac437d15765ef3c6f50c-OMIM/0/OMIM_20191004.nga", false)); } [Fact] public void ValidateUrl_valid() { HttpUtilities.ValidateUrl( "https://nirvana-annotations.s3.us-west-2.amazonaws.com/645778a7d475ac437d15765ef3c6f50c-OMIM/6/OMIM_20191004.nga", false); } } } ================================================ FILE: UnitTests/IO/LengthStreamTests.cs ================================================ using System; using System.IO; using System.Text; using IO; using Xunit; namespace UnitTests.IO { public sealed class LengthStreamTests { [Fact] public void Length_AsExpected() { long trueLength, modifiedLength; using (var memoryStream = new MemoryStream()) { using (var writer = new StreamWriter(memoryStream, Encoding.ASCII, 1024, true)) { writer.Write("The quick brown fox jumps over the lazy dog"); } trueLength = memoryStream.Length; using (var lengthStream = new LengthStream(memoryStream, 3)) { modifiedLength = lengthStream.Length; } } Assert.Equal(43, trueLength); Assert.Equal(3, modifiedLength); } [Fact] public void StreamTests_AsExpected() { using (var memoryStream = new MemoryStream()) { using (var writer = new StreamWriter(memoryStream, Encoding.ASCII, 1024, true)) { writer.Write("The quick brown fox jumps over the lazy dog"); } long expectedPosition = memoryStream.Position; memoryStream.Position = 0; using (var lengthStream = new LengthStream(memoryStream, 3)) using (var reader = new StreamReader(lengthStream)) { reader.ReadToEnd(); Assert.True(lengthStream.CanRead); Assert.True(lengthStream.CanWrite); Assert.True(lengthStream.CanSeek); Assert.Equal(3, lengthStream.Length); Assert.True(lengthStream.Position >= expectedPosition); } } } [Fact] public void StreamTests_Throws_NotSupportedException() { using (var memoryStream = new MemoryStream()) using (var lengthStream = new LengthStream(memoryStream, 3)) { var buffer = new byte[10]; ThrowsNotSupportedException(lengthStream, stream => stream.Position = 5); ThrowsNotSupportedException(lengthStream, stream => stream.Seek(0, SeekOrigin.Begin)); ThrowsNotSupportedException(lengthStream, stream => stream.Write(buffer, 0, buffer.Length)); ThrowsNotSupportedException(lengthStream, stream => stream.SetLength(7)); ThrowsNotSupportedException(lengthStream, stream => stream.Flush()); } } private static void ThrowsNotSupportedException(LengthStream lengthStream, Func exceptionFunc) { Assert.Throws(delegate { // ReSharper disable once UnusedVariable exceptionFunc(lengthStream); }); } private static void ThrowsNotSupportedException(LengthStream lengthStream, Action exceptionAction) { Assert.Throws(delegate { // ReSharper disable once UnusedVariable exceptionAction(lengthStream); }); } } } ================================================ FILE: UnitTests/IO/MD5StreamTests.cs ================================================ using System; using System.IO; using System.Linq; using System.Text; using IO; using Xunit; namespace UnitTests.IO { public sealed class MD5StreamTests { [Fact] public void GetFileMetadata_AsExpected() { FileMetadata observed, cachedObserved; using (var memoryStream = new MemoryStream()) using (var md5Stream = new MD5Stream(memoryStream)) { using (var writer = new StreamWriter(md5Stream, Encoding.ASCII)) { writer.Write("The quick brown fox jumps over the lazy dog"); } observed = md5Stream.GetFileMetadata(); cachedObserved = md5Stream.GetFileMetadata(); } byte[] expectedMd5 = StringToByteArray("9e107d9d372bb6826bd81d3542a419d6"); const int expectedLength = 43; Assert.Equal(expectedMd5, observed.MD5); Assert.Equal(expectedLength, observed.Length); Assert.Equal(expectedMd5, cachedObserved.MD5); Assert.Equal(expectedLength, cachedObserved.Length); } [Fact] public void StreamTests_AsExpected() { using (var memoryStream = new MemoryStream()) using (var md5Stream = new MD5Stream(memoryStream)) { using (var writer = new StreamWriter(md5Stream, Encoding.ASCII)) { writer.Write("The quick brown fox jumps over the lazy dog"); md5Stream.Flush(); } Assert.True(md5Stream.CanRead); Assert.True(md5Stream.CanWrite); Assert.True(md5Stream.CanSeek); Assert.Equal(43, md5Stream.Length); Assert.Equal(43, md5Stream.Position); } } [Fact] public void StreamTests_Throws_NotSupportedException() { using (var memoryStream = new MemoryStream()) using (var md5Stream = new MD5Stream(memoryStream)) { var buffer = new byte[10]; ThrowsNotSupportedException(md5Stream, stream => stream.Read(buffer, 0, buffer.Length)); ThrowsNotSupportedException(md5Stream, stream => stream.Position = 5); ThrowsNotSupportedException(md5Stream, stream => stream.Seek(0, SeekOrigin.Begin)); ThrowsNotSupportedException(md5Stream, stream => stream.SetLength(7)); } } private static void ThrowsNotSupportedException(MD5Stream md5Stream, Func exceptionFunc) { Assert.Throws(delegate { // ReSharper disable once UnusedVariable exceptionFunc(md5Stream); }); } private static void ThrowsNotSupportedException(MD5Stream lengthStream, Action exceptionAction) { Assert.Throws(delegate { // ReSharper disable once UnusedVariable exceptionAction(lengthStream); }); } private static byte[] StringToByteArray(string hex) { return Enumerable.Range(0, hex.Length) .Where(x => x % 2 == 0) .Select(x => Convert.ToByte(hex.Substring(x, 2), 16)) .ToArray(); } } } ================================================ FILE: UnitTests/IO/PersistentStreamTests.cs ================================================ using System.IO; using System.Text; using IO; using Moq; using Xunit; namespace UnitTests.IO { public sealed class PersistentStreamTests { private static Stream GetMockStream() { var memStream = new MemoryStream(); using (var writer = new StreamWriter(memStream, Encoding.Default, 4096, true)) { writer.WriteLine("2551e067cb59c540a4da905a99ee5ff4-ClinGen/2/GRCh37/ClinGen_20160414.nsi"); writer.WriteLine("43321b1a4f1c73724c00223e07d5e812-1kgSv/3/GRCh37/1000_Genomes_Project_Phase_3_v5a.nsi"); writer.WriteLine("929439472713ec609b92b97dc22a2d42-dbSNP/4/GRCh37/dbSNP_151.nsa"); } memStream.Position = 0; return memStream; } private static IConnect GetWebRequest_connect_on_third() { var moqRequest = new Mock(); //Connect succeeds on 3rd attempt moqRequest.SetupSequence(x => x.Connect(0)) .Throws(new IOException()) .Throws(new IOException()) .Returns((null,GetMockStream())); return moqRequest.Object; } private static IConnect GetWebRequest_flaky_stream() { var moqRequest = new Mock(); moqRequest.SetupSequence(x => x.Connect(0)) .Returns((null, null)) .Returns((null, GetMockStream())); return moqRequest.Object; } private static IConnect GetWebRequest_connect_on_seventh() { var moqRequest = new Mock(); //Connect succeeds on 3rd attempt moqRequest.SetupSequence(x => x.Connect(0)) .Throws(new IOException()) .Throws(new IOException()) .Throws(new IOException()) .Throws(new IOException()) .Throws(new IOException()) .Throws(new IOException()) .Returns((null, GetMockStream())); return moqRequest.Object; } [Fact] public void TestFlakyConnection() { // pStream attempts to connect at construction time. It should succeed at the third attempt new PersistentStream(GetWebRequest_connect_on_third(), 0); // no exception thrown means this test succeeded } [Fact] public void FailToConnect() { Assert.Throws(() => new PersistentStream(GetWebRequest_connect_on_seventh(), 0)); } [Fact] public void ReadFlakyStream() { var pStream = new PersistentStream(GetWebRequest_flaky_stream(),0); var buffer = new byte[4096]; Assert.Equal(100, pStream.Read(buffer, 0, 100)); } } } ================================================ FILE: UnitTests/IO/UrlUtilitiesTests.cs ================================================ using IO; using Xunit; namespace UnitTests.IO { public sealed class UrlUtilitiesTests { [Theory] [InlineData("http://www.illumina.com", "bob", "http://www.illumina.com/bob")] [InlineData("http://www.illumina.com/", "bob", "http://www.illumina.com/bob")] [InlineData("http://www.illumina.com", "/bob", "http://www.illumina.com/bob")] [InlineData("http://www.illumina.com/", "/bob", "http://www.illumina.com/bob")] public void Combine_Nominal(string prefix, string suffix, string expected) { string observed = prefix.UrlCombine(suffix); Assert.Equal(expected, observed); } [Fact] public void GetFileName_Nominal() { const string url = "https://illumina-usw2-olympia-dev.s3.amazonaws.com/Annotation/input/Mother.vcf.gz?AWSAccessKeyId=AKIAI774CQHRMUZUNE5Q&Signature=W7Rofh4%2BFXPrPE9ONrdk2iKrGqE%3D&Expires=1561072628"; string observed = UrlUtilities.GetFileName(url); Assert.Equal("Mother.vcf.gz", observed); } } } ================================================ FILE: UnitTests/Intervals/IntervalArrayTests.cs ================================================ using System.Collections.Generic; using System.Linq; using Intervals; using Xunit; namespace UnitTests.Intervals { public sealed class IntervalArrayTests { private readonly IntervalArray _intervalArray; public IntervalArrayTests() { var intervals = new List> { new Interval(10, 20, "bob"), new Interval(5, 7, "mary"), new Interval(7, 9, "jane") }; // interval array expects a sorted array of intervals _intervalArray = new IntervalArray(intervals.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray()); } [Theory] [InlineData(4, 4, false)] [InlineData(5, 6, true)] [InlineData(7, 11, true)] [InlineData(21, 23, false)] public void OverlapsAny(int begin, int end, bool expectedResult) { Assert.Equal(expectedResult, _intervalArray.OverlapsAny(begin, end)); } [Theory] [InlineData(6, 9, new[] { "mary", "jane" })] [InlineData(8, 10, new[] { "jane", "bob" })] [InlineData(11, 50, new[] { "bob" })] [InlineData(21, 23, null)] public void GetAllOverlappingValues(int begin, int end, string[] expectedValues) { var observedValues = _intervalArray.GetAllOverlappingValues(begin, end); Assert.Equal(expectedValues, observedValues); } } } ================================================ FILE: UnitTests/Intervals/IntervalExtensionsTests.cs ================================================ using Intervals; using Xunit; namespace UnitTests.Intervals { public sealed class IntervalExtensionsTests { [Theory] [InlineData(1, 3, 5, 7, 0, false)] [InlineData(1, 3, 5, 7, 2, true)] public void Overlaps_TwoIntervalsWithFlankingLength(int start1, int end1, int start2, int end2, int flankingLength, bool expectedResult) { var interval = new Interval(start1, end1); var interval2 = new Interval(start2, end2); bool observedResult = interval.Overlaps(interval2, flankingLength); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(5, 7, 1, 3, false)] [InlineData(1, 7, 5, 10, true)] public void Overlaps_IntervalAndCoordinates(int start1, int end1, int start2, int end2, bool expectedResult) { var interval = new Interval(start1, end1); bool observedResult = interval.Overlaps(start2, end2); Assert.Equal(expectedResult, observedResult); } [Fact] public void Contains_TwoIntervals() { var interval1 = new Interval(1, 10); var interval2 = new Interval(5, 6); bool observedResult = interval1.Contains(interval2); Assert.True(observedResult); } [Theory] [InlineData(1, 3, 5, 7, -1, -1)] [InlineData(1, 7, 5, 7, 5, 7)] public void Intersects_TwoIntervals(int start1, int end1, int start2, int end2, int expectedStart, int expectedEnd) { var interval = new Interval(start1, end1); var interval2 = new Interval(start2, end2); var observedInterval = interval.Intersects(interval2); Assert.Equal(expectedStart, observedInterval.Start); Assert.Equal(expectedEnd, observedInterval.End); } } } ================================================ FILE: UnitTests/Intervals/IntervalForestTests.cs ================================================ using System.Collections.Generic; using System.Linq; using Intervals; using Xunit; namespace UnitTests.Intervals { public sealed class IntervalForestTests { private readonly IntervalForest _intervalForest; public IntervalForestTests() { var intervalArraysByRefIndex = new IntervalArray[3]; intervalArraysByRefIndex[0] = GetIntervalArrayRefIndex0(); intervalArraysByRefIndex[1] = GetIntervalArrayRefIndex1(); intervalArraysByRefIndex[2] = GetIntervalArrayRefIndex2(); _intervalForest = new IntervalForest(intervalArraysByRefIndex); } private static IntervalArray GetIntervalArrayRefIndex0() { return GetIntervalArray(new List> { new Interval(10, 20, "bob"), new Interval(5, 7, "mary"), new Interval(7, 9, "jane") }); } private static IntervalArray GetIntervalArrayRefIndex1() { return GetIntervalArray(new List> { new Interval(100, 200, "jones"), new Interval(125, 150, "smith") }); } private static IntervalArray GetIntervalArrayRefIndex2() { return GetIntervalArray(new List> { new Interval(9, 28, "zoe"), new Interval(1, 7, "clive") }); } [Theory] [InlineData(0, 4, 4, false)] [InlineData(0, 5, 6, true)] [InlineData(1, 90, 95, false)] [InlineData(2, 5, 6, true)] public void OverlapsAny(ushort refIndex, int begin, int end, bool expectedResult) { Assert.Equal(expectedResult, _intervalForest.OverlapsAny(refIndex, begin, end)); } [Theory] [InlineData(0, 6, 9, new[] { "mary", "jane" })] [InlineData(1, 180, 190, new[] { "jones" })] [InlineData(2, 6, 10, new[] { "clive", "zoe" })] [InlineData(3, 23, 25, null)] public void GetAllOverlappingValues(ushort refIndex, int begin, int end, string[] expectedValues) { var observedValues = _intervalForest.GetAllOverlappingValues(refIndex, begin, end); Assert.Equal(expectedValues, observedValues); } private static IntervalArray GetIntervalArray(List> intervals) => new IntervalArray(intervals.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray()); } } ================================================ FILE: UnitTests/Intervals/IntervalOperationsTests.cs ================================================ using Intervals; using Xunit; namespace UnitTests.Intervals { public sealed class IntervalOperationsTests { [Theory] [InlineData(1, 10, 5, 6, true)] [InlineData(5, 6, 1, 10, true)] [InlineData(1, 3, 5, 7, false)] [InlineData(5, 7, 1, 3, false)] [InlineData(1, 7, 5, 10, true)] [InlineData(5, 10, 1, 7, true)] public void Overlaps(int start1, int end1, int start2, int end2, bool expectedResult) { bool observedResult = Utilities.Overlaps(start1, end1, start2, end2); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(1, 10, 5, 6, true)] [InlineData(5, 6, 1, 10, false)] [InlineData(1, 3, 5, 7, false)] [InlineData(5, 7, 1, 3, false)] [InlineData(1, 7, 5, 10, false)] [InlineData(5, 10, 1, 7, false)] public void Contains(int start1, int end1, int start2, int end2, bool expectedResult) { bool observedResult = Utilities.Contains(start1, end1, start2, end2); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(1, 10, 5, 6, 5, 6)] [InlineData(5, 6, 1, 10, 5, 6)] [InlineData(1, 3, 5, 7, -1, -1)] [InlineData(5, 7, 1, 3, -1, -1)] [InlineData(1, 7, 5, 10, 5, 7)] [InlineData(5, 10, 1, 7, 5, 7)] public void Intersects(int start1, int end1, int start2, int end2, int expectedStart, int expectedEnd) { (int observedStart, int observedEnd) = Utilities.Intersects(start1, end1, start2, end2); Assert.Equal(expectedStart, observedStart); Assert.Equal(expectedEnd, observedEnd); } } } ================================================ FILE: UnitTests/Intervals/NullIntervalSearchTests.cs ================================================ using Intervals; using Xunit; namespace UnitTests.Intervals { public sealed class NullIntervalSearchTests { [Fact] public void OverlapsAny_IIntervalForest() { var intervalForest = new NullIntervalSearch(); Assert.False(intervalForest.OverlapsAny(1, 2, 3)); } [Fact] public void GetAllOverlappingValues_IIntervalForest() { var intervalForest = new NullIntervalSearch(); Assert.Null(intervalForest.GetAllOverlappingValues(1, 2, 3)); } [Fact] public void GetAllOverlappingValues_IIntervalSearch() { var intervalSearch = new NullIntervalSearch(); Assert.Null(intervalSearch.GetAllOverlappingValues(1, 2)); } } } ================================================ FILE: UnitTests/Intervals/OverlapTypeTests.cs ================================================ using Intervals; using Xunit; namespace UnitTests.Intervals { public sealed class OverlapTypeTests { // given two intervals T and V, describe how V overlaps T [Theory] [InlineData(400, 500, OverlapType.Partial)] [InlineData(200, 400, OverlapType.CompletelyWithin)] [InlineData(100, 200, OverlapType.Partial)] [InlineData(100, 500, OverlapType.CompletelyOverlaps)] [InlineData(200, 500, OverlapType.CompletelyOverlaps)] [InlineData(100, 400, OverlapType.CompletelyOverlaps)] [InlineData(500, 600, OverlapType.None)] [InlineData(0, 100, OverlapType.None)] public void GetOverlapType(int vStart, int vEnd, OverlapType expectedResults) { const int tStart = 200; const int tEnd = 400; OverlapType observedResults = Utilities.GetOverlapType(tStart, tEnd, vStart, vEnd); Assert.Equal(expectedResults, observedResults); } } } ================================================ FILE: UnitTests/Jasix/IndexTests.cs ================================================ using System.IO; using System.IO.Compression; using System.Text; using Jasix; using Jasix.DataStructures; using Xunit; using UnitTests.TestUtilities; using Compression.FileHandling; using ErrorHandling.Exceptions; namespace UnitTests.Jasix { public sealed class IndexTests { [Fact] public void Query_succeedes_when_it_overlaps_tail_of_previous_bin() { var chrIndex = new JasixChrIndex("chr1"); for (var i = 100; i < 100 + JasixCommons.PreferredNodeCount; i++) { chrIndex.Add(i, i + 5, 100_000 + i); } for (int i = 102 + JasixCommons.PreferredNodeCount; i < 152 + JasixCommons.PreferredNodeCount; i++) { chrIndex.Add(i, i + 5, 100_020 + i); } //close current node chrIndex.Flush(); Assert.Equal(100_100, chrIndex.FindFirstSmallVariant(102, 103)); } [Fact] public void Add_fill_node_and_start_another() { var index = new JasixIndex(); //creating two nodes each containing 50 entries for (var i = 0; i < 2 * JasixCommons.PreferredNodeCount; i++) { index.Add("chr1", 100 + i, 101 + i, 100_000 + i); } index.Add("chr1", 160 + 2 * JasixCommons.PreferredNodeCount, 166 + 2 * JasixCommons.PreferredNodeCount, 200_100); index.Add("chr2", 100, 100, 200_150); index.Add("chr2", 102, 105, 200_200); index.Flush(); Assert.Equal(100_000, index.GetFirstVariantPosition("chr1", 100, 102)); Assert.Equal(100_000 + JasixCommons.PreferredNodeCount, index.GetFirstVariantPosition("chr1", 2 * JasixCommons.PreferredNodeCount + 55, 2 * JasixCommons.PreferredNodeCount + 55)); Assert.Equal(-1, index.GetFirstVariantPosition("chr1", 2 * JasixCommons.PreferredNodeCount + 120, 2 * JasixCommons.PreferredNodeCount + 124)); Assert.Equal(200_100, index.GetFirstVariantPosition("chr1", 2 * JasixCommons.PreferredNodeCount + 158, 2 * JasixCommons.PreferredNodeCount + 160)); Assert.Equal(200_150, index.GetFirstVariantPosition("chr2", 103, 105)); } [Fact] public void GetFirstVariantPosition_multi_chrom_index() { var index = new JasixIndex(); index.Add("chr1", 100, 101, 100000); index.Add("chr1", 105, 109, 100050); index.Add("chr1", 160, 166, 100100); index.Add("chr2", 100, 100, 100150); index.Add("chr2", 102, 105, 100200); index.Flush(); var chrPos = Utilities.ParseQuery("chr1"); Assert.Equal(100000, index.GetFirstVariantPosition(chrPos.Item1, chrPos.Item2, chrPos.Item3)); chrPos = Utilities.ParseQuery("chr2"); Assert.Equal(100150, index.GetFirstVariantPosition(chrPos.Item1, chrPos.Item2, chrPos.Item3)); } [Fact] public void FindLargeVaritants_method_does_not_return_small_variants() { var index = new JasixIndex(); index.Add("chr1", 100, 101, 100_000); index.Add("chr1", 105, 109, 100_050); index.Add("chr1", 160, 166, 100_100); index.Add("chr1", 200, 1000, 100_075);//large variant index.Add("chr2", 100, 100, 100_150); index.Add("chr2", 102, 105, 100_200); index.Flush(); //checking large variants Assert.Null(index.LargeVariantPositions("chr1", 100, 199)); var largeVariants = index.LargeVariantPositions("chr1", 100, 201); Assert.NotNull(largeVariants); Assert.Single(largeVariants); Assert.Equal(100075, largeVariants[0]); } [Fact] public void Write_and_read_back() { var index = new JasixIndex(); index.Add("chr1", 100, 101, 100000,"1"); index.Add("chr1", 105, 109, 100050,"1"); index.Add("chr1", 150, 1000, 100075,"1");//large variant index.Add("chr1", 160, 166, 100100, "1"); index.Add("chr2", 100, 100, 100150, "2"); index.Add("chr2", 102, 105, 100200, "2"); var writeStream = new MemoryStream(); using (writeStream) { index.Write(writeStream); } var readStream= new MemoryStream(writeStream.ToArray()); readStream.Seek(0,SeekOrigin.Begin); JasixIndex readBackIndex; using (readStream) { readBackIndex = new JasixIndex(readStream); } Assert.Equal(100000, readBackIndex.GetFirstVariantPosition("chr1", 100, 102)); Assert.Equal(100000, readBackIndex.GetFirstVariantPosition("chr1", 103, 104)); Assert.Equal(100000, readBackIndex.GetFirstVariantPosition("chr1", 120, 124)); Assert.Equal(100000, readBackIndex.GetFirstVariantPosition("chr1", 158, 160)); Assert.Equal(100150, readBackIndex.GetFirstVariantPosition("chr2", 103, 105)); //checking large variants Assert.Null(readBackIndex.LargeVariantPositions("chr1", 100, 149)); var largeVariants = readBackIndex.LargeVariantPositions("chr1", 100, 201); Assert.NotNull(largeVariants); Assert.Single(largeVariants); Assert.Equal(100075, largeVariants[0]); } [Fact] public void BgzipTestReader_basic() { var stream = ResourceUtilities.GetReadStream(Resources.TopPath("TinyAnnotated.json")); var lineCount = 0; using (var jasixReader = new StreamReader(stream)) { while (jasixReader.ReadLine() != null) { lineCount++; } } Assert.Equal(4, lineCount); } [Fact] public void IndexCreation_multChromosome() { var jsonStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz")), CompressionMode.Decompress); var writeStream = new MemoryStream(); using (var indexCreator = new IndexCreator(jsonStream, writeStream)) { indexCreator.CreateIndex(); } JasixIndex readBackIndex; var readStream = new MemoryStream(writeStream.ToArray()); readStream.Seek(0, SeekOrigin.Begin); using (readStream) { readBackIndex = new JasixIndex(readStream); } Assert.Equal(2268, readBackIndex.GetFirstVariantPosition("chr1", 9775924, 9775924)); Assert.Equal(14035925971, readBackIndex.GetFirstVariantPosition("chr2", 16081096, 16081096)); Assert.Equal(433156622693, readBackIndex.GetFirstVariantPosition("chr20", 36026164, 36026164)); Assert.Equal(439602269527, readBackIndex.GetFirstVariantPosition("chrX", 66765044, 66765044)); } [Fact] public void Begin_end_section_and_readback() { var index = new JasixIndex(); const string section = "section1"; index.BeginSection(section, 0); Assert.Throws(() => index.BeginSection(section, 1)); index.EndSection(section, 100); Assert.Throws(() => index.EndSection(section, 101)); Assert.Equal(0, index.GetSectionBegin(section)); Assert.Equal(100, index.GetSectionEnd(section)); } [Fact] public void GetChromosomeList() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz.jsi")); var outStream = new MemoryStream(); using (var writer = new StreamWriter(outStream, Encoding.UTF8, 512, true)) using (var qp = new QueryProcessor(new StreamReader(readStream), indexStream, writer)) { writer.NewLine = "\r\n"; qp.ListChromosomesAndSections(); } Assert.NotEqual(0, outStream.Length); outStream.Position = 0; using (var reader = new StreamReader(outStream)) { string chromList = reader.ReadToEnd(); Assert.Equal("1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n7\r\n8\r\n9\r\n10\r\n11\r\n12\r\n13\r\n14\r\n15\r\n16\r\n17\r\n18\r\n19\r\n20\r\n21\r\nX\r\nY\r\nheader\r\npositions\r\ngenes\r\n", chromList); } } [Fact] public void GetHeaderOnly() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz.jsi")); var outStream = new MemoryStream(); using (var writer = new StreamWriter(outStream, Encoding.UTF8, 512, true)) using (var qp = new QueryProcessor(new StreamReader(readStream), indexStream, writer)) { qp.PrintHeaderOnly(); } Assert.NotEqual(0, outStream.Length); outStream.Position = 0; using (var reader = new StreamReader(outStream)) { string actualHeaderLine = reader.ReadToEnd().Replace("\r\n", "\n"); Assert.Equal( "{\n \"header\": {\n \"annotator\": \"Nirvana 2.0.9.0\",\n \"creationTime\": \"2018-04-30 17:17:23\",\n \"genomeAssembly\": \"GRCh37\",\n \"schemaVersion\": 6,\n \"dataVersion\": \"91.26.45\",\n \"dataSources\": [\n {\n \"name\": \"VEP\",\n \"version\": \"91\",\n \"description\": \"Ensembl\",\n \"releaseDate\": \"2018-03-05\"\n },\n {\n \"name\": \"ClinVar\",\n \"version\": \"20180129\",\n \"description\": \"A freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence\",\n \"releaseDate\": \"2018-01-29\"\n },\n {\n \"name\": \"COSMIC\",\n \"version\": \"84\",\n \"description\": \"somatic mutation and related details and information relating to human cancers\",\n \"releaseDate\": \"2018-02-13\"\n },\n {\n \"name\": \"dbSNP\",\n \"version\": \"150\",\n \"description\": \"Identifiers for observed variants\",\n \"releaseDate\": \"2017-04-03\"\n },\n {\n \"name\": \"gnomAD_exome\",\n \"version\": \"2.0.2\",\n \"description\": \"Exome allele frequencies from Genome Aggregation Database (gnomAD)\",\n \"releaseDate\": \"2017-10-05\"\n },\n {\n \"name\": \"gnomAD\",\n \"version\": \"2.0.2\",\n \"description\": \"Whole genome allele frequencies from Genome Aggregation Database (gnomAD)\",\n \"releaseDate\": \"2017-10-05\"\n },\n {\n \"name\": \"MITOMAP\",\n \"version\": \"20180228\",\n \"description\": \"Small variants in the MITOMAP human mitochondrial genome database\",\n \"releaseDate\": \"2018-02-28\"\n },\n {\n \"name\": \"1000 Genomes Project\",\n \"version\": \"Phase 3 v5a\",\n \"description\": \"A public catalogue of human variation and genotype data\",\n \"releaseDate\": \"2013-05-27\"\n },\n {\n \"name\": \"TOPMed\",\n \"version\": \"freeze_5\",\n \"description\": \"Allele frequencies from TOPMed data lifted over using dbSNP ids.\",\n \"releaseDate\": \"2017-08-28\"\n },\n {\n \"name\": \"ClinGen\",\n \"version\": \"20160414\",\n \"releaseDate\": \"2016-04-14\"\n },\n {\n \"name\": \"DGV\",\n \"version\": \"20160515\",\n \"description\": \"Provides a comprehensive summary of structural variation in the human genome\",\n \"releaseDate\": \"2016-05-15\"\n },\n {\n \"name\": \"MITOMAP\",\n \"version\": \"20180228\",\n \"description\": \"Large structural variants in the MITOMAP human mitochondrial genome database\",\n \"releaseDate\": \"2018-02-28\"\n },\n {\n \"name\": \"ExAC\",\n \"version\": \"0.3.1\",\n \"description\": \"Gene scores from the ExAC project\",\n \"releaseDate\": \"2016-03-16\"\n },\n {\n \"name\": \"OMIM\",\n \"version\": \"20180213\",\n \"description\": \"An Online Catalog of Human Genes and Genetic Disorders\",\n \"releaseDate\": \"2018-02-13\"\n },\n {\n \"name\": \"phyloP\",\n \"version\": \"hg19\",\n \"description\": \"46 way conservation score between humans and 45 other vertebrates\",\n \"releaseDate\": \"2009-11-10\"\n }\n ]\n }\n}", actualHeaderLine); } } [Fact] public void GetGeneSection() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz.jsi")); var outStream = new MemoryStream(); using (var writer = new StreamWriter(outStream, Encoding.UTF8, 512, true)) using (var qp = new QueryProcessor(new StreamReader(readStream), indexStream, writer)) { writer.NewLine = "\r\n"; qp.PrintSection("genes"); } Assert.NotEqual(0, outStream.Length); outStream.Position = 0; using (var reader = new StreamReader(outStream)) { var count = 0; var line = reader.ReadLine(); while (line != null) { count++; line = reader.ReadLine(); } Assert.Equal(4382, count); } } } } ================================================ FILE: UnitTests/Jasix/JasixFunctionalityTests.cs ================================================ using Jasix; using Xunit; namespace UnitTests.Jasix { public sealed class JasixFunctionalityTests { [Fact] public void ParsingDeletionJsonLine() { const string jsonLine = "{\"chromosome\":\"chr1\",\"refAllele\":\"GT\",\"position\":2337967,\"altAlleles\":[\"G\"],\"cyt\r\nogeneticBand\":\"1p36.32\",\"variants\":[{\"altAllele\":\"C\",\"refAllele\":\"-\",\"begin\":2337968,\"chromosome\":\"chr1\",\"dbsnp\":[\"rs797044762\"],\"end\":2337967,\"variantType\":\"insertion\",\"vid\":\"1:2337968:2337967:C\",\"regulatoryRegions\":[{\"id\":\"ENSR00001576444\",\"consequence\":[\"regulatory_region_variant\"]}],\"transcripts\":{\"refSeq\":[{\"transcript\":\"XM_005244712.1\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"XP_005244769.1\"},{\"transcript\":\"NM_007033.4\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_008964.3\"},{\"transcript\":\"XM_005244713.1\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"XP_005244770.1\"},{\"transcript\":\"NM_002617.3\",\"bioType\":\"protein_coding\",\"aminoAcids\":\"-/X\",\"cDnaPos\":\"936-937\",\"codons\":\"-/G\",\"cdsPos\":\"867-868\",\"exons\":\"5/6\",\"geneId\":\"5192\",\"hgnc\":\"PEX10\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_002617.3:c.867_868insG\",\"hgvsp\":\"NP_002608.1:p.His290AlafsTer49\",\"proteinId\":\"NP_002608.1\",\"proteinPos\":\"289-290\"},{\"transcript\":\"NM_153818.1\",\"bioType\":\"protein_coding\",\"aminoAcids\":\"-/X\",\"cDnaPos\":\"996-997\",\"codons\":\"-/G\",\"cdsPos\":\"927-928\",\"exons\":\"5/6\",\"geneId\":\"5192\",\"hgnc\":\"PEX10\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_153818.1:c.927_928insG\",\"hgvsp\":\"NP_722540.1:p.His310AlafsTer49\",\"isCanonical\":true,\"proteinId\":\"NP_722540.1\",\"proteinPos\":\"309-310\"}]}}]}"; var chrPos = IndexCreator.GetChromPosition(jsonLine); Assert.Equal("chr1", chrPos.Item1); Assert.Equal(2337967, chrPos.Item2); Assert.Equal(2337968, chrPos.Item3); } [Fact] public void ParsingSnvJsonLine() { const string jsonLine = "{\"chromosome\":\"chr1\",\"refAllele\":\"G\",\"position\":2337967,\"altAlleles\":[\"C\",\"T\"],\"cyt\r\nogeneticBand\":\"1p36.32\",\"variants\":[{\"altAllele\":\"C\",\"refAllele\":\"-\",\"begin\":2337968,\"chromosome\":\"chr1\",\"dbsnp\":[\"rs797044762\"],\"end\":2337967,\"variantType\":\"insertion\",\"vid\":\"1:2337968:2337967:C\",\"regulatoryRegions\":[{\"id\":\"ENSR00001576444\",\"consequence\":[\"regulatory_region_variant\"]}],\"transcripts\":{\"refSeq\":[{\"transcript\":\"XM_005244712.1\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"XP_005244769.1\"},{\"transcript\":\"NM_007033.4\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_008964.3\"},{\"transcript\":\"XM_005244713.1\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"XP_005244770.1\"},{\"transcript\":\"NM_002617.3\",\"bioType\":\"protein_coding\",\"aminoAcids\":\"-/X\",\"cDnaPos\":\"936-937\",\"codons\":\"-/G\",\"cdsPos\":\"867-868\",\"exons\":\"5/6\",\"geneId\":\"5192\",\"hgnc\":\"PEX10\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_002617.3:c.867_868insG\",\"hgvsp\":\"NP_002608.1:p.His290AlafsTer49\",\"proteinId\":\"NP_002608.1\",\"proteinPos\":\"289-290\"},{\"transcript\":\"NM_153818.1\",\"bioType\":\"protein_coding\",\"aminoAcids\":\"-/X\",\"cDnaPos\":\"996-997\",\"codons\":\"-/G\",\"cdsPos\":\"927-928\",\"exons\":\"5/6\",\"geneId\":\"5192\",\"hgnc\":\"PEX10\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_153818.1:c.927_928insG\",\"hgvsp\":\"NP_722540.1:p.His310AlafsTer49\",\"isCanonical\":true,\"proteinId\":\"NP_722540.1\",\"proteinPos\":\"309-310\"}]}}]}"; var chrPos = IndexCreator.GetChromPosition(jsonLine); Assert.Equal("chr1", chrPos.Item1); Assert.Equal(2337967, chrPos.Item2); Assert.Equal(2337967, chrPos.Item3); } [Fact] public void ParsingJsonInsertionLine() { const string jsonLine = "{\"chromosome\":\"chr1\",\"refAllele\":\"G\",\"position\":2337967,\"altAlleles\":[\"GCC\"],\"cyt\r\nogeneticBand\":\"1p36.32\",\"variants\":[{\"altAllele\":\"C\",\"refAllele\":\"-\",\"begin\":2337968,\"chromosome\":\"chr1\",\"dbsnp\":[\"rs797044762\"],\"end\":2337967,\"variantType\":\"insertion\",\"vid\":\"1:2337968:2337967:C\",\"regulatoryRegions\":[{\"id\":\"ENSR00001576444\",\"consequence\":[\"regulatory_region_variant\"]}],\"transcripts\":{\"refSeq\":[{\"transcript\":\"XM_005244712.1\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"XP_005244769.1\"},{\"transcript\":\"NM_007033.4\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"isCanonical\":true,\"proteinId\":\"NP_008964.3\"},{\"transcript\":\"XM_005244713.1\",\"bioType\":\"protein_coding\",\"geneId\":\"11079\",\"hgnc\":\"RER1\",\"consequence\":[\"downstream_gene_variant\"],\"proteinId\":\"XP_005244770.1\"},{\"transcript\":\"NM_002617.3\",\"bioType\":\"protein_coding\",\"aminoAcids\":\"-/X\",\"cDnaPos\":\"936-937\",\"codons\":\"-/G\",\"cdsPos\":\"867-868\",\"exons\":\"5/6\",\"geneId\":\"5192\",\"hgnc\":\"PEX10\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_002617.3:c.867_868insG\",\"hgvsp\":\"NP_002608.1:p.His290AlafsTer49\",\"proteinId\":\"NP_002608.1\",\"proteinPos\":\"289-290\"},{\"transcript\":\"NM_153818.1\",\"bioType\":\"protein_coding\",\"aminoAcids\":\"-/X\",\"cDnaPos\":\"996-997\",\"codons\":\"-/G\",\"cdsPos\":\"927-928\",\"exons\":\"5/6\",\"geneId\":\"5192\",\"hgnc\":\"PEX10\",\"consequence\":[\"frameshift_variant\"],\"hgvsc\":\"NM_153818.1:c.927_928insG\",\"hgvsp\":\"NP_722540.1:p.His310AlafsTer49\",\"isCanonical\":true,\"proteinId\":\"NP_722540.1\",\"proteinPos\":\"309-310\"}]}}]}"; var chrPos = IndexCreator.GetChromPosition(jsonLine); Assert.Equal("chr1", chrPos.Item1); Assert.Equal(2337967, chrPos.Item2); Assert.Equal(2337968, chrPos.Item3); } [Fact] public void ParseJsonStructuralVariant() { const string jsonLine = "{\"chromosome\":\"chr3\",\"refAllele\":\"A\",\"position\":62431401,\"svEnd\":62431801,\"altAlleles\":[\"\"],\"cytogeneticBand\":\"3p14.2\",\"variants\":[{\"altAllele\":\"\",\"refAllele\":\"A\",\"begin\":62431402,\"chromosome\":\"chr3\",\"end\":62431801,\"variantType\":\"unknown\",\"vid\":\"3:62431402:62431401\",\"globalAllele\":{\"globalMajorAllele\":\"C\",\"globalMajorAlleleFrequency\":0.9856,\"globalMinorAllele\":\"A\",\"globalMinorAlleleFrequency\":0.01438}}]}"; var chrPos = IndexCreator.GetChromPosition(jsonLine); Assert.Equal("chr3", chrPos.Item1); Assert.Equal(62431401, chrPos.Item2); Assert.Equal(62431801, chrPos.Item3); } [Fact] public void ParseJsonBreakEnd() { const string jsonLine = "{\"chromosome\":\"2\",\"refAllele\":\"G\",\"position\":321681,\"quality\":6,\"filters\":[\"PASS\"],\"altAlleles\":[\"G]2:421681]\"],\"cytogeneticBand\":\"2p25.3\",\"oneKg\":[{\"chromosome\":\"2\",\"begin\":314969,\"end\":694521,\"variantType\":\"copy_number_gain\",\"variantFreqAll\":0.0008,\"variantFreqEas\":0.00397,\"id\":\"esv3589600\",\"sampleSize\":2504,\"sampleSizeAfr\":661,\"sampleSizeAmr\":347,\"sampleSizeEas\":504,\"sampleSizeEur\":503,\"sampleSizeSas\":489,\"observedGains\":2}],\"variants\":[{\"altAllele\":\"G]2:421681]\",\"refAllele\":\"G\",\"begin\":321681,\"chromosome\":\"2\",\"end\":321686,\"variantType\":\"translocation_breakend\",\"vid\":\"2:321681:+:2:421681:-\",\"overlappingGenes\":[\"AC079779.6\"]}]}"; var chrPos = IndexCreator.GetChromPosition(jsonLine); Assert.Equal("2", chrPos.Item1); Assert.Equal(321681, chrPos.Item2); Assert.Equal(321681, chrPos.Item3); } } } ================================================ FILE: UnitTests/Jasix/JasixQueryProcessingTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using Jasix; using Jasix.DataStructures; using Xunit; using Compression.FileHandling; using IO; using Newtonsoft.Json.Linq; using UnitTests.TestUtilities; namespace UnitTests.Jasix { public sealed class JasixQueryProcessingTests { [Fact] public void Combination_of_large_and_small_variants() { var index = new JasixIndex(); //query range 10,000- 10,020 index.Add("chr1", 8_000, 9_900, 90_000);//SV not overlapping the query index.Add("chr1", 9_000, 10_005, 90_100);// partially overlapping index.Add("chr1", 9_500, 10_050, 90_200);//completely overlapping index.Add("chr1", 10_000, 10_001, 100_000); index.Add("chr1", 10_004, 10_006, 100_100); index.Add("chr1", 10_009, 10_550, 100_200);//SV starting from the middle of the range index.Add("chr1", 10_008, 10_010, 100_300); index.Add("chr1", 10_011, 10_020, 100_400); index.Add("chr1", 10_039, 10_550, 100_200);//SV past the range index.Flush(); var firstSmallVarLocation = index.GetFirstVariantPosition("chr1", 10_000, 10_020); var largeVariantLocations = index.LargeVariantPositions("chr1", 10_000, 10_020); Assert.Equal(90_000, firstSmallVarLocation); Assert.True(largeVariantLocations.SequenceEqual(new List { 90_100, 90_200, 100_200 })); } [Fact] public void Quiring_large_variants_overlapping_range_but_starting_before() { var index = new JasixIndex(); //query range 10,000- 10,020 index.Add("chr1", 8_000, 10_000, 80_000);//SV ending at the start of query index.Add("chr1", 8_000, 9_900, 90_000);//SV not overlapping the query index.Add("chr1", 9_000, 10_005, 90_100);// partially overlapping index.Add("chr1", 9_500, 10_050, 90_200);//completely overlapping index.Add("chr1", 10_000, 10_001, 100_000); index.Add("chr1", 10_000, 10_701, 100_050);//starting at the begin of query index.Add("chr1", 10_004, 10_006, 100_100); index.Add("chr1", 10_009, 10_550, 100_200);//SV starting from the middle of the range index.Add("chr1", 10_008, 10_010, 100_300); index.Add("chr1", 10_011, 10_020, 100_400); index.Add("chr1", 10_039, 10_550, 100_200);//SV past the range index.Flush(); var largeVariantBefore = index.LargeVariantPositions("chr1", 10_000, 9_999); Assert.True(largeVariantBefore.SequenceEqual(new List { 80_000, 90_100, 90_200 })); } [Fact] public void First_variant_position_when_the_first_variant_is_large() { var index = new JasixIndex(); //query range 10,000- 10,020 index.Add("chr1", 10_000, 10_701, 100_050);//SV at the begin of query index.Add("chr1", 10_004, 10_006, 100_100); index.Add("chr1", 10_009, 10_550, 100_200);//SV starting from the middle of the range index.Add("chr1", 10_008, 10_010, 100_300); index.Add("chr1", 10_011, 10_020, 100_400); index.Add("chr1", 10_039, 10_550, 100_200);//SV past the range index.Flush(); var firstVariantLocation = index.GetFirstVariantPosition("chr1", 10_000, 10_010); Assert.Equal(100_050, firstVariantLocation); } [Fact] public void TestQuerySingle() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz.jsi")); using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream)) { var header = qp.GetHeader(); Assert.Equal("\"header\":{\"annotator\":\"Nirvana 2.0.9.0\",\"creationTime\":\"2018-04-30 15:44:31\",\"genomeAssembly\":\"GRCh37\",\"schemaVersion\":6,\"dataVersion\":\"91.26.45\",\"dataSources\":[{\"name\":\"VEP\",\"version\":\"91\",\"description\":\"Ensembl\",\"releaseDate\":\"2018-03-05\"},{\"name\":\"ClinVar\",\"version\":\"20180129\",\"description\":\"A freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence\",\"releaseDate\":\"2018-01-29\"},{\"name\":\"COSMIC\",\"version\":\"84\",\"description\":\"somatic mutation and related details and information relating to human cancers\",\"releaseDate\":\"2018-02-13\"},{\"name\":\"dbSNP\",\"version\":\"150\",\"description\":\"Identifiers for observed variants\",\"releaseDate\":\"2017-04-03\"},{\"name\":\"gnomAD_exome\",\"version\":\"2.0.2\",\"description\":\"Exome allele frequencies from Genome Aggregation Database (gnomAD)\",\"releaseDate\":\"2017-10-05\"},{\"name\":\"gnomAD\",\"version\":\"2.0.2\",\"description\":\"Whole genome allele frequencies from Genome Aggregation Database (gnomAD)\",\"releaseDate\":\"2017-10-05\"},{\"name\":\"MITOMAP\",\"version\":\"20180228\",\"description\":\"Small variants in the MITOMAP human mitochondrial genome database\",\"releaseDate\":\"2018-02-28\"},{\"name\":\"1000 Genomes Project\",\"version\":\"Phase 3 v5a\",\"description\":\"A public catalogue of human variation and genotype data\",\"releaseDate\":\"2013-05-27\"},{\"name\":\"TOPMed\",\"version\":\"freeze_5\",\"description\":\"Allele frequencies from TOPMed data lifted over using dbSNP ids.\",\"releaseDate\":\"2017-08-28\"},{\"name\":\"ClinGen\",\"version\":\"20160414\",\"releaseDate\":\"2016-04-14\"},{\"name\":\"DGV\",\"version\":\"20160515\",\"description\":\"Provides a comprehensive summary of structural variation in the human genome\",\"releaseDate\":\"2016-05-15\"},{\"name\":\"MITOMAP\",\"version\":\"20180228\",\"description\":\"Large structural variants in the MITOMAP human mitochondrial genome database\",\"releaseDate\":\"2018-02-28\"},{\"name\":\"ExAC\",\"version\":\"0.3.1\",\"description\":\"Gene scores from the ExAC project\",\"releaseDate\":\"2016-03-16\"},{\"name\":\"OMIM\",\"version\":\"20180213\",\"description\":\"An Online Catalog of Human Genes and Genetic Disorders\",\"releaseDate\":\"2018-02-13\"},{\"name\":\"phyloP\",\"version\":\"hg19\",\"description\":\"46 way conservation score between humans and 45 other vertebrates\",\"releaseDate\":\"2009-11-10\"}]}", header); var results = qp.ReadOverlappingJsonLines(Utilities.ParseQuery("1:9775924")); Assert.Single(results); } } [Fact] public void TestQueryMultiple() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz.jsi")); using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream)) { var results = qp.ReadOverlappingJsonLines(Utilities.ParseQuery("1:9775924-9778952")); Assert.Equal(3, results.Count()); } } [Fact] public void TestQueryMultipleWithSkippingMiddleOne() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz.jsi")); using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream)) { var results = qp.ReadOverlappingJsonLines(Utilities.ParseQuery("1:27023180-27023190")); Assert.Equal(2, results.Count()); } } [Fact] public void TestQueryChr1() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("cosmicv72.indels.json.gz.jsi")); using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream)) { var results = qp.ReadOverlappingJsonLines(Utilities.ParseQuery("1")); Assert.Equal(422, results.Count()); } } [Fact] public void Query_onthefly_Ensembl_and_Ucsc() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz.jsi")); using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream)) { int ucscCount = qp.ProcessQuery(new[] {"chr1"}); int ensemblCount = qp.ProcessQuery(new[] { "1" }); Assert.Equal(13, ucscCount); Assert.Equal(13, ensemblCount); } } [Fact] public void Query_with_header() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("Clinvar20150901.json.gz.jsi")); using( var stream = new MemoryStream()) using (var writer = new StreamWriter(stream)) using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream, writer)) { qp.ProcessQuery(new[] {"chr1"}, true); writer.Flush(); var jsonString = System.Text.Encoding.UTF8.GetString(stream.ToArray(), 0, (int) stream.Length); Assert.NotEmpty(jsonString); var jObject = JObject.Parse(jsonString); Assert.NotNull(jObject); } } [Fact] public void Report_overlapping_small_and_extending_large_variants() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("JasixTest.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("JasixTest.json.gz.jsi")); using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream)) { var results = qp.ReadOverlappingJsonLines(Utilities.ParseQuery("chr1:16378-17000")); Assert.Equal(3, results.Count()); results = qp.ReadJsonLinesExtendingInto(Utilities.ParseQuery("chr1:16378-17000")); Assert.Single(results); } } [Fact] public void Report_overlapping_small_and_extending_multiple_large_variants() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("JasixTest.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("JasixTest.json.gz.jsi")); using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream)) { var results = qp.ReadOverlappingJsonLines(Utilities.ParseQuery("chr1:19004-20000")); Assert.Equal(3, results.Count()); results = qp.ReadJsonLinesExtendingInto(Utilities.ParseQuery("chr1:19004-20000")); Assert.Equal(2, results.Count()); } } [Fact] public void Report_overlapping_small_and_large_variants_starting_at_same_location() { var readStream = new BlockGZipStream(ResourceUtilities.GetReadStream(Resources.TopPath("JasixTest.json.gz")), CompressionMode.Decompress); var indexStream = ResourceUtilities.GetReadStream(Resources.TopPath("JasixTest.json.gz.jsi")); using (var qp = new QueryProcessor(FileUtilities.GetStreamReader(readStream), indexStream)) { var results = qp.ReadOverlappingJsonLines(Utilities.ParseQuery("chr1:46993-50000")); Assert.Equal(5, results.Count()); results = qp.ReadJsonLinesExtendingInto(Utilities.ParseQuery("chr1:46993-50000")); Assert.Empty(results); } } } } ================================================ FILE: UnitTests/Jasix/OtfIndexCreatorTests.cs ================================================ using System.IO; using Jasix; using Jasix.DataStructures; using Moq; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Positions; using Xunit; namespace UnitTests.Jasix { public sealed class OtfIndexCreatorTests { [Fact] public void Add_one_chrom() { var position1 = new Mock(); position1.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); position1.SetupGet(x => x.Start).Returns(100); position1.SetupGet(x => x.RefAllele).Returns("A"); position1.SetupGet(x => x.AltAlleles).Returns(new []{"C"}); var memStream = new MemoryStream(); using (var indexCreator = new OnTheFlyIndexCreator(memStream)) { indexCreator.BeginSection("positions", 100); indexCreator.Add(position1.Object, 2588); indexCreator.EndSection("positions",2699 ); } var readStream = new MemoryStream(memStream.ToArray()); readStream.Seek(0, SeekOrigin.Begin); var index = new JasixIndex(readStream); Assert.Equal(100, index.GetSectionBegin("positions")); Assert.Equal(2588, index.GetFirstVariantPosition("chr1", 100,102)); } } } ================================================ FILE: UnitTests/Jist/JiSTtests.cs ================================================ using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Text; using Compression.FileHandling; using Genome; using Jasix.DataStructures; using Jist; using Moq; using Newtonsoft.Json.Linq; using UnitTests.TestUtilities; using VariantAnnotation.Interface; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO; using Xunit; namespace UnitTests.Jist { public sealed class JiSTtests { private const string NirvanaHeader = "{\"header\":\"Jist test header\",\"positions\":[\n"; private const string NirvanaGenes = JsonStitcher.GeneHeaderLine; private const string NirvanaFooter = JsonStitcher.FooterLine; private static (Stream jsonStream, Stream jasixStream) GetJsonStreams(Chromosome chromosome, bool withGenes) { var jsonStream = new MemoryStream(); var jasixStream = new MemoryStream(); var annotationResources = new Mock(); annotationResources.SetupGet(x => x.AnnotatorVersionTag).Returns("NirvanaTest"); annotationResources.SetupGet(x => x.VepDataVersion).Returns("VEPTest"); annotationResources.SetupGet(x => x.DataSourceVersions).Returns(new List()); annotationResources.SetupGet(x => x.SequenceProvider.Assembly).Returns(GenomeAssembly.GRCh38); using (var jsonWriter = new JsonWriter(new BlockGZipStream(jsonStream, CompressionMode.Compress, true), jasixStream, annotationResources.Object, "2020-05-17", null, true)) { var position = new Mock(); position.SetupGet(x => x.Chromosome).Returns(chromosome); for (int i = 100 * (chromosome.Index+1); i < 123 *(chromosome.Index +1); i++) { position.SetupGet(x => x.Start).Returns(i); position.SetupGet(x => x.RefAllele).Returns("A"); position.SetupGet(x => x.AltAlleles).Returns(new []{"T"}); jsonWriter.WritePosition(position.Object, $"{JsonObject.OpenBrace}\"chromosome\":\"{chromosome.UcscName}\",\"position\":{i}{JsonObject.CloseBrace}"); } if (withGenes) { var geneEntries = new string[] { $"{{\"gene{chromosome.EnsemblName}A\":\"gene annotation\"}}", $"{{\"gene{chromosome.EnsemblName}B\":\"gene annotation\"}}" }; jsonWriter.WriteGenes(geneEntries); } } jsonStream.Position = 0; jasixStream.Position = 0; return (jsonStream, jasixStream); } [Fact] public void All_jsons_with_genes() { var jsonStreams = new Stream[3]; var jasixSteams = new Stream[3]; (jsonStreams[0], jasixSteams[0]) = GetJsonStreams(ChromosomeUtilities.Chr1, true); (jsonStreams[1], jasixSteams[1]) = GetJsonStreams(ChromosomeUtilities.Chr2, true); (jsonStreams[2], jasixSteams[2]) = GetJsonStreams(ChromosomeUtilities.Chr3, true); var outStream = new MemoryStream(); using (var stitcher = new JsonStitcher(jsonStreams, jasixSteams, outStream, true)) { stitcher.Stitch(); } outStream.Position = 0; var sb = new StringBuilder(); using (var bgZipStream = new BlockGZipStream(outStream, CompressionMode.Decompress)) using (var reader = new StreamReader(bgZipStream)) { string line; while ((line = reader.ReadLine())!=null) { sb.Append(line+'\n'); } } var fullJson = sb.ToString(); //making sure all the first and last positions are present in the merged JSON Assert.Contains("\"header\":{\"annotator\":\"NirvanaTest\"", fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":100}", fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":122}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":200}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":222}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":300}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":322}", fullJson); //checking if all the genes are there Assert.Contains("gene1A", fullJson); Assert.Contains("gene1B", fullJson); Assert.Contains("gene2A", fullJson); Assert.Contains("gene2B", fullJson); Assert.Contains("gene3A", fullJson); Assert.Contains("gene3B", fullJson); //need to check if this is a valid json var jObject = JObject.Parse(fullJson); Assert.NotNull(jObject); } [Fact] public void Some_with_genes() { var jsonStreams = new Stream[3]; var jasixSteams = new Stream[3]; (jsonStreams[0], jasixSteams[0]) = GetJsonStreams(ChromosomeUtilities.Chr1, true); (jsonStreams[1], jasixSteams[1]) = GetJsonStreams(ChromosomeUtilities.Chr2, false); (jsonStreams[2], jasixSteams[2]) = GetJsonStreams(ChromosomeUtilities.Chr3, true); var outStream = new MemoryStream(); using (var stitcher = new JsonStitcher(jsonStreams, jasixSteams, outStream, true)) { stitcher.Stitch(); } outStream.Position = 0; var sb = new StringBuilder(); using (var bgZipStream = new BlockGZipStream(outStream, CompressionMode.Decompress)) using (var reader = new StreamReader(bgZipStream)) { string line; while ((line = reader.ReadLine())!=null) { sb.Append(line+'\n'); } } var fullJson = sb.ToString(); //making sure all the first and last positions are present in the merged JSON Assert.Contains("\"header\":{\"annotator\":\"NirvanaTest\"", fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":100}", fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":122}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":200}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":222}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":300}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":322}", fullJson); //checking if all the genes are there Assert.Contains("gene1A", fullJson); Assert.Contains("gene1B", fullJson); Assert.DoesNotContain("gene2A", fullJson); Assert.DoesNotContain("gene2B", fullJson); Assert.Contains("gene3A", fullJson); Assert.Contains("gene3B", fullJson); //need to check if this is a valid json var jObject = JObject.Parse(fullJson); Assert.NotNull(jObject); } [Fact] public void All_jsons_without_genes() { var jsonStreams = new Stream[3]; var jasixSteams = new Stream[3]; (jsonStreams[0], jasixSteams[0]) = GetJsonStreams(ChromosomeUtilities.Chr1, false); (jsonStreams[1], jasixSteams[1]) = GetJsonStreams(ChromosomeUtilities.Chr2, false); (jsonStreams[2], jasixSteams[2]) = GetJsonStreams(ChromosomeUtilities.Chr3, false); var outStream = new MemoryStream(); using (var stitcher = new JsonStitcher(jsonStreams, jasixSteams, outStream, true)) { stitcher.Stitch(); } outStream.Position = 0; var sb = new StringBuilder(); using (var bgZipStream = new BlockGZipStream(outStream, CompressionMode.Decompress)) using (var reader = new StreamReader(bgZipStream)) { string line; while ((line = reader.ReadLine())!=null) { sb.Append(line+'\n'); } } var fullJson = sb.ToString(); //making sure all the first and last positions are present in the merged JSON Assert.Contains("\"header\":{\"annotator\":\"NirvanaTest\"", fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":100}", fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":122}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":200}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":222}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":300}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":322}", fullJson); //checking if all the genes are there Assert.DoesNotContain("gene1A", fullJson); Assert.DoesNotContain("gene1B", fullJson); Assert.DoesNotContain("gene2A", fullJson); Assert.DoesNotContain("gene2B", fullJson); Assert.DoesNotContain("gene3A", fullJson); Assert.DoesNotContain("gene3B", fullJson); //need to check if this is a valid json var jObject = JObject.Parse(fullJson); Assert.NotNull(jObject); } //The following tests don't use JsonWriter. They are intended to isolate issues that might be due to some // error in the json writer. The following tests try to create the ideal json output. private static (Stream jsonStream, Stream jasixStream) GetNirvanaJsonStream(int chromNumber) { var jsonStream = new MemoryStream(); var jasixStream = new MemoryStream(); using (var bgZipStream = new BlockGZipStream(jsonStream, CompressionMode.Compress, true)) using (var writer = new BgzipTextWriter(bgZipStream)) using(var jasixIndex = new JasixIndex()) { writer.Write(NirvanaHeader); writer.Flush(); jasixIndex.BeginSection(JasixCommons.PositionsSectionTag, writer.Position); for (int i = 100*chromNumber; i < 123*chromNumber; i++) { writer.WriteLine($"{JsonObject.OpenBrace}\"chromosome\":\"chr{chromNumber}\",\"position\":{i}{JsonObject.CloseBrace},"); if(i%50==0) writer.Flush();//creating another block } writer.WriteLine($"{JsonObject.OpenBrace}\"chromosome\":\"chr{chromNumber}\",\"position\":{100*chromNumber+25}{JsonObject.CloseBrace}"); writer.Flush(); jasixIndex.EndSection(JasixCommons.PositionsSectionTag, writer.Position); writer.Write(NirvanaGenes); writer.Flush(); jasixIndex.BeginSection(JasixCommons.GenesSectionTag, writer.Position); writer.WriteLine($"{{\"gene{chromNumber}A\":\"gene annotation\"}},"); writer.WriteLine($"{{\"gene{chromNumber}B\":\"gene annotation\"}}"); writer.Flush(); jasixIndex.EndSection(JasixCommons.GenesSectionTag, writer.Position); writer.Write(NirvanaFooter); jasixIndex.Write(jasixStream); } jsonStream.Position = 0; jasixStream.Position = 0; return (jsonStream, jasixStream); } [Fact] public void EndToEndStitching() { var jsonStreams = new Stream[3]; var jasixSteams = new Stream[3]; (jsonStreams[0], jasixSteams[0]) = GetNirvanaJsonStream(1); (jsonStreams[1], jasixSteams[1]) = GetNirvanaJsonStream(2); (jsonStreams[2], jasixSteams[2]) = GetNirvanaJsonStream(3); var outStream = new MemoryStream(); using (var stitcher = new JsonStitcher(jsonStreams, jasixSteams, outStream, true)) { stitcher.Stitch(); } outStream.Position = 0; var sb = new StringBuilder(); using (var bgZipStream = new BlockGZipStream(outStream, CompressionMode.Decompress)) using (var reader = new StreamReader(bgZipStream)) { string line; while ((line = reader.ReadLine())!=null) { sb.Append(line+'\n'); } } var fullJson = sb.ToString(); //making sure all the first and last positions are present in the merged JSON Assert.Contains(NirvanaHeader, fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":100}", fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":125}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":200}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":225}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":300}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":325}", fullJson); //checking if all the genes are there Assert.Contains("gene1A", fullJson); Assert.Contains("gene1B", fullJson); Assert.Contains("gene2A", fullJson); Assert.Contains("gene2B", fullJson); Assert.Contains("gene3A", fullJson); Assert.Contains("gene3B", fullJson); //need to check if this is a valid json var jObject = JObject.Parse(fullJson); Assert.NotNull(jObject); } private static (Stream jsonStream, Stream jasixStream) GetNirvanaJsonStreamWithoutGenes(int chromNumber) { var jsonStream = new MemoryStream(); var jasixStream = new MemoryStream(); using (var bgZipStream = new BlockGZipStream(jsonStream, CompressionMode.Compress, true)) using (var writer = new BgzipTextWriter(bgZipStream)) using(var jasixIndex = new JasixIndex()) { writer.Write(NirvanaHeader); writer.Flush(); jasixIndex.BeginSection(JasixCommons.PositionsSectionTag, writer.Position); for (int i = 100 *chromNumber; i < 123 *chromNumber; i++) { writer.WriteLine($"{JsonObject.OpenBrace}\"chromosome\":\"chr{chromNumber}\",\"position\":{i}{JsonObject.CloseBrace},"); if(i %50 ==0) writer.Flush();//creating another block } writer.WriteLine($"{JsonObject.OpenBrace}\"chromosome\":\"chr{chromNumber}\",\"position\":{100 *chromNumber +25}{JsonObject.CloseBrace}"); writer.Flush(); jasixIndex.EndSection(JasixCommons.PositionsSectionTag, writer.Position); writer.Write(NirvanaFooter); jasixIndex.Write(jasixStream); } jsonStream.Position = 0; jasixStream.Position = 0; return (jsonStream, jasixStream); } [Fact] public void StitchingWithoutGenes() { var jsonStreams = new Stream[3]; var jasixSteams = new Stream[3]; (jsonStreams[0], jasixSteams[0]) = GetNirvanaJsonStream(1); (jsonStreams[1], jasixSteams[1]) = GetNirvanaJsonStreamWithoutGenes(2); (jsonStreams[2], jasixSteams[2]) = GetNirvanaJsonStream(3); var outStream = new MemoryStream(); using (var stitcher = new JsonStitcher(jsonStreams, jasixSteams, outStream, true)) { stitcher.Stitch(); } outStream.Position = 0; var sb = new StringBuilder(); using (var bgZipStream = new BlockGZipStream(outStream, CompressionMode.Decompress)) using (var reader = new StreamReader(bgZipStream)) { string line; while ((line = reader.ReadLine())!=null) { sb.Append(line+'\n'); } } var fullJson = sb.ToString(); //making sure all the first and last positions are present in the merged JSON Assert.Contains(NirvanaHeader, fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":100}", fullJson); Assert.Contains("{\"chromosome\":\"chr1\",\"position\":125}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":200}", fullJson); Assert.Contains("{\"chromosome\":\"chr2\",\"position\":225}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":300}", fullJson); Assert.Contains("{\"chromosome\":\"chr3\",\"position\":325}", fullJson); //checking if all the genes are there Assert.Contains("gene1A", fullJson); Assert.Contains("gene1B", fullJson); Assert.Contains("gene3A", fullJson); Assert.Contains("gene3B", fullJson); //need to check if this is a valid json var jObject = JObject.Parse(fullJson); Assert.NotNull(jObject); } } } ================================================ FILE: UnitTests/MitoHeteroplasmy/MitoHeteroplasmyProviderTests.cs ================================================ using MitoHeteroplasmy; using UnitTests.TestUtilities; using VariantAnnotation.Pools; using Variants; using Xunit; namespace UnitTests.MitoHeteroplasmy { public sealed class MitoHeteroplasmyProviderTests { private static MitoHeteroplasmyProvider GetProvider() { var provider = new MitoHeteroplasmyProvider(); provider.Add(1, "C", new[] { 0.123, 0.200, 0.301 }, new[] { 1, 3, 4 }); provider.Add(1, "G", new[] { 0.101, 0.201 }, new[] { 1, 2 }); provider.Add(2, "T", new[] { 0, 0.001, 0.002, 0.003 }, new[] { 134, 1111, 936, 203 }); return provider; } [Fact] public void GetVrfPercentiles_AsExpected() { var provider = GetProvider(); var position = 1; IVariant[] variants = { VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "C", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false), VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "G", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false), VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "T", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false) }; var percentilesSample = provider.GetVrfPercentiles(variants, new[] { 0.2, 0.15, 0.02 }); Assert.Equal(3, percentilesSample.Length); Assert.True(percentilesSample[0].HasValue); Assert.Equal(100 / 8.0, percentilesSample[0].Value, 3); Assert.True(percentilesSample[1].HasValue); Assert.Equal(100 / 3.0, percentilesSample[1].Value, 3); Assert.Null(percentilesSample[2]); foreach (IVariant variant in variants) { VariantPool.Return((Variant) variant); } } [Fact] public void GetVrfPercentiles_NullIfNoValue() { var provider = GetProvider(); var position = 1; IVariant[] variants = { VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "T", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false), VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "ACC", VariantType.insertion, null, false, false, false, null, AnnotationBehavior.SmallVariants, false) }; var percentiles = provider.GetVrfPercentiles(variants, new[] { 0.24, 0.12 }); Assert.Null(percentiles); foreach (IVariant variant in variants) { VariantPool.Return((Variant) variant); } } [Fact] public void GetVrfPercentiles_ProperRounding() { var provider = GetProvider(); var position = 2; IVariant[] variants = { VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "T", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false) }; var percentilesSample = provider.GetVrfPercentiles(variants, new[] { 0.0014 }); Assert.Single(percentilesSample); Assert.True(percentilesSample[0].HasValue); Assert.Equal(52.22, percentilesSample[0].Value, 2); foreach (IVariant variant in variants) { VariantPool.Return((Variant) variant); } } [Fact] public void GetVrfPercentiles_zero() { var provider = GetProvider(); var position = 1; IVariant[] variants = { VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "G", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false) }; var percentilesSample = provider.GetVrfPercentiles(variants, new[] { 0.0034 }); Assert.Single(percentilesSample); Assert.True(percentilesSample[0].HasValue); Assert.Equal(0, percentilesSample[0].Value, 2); foreach (IVariant variant in variants) { VariantPool.Return((Variant) variant); } } [Fact] public void GetVrfPercentiles_100() { var provider = GetProvider(); var position = 2; IVariant[] variants = { VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "T", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false) }; var percentilesSample = provider.GetVrfPercentiles(variants, new[] { 0.0034 }); Assert.Single(percentilesSample); Assert.True(percentilesSample[0].HasValue); Assert.Equal(100, percentilesSample[0].Value, 2); foreach (IVariant variant in variants) { VariantPool.Return((Variant) variant); } } [Fact] public void CapVrf() { var provider = new MitoHeteroplasmyProvider(); provider.Add(750, "G", new[] { 0.0,0.001,0.002,0.991,0.994,0.995,0.996,0.997,0.998,0.999 }, new[] { 24,4,2,3,2,1,1,4,3,2460}); var position = 750; IVariant[] variants = { VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "N", "G", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false) }; var percentilesSample = provider.GetVrfPercentiles(variants, new[] { 1.0 }); Assert.Single(percentilesSample); Assert.True(percentilesSample[0].HasValue); Assert.Equal(1.76, percentilesSample[0].Value, 2); foreach (IVariant variant in variants) { VariantPool.Return((Variant) variant); } } } } ================================================ FILE: UnitTests/MockedData/Genes.cs ================================================ using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; // ReSharper disable InconsistentNaming namespace UnitTests.MockedData { public static class Genes { public static readonly Gene MED8 = new(ChromosomeUtilities.Chr1, 43383908, 43389812, true, "MED8", 19971, CompactId.Convert("112950"), CompactId.Convert("ENSG00000159479")); public static readonly Gene SAMD13 = new(ChromosomeUtilities.Chr1, 84298366, 84389957, false, "SAMD13", 24582, CompactId.Convert("148418"), CompactId.Convert( "ENSG00000203943")); public static readonly Gene POTEI = new(ChromosomeUtilities.Chr2, 130459455, 131626428, true, "POTEI", 37093, CompactId.Convert("653269"), CompactId.Convert( "ENSG00000196834")); public static readonly Gene PTPN18 = new(ChromosomeUtilities.Chr2, 130356007, 130375409, false, "PTPN18", 9649, CompactId.Convert("26469"), CompactId.Convert( "ENSG00000072135")); public static readonly Gene AL078459_1 = new(ChromosomeUtilities.Chr1, 85276715, 85448124, false, "AL078459.1", -1, CompactId.Empty, CompactId.Convert("ENSG00000223653")); } } ================================================ FILE: UnitTests/MockedData/TranscriptRegions.cs ================================================ using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; // ReSharper disable InconsistentNaming namespace UnitTests.MockedData { public static class TranscriptRegions { public static readonly ITranscriptRegion[] ENST00000290663 = { new TranscriptRegion(TranscriptRegionType.Exon, 8, 43383917, 43384552, 848, 1483), new TranscriptRegion(TranscriptRegionType.Intron, 7, 43384553, 43385045, 847, 848), new TranscriptRegion(TranscriptRegionType.Exon, 7, 43385046, 43385106, 787, 847), new TranscriptRegion(TranscriptRegionType.Intron, 6, 43385107, 43385977, 786, 787), new TranscriptRegion(TranscriptRegionType.Exon, 6, 43385978, 43386226, 538, 786), new TranscriptRegion(TranscriptRegionType.Intron, 5, 43386227, 43386588, 537, 538), new TranscriptRegion(TranscriptRegionType.Exon, 5, 43386589, 43386670, 456, 537), new TranscriptRegion(TranscriptRegionType.Intron, 4, 43386671, 43386857, 455, 456), new TranscriptRegion(TranscriptRegionType.Exon, 4, 43386858, 43386998, 315, 455), new TranscriptRegion(TranscriptRegionType.Intron, 3, 43386999, 43387502, 314, 315), new TranscriptRegion(TranscriptRegionType.Exon, 3, 43387503, 43387647, 170, 314), new TranscriptRegion(TranscriptRegionType.Intron, 2, 43387648, 43388309, 169, 170), new TranscriptRegion(TranscriptRegionType.Exon, 2, 43388310, 43388428, 51, 169), new TranscriptRegion(TranscriptRegionType.Intron, 1, 43388429, 43389758, 50, 51), new TranscriptRegion(TranscriptRegionType.Exon, 1, 43389759, 43389808, 1, 50) }; public static readonly ITranscriptRegion[] ENST00000370673 = { new TranscriptRegion(TranscriptRegionType.Exon, 1, 84298366, 84298567, 1, 202), new TranscriptRegion(TranscriptRegionType.Intron, 1, 84298568, 84303202, 202, 203), new TranscriptRegion(TranscriptRegionType.Exon, 2, 84303203, 84303287, 203, 287), new TranscriptRegion(TranscriptRegionType.Intron, 2, 84303288, 84325636, 287, 288), new TranscriptRegion(TranscriptRegionType.Exon, 3, 84325637, 84325748, 288, 399), new TranscriptRegion(TranscriptRegionType.Intron, 3, 84325749, 84349630, 399, 400), new TranscriptRegion(TranscriptRegionType.Exon, 4, 84349631, 84350798, 400, 1567) }; public static readonly ITranscriptRegion[] ENST00000615053 = { new TranscriptRegion(TranscriptRegionType.Exon, 13, 130463799, 130464144, 1581, 1926), new TranscriptRegion(TranscriptRegionType.Intron, 12, 130464145, 130465651, 1580, 1581), new TranscriptRegion(TranscriptRegionType.Exon, 12, 130465652, 130465664, 1568, 1580), new TranscriptRegion(TranscriptRegionType.Intron, 11, 130465665, 130465666, 1567, 1568), new TranscriptRegion(TranscriptRegionType.Exon, 11, 130465667, 130465772, 1462, 1567), new TranscriptRegion(TranscriptRegionType.Intron, 10, 130465773, 130474377, 1461, 1462), new TranscriptRegion(TranscriptRegionType.Exon, 10, 130474378, 130474534, 1305, 1461), new TranscriptRegion(TranscriptRegionType.Intron, 9, 130474535, 130488188, 1304, 1305), new TranscriptRegion(TranscriptRegionType.Exon, 9, 130488189, 130488201, 1292, 1304), new TranscriptRegion(TranscriptRegionType.Intron, 8, 130488202, 130489237, 1291, 1292), new TranscriptRegion(TranscriptRegionType.Exon, 8, 130489238, 130489279, 1250, 1291), new TranscriptRegion(TranscriptRegionType.Intron, 7, 130489280, 130490669, 1249, 1250), new TranscriptRegion(TranscriptRegionType.Exon, 7, 130490670, 130490740, 1179, 1249), new TranscriptRegion(TranscriptRegionType.Intron, 6, 130490741, 130496551, 1178, 1179), new TranscriptRegion(TranscriptRegionType.Exon, 6, 130496552, 130496622, 1108, 1178), new TranscriptRegion(TranscriptRegionType.Intron, 5, 130496623, 130499083, 1107, 1108), new TranscriptRegion(TranscriptRegionType.Exon, 5, 130499084, 130499221, 970, 1107), new TranscriptRegion(TranscriptRegionType.Intron, 4, 130499222, 130500535, 969, 970), new TranscriptRegion(TranscriptRegionType.Exon, 4, 130500536, 130500642, 863, 969), new TranscriptRegion(TranscriptRegionType.Intron, 3, 130500643, 130503445, 862, 863), new TranscriptRegion(TranscriptRegionType.Exon, 3, 130503446, 130503619, 689, 862), new TranscriptRegion(TranscriptRegionType.Intron, 2, 130503620, 130503779, 688, 689), new TranscriptRegion(TranscriptRegionType.Exon, 2, 130503780, 130503894, 574, 688), new TranscriptRegion(TranscriptRegionType.Intron, 1, 130503895, 130508714, 573, 574), new TranscriptRegion(TranscriptRegionType.Exon, 1, 130508715, 130509287, 1, 573) }; public static readonly ITranscriptRegion[] ENST00000347849 = { new TranscriptRegion(TranscriptRegionType.Exon, 1, 130356045, 130356200, 1, 156), new TranscriptRegion(TranscriptRegionType.Intron, 1, 130356201, 130369132, 156, 157), new TranscriptRegion(TranscriptRegionType.Exon, 2, 130369133, 130369201, 157, 225), new TranscriptRegion(TranscriptRegionType.Intron, 2, 130369202, 130369764, 225, 226), new TranscriptRegion(TranscriptRegionType.Exon, 3, 130369765, 130369827, 226, 288), new TranscriptRegion(TranscriptRegionType.Intron, 3, 130369828, 130370047, 288, 289), new TranscriptRegion(TranscriptRegionType.Exon, 4, 130370048, 130370190, 289, 431), new TranscriptRegion(TranscriptRegionType.Intron, 4, 130370191, 130370556, 431, 432), new TranscriptRegion(TranscriptRegionType.Exon, 5, 130370557, 130370623, 432, 498), new TranscriptRegion(TranscriptRegionType.Intron, 5, 130370624, 130370704, 498, 499), new TranscriptRegion(TranscriptRegionType.Exon, 6, 130370705, 130370782, 499, 576), new TranscriptRegion(TranscriptRegionType.Intron, 6, 130370783, 130370874, 576, 577), new TranscriptRegion(TranscriptRegionType.Exon, 7, 130370875, 130370964, 577, 666), new TranscriptRegion(TranscriptRegionType.Intron, 7, 130370965, 130371198, 666, 667), new TranscriptRegion(TranscriptRegionType.Exon, 8, 130371199, 130371287, 667, 755), new TranscriptRegion(TranscriptRegionType.Intron, 8, 130371288, 130372256, 755, 756), new TranscriptRegion(TranscriptRegionType.Exon, 9, 130372257, 130372483, 756, 982), new TranscriptRegion(TranscriptRegionType.Intron, 9, 130372484, 130372872, 982, 983), new TranscriptRegion(TranscriptRegionType.Exon, 10, 130372873, 130372947, 983, 1057), new TranscriptRegion(TranscriptRegionType.Intron, 10, 130372948, 130373156, 1057, 1058), new TranscriptRegion(TranscriptRegionType.Exon, 11, 130373157, 130374571, 1058, 2472) }; public static readonly ITranscriptRegion[] ENST00000427819 = { new TranscriptRegion(TranscriptRegionType.Exon, 1, 85276715, 85276797, 1, 83), new TranscriptRegion(TranscriptRegionType.Intron, 1, 85276798, 85277640, 83, 84), new TranscriptRegion(TranscriptRegionType.Exon, 2, 85277641, 85277738, 84, 181), new TranscriptRegion(TranscriptRegionType.Intron, 2, 85277739, 85376765, 181, 182), new TranscriptRegion(TranscriptRegionType.Exon, 3, 85376766, 85376835, 182, 251), new TranscriptRegion(TranscriptRegionType.Intron, 3, 85376836, 85380373, 251, 252), new TranscriptRegion(TranscriptRegionType.Exon, 4, 85380374, 85380565, 252, 443), new TranscriptRegion(TranscriptRegionType.Intron, 4, 85380566, 85398456, 443, 444), new TranscriptRegion(TranscriptRegionType.Exon, 5, 85398457, 85399963, 444, 1950) }; } } ================================================ FILE: UnitTests/MockedData/Transcripts.cs ================================================ using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; // ReSharper disable InconsistentNaming namespace UnitTests.MockedData { public static class Transcripts { // the following 5 transcripts were chosen to stress test our gene fusions: public static readonly ITranscript ENST00000290663 = new Transcript(ChromosomeUtilities.Chr1, 43383917, 43389808, CompactId.Convert("ENST00000290663", 10), Translations.ENST00000290663, BioType.protein_coding, Genes.MED8, 1483, 0, true, TranscriptRegions.ENST00000290663, 8, null, 0, 0, Source.Ensembl, false, false, null, null); public static readonly ITranscript ENST00000370673 = new Transcript(ChromosomeUtilities.Chr1, 84298366, 84350798, CompactId.Convert("ENST00000370673", 7), Translations.ENST00000370673, BioType.protein_coding, Genes.SAMD13, 1567, 0, false, TranscriptRegions.ENST00000370673, 4, null, 0, 0, Source.Ensembl, false, false, null, null); public static readonly ITranscript ENST00000615053 = new Transcript(ChromosomeUtilities.Chr2, 130463799, 130509287, CompactId.Convert("ENST00000615053", 3), Translations.ENST00000615053, BioType.protein_coding, Genes.POTEI, 1926, 0, false, TranscriptRegions.ENST00000615053, 13, null, 0, 0, Source.Ensembl, false, false, null, null); public static readonly ITranscript ENST00000347849 = new Transcript(ChromosomeUtilities.Chr2, 130356045, 130374571, CompactId.Convert("ENST00000347849", 7), Translations.ENST00000347849, BioType.protein_coding, Genes.PTPN18, 2472, 0, false, TranscriptRegions.ENST00000347849, 11, null, 0, 0, Source.Ensembl, false, false, null, null); // antisense RNA public static readonly ITranscript ENST00000427819 = new Transcript(ChromosomeUtilities.Chr1, 85276715, 85399963, CompactId.Convert("ENST00000427819", 5), null, BioType.antisense_RNA, Genes.AL078459_1, 1950, 0, false, TranscriptRegions.ENST00000427819, 5, null, 0, 0, Source.Ensembl, false, false, null, null); } } ================================================ FILE: UnitTests/MockedData/Translations.cs ================================================ using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; // ReSharper disable InconsistentNaming namespace UnitTests.MockedData { public static class Translations { public static readonly ITranslation ENST00000290663 = new Translation(new CodingRegion(43384450, 43389764, 45, 950, 906), CompactId.Convert("ENSP00000290663", 6), "MQREEKQLEASLDALLSQVADLKNSLGSFICKLENEYGRLTWPSVLDSFALLSGQLNTLNKVLKHEKTPLFRNQVIIPLVLSPDRDEDLMRQTEGRVPVFSHEVVPDHLRTKPDPEVEEQEKQLTTDAARIGADAAQKQIQSLNKMCSNLLEKISKEERESESGGLRPNKQTFNPTDTNALVAAVAFGKGLSNWRPSGSSGPGQAGQPGAGTILAGTSGLQQVQMAGAPSQQQPMLSGVQMAQAGQPGKMPSGIKTNIKSASMHPYQRPSCLGFILAIPLRRKVKKLLGQEGKKNAHLQLW"); public static readonly ITranslation ENST00000370673 = new Translation(new CodingRegion(84298558, 84349774, 193, 543, 351), CompactId.Convert("ENSP00000359707", 3), "MRGVAEVKEPCSLPMLSVDMENKENGSVGVKNSMENGRPPDPADWAVMDVVNYFRTVGFEEQASAFQEQEIDGKSLLLMTRNDVLTGLQLKLGPALKIYEYHVKPLQTKHLKNNSS"); public static readonly ITranslation ENST00000615053 = new Translation(new CodingRegion(130465653, 130509235, 53, 1579, 1527), CompactId.Convert("ENSP00000483193", 1), "MVAEVDSMPAASSVKKPFVLRSKMGKWCRHCFPCCRGSGKSNVGTSGDQDDSTMKTLRSKMGKWCCHCFPCCRGSGKSNVGTSGDHDDSAMKTLRSKMGKWCCHCFPCCRGSGKSNVGAWGDYDDSAFVEPRYHVRREDLDKLHRAAWWGKVARKDLIVMLRDTDVNKQDKQKRTALHLASANGNSGVVKLLLDRRCQLNVLDNKKRTALTKAVQCQEDECALMLLEHGTDPNIPDEYGNTTLHYAIYNEDKLMAKALLLYGADIESKNKHGLTPLLLGVHEQKQQVVKFLIKKKANLNALDRYGRTALILAVCCGSASIVSLLLEQNIDVSSQDLSGQTAREYAVSSHHHVICQLLSDYKEKQMLKISSENSNPEQDLKLTSEEESQRFKGSENSQPEKMSQEPEINKDGDRKVEEEMKKHGSTHVGFPENLTNGATAGNGDDGLIPPRKSRTPESQQFPDTENEEYHSDEQNDTQKQFCEEQNTGILHDEILIHEEKQIEVVENEF"); public static readonly ITranslation ENST00000347849 = new Translation(new CodingRegion(130356108, 130373224, 64, 1125, 1062), CompactId.Convert("ENSP00000310092", 5), "MSRSLDSARSFLERLEARGGREGAVLAGEFSKRCERYWAQEQEPLQTGLFCITLIKEKWLNEDIMLRTLKVTFQKESRSVYQLQYMSWPDRGVPSSPDHMLAMVEEARRLQGSGPEPLCVHCSAGCGRTGVLCTVDYVRQLLLTQMIPPDFSLFDVVLKMRKQRPAAVQTEEQYRFLYHTVAQMFCSTLQNASPHYQNIKENCAPLYDDALFLRTPQALLAIPRPPGGVLRSISVPGSPGHAMADTYAVVQKRGAPAGAGSGTQTGTGTGTGARSAEEAPLYSKVTPRAQRPGAHAEDARGTLPGRVPADQSPAGSGAYEDVAGGAQTGGLGFNLRIGRPKGPRDPPAEWTRV"); } } ================================================ FILE: UnitTests/Nirvana/AnnotationFilesTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Nirvana; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Nirvana { public sealed class AnnotationFilesTests { [Fact] public void GetFiles_FromDirectory_AsExpected() { var files = new AnnotationFiles(); var saDirectory = Resources.MockSaFiles; files.AddFiles(saDirectory); var expectedNsaFiles = new List<(string, string)> { (Path.Combine(saDirectory, "sa1.nsa"), Path.Combine(saDirectory, "sa1.nsa.idx")), (Path.Combine(saDirectory, "sa2.nsa"), Path.Combine(saDirectory, "sa2.nsa.idx")) }; var expectedNsiFiles = new List { Path.Combine(saDirectory, "sa3.nsi"), Path.Combine(saDirectory, "sa4.nsi") }; var expectedConservationFile = (Path.Combine(saDirectory, "sa5.npd"), Path.Combine(saDirectory, "sa5.npd.idx")); var expectedNgaFiles = new List { Path.Combine(saDirectory, "sa6.nga"), Path.Combine(saDirectory, "sa7.nga") }; var expectedRefMinorFile = (Path.Combine(saDirectory, "sa8.rma"), Path.Combine(saDirectory, "sa8.rma.idx")); Assert.Equal(expectedNsaFiles, files.NsaFiles.OrderBy(x=> x.Nsa)); Assert.Equal(expectedNsiFiles, files.NsiFiles.OrderBy(x=>x)); Assert.Equal(expectedConservationFile, files.PhylopFile); Assert.Equal(expectedNgaFiles, files.NgaFiles.OrderBy(x=>x)); Assert.Equal(expectedRefMinorFile, files.RefMinorFile); } [Fact] public void GetFiles_FromDirectoryNoSa_NoFileAdded() { var files = new AnnotationFiles(); files.AddFiles("."); Assert.Empty(files.NsaFiles); Assert.Empty(files.NsiFiles); Assert.Empty(files.NgaFiles); Assert.Equal(default, files.PhylopFile); Assert.Equal(default, files.RefMinorFile); } } } ================================================ FILE: UnitTests/Nirvana/PreLoadUtilitiesTests.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using Nirvana; using UnitTests.SAUtils.InputFileParsers; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Nirvana { public sealed class PreLoadUtilitiesTests { private static Stream GetVcfStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##dbSNP"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("1\t10019\trs775809821\tTA\tT\t.\t.\tRS=775809821;RSPOS=10020;dbSNPBuildID=144;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); writer.WriteLine("1\t10285\trs866375379\tT\tA,C\t.\t.\tRS=866375379;RSPOS=10285;dbSNPBuildID=147;SSR=0;SAO=0;VP=0x050100020005000002000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;SLO;R5;ASP"); writer.WriteLine("1\t10329\trs150969722\tAC\tA\t.\t.\tRS=150969722;RSPOS=10330;dbSNPBuildID=134;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); writer.WriteLine("2\t10019\trs775809821\tTA\tT\t.\t.\tRS=775809821;RSPOS=10020;dbSNPBuildID=144;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); writer.WriteLine("2\t10285\trs866375379\tT\tA,C\t.\t.\tRS=866375379;RSPOS=10285;dbSNPBuildID=147;SSR=0;SAO=0;VP=0x050100020005000002000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;SLO;R5;ASP"); writer.WriteLine("2\t10329\trs150969722\tAC\tA\t.\t.\tRS=150969722;RSPOS=10330;dbSNPBuildID=134;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetAllPositions() { //we only need the sequence provider for variant rotation. var seqProvider = ParserTestUtils.GetSequenceProvider(10329, "AC", 'A', ChromosomeUtilities.RefNameToChromosome); (var positions, _) = PreLoadUtilities.GetPositions(GetVcfStream(), null, seqProvider, null); Assert.Equal(2, positions.Count); Assert.Equal(4, positions[ChromosomeUtilities.Chr1].Count); Assert.Equal(4, positions[ChromosomeUtilities.Chr2].Count); } [Fact] public void GetPositions_inRange() { var annotationRange = new GenomicRange(new GenomicPosition(ChromosomeUtilities.Chr1, 10019), new GenomicPosition(ChromosomeUtilities.Chr1, 10290)); var seqProvider = ParserTestUtils.GetSequenceProvider(10329, "AC", 'A', ChromosomeUtilities.RefNameToChromosome); (var positions, _) = PreLoadUtilities.GetPositions(GetVcfStream(), annotationRange, seqProvider, null); Assert.Single(positions); Assert.Equal(3, positions[ChromosomeUtilities.Chr1].Count); } private static Stream GetRefMinorVcfStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##dbSNP"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("1\t10019\trs775809821\tTA\tT\t.\t.\tRS=775809821;RSPOS=10020;dbSNPBuildID=144;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); //ref minor position writer.WriteLine("1\t10275\trs866375379\tT\t.\t.\t.\tRS=866375379;RSPOS=10285;dbSNPBuildID=147;SSR=0;SAO=0;VP=0x050100020005000002000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;SLO;R5;ASP"); writer.WriteLine("1\t10285\trs866375379\tT\tA,C\t.\t.\tRS=866375379;RSPOS=10285;dbSNPBuildID=147;SSR=0;SAO=0;VP=0x050100020005000002000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;SLO;R5;ASP"); //ref position. not ref minor writer.WriteLine("1\t10289\trs866375379\tT\t.\t.\t.\tRS=866375379;RSPOS=10285;dbSNPBuildID=147;SSR=0;SAO=0;VP=0x050100020005000002000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;SLO;R5;ASP"); writer.WriteLine("1\t10329\trs150969722\tAC\tA\t.\t.\tRS=150969722;RSPOS=10330;dbSNPBuildID=134;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); writer.WriteLine("2\t10019\trs775809821\tTA\tT\t.\t.\tRS=775809821;RSPOS=10020;dbSNPBuildID=144;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); writer.WriteLine("2\t10285\trs866375379\tT\tA,C\t.\t.\tRS=866375379;RSPOS=10285;dbSNPBuildID=147;SSR=0;SAO=0;VP=0x050100020005000002000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;SLO;R5;ASP"); writer.WriteLine("2\t10329\trs150969722\tAC\tA\t.\t.\tRS=150969722;RSPOS=10330;dbSNPBuildID=134;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetAllPositions_skip_refs() { //we only need the sequence provider for variant rotation. var seqProvider = ParserTestUtils.GetSequenceProvider(10329, "AC", 'A', ChromosomeUtilities.RefNameToChromosome); var refMinorProvider = ParserTestUtils.GetRefMinorProvider( new List<(Chromosome chrom, int position, string globalMinor)> { (ChromosomeUtilities.Chr1, 10275, "A" ) } ); (var positions, _) = PreLoadUtilities.GetPositions(GetRefMinorVcfStream(), null, seqProvider, refMinorProvider); Assert.Equal(2, positions.Count); Assert.Equal(5, positions[ChromosomeUtilities.Chr1].Count); Assert.Equal(4, positions[ChromosomeUtilities.Chr2].Count); } } } ================================================ FILE: UnitTests/Nirvana/ProviderUtilitiesTests.cs ================================================ using Nirvana; using Xunit; namespace UnitTests.Nirvana { public sealed class ProviderUtilitiesTests { [Fact] public void GetNsaProvider_NoSaFile_ReturnNull() { var annotationFiles = new AnnotationFiles(); var nsaProvider = ProviderUtilities.GetNsaProvider(annotationFiles); Assert.Null(nsaProvider); } } } ================================================ FILE: UnitTests/NirvanaLambda/AnnotationJobTests.cs ================================================ using System; using System.IO; using System.Text; using System.Threading.Tasks; using Amazon.Lambda.Model; using ErrorHandling; using NirvanaLambda; using Xunit; namespace UnitTests.NirvanaLambda { public sealed class AnnotationJobTests { [Fact] public void GetResultSummaryFromSuccessInvocation_AsExpected() { const string annotationResult = "{\"id\":\"Test\",\"status\":\"Success\",\"filePath\":\"result/input_00001.json.gz\"}"; var memoryStream = new MemoryStream(Encoding.UTF8.GetBytes(annotationResult)); var processed = AnnotationJob.GetResultSummaryFromSuccessInvocation(memoryStream); Assert.Equal("input_00001.json.gz", processed.FileName); Assert.Null(processed.ErrorMessage); Assert.Null(processed.ErrorCategory); } [Fact] public void GetResultSummaryFromSuccessInvocation_PassFailedStatus_FromAnnotationJob() { const string annotationResult = "{\"id\":\"Test\",\"status\":\"Something Wrong!\",\"filePath\":\"\",\"ErrorCategory\":\"NirvanaError\"}"; var memoryStream = new MemoryStream(Encoding.UTF8.GetBytes(annotationResult)); var processed = AnnotationJob.GetResultSummaryFromSuccessInvocation(memoryStream); Assert.Equal("", processed.FileName); Assert.Equal("Something Wrong!", processed.ErrorMessage); Assert.Equal(ErrorCategory.NirvanaError, processed.ErrorCategory); } [Fact] public void CheckResponse_AsExpected() { Assert.Throws(() => new AnnotationJob(null, 1).CheckResponse(new InvokeResponse {FunctionError = "Unhandled"})); Assert.Throws(() => new AnnotationJob(null, 1).CheckResponse(null)); } [Fact] public void GetResultSummaryFromFailedInvocation_AsExpected() { var job = new AnnotationJob(null, 1); var generalExpection = new Exception("first level exception", new Exception("second level exception", new Exception("third level exception"))); var taskCanceledExpection = new Exception("first level exception", new TaskCanceledException("second level exception", new Exception("third level exception"))); var generalResult = job.GetResultSummaryFromFailedInvocation(generalExpection); var taskCanceledResult = job.GetResultSummaryFromFailedInvocation(taskCanceledExpection); Assert.Equal(ErrorCategory.NirvanaError, generalResult.ErrorCategory); Assert.Equal("Failed job when invoking the annotation job: third level exception.", generalResult.ErrorMessage); Assert.Equal(ErrorCategory.TimeOutError, taskCanceledResult.ErrorCategory); Assert.Equal("Failed job when invoking the annotation job: third level exception. Annotation job was not finished in 0 milliseconds.", taskCanceledResult.ErrorMessage); } } } ================================================ FILE: UnitTests/NirvanaLambda/NirvanaConfigTests.cs ================================================ using Cloud.Messages; using Cloud.Messages.Nirvana; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.NirvanaLambda { public sealed class NirvanaConfigTests { [Fact] public void CheckFieldsNotNull_AsExpected() { var config = GetConfig(); config.id = null; var exception = Assert.Throws(() =>config.CheckRequiredFieldsNotNull()); Assert.Equal("id cannot be null.", exception.Message); config = GetConfig(); config.genomeAssembly = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("genomeAssembly cannot be null.", exception.Message); config = GetConfig(); config.outputDir = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("outputDir cannot be null.", exception.Message); config = GetConfig(); config.outputDir.bucketName = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("bucketName of outputDir cannot be null.", exception.Message); config = GetConfig(); config.outputDir.path = null; exception = Assert.Throws(() => config.CheckRequiredFieldsNotNull()); Assert.Equal("path of outputDir cannot be null.", exception.Message); } private static NirvanaConfig GetConfig() => new NirvanaConfig { id = "Test", genomeAssembly = "Assembly", vcfUrl = "https://s3.amazonaws.com/input/input.vcf.gz?SomeStuff", tabixUrl = "https://s3.amazonaws.com/input/input.vcf.gz.tbi?SomeStuff", outputDir = new S3Path { bucketName = "OutputBucket", region = "us-west-2", path = "/OutputDir/", accessKey = "1234567", secretKey = "show me the money", sessionToken = "a token" } }; } } ================================================ FILE: UnitTests/NirvanaLambda/NirvanaLambdaTests.cs ================================================ using ErrorHandling; using NL = NirvanaLambda.NirvanaLambda; using Xunit; namespace UnitTests.NirvanaLambda { public sealed class NirvanaLambdaTests { [Theory] [InlineData("/tmp/ada.vcf", 0, "ada_00000")] [InlineData("/ada.vcf", 1, "ada_00001")] [InlineData("ada.vcf", 2, "ada_00002")] [InlineData("ada.vcf.gz", 3, "ada_00003")] [InlineData("ada.vcf.data.vcf.gz", 4, "ada_00004")] [InlineData("https://s3.amazonaws.com/illumina-early-access-zeus/Olympia.vcf.gz?AWSAccessKeyId=AKISKSD87A3C4&Expires=109838429&Signature=s98df7s8df12f2jo4lfjfs9d0fu0sd9f", 5, "Olympia_00005")] [InlineData("https://stratus-gds-stage.s3.us-west-2.amazonaws.com/d3a56bf8-5528-4b4d-b5bb-08d6c9c1c9dd/test-data/vcf/some-chroms/dq/DQ-Strelka-Germline-chr22-hg38.vcf.gz?X-Amz-Expires=604800&response-content-disposition=attachment%3Bfilename%3D%22DQ-Strelka-Germline-chr22-hg38.vcf.gz%22&x-userId=fb2136c7-01c2-32cc-8d53-b78db2c022de&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJ7P2VLXQJYGXATTA/20190516/us-west-2/s3/aws4_request&X-Amz-Date=20190516T160606Z&X-Amz-SignedHeaders=host&X-Amz-Signature=8b2f512998b820e8fb18433b5fd2de1c189c157accff92d5d5316a9fa3684d19", 6, "DQ-Strelka-Germline-chr22-hg38_00006")] public void GetIndexedPrefix_AsExpected(string inputVcfPath, int jobIndex, string expectedPrefix) { Assert.Equal(NL.GetIndexedPrefix(inputVcfPath, jobIndex), expectedPrefix); } [Theory] [InlineData(ErrorCategory.UserError, "Wrong input.", "User error: wrong input.")] [InlineData(ErrorCategory.NirvanaError, null, "Nirvana error: an unexpected annotation error occurred while annotating this VCF.")] [InlineData(ErrorCategory.TimeOutError, null, "Timeout error: annotation of the VCF was not finished on time due to network congestion. Please try again later.")] [InlineData(ErrorCategory.InvocationThrottledError, null, "Invocation throttled error: there are too many lambdas currently running in this account. Please try again later.")] public void GetFailedRunStatus_AsExpected(ErrorCategory errorCategory, string errorMessage, string expectedStatus) { Assert.Equal(expectedStatus, NL.GetFailedRunStatus(errorCategory, errorMessage)); } } } ================================================ FILE: UnitTests/NirvanaLambda/PartitionUtilitiesTests.cs ================================================ using System.Linq; using NirvanaLambda; using Xunit; namespace UnitTests.NirvanaLambda { public sealed class PartitionUtilitiesTests { [Fact] public void FindEqualOrClosestSmallerOffsets_AsExpected() { var sizeBasedOffsets = new long[] { 0, 100, 200, 300, 400 }; var allLinearIndexes = new long[] { 15, 45, 97, 123, 146, 175, 200, 234, 265, 293, 401 }; var blockOffsets = PartitionUtilities.FindEqualOrClosestSmallerOffsets(sizeBasedOffsets, allLinearIndexes); var expected = new long[] { 15, 97, 200, 293 }; Assert.Equal(expected, blockOffsets); } [Fact] public void MergeConsecutiveEqualValues_AsExpected() { var input = new[] { 1, 2, 3, 3, 2, 5, 4, 4 }; var expected = new[] { 1, 2, 3, 2, 5, 4 }; Assert.Equal(expected, PartitionUtilities.MergeConsecutiveEqualValues(input).ToArray()); } [Fact] public void GetEqualSizeOffsets_AsExpected() { const int fileSize = 1001; const int numPartitions = 3; var expected = new long[] { 0, 333, 666 }; Assert.Equal(expected, PartitionUtilities.GetEqualSizeOffsets(fileSize, numPartitions)); } } } ================================================ FILE: UnitTests/OptimizedCore/StringBuilderCacheTests.cs ================================================ using OptimizedCore; using Xunit; namespace UnitTests.OptimizedCore { public sealed class StringBuilderCacheTests { [Fact] public void Acquire_UseAndRelease() { const string expectedString = "ABC123"; const string expectedString2 = "The quick brown fox jumps over the lazy dog."; var sb = StringBuilderPool.Get(); sb.Append(expectedString); Assert.Equal(expectedString, StringBuilderPool.GetStringAndReturn(sb)); // acquire an existing string builder sb = StringBuilderPool.Get(); sb.Append(expectedString2); Assert.Equal(expectedString2, StringBuilderPool.GetStringAndReturn(sb)); } } } ================================================ FILE: UnitTests/OptimizedCore/StringExtensionsTests.cs ================================================ using OptimizedCore; using Xunit; namespace UnitTests.OptimizedCore { public sealed class StringExtensionsTests { [Theory] [InlineData("\tjane\tjim")] [InlineData("bob\tjane\t")] [InlineData("bob\tjane\tjim")] public void OptimizedSplit(string s) { var observedResult = s.OptimizedSplit('\t'); var expectedResult = s.Split('\t'); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(null)] [InlineData("")] [InlineData("0")] [InlineData("123")] [InlineData("-123")] [InlineData("2147483647")] [InlineData("-2147483647")] [InlineData("4444444444")] [InlineData("123.3")] public void OptimizedParseInt32(string s) { var observedResult = s.OptimizedParseInt32(); bool expectedFoundError = !int.TryParse(s, out int expectedResult); Assert.Equal(expectedFoundError, observedResult.FoundError); Assert.Equal(expectedResult, observedResult.Number); } [Theory] [InlineData("#CHROM", '#')] [InlineData("#CHROM", 'L')] public void OptimizedStartsWith(string s, char leadingChar) { bool observedResult = s.OptimizedStartsWith(leadingChar); bool expectedResult = s.StartsWith(leadingChar); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData("END=123")] [InlineData("RECOMPOSED")] public void OptimizedKeyValue(string s) { var observedResult = s.OptimizedKeyValue(); var expectedResult = s.Split('='); Assert.Equal(expectedResult[0], observedResult.Key); if (expectedResult.Length == 1) Assert.Null(observedResult.Value); else Assert.Equal(expectedResult[1], observedResult.Value); } [Theory] [InlineData("", '>')] [InlineData("", 'L')] public void OptimizedEndsWith(string s, char leadingChar) { bool observedResult = s.OptimizedEndsWith(leadingChar); bool expectedResult = s.EndsWith(leadingChar); Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/RepeatExpansions/MatcherTests.cs ================================================ using System.Text; using Genome; using Intervals; using RepeatExpansions; using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.RepeatExpansions { public sealed class MatcherTests { private readonly Matcher _matcher; public MatcherTests() { var repeatNumbers = new[] { 7, 8, 9 }; double[] percentiles = { 0, 1, 1.5 }; var classificationRanges = new[] { new Interval(0, 27) }; var classifications = new[] { "Normal" }; var aInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); var aPhenotype = new RepeatExpansionPhenotype(aInterval, "A", null, repeatNumbers, percentiles, classifications, classificationRanges); var chr1Phenotypes = new Interval[1]; chr1Phenotypes[0] = new Interval(aInterval.Start, aInterval.End, aPhenotype); var intervalArrays = new IntervalArray[1]; intervalArrays[ChromosomeUtilities.Chr1.Index] = new IntervalArray(chr1Phenotypes); var phenotypeForest = new IntervalForest(intervalArrays); _matcher = new Matcher(phenotypeForest); } [Fact] public void GetMatchingAnnotations_Overlap_ReturnEntry() { var variant = new RepeatExpansion(ChromosomeUtilities.Chr1, 100, 200, null, null, null, 9, 7); var sa = _matcher.GetMatchingAnnotations(variant); var sb = new StringBuilder(); sa.SerializeJson(sb); string observedResult = sb.ToString(); Assert.Contains("{\"phenotype\":\"A\"", observedResult); } [Fact] public void GetMatchingAnnotations_NoOverlap_ReturnNull() { var variant = new RepeatExpansion(ChromosomeUtilities.Chr1, 220, 230, null, null, null, 9, 7); var sa = _matcher.GetMatchingAnnotations(variant); Assert.Null(sa); } } } ================================================ FILE: UnitTests/RepeatExpansions/PercentileUtilitiesTests.cs ================================================ using RepeatExpansions; using Xunit; namespace UnitTests.RepeatExpansions { public sealed class PercentileUtilitiesTests { private readonly int[] _values = { 7, 8, 9, 10, 11, 12, 13, 15 }; private readonly double[] _percentiles = { 0, 1, 1.5, 3.5, 75.5, 86.5, 98.5, 99.5 }; [Fact] public void ComputePercentiles_Nominal() { var repeatNumbers = new[] { 10, 15, 20, 100, 200 }; var alleleCounts = new[] { 550, 34, 78, 30, 45 }; double[] expectedPercentiles = { 0, 550 * 100.0 / 737, (550 + 34) * 100.0 / 737, (550 + 34 + 78) * 100.0 / 737, (550 + 34 + 78 + 30) * 100.0 / 737 }; double[] observedResults = PercentileUtilities.ComputePercentiles(repeatNumbers.Length, alleleCounts); Assert.Equal(expectedPercentiles, observedResults); } [Fact] public void GetPercentile_RepeatNumberInRange_PositiveIndex() { double observedResult = PercentileUtilities.GetPercentile(14, _values, _percentiles); Assert.Equal(99.5, observedResult); } [Fact] public void GetPercentile_RepeatNumberOutOfRange_NegativeIndex() { double observedResult = PercentileUtilities.GetPercentile(20, _values, _percentiles); Assert.Equal(100, observedResult); } } } ================================================ FILE: UnitTests/RepeatExpansions/RepeatExpansionProviderTests.cs ================================================ using Genome; using OptimizedCore; using RepeatExpansions; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO; using VariantAnnotation.Pools; using Variants; using Vcf; using Xunit; namespace UnitTests.RepeatExpansions { public sealed class RepeatExpansionProviderTests { private readonly RepeatExpansionProvider _provider; private const int Start = 63898361; private const int End = 63898390; public RepeatExpansionProviderTests() { _provider = new RepeatExpansionProvider(GenomeAssembly.GRCh37, ChromosomeUtilities.RefNameToChromosome, 23, null); } [Fact] public void Annotate_NotRepeatExpansion_NullPhenotypes() { var variant = VariantPool.Get(ChromosomeUtilities.Chr3, Start, End, "A", "C", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false); var annotatedPosition = GetAnnotatedPosition(variant); _provider.Annotate(annotatedPosition); var firstVariant = annotatedPosition.AnnotatedVariants[0]; Assert.Null(firstVariant.RepeatExpansionPhenotypes); VariantPool.Return(variant); PositionPool.Return((Position)annotatedPosition.Position); AnnotatedVariantPool.Return((AnnotatedVariant)firstVariant); AnnotatedPositionPool.Return((AnnotatedPosition) annotatedPosition); } [Fact] public void Annotate_RepeatExpansion_NotExactMatch_NullPhenotypes() { var variant = new RepeatExpansion(ChromosomeUtilities.Chr3, Start, End + 1, "A", "", null, 10, 5); var annotatedPosition = GetAnnotatedPosition(variant); _provider.Annotate(annotatedPosition); var firstVariant = annotatedPosition.AnnotatedVariants[0]; Assert.Null(firstVariant.RepeatExpansionPhenotypes); PositionPool.Return((Position)annotatedPosition.Position); AnnotatedVariantPool.Return((AnnotatedVariant)firstVariant); AnnotatedPositionPool.Return((AnnotatedPosition) annotatedPosition); } [Fact] public void Annotate_RepeatExpansion_no_refRepeatCount() { var variant = new RepeatExpansion(ChromosomeUtilities.Chr3, Start, End + 1, "A", "", null, 10, null); var annotatedPosition = GetAnnotatedPosition(variant); _provider.Annotate(annotatedPosition); var firstVariant = annotatedPosition.AnnotatedVariants[0]; Assert.NotNull(firstVariant); PositionPool.Return((Position)annotatedPosition.Position); AnnotatedVariantPool.Return((AnnotatedVariant)firstVariant); AnnotatedPositionPool.Return((AnnotatedPosition) annotatedPosition); } [Fact] public void Annotate_RepeatExpansion_ExactMatch_OnePhenotype() { var variant = new RepeatExpansion(ChromosomeUtilities.Chr3, Start, End, "A", "", null, 10, 5); var annotatedPosition = GetAnnotatedPosition(variant); _provider.Annotate(annotatedPosition); var firstVariant = annotatedPosition.AnnotatedVariants[0]; Assert.NotNull(firstVariant.RepeatExpansionPhenotypes); var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); jsonObject.AddObjectValue(firstVariant.RepeatExpansionPhenotypes.JsonKey, firstVariant.RepeatExpansionPhenotypes); const string expectedJson = "\"repeatExpansionPhenotypes\":[{\"phenotype\":\"Spinocerebellar ataxia 7\",\"omimId\":164500,\"classifications\":[\"Normal\"],\"percentile\":6.33}]"; string observedJson = sb.ToString(); Assert.Equal(expectedJson, observedJson); PositionPool.Return((Position)annotatedPosition.Position); AnnotatedVariantPool.Return((AnnotatedVariant)firstVariant); AnnotatedPositionPool.Return((AnnotatedPosition) annotatedPosition); } private static IAnnotatedPosition GetAnnotatedPosition(IVariant variant) { IVariant[] variants = { variant }; var position = PositionPool.Get(ChromosomeUtilities.Chr3, Start, End, null, null, null, null, variants, null, null, null, null, false); var annotatedVariant = AnnotatedVariantPool.Get(variant); IAnnotatedVariant[] annotatedVariants = { annotatedVariant }; return AnnotatedPositionPool.Get(position, annotatedVariants); } } } ================================================ FILE: UnitTests/Resources/COSM18152.tsv ================================================ Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 Genome-wide screen Mutation ID Mutation CDS Mutation AA Mutation Description Mutation zygosity LOH GRCh Mutation genome position Mutation strand SNP Resistance Mutation FATHMM prediction FATHMM score Mutation somatic status Pubmed_PMID ID_STUDY Sample source Tumour origin Age VHL ENST00000256474 642 12687 264 1776734 1680780 kidney NS NS NS carcinoma clear_cell_renal_cell_carcinoma NS NS n COSM18152 c.463G>A p.V155M Substitution - Missense u 37 3:10188320-10188320 + n - PATHOGENIC .97012 Confirmed somatic variant 23036577 surgery fresh/frozen NS VHL ENST00000256474 642 12687 2146326 2146326 2015515 kidney NS NS NS carcinoma clear_cell_renal_cell_carcinoma NS NS n COSM18152 c.463G>A p.V155M Substitution - Missense u 37 3:10188320-10188320 + n - PATHOGENIC .97012 Confirmed somatic variant 24471421 surgery-fixed NS 68 VHL ENST00000256474 642 12687 980518 980518 896240 kidney NS NS NS carcinoma clear_cell_renal_cell_carcinoma NS NS n COSM18152 c.463G>A p.V155M Substitution - Missense u 37 3:10188320-10188320 + n - PATHOGENIC .97012 Confirmed somatic variant 11505222 surgery fresh/frozen NS 56 VHL ENST00000256474 642 12687 PD3476a 1101397 1015974 kidney NS NS NS carcinoma clear_cell_renal_cell_carcinoma NS NS n COSM18152 c.463G>A p.V155M Substitution - Missense het u 37 3:10188320-10188320 + n - PATHOGENIC .97012 Confirmed somatic variant 20054297 255 NS primary 46 VHL ENST00000256474 642 12687 2146325 2146325 2015515 kidney NS NS NS carcinoma clear_cell_renal_cell_carcinoma NS NS n COSM18152 c.463G>A p.V155M Substitution - Missense u 37 3:10188320-10188320 + n - PATHOGENIC .97012 Confirmed somatic variant 24471421 surgery-fixed NS 68 VHL ENST00000256474 642 12687 K112 1692686 1600914 kidney NS NS NS carcinoma clear_cell_renal_cell_carcinoma NS NS n COSM18152 c.463G>A p.V155M Substitution - Missense u 37 3:10188320-10188320 + n - PATHOGENIC .97012 Confirmed somatic variant 22138691 surgery fresh/frozen primary 44 VHL ENST00000256474 642 12687 MEL-JWCI-WGS-12 1838362 1732464 skin upper_arm NS NS malignant_melanoma NS NS NS y COSM18152 c.463G>A p.V155M Substitution - Missense u 37 3:10188320-10188320 + n - PATHOGENIC .97012 Confirmed somatic variant 22817889 NS primary 46 ================================================ FILE: UnitTests/Resources/COSM18152.vcf ================================================ 3 10188320 COSM18152 G A . . GENE=VHL;STRAND=+;CDS=c.463G>A;AA=p.V155M;CNT=7 ================================================ FILE: UnitTests/Resources/COSM983708.tsv ================================================ Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 Genome-wide screen Mutation ID Mutation CDS Mutation AA Mutation Description Mutation zygosity LOH GRCh Mutation genome position Mutation strand SNP Resistance Mutation FATHMM prediction FATHMM score Mutation somatic status Pubmed_PMID ID_STUDY Sample source Tumour origin Age BCL6B ENST00000293805 1440 1002 TCGA-AX-A0J0-01 1783376 1687375 endometrium NS NS NS carcinoma endometrioid_carcinoma NS NS y COSM983708 c.701_702insCAG p.S244_E245insS Insertion - In frame het u 37 17:6928019-6928020 + - Reported in another cancer sample as somatic 419 fresh/frozen - NOS primary 47 ================================================ FILE: UnitTests/Resources/COSM983708.vcf ================================================ 17 6928019 COSM983708 C CCAG . . GENE=BCL6B;STRAND=+;CDS=c.701_702insCAG;AA=p.S244_E245insS;CNT=27 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000000101.xml ================================================ currentNM_000235.4(LIPA):c.894+1G>A AND Wolman diseasecurrentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn 2 sibs with Wolman disease (278000) from a consanguineous family, Aslanidis et al. (1996) detected homozygosity for a G-to-A mutation at position +1 of the splice donor site following exon 8 of the LIPA gene. Both children died within the first year of life. The parents, who were heterozygous for the mutation, had reduced enzymatic activity, while no enzymatic activity was detectable in fibroblasts from the affected children. Although the same donor splice site is involved as in the mutation reported in CESD (934G-A, 613497.0002), the nucleotide at position +1 was changed in the Wolman disease mutation while the nucleotide at position -1 was changed in the CESD mutation. Both mutations result in deletion of the same 24 amino acids (exon 8), but the effects are dramatically different: the -1 mutation allowed some correct splicing (3% of total LIPA RNA), but the +1 splice site mutation, which affects one of the invariable nucleotides of the splice consensus sequences, permits no correct splicing. Aslanidis et al. (1996) suggested that the residual activity in CESD patients compared to Wolman patients may result either from a partially active enzyme with the internal deletion of 24 amino acids (skipping of exon 8) or from the production of low amounts of the full size of the protein due to inefficient exon exclusion from the mutated allele.8617513NM_000235.4(LIPA):c.894+1G>ANM_001288979.1:c.546+1G>ANM_000235.4:c.894+1G>ANM_001127605.3:c.894+1G>ANG_008194.1:g.34394G>ANC_000010.11:g.89222510C>TNC_000010.10:g.90982267C>TNM_000235.3:c.894+1G>Asplice donor variantsplice donor variantsplice donor variantIVS8, G-A, +110q23.31lipase A, lysosomal acid typeLIPAClinGen staff contributed the HGVS expression for this variant.NM_000235.4(LIPA):c.894+1G>AWolman diseasecurrentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn 2 sibs with Wolman disease (278000) from a consanguineous family, Aslanidis et al. (1996) detected homozygosity for a G-to-A mutation at position +1 of the splice donor site following exon 8 of the LIPA gene. Both children died within the first year of life. The parents, who were heterozygous for the mutation, had reduced enzymatic activity, while no enzymatic activity was detectable in fibroblasts from the affected children. Although the same donor splice site is involved as in the mutation reported in CESD (934G-A, 613497.0002), the nucleotide at position +1 was changed in the Wolman disease mutation while the nucleotide at position -1 was changed in the CESD mutation. Both mutations result in deletion of the same 24 amino acids (exon 8), but the effects are dramatically different: the -1 mutation allowed some correct splicing (3% of total LIPA RNA), but the +1 splice site mutation, which affects one of the invariable nucleotides of the splice consensus sequences, permits no correct splicing. Aslanidis et al. (1996) suggested that the residual activity in CESD patients compared to Wolman patients may result either from a partially active enzyme with the internal deletion of 24 amino acids (skipping of exon 8) or from the production of low amounts of the full size of the protein due to inefficient exon exclusion from the mutated allele.8617513LIPA, IVS8, G-A, +1IVS8, G-A, +1LIPAWOLMAN DISEASE ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000000734.xml ================================================ currentNM_000512.4(GALNS):c.413T>C (p.Val138Ala) AND Mucopolysaccharidosis, MPS-IV-Acurrentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn a patient with a severe form of Morquio syndrome A (253000), Tomatsu et al. (1992) identified a homozygous 468T-C transition in the GALNS gene, resulting in an ala138-to-val (A138V) substitution.Tomatsu, S., Fukuda, S., Masue, M., Sukegawa, K., Masuno, M., Orii, T. Mucopolysaccharidosis type IVA: characterization and chromosomal localization of N-acetylgalactosamine-6-sulfate sulfatase gene and genetic heterogeneity. (Abstract) Am. J. Hum. Genet. 51 (suppl.): A178, 1992.NM_000512.4(GALNS):c.413T>C (p.Val138Ala)NM_001323543.2:c.-143T>CNM_000512.5:c.413T>CNM_001323544.2:c.431T>CNG_008667.1:g.20966T>CNC_000016.10:g.88841001A>GNC_000016.9:g.88907409A>GP34059:p.Val138AlaNP_000503.1:p.Val138AlaNP_001310473.1:p.Val144Ala5 prime UTR variantmissense variantmissense variantA138VV138AV144AALA138VAL16q24.3galactosamine (N-acetyl)-6-sulfataseGALNS8651279NCBI staff reviewed the sequence information reported in PubMed 8651279 to confirm this allele on current reference sequence (V138A).NM_000512.4(GALNS):c.413T>C (p.Val138Ala)Mucopolysaccharidosis, MPS-IV-AMPS IVAMucopolysaccharidosis type IV AMorquio syndrome A, mildMucopolysaccharidosis Type IVAMPS4AThe phenotypic spectrum of mucopolysaccharidosis IVA (MPS IVA) is a continuum that ranges from a severe and rapidly progressive early-onset form to a slowly progressive later-onset form. Children with MPS IVA have no distinctive clinical findings at birth. The severe form is usually apparent between ages one and three years, often first manifesting as kyphoscoliosis, knock-knee (genu valgum), and pectus carinatum; the slowly progressive form may not become evident until late childhood or adolescence often first manifesting as hip problems (pain, stiffness, and Legg Perthes disease). Progressive bone and joint involvement leads to short stature, and eventually to disabling pain and arthritis. Involvement of other organ systems can lead to significant morbidity, including respiratory compromise, obstructive sleep apnea, valvular heart disease, hearing impairment, visual impairment from corneal clouding, dental abnormalities, and hepatomegaly. Compression of the spinal cord is a common complication that results in neurologic impairment. Children with MPS IVA have normal intellectual abilities at the outset of the disease.23844448NBK148668currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn a patient with a severe form of Morquio syndrome A (253000), Tomatsu et al. (1992) identified a homozygous 468T-C transition in the GALNS gene, resulting in an ala138-to-val (A138V) substitution.Tomatsu, S., Fukuda, S., Masue, M., Sukegawa, K., Masuno, M., Orii, T. Mucopolysaccharidosis type IVA: characterization and chromosomal localization of N-acetylgalactosamine-6-sulfate sulfatase gene and genetic heterogeneity. (Abstract) Am. J. Hum. Genet. 51 (suppl.): A178, 1992.GALNS, ALA138VALALA138VALGALNSMUCOPOLYSACCHARIDOSIS, TYPE IVA ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000001054.xml ================================================ currentNM_013236.3(ATXN10):c.1173+54822_1173+54826ATTCT(10_32) AND Spinocerebellar ataxia 10currentno assertion criteria providedBenigngermlinehumannot providedliterature onlyIn all affected members of 5 Mexican families with SCA10 (603516), Matsuura et al. (2000) detected expansion of a pentanucleotide (ATTCT) repeat in intron 9 of the ATXN10 gene. There was an inverse correlation between the expansion size, up to 22.5 kb larger than the normal allele, and the age of onset. Analysis of 562 chromosomes from unaffected individuals of various ethnic origins, including 242 chromosomes from Mexicans, showed a range of 10 to 22 ATTCT repeats with no evidence of expansions. The data indicated that the ATXN10 intronic ATTCT pentanucleotide repeat in SCA10 patients is unstable and represented the largest microsatellite expansion found to that time in the human genome.11017075In a multigenerational study, Matsuura et al. (2004) demonstrated that (1) the expanded ATTCT repeats are highly unstable when paternally transmitted, whereas maternal transmission results in significantly smaller changes in repeat size; (2) blood leukocytes, lymphoblastoid cells, buccal cells, and sperm have a variable degree of mosaicism in ATTCT expansion; (3) the length of the expanded repeat was not observed to change in individuals over a 5-year period; and (4) clinically determined anticipation is sometimes associated with intergenerational contraction rather than expansion of the ATTCT repeat.15127363not providedhumannot providedassert pathogenicitycurationnot providedNM_013236.3(ATXN10):c.1173+54822_1173+54826ATTCT(10_32)ATXN10, (ATTCT)n EXPANSIONNM_013236.2:c.1173+54822_1173+54826ATTCT(10_32)22q13.31ataxin 10 repeat instability regionLOC108660404ataxin 10ATXN10origin of replication for ATXN10 repeat regionLOC107181287NM_013236.3(ATXN10):c.1173+54822_1173+54826ATTCT(10_32)Spinocerebellar ataxia 10Spinocerebellar Ataxia Type10SCA10Spinocerebellar ataxia type 10 (SCA10) is characterized by slowly progressive cerebellar ataxia that usually starts as poor balance and unsteady gait, followed by upper-limb ataxia, scanning dysarthria, and dysphagia. Abnormal tracking eye movements are common. Recurrent seizures after the onset of gait ataxia have been reported with variable frequencies among different families. Some individuals have cognitive dysfunction, behavioral disturbances, mood disorders, mild pyramidal signs, and peripheral neuropathy. Age of onset ranges from 12 to 48 years.20301354NBK117520301317NBK113820050888currentno assertion criteria providedBenigngermlinehumannot providedliterature onlyIn all affected members of 5 Mexican families with SCA10 (603516), Matsuura et al. (2000) detected expansion of a pentanucleotide (ATTCT) repeat in intron 9 of the ATXN10 gene. There was an inverse correlation between the expansion size, up to 22.5 kb larger than the normal allele, and the age of onset. Analysis of 562 chromosomes from unaffected individuals of various ethnic origins, including 242 chromosomes from Mexicans, showed a range of 10 to 22 ATTCT repeats with no evidence of expansions. The data indicated that the ATXN10 intronic ATTCT pentanucleotide repeat in SCA10 patients is unstable and represented the largest microsatellite expansion found to that time in the human genome.11017075In a multigenerational study, Matsuura et al. (2004) demonstrated that (1) the expanded ATTCT repeats are highly unstable when paternally transmitted, whereas maternal transmission results in significantly smaller changes in repeat size; (2) blood leukocytes, lymphoblastoid cells, buccal cells, and sperm have a variable degree of mosaicism in ATTCT expansion; (3) the length of the expanded repeat was not observed to change in individuals over a 5-year period; and (4) clinically determined anticipation is sometimes associated with intergenerational contraction rather than expansion of the ATTCT repeat.15127363ATXN10, (ATTCT)n EXPANSION(ATTCT)n EXPANSIONATXN10SPINOCEREBELLAR ATAXIA 10currentno assertion criteria providednon-pathogenicConverted during submission to Benign.not providedhumannot providedAssert pathogenicitycurationnot providedNM_013236.2:c.1173+54822_1173+54826ATTCT(10_32)Spinocerebellar Ataxia Type10 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000001373.xml ================================================ currentNM_001174089.2(SLC4A11):c.2019-16_2019-6delinsGGCCGGCCGG AND Corneal endothelial dystrophycurrentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn a consanguineous family in India, Vithana et al. (2006) found that corneal endothelial dystrophy (CHED; 217700) cosegregated with a deletion-insertion mutation in intron 15 of the SLC4A11 gene that inactivated the splice acceptor site.18024964NM_001174089.2(SLC4A11):c.2019-16_2019-6delinsGGCCGGCCGGNM_001363745.2:c.1905-16_1905-6delinsGGCCGGCCGGNM_001174089.2:c.2019-16_2019-6delinsGGCCGGCCGGNM_001174090.1:c.2148-16_2148-6delinsGGCCGGCCGGNG_017072.1:g.15215_15225delinsGGCCGGCCGGNG_012093.2:g.25151_25161delinsCCGGCCGGCCNC_000020.11:g.3229017_3229027delinsCCGGCCGGCCNC_000020.11:g.3229017_3229027delGCAGACGGGCAinsCCGGCCGGCCNC_000020.10:g.3209663_3209673delinsCCGGCCGGCCintron variantintron variantintron variantIVS15AS, -6, DEL/INS20p13solute carrier family 4 member 11SLC4A11NCBI staff reviewed the sequence information reported in PubMed 16767101 Supplementary Fig. 3 to determine the location of this allele on the current reference sequence.NM_001174089.2(SLC4A11):c.2019-16_2019-6delinsGGCCGGCCGGCorneal endothelial dystrophyCHEDcurrentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn a consanguineous family in India, Vithana et al. (2006) found that corneal endothelial dystrophy (CHED; 217700) cosegregated with a deletion-insertion mutation in intron 15 of the SLC4A11 gene that inactivated the splice acceptor site.18024964SLC4A11, IVS15AS, -6, DEL/INSIVS15AS, -6, DEL/INSSLC4A11CORNEAL ENDOTHELIAL DYSTROPHY ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000001752.xml ================================================ currentNM_000528.4(MAN2B1):c.215A>T (p.His72Leu) AND Deficiency of alpha-mannosidasecurrentno assertion criteria providedConflicting interpretations of pathogenicityPathogenic(1);Uncertain significance(1)germlinehumanyesliterature onlynot providedgermlinehumannot providedliterature onlyIn 2 sibs with alpha-mannosidosis (MANSA; 248500), born of consanguineous parents, Nilssen et al. (1997) identified a homozygous 212A-T transversion in exon 2 of the MANB gene, resulting in a his71-to-leu (H71L) substitution. Residue his71 is conserved among lysosomal alpha-mannosidases from several species. The sibs were thought to be mildly affected and residual acidic alpha-mannosidase activity of 20% of normal was detected in the patient's fibroblasts, according to the report of this family by Bach et al. (1978). Nevertheless, the patients showed vacuolated leukocytes and fibroblasts consistent with the disease phenotype. The authors suggested that mutant mannosidase enzymes, even though containing residual activity upon testing at the appropriate pH, may be mislocalized to nonlysosomal compartments and therefore functionally inactive.7242929158146Gotoda et al. (1998) identified the same mutation, which they designated HIS72LEU in keeping with the codon numbering system of Wakamatsu et al. (1997). The patient, represented by cell line GM2051, was one of the patients reported by Nilssen et al. (1997).915814693703019758606NM_000528.4(MAN2B1):c.215A>T (p.His72Leu)NM_000528.4:c.215A>TNM_001173498.1:c.215A>TNG_015814.1:g.3947T>ANG_008318.1:g.6028A>TNC_000019.10:g.12665750T>ANC_000019.9:g.12776564T>ANM_000528.3:c.215A>TO00754:p.His72LeuNP_000519.2:p.His72LeuNP_000519.2:p.His72LeuNP_000519.2:p.His72LeuNP_001166969.1:p.His72Leumissense variantmissense variantH71LH72LHIS71LEU19p13.13mannosidase alpha class 2B member 1MAN2B1NM_000528.4(MAN2B1):c.215A>T (p.His72Leu)Deficiency of alpha-mannosidaseAlpha-MannosidosisMANSAAlpha-mannosidosis encompasses a continuum of clinical findings from mild to severe. Three major clinical subtypes have been suggested: A mild form recognized after age ten years with absence of skeletal abnormalities, myopathy, and slow progression (type 1). A moderate form recognized before age ten years with presence of skeletal abnormalities, myopathy, and slow progression (type 2). A severe form manifested as prenatal loss or early death from progressive central nervous system involvement or infection (type 3). Individuals with a milder phenotype have mild-to-moderate intellectual disability, impaired hearing, characteristic coarse features, clinical or radiographic skeletal abnormalities, immunodeficiency, and primary central nervous system disease – mainly cerebellar involvement causing ataxia. Periods of psychiatric symptoms are common. Associated medical problems can include corneal opacities, hepatosplenomegaly, aseptic destructive arthritis, and metabolic myopathy. Alpha-mannosidosis is insidiously progressive; some individuals may live into the sixth decade.20301570NBK139621368911currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn 2 sibs with alpha-mannosidosis (MANSA; 248500), born of consanguineous parents, Nilssen et al. (1997) identified a homozygous 212A-T transversion in exon 2 of the MANB gene, resulting in a his71-to-leu (H71L) substitution. Residue his71 is conserved among lysosomal alpha-mannosidases from several species. The sibs were thought to be mildly affected and residual acidic alpha-mannosidase activity of 20% of normal was detected in the patient's fibroblasts, according to the report of this family by Bach et al. (1978). Nevertheless, the patients showed vacuolated leukocytes and fibroblasts consistent with the disease phenotype. The authors suggested that mutant mannosidase enzymes, even though containing residual activity upon testing at the appropriate pH, may be mislocalized to nonlysosomal compartments and therefore functionally inactive.9158146724292Gotoda et al. (1998) identified the same mutation, which they designated HIS72LEU in keeping with the codon numbering system of Wakamatsu et al. (1997). The patient, represented by cell line GM2051, was one of the patients reported by Nilssen et al. (1997).975860693703019158146MAN2B1, HIS71LEUHIS71LEUMAN2B1ALPHA-MANNOSIDOSIScurrentno assertion criteria providedUncertain significance9758606http://web.expasy.org/variant_pages/VAR_003338.htmlgermlinehumanyesliterature onlynot providedp.His72LeuNM_000528.3:c.215A>TMAN2B1Deficiency of alpha-mannosidase ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000003254.xml ================================================ currentNM_144701.3(IL23R):c.1142G>A (p.Arg381Gln) AND Inflammatory bowel disease 17, protection againstcurrentno assertion criteria providedprotectivegermlinehumannot providedliterature onlyInflammatory Bowel DiseaseUsing a large-scale genomewide association study, Duerr et al. (2006) identified an uncommon coding mutation in the IL23R gene, a 1142G-A transition, resulting in an arg381-to-gln (R381Q) substitution (rs11209026), that confers strong protection against Crohn disease (see 612261). This SNP was identified in 1 cohort and replicated in 2 others. The gln381 allele was found in 7% of non-Jewish controls and 1.9% of non-Jewish patients with ileal Crohn disease.17068223Libioulle et al. (2007) performed a genomewide association study with more than 300,000 SNPs in 547 Caucasian patients with Crohn disease from Belgium and 928 controls and found the strongest association (p less than 10(-9)) with markers of the IL23R gene, including rs11209026, which corresponds to the R381Q substitution. The association with R381Q was replicated in 1,255 additional Caucasian CD patients and 550 controls (combined p = 2.2 x 10(-18)).17447842Raelson et al. (2007) analyzed the IL23R region in 477 parent-proband trios with Crohn disease from the Quebec Founder Population and 2 independent German samples involving 521 affected-child trios, 752 cases, and 828 independent controls. The authors found that the R381Q SNP did not occur consistently in all risk and protective haplotypes, and concluded that it is highly unlikely that R381Q fully explains the functional role of this gene in CD etiology.17804789In a Caucasian German IBD cohort that included 833 CD patients, 456 patients with ulcerative colitis (UC), and 1,381 unrelated controls, Glas et al. (2007) confirmed the association between rs11209026 and protection against CD (OR, 0.43; p = 8.04 x 10(-8)) and also found a significant association with protection from UC (OR, 0.70; p = 0.00361).17786191In a study involving 1,841 ulcerative colitis cases and 1,470 controls, Fisher et al. (2008) found the strongest signal at rs11209026 (p = 8.0 x 10(-8); OR, 0.53), with evidence that additional independent IL23R variants also contribute to ulcerative colitis risk.18438406McGovern et al. (2010) combined new data from 2 genomewide association studies of ulcerative colitis involving 266,047 SNPs and performed a metaanalysis with previously published data (Silverberg et al., 2009), thus bringing together a discovery set of 2,693 European UC patients and 6,791 controls; the top results from the metaanalysis were then independently replicated with 2,009 additional European UC cases and 1,580 controls. McGovern et al. (2010) confirmed association with UC at rs11209026 (combined p = 1.9 x 10(-13)).1912266420228799PsoriasisCapon et al. (2007) reported a significant association between the R381Q variant and protection against psoriasis (PSORS7; 605606) among 318 British patients with the disorder. The findings were replicated in a second group of 519 British patients. Together, the association yielded an odds ratio of 0.49 (p = 0.00014), with the gln381 allele offering protection from the disease. Capon et al. (2007) noted that the arg381 residue is highly conserved among higher vertebrates and is located within the binding domain for JAK2 kinase, which is the first mediator of the IL23R signaling cascade (Parham et al., 2002).1202336917587057NM_144701.3(IL23R):c.1142G>A (p.Arg381Gln)NM_144701.3:c.1142G>ANG_011498.1:g.78790G>ANC_000001.11:g.67240275G>ANC_000001.10:g.67705958G>AQ5VWK5:p.Arg381GlnNP_653302.2:p.Arg381Glnmissense variantR381QARG381GLN1p31.3interleukin 23 receptorIL23RNM_144701.3(IL23R):c.1142G>A (p.Arg381Gln)Inflammatory bowel disease 17, protection againstcurrentno assertion criteria providedprotectivegermlinehumannot providedliterature onlyInflammatory Bowel DiseaseUsing a large-scale genomewide association study, Duerr et al. (2006) identified an uncommon coding mutation in the IL23R gene, a 1142G-A transition, resulting in an arg381-to-gln (R381Q) substitution (rs11209026), that confers strong protection against Crohn disease (see 612261). This SNP was identified in 1 cohort and replicated in 2 others. The gln381 allele was found in 7% of non-Jewish controls and 1.9% of non-Jewish patients with ileal Crohn disease.17068223Libioulle et al. (2007) performed a genomewide association study with more than 300,000 SNPs in 547 Caucasian patients with Crohn disease from Belgium and 928 controls and found the strongest association (p less than 10(-9)) with markers of the IL23R gene, including rs11209026, which corresponds to the R381Q substitution. The association with R381Q was replicated in 1,255 additional Caucasian CD patients and 550 controls (combined p = 2.2 x 10(-18)).17447842Raelson et al. (2007) analyzed the IL23R region in 477 parent-proband trios with Crohn disease from the Quebec Founder Population and 2 independent German samples involving 521 affected-child trios, 752 cases, and 828 independent controls. The authors found that the R381Q SNP did not occur consistently in all risk and protective haplotypes, and concluded that it is highly unlikely that R381Q fully explains the functional role of this gene in CD etiology.17804789In a Caucasian German IBD cohort that included 833 CD patients, 456 patients with ulcerative colitis (UC), and 1,381 unrelated controls, Glas et al. (2007) confirmed the association between rs11209026 and protection against CD (OR, 0.43; p = 8.04 x 10(-8)) and also found a significant association with protection from UC (OR, 0.70; p = 0.00361).17786191In a study involving 1,841 ulcerative colitis cases and 1,470 controls, Fisher et al. (2008) found the strongest signal at rs11209026 (p = 8.0 x 10(-8); OR, 0.53), with evidence that additional independent IL23R variants also contribute to ulcerative colitis risk.18438406McGovern et al. (2010) combined new data from 2 genomewide association studies of ulcerative colitis involving 266,047 SNPs and performed a metaanalysis with previously published data (Silverberg et al., 2009), thus bringing together a discovery set of 2,693 European UC patients and 6,791 controls; the top results from the metaanalysis were then independently replicated with 2,009 additional European UC cases and 1,580 controls. McGovern et al. (2010) confirmed association with UC at rs11209026 (combined p = 1.9 x 10(-13)).2022879919122664PsoriasisCapon et al. (2007) reported a significant association between the R381Q variant and protection against psoriasis (PSORS7; 605606) among 318 British patients with the disorder. The findings were replicated in a second group of 519 British patients. Together, the association yielded an odds ratio of 0.49 (p = 0.00014), with the gln381 allele offering protection from the disease. Capon et al. (2007) noted that the arg381 residue is highly conserved among higher vertebrates and is located within the binding domain for JAK2 kinase, which is the first mediator of the IL23R signaling cascade (Parham et al., 2002).1758705712023369IL23R, ARG381GLNARG381GLNIL23RINFLAMMATORY BOWEL DISEASE 17, PROTECTION AGAINST ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000005426.xml ================================================ currentNM_001271604.2(JPH3):c.431_433CTG(6_27) (p.Ala150_Ala157del) AND Huntington disease-like 2currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn affected members of an African American family with Huntington disease-like-2 (HDL2; 606438), Holmes et al. (2001) demonstrated a CAG/CTG repeat expansion of about 40 or more triplets in an alternatively spliced exon of the JPH3 gene. Holmes et al. (2001) found the same mutation in 4 other African American individuals from the southeastern United States, each of whom had a familial Huntington disease-like disorder and had tested negative for the Huntington disease mutation in the IT15 gene (613004).11694876Among 74 patients with an HD-like phenotype but without CAG repeat expansions in the IT15 gene, Stevanin et al. (2002) identified 1 patient with a pure uninterrupted 50 CAG/CTG repeat in the JPH3 gene. The patient was a 44-year-old Moroccan woman with subcortical dementia, mild choreic movements, and atrophy of the cerebral cortex.11914418In 3 members of a family with HLD2, originally reported by Walker et al. (2002) as having choreoacanthocytosis, Walker et al. (2003) identified trinucleotide repeat expansions of 51, 58, and 57 triplets in the JPH3 gene. The authors identified affected members of 2 other families with trinucleotide repeats in the JPH3 gene.1194068814557581NM_001271604.2(JPH3):c.431_433CTG(6_27) (p.Ala150_Ala157del)JPH3, CAG(n) EXPANSIONNM_001271604.2:c.431CTG(>40)16q24.2junctophilin 3 repeat instability regionLOC109029536junctophilin 3JPH3NM_001271604.2(JPH3):c.431_433CTG(6_27) (p.Ala150_Ala157del)Huntington disease-like 2HDL2Huntington disease-like 2 (HDL2) typically presents in midlife with a relentless progressive triad of movement, emotional, and cognitive abnormalities which lead to death within ten to 20 years. HDL2 cannot be differentiated from Huntington disease clinically. Neurologic abnormalities include chorea, hypokinesia (rigidity, bradykinesia), dysarthria, and hyperreflexia in the later stages of the disease. There is a strong correlation between the duration of the disease and the progression of the motor and cognitive disorder.20301701NBK1529currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn affected members of an African American family with Huntington disease-like-2 (HDL2; 606438), Holmes et al. (2001) demonstrated a CAG/CTG repeat expansion of about 40 or more triplets in an alternatively spliced exon of the JPH3 gene. Holmes et al. (2001) found the same mutation in 4 other African American individuals from the southeastern United States, each of whom had a familial Huntington disease-like disorder and had tested negative for the Huntington disease mutation in the IT15 gene (613004).11694876Among 74 patients with an HD-like phenotype but without CAG repeat expansions in the IT15 gene, Stevanin et al. (2002) identified 1 patient with a pure uninterrupted 50 CAG/CTG repeat in the JPH3 gene. The patient was a 44-year-old Moroccan woman with subcortical dementia, mild choreic movements, and atrophy of the cerebral cortex.11914418In 3 members of a family with HLD2, originally reported by Walker et al. (2002) as having choreoacanthocytosis, Walker et al. (2003) identified trinucleotide repeat expansions of 51, 58, and 57 triplets in the JPH3 gene. The authors identified affected members of 2 other families with trinucleotide repeats in the JPH3 gene.1194068814557581JPH3, CAG(n) EXPANSIONCAG(n) EXPANSIONJPH3HUNTINGTON DISEASE-LIKE 2 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000007484.xml ================================================ current NM_007262.4(PARK7):c.[-24+75_-24+92dup;487G>A] AND Parkinson disease 7 current no assertion criteria provided Pathogenic germline human not provided literature only In 3 affected sibs from a consanguineous southern Italian family with early-onset parkinsonism (606324), Annesi et al. (2005) identified double homozygosity for mutations in the DJ1 gene. One was a 3385G-A transition in exon 7, resulting in a glu163-to-lys (E163K) substitution, and the other was an 18-bp duplication (168-185dup) in the promoter region. Age at disease onset was 36, 35, and 24 years, respectively. Severe amyotrophic lateral sclerosis and cognitive impairment were prominent in 1 sib, while the other 2 had prominent parkinsonism and behavioral abnormalities. 16240358 NM_007262.5(PARK7):c.487G>A (p.Glu163Lys) NM_001123377.1:c.487G>A NM_007262.5:c.487G>A NG_008271.1:g.28318G>A NC_000001.11:g.7984971G>A NC_000001.10:g.8045031G>A Q99497:p.Glu163Lys NP_001116849.1:p.Glu163Lys NP_009193.2:p.Glu163Lys missense variant missense variant E163K GLU163LYS 1p36.23 Parkinsonism associated deglycase PARK7 NM_007262.5(PARK7):c.-24+75_-24+92dup NM_001123377.1:c.-24+133_-24+150dup NM_007262.5:c.-24+75_-24+92dup NG_008271.1:g.5215_5232dup NC_000001.11:g.7961868_7961885dup NC_000001.10:g.8021928_8021945dup NM_007262.4:c.-24+75_-24+92dup intron variant intron variant 1p36.23 Parkinsonism associated deglycase PARK7 16240358 NM_007262.4(PARK7):c.[-24+75_-24+92dup;487G>A] PARK7, GLU163LYS AND 18-BP DUP NG_008271.1:g.[5215_5232dup;28318G>A] NM_007262.4:c.[-24+75_-24+92dup;487G>A] Parkinson disease 7 PARKINSON DISEASE 7, AUTOSOMAL RECESSIVE EARLY-ONSET PARK7 Parkinson disease is a progressive disorder of the nervous system. The disorder affects several regions of the brain, especially an area called the substantia nigra that controls balance and movement.Often the first symptom of Parkinson disease is trembling or shaking (tremor) of a limb, especially when the body is at rest. Typically, the tremor begins on one side of the body, usually in one hand. Tremors can also affect the arms, legs, feet, and face. Other characteristic symptoms of Parkinson disease include rigidity or stiffness of the limbs and torso, slow movement (bradykinesia) or an inability to move (akinesia), and impaired balance and coordination (postural instability). These symptoms worsen slowly over time.Parkinson disease can also affect emotions and thinking ability (cognition). Some affected individuals develop psychiatric conditions such as depression and visual hallucinations. People with Parkinson disease also have an increased risk of developing dementia, which is a decline in intellectual functions including judgment and memory.Generally, Parkinson disease that begins after age 50 is called late-onset disease. The condition is described as early-onset disease if signs and symptoms begin before age 50. Early-onset cases that begin before age 20 are sometimes referred to as juvenile-onset Parkinson disease. 20301402 NBK1223 23279440 current no assertion criteria provided Pathogenic germline human not provided literature only In 3 affected sibs from a consanguineous southern Italian family with early-onset parkinsonism (606324), Annesi et al. (2005) identified double homozygosity for mutations in the DJ1 gene. One was a 3385G-A transition in exon 7, resulting in a glu163-to-lys (E163K) substitution, and the other was an 18-bp duplication (168-185dup) in the promoter region. Age at disease onset was 36, 35, and 24 years, respectively. Severe amyotrophic lateral sclerosis and cognitive impairment were prominent in 1 sib, while the other 2 had prominent parkinsonism and behavioral abnormalities. 16240358 PARK7, GLU163LYS AND 18-BP DUP GLU163LYS AND 18-BP DUP PARK7 PARKINSON DISEASE 7, AUTOSOMAL RECESSIVE EARLY-ONSET ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000010551.xml ================================================ current NM_000451.3(SHOX):c.394C>G (p.Leu132Val) AND Leri Weill dyschondrosteosis current no assertion criteria provided Pathogenic germline human not provided literature only In a patient with Leri-Weill dyschondrosteosis (127300), Grigelioniene et al. (2000) identified a 485C-G transversion in the SHOX gene, resulting in a leu132-to-val amino acid substitution. 11030412 NM_000451.3(SHOX):c.394C>G (p.Leu132Val) NM_000451.3:c.394C>G NM_006883.2:c.394C>G NG_009385.2:g.15391C>G NG_046891.1:g.1782C>G NC_000023.11:g.634734C>G NC_000024.10:g.634734C>G NC_000024.9:g.545469C>G NC_000023.10:g.595469C>G O15266:p.Leu132Val NP_000442.1:p.Leu132Val NP_006874.1:p.Leu132Val missense variant missense variant L132V LEU132VAL Xp22.33 Yp11.2 meiotic recombination hotspot SHOX LOC107652445 short stature homeobox SHOX Sufficient evidence for dosage pathogenicity https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=SHOX No evidence available https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=SHOX NM_000451.3(SHOX):c.394C>G (p.Leu132Val) Leri Weill dyschondrosteosis LWD DCO The phenotypic spectrum of SHOX deficiency disorders, caused by haploinsufficiency of the short stature homeobox-containing gene (SHOX), ranges from Leri-Weill dyschondrosteosis (LWD) at the severe end of the spectrum to nonspecific short stature at the mild end of the spectrum. In adults with SHOX deficiency, the proportion of LWD versus short stature without features of LWD is not well defined. In LWD the classic clinical triad is short stature, mesomelia, and Madelung deformity. Mesomelia, in which the middle portion of a limb is shortened in relation to the proximal portion, can be evident first in school-aged children and increases with age in frequency and severity. Madelung deformity (abnormal alignment of the radius, ulna, and carpal bones at the wrist) typically develops in mid-to-late childhood and is more common and severe in females. The phenotype of short stature caused by SHOX deficiency in the absence of mesomelia and Madelung deformity (called SHOX-deficient short stature in this GeneReview) is highly variable, even within the same family. 20301394 NBK1215 current no assertion criteria provided Pathogenic germline human not provided literature only In a patient with Leri-Weill dyschondrosteosis (127300), Grigelioniene et al. (2000) identified a 485C-G transversion in the SHOX gene, resulting in a leu132-to-val amino acid substitution. 11030412 SHOX, LEU132VAL LEU132VAL SHOX LERI-WEILL DYSCHONDROSTEOSIS ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000016673.xml ================================================ current NM_000518.4(HBB):c.126_129delCTTT (p.Phe42fs) AND beta^0^ Thalassemia current no assertion criteria provided Pathogenic germline human not provided literature only Frameshift, -4, codons 41/42, TTCTTT to TT, was found in an Asian Indian with beta-zero-thalassemia (613985) by Kazazian et al. (1984) and in Chinese by Kimura et al. (1983). 6826539 Kazazian, H. H., Jr. Personal Communication. 1982. Baltimore, Md. Lau et al. (1997) found that the deletion of CTTT at codons 41/42 accounted for 40% of all beta-thalassemia alleles in Hong Kong. Chiu et al. (2002) designed allele-specific primers and a fluorescent probe for detection of this mutation in the HBB gene from maternal plasma by real-time PCR. Using this method, they showed that beta-thalassemia major could be excluded from fetal inheritance by demonstrating absence of inheritance of the paternally transmitted mutation. By studying circulating fetal DNA in the maternal plasma for this mutation, Chiu et al. (2002) added beta-thalassemia to the list of disorders that could be prenatally diagnosed using this noninvasive method, which had previously demonstrated usefulness in diagnosing sex-linked diseases (Costa et al., 2002) and fetal rhesus D status (Lo et al., 1998). 12000828 12383672 9113933 9845707 NM_000518.4(HBB):c.126_129delCTTT (p.Phe42fs) 41/42-TTCT NM_000518.4:c.124_127delTTCT NM_000518.5:c.126_129del NG_042296.1:g.296_299del NG_046672.1:g.4700_4703del NG_059281.1:g.5306_5309del NG_000007.3:g.70850_70853del NC_000011.10:g.5226765_5226768del NC_000011.10:g.5226763_5226766del NC_000011.9:g.5247993_5247996del HBB:c.126_129delCTTT NM_000518.4:c.126_129delCTTT p.Phe42Leufs*19 p.Phe42LeufsTer17 NP_000509.1:p.Phe42fs NP_000509.1:p.Phe42fs NP_000509.1:p.Phe42fs NP_000509.1:p.Phe42fs frameshift variant F42fs 11p15.4 HBB recombination region LOC106099062 hemoglobin subunit beta HBB origin of replication at HBB LOC107133510 NM_000518.4(HBB):c.126_129delCTTT (p.Phe42fs) beta^0^ Thalassemia Beta-zero-thalassemia current no assertion criteria provided Pathogenic germline human not provided literature only Frameshift, -4, codons 41/42, TTCTTT to TT, was found in an Asian Indian with beta-zero-thalassemia (613985) by Kazazian et al. (1984) and in Chinese by Kimura et al. (1983). Kazazian, H. H., Jr. Personal Communication. 1982. Baltimore, Md. 6826539 Lau et al. (1997) found that the deletion of CTTT at codons 41/42 accounted for 40% of all beta-thalassemia alleles in Hong Kong. Chiu et al. (2002) designed allele-specific primers and a fluorescent probe for detection of this mutation in the HBB gene from maternal plasma by real-time PCR. Using this method, they showed that beta-thalassemia major could be excluded from fetal inheritance by demonstrating absence of inheritance of the paternally transmitted mutation. By studying circulating fetal DNA in the maternal plasma for this mutation, Chiu et al. (2002) added beta-thalassemia to the list of disorders that could be prenatally diagnosed using this noninvasive method, which had previously demonstrated usefulness in diagnosing sex-linked diseases (Costa et al., 2002) and fetal rhesus D status (Lo et al., 1998). 9113933 12383672 12000828 9845707 HBB, 4-BP DEL, 41/42CTTT 4-BP DEL, 41/42CTTT HBB BETA-ZERO-THALASSEMIA ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000017510.xml ================================================ currentNM_004285.4(H6PD):c.1860_1861insACAGGTGGTTGACCTGTGGCCGGGTCTGA (p.Glu621delinsThrGlyGlyTer) AND Cortisone reductase deficiency 1currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyliterature onlyIn the Scottish female with apparent cortisone reductase deficiency (CORTRD1; 604931) reported by Jamieson et al. (1999), Draper et al. (2003) detected a heterozygous 29-bp insertion between nucleotides 620 and 621 of the H6PD gene. Functional studies in hepatic WRL68 cells demonstrated that the 620ins29bp mutant was devoid of H6PDH activity. The 620_621ins29 mutation was found in none of 100 Scottish controls. In this patient, Draper et al. (2003) also detected homozygosity for a pair of linked intronic mutations in the HSD11B1 gene (600713.0001). In 100 Scottish controls homozygosity for these intronic changes had a frequency of 2%. The patient reported by Jamieson et al. (1999) presented at the age of 36 years with hirsutism, oligomenorrhea, obesity, acne, and infertility, features resembling those of polycystic ovary syndrome (PCOS; 184700).1052299712858176In the Scottish woman with cortisone reductase deficiency reported by Jamieson et al. (1999), Lavery et al. (2008) detected compound heterozygosity for the 620_621ins29 mutation in H6PD and a 960G-A transition in exon 4 (138090.0003). The 29-bp insertion caused a frameshift predicted to result in an in-frame stop codon that truncates the protein by 268 amino acids (Asp620fsTer3). No mutations or sequence variants were detected in the HSD11B1 gene.1052299718628520NM_004285.4(H6PD):c.1860_1861insACAGGTGGTTGACCTGTGGCCGGGTCTGA (p.Glu621delinsThrGlyGlyTer)H6PD, 29-BP INS, NT620NM_004285.4:c.1860_1861insACAGGTGGTTGACCTGTGGCCGGGTCTGANM_001282587.2:c.1893_1894insACAGGTGGTTGACCTGTGGCCGGGTCTGANG_012218.1:g.34550_34551insACAGGTGGTTGACCTGTGGCCGGGTCTGANC_000001.11:g.9264353_9264354insACAGGTGGTTGACCTGTGGCCGGGTCTGANC_000001.10:g.9324412_9324413insACAGGTGGTTGACCTGTGGCCGGGTCTGANM_004285.3:c.1860_1861insACAGGTGGTTGACCTGTGGCCGGGTCTGANP_004276.2:p.Glu621delinsThrGlyGlyTerNP_001269516.1:p.Glu632delinsThrGlyGlyTernonsensenonsense1p36.22hexose-6-phosphate dehydrogenase/glucose 1-dehydrogenaseH6PD12858176NCBI staff reviewed the sequence information reported in PubMed 12858176 Fig. 4a to determine the location of this insertion on the current reference sequence.NM_004285.4(H6PD):c.1860_1861insACAGGTGGTTGACCTGTGGCCGGGTCTGA (p.Glu621delinsThrGlyGlyTer)Cortisone reductase deficiency 1CORTRD1currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn the Scottish female with apparent cortisone reductase deficiency (CORTRD1; 604931) reported by Jamieson et al. (1999), Draper et al. (2003) detected a heterozygous 29-bp insertion between nucleotides 620 and 621 of the H6PD gene. Functional studies in hepatic WRL68 cells demonstrated that the 620ins29bp mutant was devoid of H6PDH activity. The 620_621ins29 mutation was found in none of 100 Scottish controls. In this patient, Draper et al. (2003) also detected homozygosity for a pair of linked intronic mutations in the HSD11B1 gene (600713.0001). In 100 Scottish controls homozygosity for these intronic changes had a frequency of 2%. The patient reported by Jamieson et al. (1999) presented at the age of 36 years with hirsutism, oligomenorrhea, obesity, acne, and infertility, features resembling those of polycystic ovary syndrome (PCOS; 184700).1052299712858176In the Scottish woman with cortisone reductase deficiency reported by Jamieson et al. (1999), Lavery et al. (2008) detected compound heterozygosity for the 620_621ins29 mutation in H6PD and a 960G-A transition in exon 4 (138090.0003). The 29-bp insertion caused a frameshift predicted to result in an in-frame stop codon that truncates the protein by 268 amino acids (Asp620fsTer3). No mutations or sequence variants were detected in the HSD11B1 gene.1052299718628520H6PD, 29-BP INS, NT62029-BP INS, NT620H6PDCORTISONE REDUCTASE DEFICIENCY 1 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000021819.xml ================================================ currentNM_020975.6(RET):c.1896_1900delinsCGTGC (p.Glu632_Cys634delinsAspValArg) AND Multiple endocrine neoplasia, type 2currentno assertion criteria providedPathogenicgermlinehumanyesliterature onlynot providedNM_020975.6(RET):c.1896_1900delinsCGTGC (p.Glu632_Cys634delinsAspValArg)LRG_518t1:c.1896_1900delGCTGTinsCGTGCLRG_518t2:c.1896_1900delinsCGTGCNM_001355216.1:c.1134_1138delinsCGTGCNM_020630.5:c.1896_1900delinsCGTGCNM_020975.6:c.1896_1900delinsCGTGCLRG_518:g.42428_42432delinsCGTGCNG_007489.1:g.42428_42432delinsCGTGCNC_000010.11:g.43114496_43114500delinsCGTGCNC_000010.10:g.43609944_43609948delinsCGTGCNM_020975.4:c.1896_1900delGCTGTinsCGTGCNP_001342145.1:p.Glu378_Cys380delinsAspValArgNP_065681.1:p.Glu632_Cys634delinsAspValArgNP_066124.1:p.Glu632_Cys634delinsAspValArgexon 11missense variantmissense variantmissense variant10q11.21ret proto-oncogeneRETSufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=RETNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=RETThis gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons.This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons.8099202NM_020975.6(RET):c.1896_1900delinsCGTGC (p.Glu632_Cys634delinsAspValArg)Multiple endocrine neoplasia, type 2MEN2Multiple endocrine neoplasia type 2 (MEN 2) includes the following phenotypes: MEN 2A, FMTC (familial medullary thyroid carcinoma, which may be a variant of MEN 2A), and MEN 2B. All three phenotypes involve high risk for development of medullary carcinoma of the thyroid (MTC); MEN 2A and MEN 2B involve an increased risk for pheochromocytoma; MEN 2A involves an increased risk for parathyroid adenoma or hyperplasia. Additional features in MEN 2B include mucosal neuromas of the lips and tongue, distinctive facies with enlarged lips, ganglioneuromatosis of the gastrointestinal tract, and a marfanoid habitus. MTC typically occurs in early childhood in MEN 2B, early adulthood in MEN 2A, and middle age in FMTC.gain of function20301434NBK12571173941615604628currentno assertion criteria providedPathogenic809920286124797595167http://www.arup.utah.edu/database/MEN2/MEN2_display.phpThis indel changes three amino acids (ELC>DVR) which results in a p.C634R mutation. In vitro studies: RET activation (PMID 8612479). In the oldest reference, codon 634 was called codon 380. Additional reference: PMID 7595167.germlinehumanyesliterature onlynot providedNM_020975.4:c.1896_1900delGCTGTinsCGTGCRET ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000030349.xml ================================================ currentNM_174936.3(PCSK9):c.2009G>A (p.Gly670Glu) AND Familial hypercholesterolemiacurrentcriteria provided, multiple submitters, no conflictsBenign/Likely benignautosomal unknowngermlinehumanyesresearchassert pathogenicitycurationnot providednot providedgermlinehumanunknownclinical testingresearchclinical testingnot providednot providednot providednot providednot providednot providednot providedNM_174936.3(PCSK9):c.2009G>A (p.Gly670Glu)LRG_275t1:c.2009G>ANM_174936.3:c.2009G>ALRG_275:g.28968G>ANG_009061.1:g.28968G>ANC_000001.11:g.55063514G>ANC_000001.10:g.55529187G>ANR_110451.1:n.1616G>ALRG_275p1:p.Gly670GluQ8NBP7:p.Gly670GluNP_777596.2:p.Gly670Glumissense variantnon-coding transcript variantG670E1p32.3proprotein convertase subtilisin/kexin type 9PCSK9Dosage sensitivity unlikelyhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=PCSK9No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=PCSK9This gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons.This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons.19191301NM_174936.3(PCSK9):c.2009G>A (p.Gly670Glu)Familial hypercholesterolemiaLDL RECEPTOR DISORDERHyperlipoproteinemia Type IIaHYPER-LOW-DENSITY-LIPOPROTEINEMIAHYPERCHOLESTEROLEMIC XANTHOMATOSIS, FAMILIALFredrickson type IIa hyperlipoproteinemiaHyperlipoproteinemia Type IIHYPERCHOLESTEROLEMIA, FAMILIAL, MODIFIER OFHYPERCHOLESTEROLEMIA, FAMILIAL, 1FHCL1FHFHCFamilial hypercholesterolemia (FH) is characterized by severely elevated LDL cholesterol (LDL-C) levels that lead to atherosclerotic plaque deposition in the coronary arteries and proximal aorta at an early age, leading to an increased risk for cardiovascular disease. Xanthomas (patches of yellowish cholesterol buildup) may worsen with age as a result of extremely high cholesterol levels. Xanthomas can occur around the eyelids and within the tendons of the elbows, hands, knees, and feet. In FH, the more common cardiovascular disease is coronary artery disease (CAD), which may manifest as angina and myocardial infarction; stroke occurs more rarely. Untreated men are at a 50% risk for a fatal or nonfatal coronary event by age 50 years; untreated women are at a 30% risk by age 60 years. An estimated 70%-95% of FH results from a heterozygous pathogenic variant in one of three genes (APOB, LDLR, PCSK9). FH is the most common inherited cardiovascular disease, with a prevalence of 1:200-250. FH likely accounts for 2%-3% of myocardial infarctions in individuals younger than age 60 years. In contrast, homozygous FH (HoFH) results from biallelic (homozygous or compound heterozygous) pathogenic variants in one of these known genes (APOB, LDLR, PCSK9). Most individuals with HoFH experience severe CAD by their mid-20s and the rate of either death or coronary bypass surgery by the teenage years is high. Severe aortic stenosis is also common.23788249http://www.nice.org.uk/guidance/cg71Identification and management of familial hypercholesterolaemia2236483715177124216005252441828924404629NBK17488425404096250536602463617623725921253569652785436010.1038/gim.2016.190currentcriteria provided, single submitterbenignConverted during submission to Benign.autosomal unknownLabCorp Variant Classification Summary - May 2015https://submit.ncbi.nlm.nih.gov/ft/byid/pttb9itm/labcorp_variant_classification_method_-_may_2015.pdfgermlineBloodhumanyesassert pathogenicitycuration1NM_174936.3:c.2009G>Ap.Gly670Glumissense mutationPCSK919191301Familial HypercholesterolemiaFHcurrentcriteria provided, single submitterBenignMAF = 2% in 100 subjects with average plasma cholesterol; 291 hmz(AA) + 28 htz(GA) in 319 normolipidemic individuals; 98(AA)/100 normolipidemic individualsACMG Guidelines, 201525741868germlinehumanyesresearchnot provided%MAF (ExAC):5.67germlineHEKhumanyesresearchnormal PCSK9 processingnot provided153587851589317617170371Heterologous cells (HEK), pulse-chase [S35]Met/Cys assaysNM_174936.3:c.2009G>ANG_009061.1:g.28968G>APCSK9currentcriteria provided, single submitterLikely benignICSL Variant Classification 20161018https://submit.ncbi.nlm.nih.gov/ft/byid/4jQgNGYk/ICSL_Variant_Classification_20161018.pdfICSL_Variant_Classification_20161018.pdfgermlinehumanunknownclinical testingnot providedNM_174936.3:c.2009G>APCSK9Familial Hypercholesterolemiacurrentcriteria provided, single submitterBenignACMG Guidelines, 201525741868germlinehumanunknownresearchnot provided%MAF(ExAC):5.67germlineHEK cellshumanunknownresearchnormal PCSK9 processingnot provided15358785Assay description:Heterologous cells (HEK), pulse-chase [S35]Met/Cys assaysNM_174936.3:c.2009G>ANG_009061.1:g.28968G>APCSK9HipercolBrasilHipercolBrasil is a program that aims to describe the genetic data obtained from the cascade screening applied in a large FH Brazilian cohort since 2011.currentno assertion criteria providedBenigngermlinehumanunknownresearchnot providedNM_174936.3:c.2009G>APCSK9currentcriteria provided, single submitterBenignACMG Guidelines, 201525741868germlinehumanunknownNGSclinical testingnot providedNM_174936.3:c.2009G>APCSK9currentcriteria provided, single submitterBenignACMG Guidelines, 201525741868germlinehumanunknownresearchnot provided%MAF(ExAC):5.67germlineHEK cellshumanunknownresearchnormal PCSK9 processingnot provided15358785Assay Description:Heterologous cells (HEK), pulse-chase [S35]Met/Cys assaysNM_174936.3:c.2009G>ANG_009061.1:g.28968G>APCSK9Variant present in the database from Mexico ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000032548.xml ================================================ current NM_207352.4(CYP4V2):c.802-8_810delinsGC AND Bietti crystalline corneoretinal dystrophy current criteria provided, multiple submitters, no conflicts Pathogenic Autosomal recessive inheritance germline human not provided clinical testing literature only In 7 Japanese patients with Bietti crystalline corneoretinal dystrophy (BCD; 210370), Lin et al. (2005) identified homozygosity for an insertion/deletion mutation (c.802-8_810del17insGC) at intron 6 of the CYP4V2 gene that resulted in the skipping of exon 7. All 7 patients also shared homozygosity for 6 closely linked intragenic polymorphic markers, consistent with a founder effect; however, the authors noted that the founder was probably a very distant ancestor because the region of the conserved linked markers was small (6.7-17.1 kb). In a Chinese BCD patient, Lin et al. (2005) identified compound heterozygosity for this mutation and a c.992A-C transversion in the CYP4V2 gene, resulting in a his-331-to-pro (H331P; 608614.0007) substitution. 15937078 Li et al. (2004) had previously reported the indel mutation as a 15-bp deletion, which they found in homozygosity in 6 Japanese and 2 Chinese families with BCD as well as in 1 Japanese and 2 Chinese sporadic BCD patients. In addition, they identified the indel mutation in compound heterozygous state with the IVS8-2A-G mutation (608614.0005) in a Chinese patient with sporadic BCD and with the H331P mutation in affected individuals from a Chinese family. Screening for these 3 mutations in 50 controls, including 12 of Chinese, 16 of Japanese, and 22 of European origin, detected only 1 heterozygous instance of the H331P mutation in 1 Chinese control sample. 15042513 In 4 Chinese sisters with congenital cataract, high myopia, thin corneas, and a diagnosis of retinitis pigmentosa, Wang et al. (2012) identified compound heterozygosity for the c.802-8_810del17insGC mutation and the IVS8-2A-G mutation in the CYP4V2 gene. The mutations segregated fully with disease in 22 examined members of this 4-generation family. 22693542 In the proband from a Chinese family diagnosed with autosomal recessive RP, Fu et al. (2013) identified compound heterozygosity for the CYP4V2 c.802-8_810del17insGC (c.802-8_810del17insGC, NM_207352.3) and IVS8-2A-G mutations. Both mutations were also present in an affected sib, but mutation status was unknown for their unaffected deceased parents. Clinical reevaluation was not possible in this family, but reexamination of affected individuals in another Chinese RP family with mutations in CYP4V2 resulted in a rediagnosis of their phenotype as BCD (see 608614.0009). 23661369 germline human unknown clinical testing not provided not provided human not provided 1 literature only assert pathogenicity curation not provided not provided NM_207352.4(CYP4V2):c.802-8_810delinsGC NM_207352.3:c.802-8_810del17insGC NM_207352.4:c.802-8_810delinsGC NG_007965.1:g.14630_14646del17insGC NG_007965.1:g.14630_14646del17insGC NG_007965.1:g.14630_14646delinsGC NG_007965.1:g.14630_14646delinsGC NC_000004.12:g.186201149_186201165delinsGC NM_207352.3:c.802-8_810delinsGC p.? NM_207352.3:exon 7 NM_207352.3:intron 6 splice acceptor variant IVS6-8 del/insGC 4q35.2 cytochrome P450 family 4 subfamily V member 2 CYP4V2 15860296 NM_207352.4(CYP4V2):c.802-8_810delinsGC Bietti crystalline corneoretinal dystrophy Bietti Crystalline Dystrophy BCD Bietti crystalline dystrophy (BCD) is a chorioretinal degeneration characterized by the presence of yellow-white crystals and/or complex lipid deposits in the retina and (to a variable degree) the cornea. Progressive atrophy and degeneration of the retinal pigment epithelium (RPE) / choroid lead to symptoms similar to those of other forms of retinal degeneration that fall under the category of retinitis pigmentosa and allied disorders, namely: reduced visual acuity, poor night vision, abnormal retinal electrophysiology, visual field loss, and often impaired color vision. Marked asymmetry between eyes is not uncommon. Onset is typically during the second to third decade of life, but ranges from the early teenage years to beyond the third decade. With time, loss of peripheral visual field, central acuity, or both result in legal blindness in most if not all affected individuals. 22497028 NBK91457 current no assertion criteria provided pathologic Converted during submission to Pathogenic. not provided human not provided Assert pathogenicity curation not provided IVS6-8 del/insGC Bietti Crystalline Dystrophy current no assertion criteria provided pathogenic Converted during submission to Pathogenic. not provided human not provided 1 not provided not provided NM_207352.3:c.802-8_810del17insGC Bietti crystalline corneoretinal dystrophy current no assertion criteria provided Pathogenic germline human not provided literature only In 7 Japanese patients with Bietti crystalline corneoretinal dystrophy (BCD; 210370), Lin et al. (2005) identified homozygosity for an insertion/deletion mutation (c.802-8_810del17insGC) at intron 6 of the CYP4V2 gene that resulted in the skipping of exon 7. All 7 patients also shared homozygosity for 6 closely linked intragenic polymorphic markers, consistent with a founder effect; however, the authors noted that the founder was probably a very distant ancestor because the region of the conserved linked markers was small (6.7-17.1 kb). In a Chinese BCD patient, Lin et al. (2005) identified compound heterozygosity for this mutation and a c.992A-C transversion in the CYP4V2 gene, resulting in a his-331-to-pro (H331P; 608614.0007) substitution. 15937078 Li et al. (2004) had previously reported the indel mutation as a 15-bp deletion, which they found in homozygosity in 6 Japanese and 2 Chinese families with BCD as well as in 1 Japanese and 2 Chinese sporadic BCD patients. In addition, they identified the indel mutation in compound heterozygous state with the IVS8-2A-G mutation (608614.0005) in a Chinese patient with sporadic BCD and with the H331P mutation in affected individuals from a Chinese family. Screening for these 3 mutations in 50 controls, including 12 of Chinese, 16 of Japanese, and 22 of European origin, detected only 1 heterozygous instance of the H331P mutation in 1 Chinese control sample. 15042513 In 4 Chinese sisters with congenital cataract, high myopia, thin corneas, and a diagnosis of retinitis pigmentosa, Wang et al. (2012) identified compound heterozygosity for the c.802-8_810del17insGC mutation and the IVS8-2A-G mutation in the CYP4V2 gene. The mutations segregated fully with disease in 22 examined members of this 4-generation family. 22693542 In the proband from a Chinese family diagnosed with autosomal recessive RP, Fu et al. (2013) identified compound heterozygosity for the CYP4V2 c.802-8_810del17insGC (c.802-8_810del17insGC, NM_207352.3) and IVS8-2A-G mutations. Both mutations were also present in an affected sib, but mutation status was unknown for their unaffected deceased parents. Clinical reevaluation was not possible in this family, but reexamination of affected individuals in another Chinese RP family with mutations in CYP4V2 resulted in a rediagnosis of their phenotype as BCD (see 608614.0009). 23661369 CYP4V2, IVS6AS, 17-BP DEL/2-BP INS IVS6AS, 17-BP DEL/2-BP INS CYP4V2 BIETTI CRYSTALLINE CORNEORETINAL DYSTROPHY current criteria provided, single submitter Pathogenic 17962476 15860296 15042513 26085992 25593508 21565171 24739949 26865810 25629076 22693542 The c.802-8_810delTCATACAGGTCATCGCTinsGC variant, also described in the literature as c.802-8_810delinsG, occurs in a canonical splice site (acceptor) and is therefore predicted to disrupt or distort the normal gene product. The c.802-8_810delTCATACAGGTCATCGCTinsGC variant is the most common variant associated with Bietti crystalline dystrophy in the Japanese and Chinese populations, accounting for up to 83% of disease alleles (Park et al. 2016). The variant has been reported in at least nine studies in which it is found in at least 130 patients including 64 in a homozygous state, 65 in a compound heterozygous state and one individual in a heterozygous state in whom a second allele has not been detected (Li et al. 2004; Wada et al. 2006; Lai et al. 2007; Xiao et al. 2011; Yin et al. 2014; Meng et al. 2014; Tian et al. 2015; Park et al. 2016; Astuti et al. 2016). The variant was absent from 146 controls but is reported at a frequency of 0.00496 in the East Asian population of the 1000 Genomes Project. Due to the potential impact of splice acceptor variants and the supporting evidence from the literature, the c.802-8_810delTCATACAGGTCATCGCTinsGC variant is classified as pathogenic for Bietti crystalline dystrophy. ICSL Variant Classification 20161018 https://submit.ncbi.nlm.nih.gov/ft/byid/4jQgNGYk/ICSL_Variant_Classification_20161018.pdf ICSL_Variant_Classification_20161018.pdf germline human unknown clinical testing not provided NM_207352.3:c.802-8_810delTCATACAGGTCATCGCTinsGC CYP4V2 Bietti Crystalline Dystrophy current criteria provided, single submitter Pathogenic 26085992 17962476 15860296 23793346 15937078 23661369 24739949 21565171 15042513 25629076 25593508 26865810 22693542 19508456 The c.802-8_810delinsGC (NM_207352.3 c.802-8_810delinsGC) variant in CYP4V2 has been reported in over 60 homozygous and compound heterozygous individuals with B ietti crystalline dystrophy and related disorders and is the most common variant associated with this disease in East Asian populations (Wada 2005, Lin 2005, La i 2007, Yokoi 2010, Xiao 2011, Wang 2012, Chung 2013, Fu 2013, Yin 2014, Meng 20 14, Tian 2015, Astuti 2015, and Park 2016). This variant has also been reported as pathogenic in ClinVar (Variation ID#39271). This variant has been identified in 0.2% (16/8520) of East Asian chromosomes by chromosomes by the Exome Aggregat ion Consortium (ExAC, http://exac.broadinstitute.org). This variant alters the c anonical splice site, and therefore is expected to impact splicing and lead to a n absent or truncated protein. In summary, this variant meets criteria to be cla ssified as pathogenic for Bietti crystalline dystrophy and related disorders in an autosomal recessive manner based upon its biallelic occurrence in patients wi th this disease and predicted functional impact. Autosomal recessive inheritance LMM Criteria 24033266 germline human not provided clinical testing NM_207352.3:c.802-8_810delinsGC p.? NM_207352.3:EXON 7 NC_000004.11:g.187122303_187122319delinsGC CYP4V2 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000032707.xml ================================================ current NM_000168.6(GLI3):c.1616_1617del (p.Arg539fs) AND Postaxial polydactyly, type A1/B current no assertion criteria provided Pathogenic germline human not provided literature only In affected members of a 3-generation nonconsanguineous Saudi Arabian family with postaxial polydactyly (174200), Al-Qattan (2012) identified heterozygosity for a 2-bp deletion (1615delGA) in the GLI3 gene, predicted to cause a frameshift resulting in a premature termination codon (R539Tfs*12). Al-Qattan (2012) noted that although this frameshift predicts truncation in the N-terminal part of the gene and a GCPS phenotype would be expected, none of the family members had craniofacial features. 22428873 NM_000168.6(GLI3):c.1616_1617del (p.Arg539fs) NM_000168.6:c.1616_1617del NG_008434.1:g.263390_263391del NC_000007.14:g.41978630_41978631del NC_000007.13:g.42018229_42018230del NM_000168.5:c.1616_1617del NP_000159.3:p.Arg539fs frameshift variant R539fs 7p14.1 GLI family zinc finger 3 GLI3 Sufficient evidence for dosage pathogenicity https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=GLI3 No evidence available https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=GLI3 22428873 NM_000168.6(GLI3):c.1616_1617del (p.Arg539fs) Postaxial polydactyly, type A1/B RECLASSIFIED - VARIANT OF UNKNOWN SIGNIFICANCE current no assertion criteria provided Pathogenic germline human not provided literature only In affected members of a 3-generation nonconsanguineous Saudi Arabian family with postaxial polydactyly (174200), Al-Qattan (2012) identified heterozygosity for a 2-bp deletion (1615delGA) in the GLI3 gene, predicted to cause a frameshift resulting in a premature termination codon (R539Tfs*12). Al-Qattan (2012) noted that although this frameshift predicts truncation in the N-terminal part of the gene and a GCPS phenotype would be expected, none of the family members had craniofacial features. 22428873 GLI3, 2-BP DEL, 1615GA 2-BP DEL, 1615GA GLI3 POSTAXIAL POLYDACTYLY, TYPE A1/B ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000038438.xml ================================================ currentNM_005228.5(EGFR):c.2543C>T (p.Pro848Leu) AND not specifiedcurrentcriteria provided, single submitterLikely benignsomatichumannot providedclinical testingNM_005228.5(EGFR):c.2543C>T (p.Pro848Leu)LRG_304t1:c.2543C>TNM_001346941.2:c.1742C>TNM_001346900.2:c.2384C>TNM_001346897.2:c.2408C>TNM_001346899.1:c.2408C>TNM_001346898.2:c.2543C>TNM_005228.5:c.2543C>TLRG_304:g.177761C>TNG_007726.3:g.177761C>TNC_000007.14:g.55191792C>TNC_000007.13:g.55259485C>Tc.2543C>TNM_005228.3:c.2543C>TNP_001333870.1:p.Pro581LeuNP_001333829.1:p.Pro795LeuNP_001333826.1:p.Pro803LeuNP_001333828.1:p.Pro803LeuNP_001333827.1:p.Pro848LeuNP_005219.2:p.Pro848LeuNM_005228.3:exon 21missense variantmissense variantmissense variantmissense variantmissense variantmissense variantP581LP795LP803LP848L7p11.2epidermal growth factor receptorEGFRNM_005228.5(EGFR):c.2543C>T (p.Pro848Leu)not specifiedAllHighlyPenetrantThe term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders. This usage was introduced in 2014 to replace AllHighlyPenetrant.currentcriteria provided, single submitterLikely benign178778141728573522848293Pro848Leu variant in Exon 21 of EGFR: This variant is not expected to have clinical significance because iIn vitro studies suggest that this variant does not activate EGFR activity and does not render the protein sensitive to tyrosine kinase inhibitors (TKIs) (De Gunst 2007, Han 2011). It has been previously identified in both tumor and normal tissue in an individuals with lung cancer (Sequist 2007), and it has been identified in 0.06% (6/8600) of European American chromosomes by the NHLBI Exome Sequencing Project (http://evs.gs.washington.edu/EVS/; dbSNP rs148934350).LMM Criteria24033266somatichumannot providedclinical testingNM_005228.3:c.2543C>Tp.Pro848LeuNM_005228.3:EXON 21NC_000007.13:g.55259485C>TEGFRNot Specified ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000050055.xml ================================================ current NM_017890.4(VPS13B):c.11825_11827dup (p.Asp3942dup) AND Cohen syndrome current criteria provided, multiple submitters, no conflicts Uncertain significance germline human unknown clinical testing not provided not provided not provided human not provided 1 literature only not provided NM_017890.4(VPS13B):c.11825_11827dup (p.Asp3942dup) NM_017890.4:c.11825_11827dupATG LRG_351t2:c.11750_11752dup LRG_351t1:c.11825_11827dup NM_152564.4:c.11750_11752dup NM_017890.4:c.11825_11827dup LRG_351:g.867157_867159dup NG_007098.2:g.867157_867159dup NC_000008.11:g.99875422_99875424dup NC_000008.10:g.100887650_100887652dup NM_017890.3:c.11825_11827dupATG LRG_351p2:p.Asp3917dup LRG_351p1:p.Asp3942dup NP_689777.3:p.Asp3917dup NP_060360.3:p.Asp3942dup NM_017890.4:exon 62 inframe_insertion inframe_insertion 8q22.2 vacuolar protein sorting 13 homolog B VPS13B 20921020 NM_017890.4(VPS13B):c.11825_11827dup (p.Asp3942dup) Cohen syndrome Cutis verticis gyrata, retinitis pigmentosa, and sensorineural deafness COH1 CHS1 COH Cohen syndrome is characterized by failure to thrive in infancy and childhood; truncal obesity in the teen years; early-onset hypotonia and developmental delays; microcephaly developing during the first year of life; moderate to profound psychomotor retardation; progressive retinochoroidal dystrophy and high myopia; neutropenia in many with recurrent infections and aphthous ulcers in some; a cheerful disposition; joint hypermobility; and characteristic facial features. 20301655 NBK1482 current no assertion criteria provided probable-pathogenic Converted during submission to Likely pathogenic. not provided human not provided 1 not provided not provided NM_017890.4:c.11825_11827dupATG 20921020 Cohen syndrome FinDis database variant: This variant was not found or characterized by our laboratory, data were collected from public sources: see reference current criteria provided, single submitter Uncertain significance VPS13B NM_017890.4 exon 62 p.Asp3942_dup (c.11825_11827dupATG): This variant has been reported in the literature as a compound heterozygote (in trans with a multi-exon deletion of this gene) in 1 individual with a diagnosis of Cohen syndrome (Rivera-Brugues 2011 PMID:20921020, gene identified as alternate name COH1). However, this variant is present in 0.4% (100/24024) of African alleles in the Genome Aggregation Database (http://gnomad.broadinstitute.org/rs558633643). This variant is present in ClinVar (Variation ID:56642). Evolutionary conservation and computational predictive tools for this variant are limited or unavailable. This variant represents a duplication of 1 amino acid at position 3942 and is not predicted to alter the reading frame. However, the effect of this variant on the protein is unclear. In summary, data on this variant is insufficient for disease classification. Therefore, the clinical significance of this variant is uncertain. ACMG Guidelines, 2015 25741868 germline human unknown clinical testing not provided VPS13B current criteria provided, single submitter Uncertain significance This variant, c.11825_11827dupATG, results in the insertion of 1 amino acid(s) to the VPS13B protein (p.Asp3942dup), but otherwise preserves the integrity of the reading frame. This variant is present in population databases (rs558633643, ExAC 0.4%). This variant has been observed in an individual affected with Cohen syndrome (PMID: 20921020). ClinVar contains an entry for this variant (Variation ID: 56642). Experimental studies and prediction algorithms are not available for this variant, and the functional significance of the affected amino acid(s) is currently unknown. In summary, the available evidence is currently insufficient to determine the role of this variant in disease. Therefore, it has been classified as a Variant of Uncertain Significance. Invitae Variant Classification Sherloc (09022015) 28492532 germline human unknown clinical testing not provided NM_017890.4:c.11825_11827dupATG VPS13B Cohen syndrome ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000073701.xml ================================================ currentNM_001127511.3(APC):c.165+21247G>C AND Familial colorectal cancercurrentno assertion criteria providedotherunknownhumannot provided1literature onlynot providedNM_001127511.3(APC):c.165+21247G>CNM_001354895.2:c.-19+21247G>CNM_001127511.3:c.165+21247G>CNM_001354897.2:c.165+21247G>CNM_001354902.2:c.165+21247G>CLRG_130:g.41609G>CNG_008481.4:g.41609G>CNC_000005.10:g.112729129G>CNC_000005.9:g.112064826G>CNC_000005.8:g.112092725G>Cintron variantintron variantintron variantintron variant5q22.2APC regulator of WNT signaling pathwayAPCSufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=APCNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=APCThis gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons.This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons.NM_001127511.3(APC):c.165+21247G>CFamilial colorectal cancerCOLORECTAL CANCERCOLON CANCERCRCloss of function2743612http://www.nchpeg.org/documents/crc/11-0456%20Fact%20sheets%20(MSI%20and%20IHC%20testing).pdfcurrentno assertion criteria providedcancerConverted during submission to other.unknownhumannot provided1not providednot providedNC_000005.8:g.112092725G>CFamilial colorectal cancer ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000077146.xml ================================================ current NM_007294.3(BRCA1):c.4357+2T>G AND Breast-ovarian cancer, familial 1 current no assertion criteria provided Pathogenic germline human yes clinical testing germline human not provided 1 clinical testing NM_007294.3(BRCA1):c.4357+2T>G LRG_292t1:c.4357+2T>G NM_007298.3:c.1048+2T>G NM_007299.4:c.1048+2T>G NM_007297.4:c.4216+2T>G NM_007294.3:c.4357+2T>G NM_007300.4:c.4357+2T>G LRG_292:g.135582T>G NG_005905.2:g.135582T>G NC_000017.11:g.43082402A>C NC_000017.10:g.41234419A>C U14680.1:n.4476+2T>G U14680.1:intron 13 splice donor variant splice donor variant splice donor variant splice donor variant splice donor variant IVS13+2T>G 17q21.31 BRCA1 DNA repair associated BRCA1 Sufficient evidence for dosage pathogenicity https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA1 No evidence available https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA1 This gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons. This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons. NM_007294.3(BRCA1):c.4357+2T>G Breast-ovarian cancer, familial 1 BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OVARIAN CANCER, SUSCEPTIBILITY TO BREAST CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 Breast cancer, familial 1 BROVCA1 HBOC BRCA1- and BRCA2-associated hereditary breast and ovarian cancer syndrome (HBOC) is characterized by an increased risk for female and male breast cancer, ovarian cancer (includes fallopian tube and primary peritoneal cancers), and to a lesser extent other cancers such as prostate cancer, pancreatic cancer, and melanoma primarily in individuals with a BRCA2 pathogenic variant. The exact cancer risks differ slightly depending on whether HBOC is caused by a BRCA1 or BRCA2 pathogenic variant. loss of function Neoplasm 20301425 NBK1247 17392385 20065170 15604628 17508274 19305347 23788249 http://www.nccn.org/professionals/physician_gls/pdf/genetics_screening.pdf National Comprehensive Cancer Network practice guidelines in oncology. Genetic/Familial High-Risk Assessment: Breast and Ovarian 23918944 2948529 24493721 24366376 23188549 25394175 24366402 24432435 25356965 27854360 10.1038/gim.2016.190 current no assertion criteria provided Pathogenic germline human not provided 1 clinical testing 1 IVS13+2T>G BRCA1 Breast-ovarian cancer, familial 1 current no assertion criteria provided Pathogenic germline human yes clinical testing IVS13+2T>G U14680.1:intron 13 U14680.1:n.4476+2T>G Breast-ovarian cancer, familial 1 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000080071.xml ================================================ currentRCV000124228NM_003159.2(CDKL5):c.2995G>A (p.Val999Met) AND not specifiedcurrentcriteria provided, multiple submitters, no conflictsBenignX-linked inheritancegermlinehumanyesclinical testingnot providedgermlinehumanunknownclinical testingnot providednot providednot providedmaternalhumannot provided2DHPLC, exons 2-21curationCSGE, MECP2 negativecurationunknownhumannot provided4DHPLC, exons 2-21curationCSGE, MECP2 negativecurationdirect, CDKL5 Exon 2-21curationNM_003159.2(CDKL5):c.2995G>A (p.Val999Met)p.V999M:GTG>ATGLRG_702t1:c.184+3207C>TNM_000330.3:c.184+3207C>TNM_001037343.1:c.2995G>ANM_003159.2:c.2995G>ALRG_702:g.29003C>TNG_008475.1:g.232842G>ANG_008659.3:g.29003C>TNC_000023.11:g.18653446G>ANC_000023.10:g.18671566G>Ap.(Val999Met)O76039:p.Val999MetNP_001032420.1:p.Val999MetNP_003150.1:p.Val999MetNP_003150.1:p.Val999MetNM_003159.2:exon 21intron variantmissense variantmissense variantV999MXp22.13cyclin dependent kinase like 5CDKL5Sufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=CDKL5No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=CDKL5retinoschisin 1RS1Sufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=RS1No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=RS1NM_003159.2(CDKL5):c.2995G>A (p.Val999Met)not specifiedAllHighlyPenetrantThe term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders. This usage was introduced in 2014 to replace AllHighlyPenetrant.currentcriteria provided, single submitterBenignEGL_Classification_Definitions_2015https://submit.ncbi.nlm.nih.gov/ft/byid/dn5yhybg/egl_classification_definitions_2015.pdfgermlinehumanunknownmixedclinical testingNM_003159.2:Ex21CDKL5not specifiedhttp://www.egl-eurofins.com/emvclass/emvclass.php?approved_symbol=CDKL5currentcriteria provided, single submitterBenignThis variant is considered likely benign or benign based on one or more of the following criteria: it is a conservative change, it occurs at a poorly conserved position in the protein, it is predicted to be benign by multiple in silico algorithms, and/or has population frequency not consistent with disease.GeneDX Variant Classification (06012015)https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdfgermlinehumanyesclinical testingnot providedNM_003159.2:c.2995G>ACDKL5not specifiedcurrentno assertion criteria providedBenignFound in unaffected mother with apparent balanced X-chromosome inactivation; in exon 20, affecting only the transcript lowly expressed; In silico prediction: SIFT = tolerated, MutationTaster = polymorphism, PolyPhen2 = benign, AlignGVGD = benign (C0)maternalhumannot provided1femaleDHPLC, exons 2-21curation16813600Rett syndrome - early seizureunknownhumannot provided1femaleDHPLC, exons 2-21curation16813600Unaffected - unaffected family memberunknownhumannot provided1femaledirect, CDKL5 Exon 2-21curation21775177Not Rett syndrome - infantile intractable epilepsymaternalhumannot provided1femaleCSGE, MECP2 negativecuration22867051Not Rett syndrome - epilepsy, Rett-likeunknownhumannot provided1femaleCSGE, MECP2 negativecuration22867051Unaffected - unaffected family memberunknownhumannot provided1femaleCSGE, MECP2 negativecuration22867051Unaffected - non-RTT controlNM_003159.2:c.2995G>ACDKL5Not specifiedcurrentcriteria provided, single submitterBenignX-linked inheritanceACMG Guidelines, 200718414213germlinehumanunknownclinical testingnot providedNM_003159.2:c.2995G>ACDKL5not specifiedcurrentno assertion criteria providedBenigngermlinehumanunknownclinical testingnot providedp.(Val999Met)NM_003159.2:c.2995G>ACDKL5not specifiedcurrentcriteria provided, single submitterBenignACMG Guidelines, 201525741868germlinehumanunknownclinical testingnot providedNM_003159.2:c.2995G>ANOT SPECIFIED ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000083638.xml ================================================ currentNM_000348.4(SRD5A2):c.89_90insT (p.Ser31fs) AND not providedcurrentno assertion providednot providednot providedhumannot provided1literature onlynot providedNM_000348.4(SRD5A2):c.89_90insT (p.Ser31fs)NM_000348.4:c.89_90insTNG_008365.1:g.5160_5161insTNC_000002.12:g.31580811_31580812insANC_000002.11:g.31805880dupNP_000339.2:p.Ser31fsframeshift variant2p23.1steroid 5 alpha-reductase 2SRD5A2NM_000348.4(SRD5A2):c.89_90insT (p.Ser31fs)not providedThe term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified.currentno assertion providednot providednot providedhumannot provided1not providednot providedNG_008365.1:g.5160_5161insTnot provided ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000087262.xml ================================================ currentNM_002296.4(LBR):c.1599_1605delinsCTAGAAG (p.Leu534_Leu535delinsTer) AND Pelger-Huët anomalycurrentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyWaterham et al. (2003) described a fetus, the product of a consanguineous Turkish marriage, who presented with intrauterine growth retardation at 17 weeks' gestation and was found to have severe hydrops and short-limb skeletal dysplasia consistent with thanatophoric dysplasia. Intrauterine death occurred at 18 weeks, and delivery was induced. Fetal examination showed severe hydrops, extremely shortened edematous limbs, and postaxial polydactyly on both hands. Radiographic examination showed severe platyspondyly, short irregular ribs, a 'moth-eaten' aspect of scapular and pelvic bones, and very short tubular bones with angular diaphyses. Histopathology showed almost complete absence of ossification, severe disorganization of cartilage (with nodular calcification deposits), and defective or absent joint formation. On the basis of these findings, the diagnosis of Greenberg dysplasia (215140) was made. Elevated levels of cholesta-8,14-dien-3-beta-ol in cultured skin fibroblasts were consistent with deficiency of 3-beta-hydroxysterol delta(14)-reductase. Sequence analysis of the LBR gene identified a homozygous 7-bp substitution at nucleotide 1599 in exon 13, TCTTCTA-CTAGAAG, which resulted in a truncated protein. The mother showed classic Pelger-Huet anomaly (169400), which represents the heterozygous state of 3-beta-hydroxysterol delta(14)-reductase deficiency.12618959NM_002296.4(LBR):c.1599_1605delinsCTAGAAG (p.Leu534_Leu535delinsTer)NM_194442.2:c.1599_1605delTCTTCTAinsCTAGAAGNM_002296.4:c.1599_1605delinsCTAGAAGNM_194442.2:c.1599_1605delinsCTAGAAGNG_008099.1:g.29326_29332delTCTTCTAinsCTAGAAG12618959NG_008099.1:g.29326_29332delinsCTAGAAGNG_008099.1:g.29326_29332delTCTTCTAinsCTAGAAGNC_000001.11:g.225404486_225404492delinsCTTCTAGNC_000001.10:g.225592188_225592194delinsCTTCTAGNP_002287.2:p.Leu534_Leu535delinsTerNP_919424.1:p.Leu534_Leu535delinsTernonsensenonsense1q42.12lamin B receptorLBRNM_002296.4(LBR):c.1599_1605delinsCTAGAAG (p.Leu534_Leu535delinsTer)Pelger-Huët anomalyPelger-Huet AnomalyPHAcurrentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyWaterham et al. (2003) described a fetus, the product of a consanguineous Turkish marriage, who presented with intrauterine growth retardation at 17 weeks' gestation and was found to have severe hydrops and short-limb skeletal dysplasia consistent with thanatophoric dysplasia. Intrauterine death occurred at 18 weeks, and delivery was induced. Fetal examination showed severe hydrops, extremely shortened edematous limbs, and postaxial polydactyly on both hands. Radiographic examination showed severe platyspondyly, short irregular ribs, a 'moth-eaten' aspect of scapular and pelvic bones, and very short tubular bones with angular diaphyses. Histopathology showed almost complete absence of ossification, severe disorganization of cartilage (with nodular calcification deposits), and defective or absent joint formation. On the basis of these findings, the diagnosis of Greenberg dysplasia (215140) was made. Elevated levels of cholesta-8,14-dien-3-beta-ol in cultured skin fibroblasts were consistent with deficiency of 3-beta-hydroxysterol delta(14)-reductase. Sequence analysis of the LBR gene identified a homozygous 7-bp substitution at nucleotide 1599 in exon 13, TCTTCTA-CTAGAAG, which resulted in a truncated protein. The mother showed classic Pelger-Huet anomaly (169400), which represents the heterozygous state of 3-beta-hydroxysterol delta(14)-reductase deficiency.12618959LBR, 7-BP SUB, NT15997-BP SUB, NT1599LBRPELGER-HUET ANOMALY ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000112977.xml ================================================ currentNM_000059.3(BRCA2):c.-26G>A AND Breast-ovarian cancer, familial 2currentreviewed by expert panelBenigngermlinehumanyesclinical testingnot providednot providednot providedgermlineCaucasianhumanyesclinical testinggermlineCaucasian Southern Africanhumanyesclinical testinggermlineEuropean, Asian, Oceananhumanyesclinical testinggermlineSinhalesehumanyesclinical testinggermlinehumannot provided86clinical testinggermlinehumanunknowncurationnot providednot providedhumanyesclinical testingunknownhumanunknownclinical testingnot providedNM_000059.3(BRCA2):c.-26G>A203G/A203G>A203 G>ALRG_293t1:c.-26G>ANM_000059.3:c.-26G>ALRG_293:g.5956G>ANG_017006.2:g.3929C>TNG_017006.1:g.520C>TNG_012772.3:g.5956G>ANC_000013.11:g.32316435G>ANC_000013.10:g.32890572G>AU43746.1:n.203G>ANM_000059.3:exon 2U43746.1:exon 25 prime UTR variant5'UTR203G>A13q13.1BRCA2 DNA repair associatedBRCA2Sufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA2No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA2This gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons.This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons.NM_000059.3(BRCA2):c.-26G>ABreast-ovarian cancer, familial 2BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 2BREAST CANCER, FAMILIAL, SUSCEPTIBILITY TO, 2Breast cancer, familial 2BROVCA2HBOCBRCA2BRCA1- and BRCA2-associated hereditary breast and ovarian cancer syndrome (HBOC) is characterized by an increased risk for female and male breast cancer, ovarian cancer (includes fallopian tube and primary peritoneal cancers), and to a lesser extent other cancers such as prostate cancer, pancreatic cancer, and melanoma primarily in individuals with a BRCA2 pathogenic variant. The exact cancer risks differ slightly depending on whether HBOC is caused by a BRCA1 or BRCA2 pathogenic variant.loss of functionNeoplasm20301425NBK1247173923852006517015604628175082741930534723788249http://www.nccn.org/professionals/physician_gls/pdf/genetics_screening.pdfNational Comprehensive Cancer Network practice guidelines in oncology. Genetic/Familial High-Risk Assessment: Breast and Ovarian2391894429485292449372124366376358549223188549253941752436640224432435253569652785436010.1038/gim.2016.190currentno assertion criteria providedBenigngermlinehumanyesclinical testinggermlineAustriahumanyesclinical testinggermlineBelgiumhumanyesclinical testinggermlineSpainhumanyesclinical testinggermlineCaucasianAmericanhumanyesclinical testinggermlineCaucasianGermanyhumanyesclinical testinggermlineCaucasian Southern AfricanSouth Africahumanyesclinical testinggermlineEuropean, Asian, Oceananhumanyesclinical testinggermlineSinhaleseSri Lankahumanyesclinical testingunknownBrazilhumanyesclinical testing203G/A203G>AU43746.1:exon 2U43746.1:n.203G>ABreast-ovarian cancer, familial 2currentno assertion criteria providedBenigngermlinehumannot provided86clinical testing865'UTR203G>ABRCA2Breast-ovarian cancer, familial 2currentcriteria provided, single submitterBenignACMG Guidelines, 201525741868germlineBloodhumanyesclinical testingnot providedNM_000059.3:c.-26G>ABRCA2currentreviewed by expert panelBenignClass 1 not pathogenic based on frequency >1% in an outbred sampleset. Frequency 0.3689 (Asian), 0.04878 (African), 0.2282 (European), derived from 1000 genomes (2012-04-30).ENIGMA BRCA1/2 Classification Criteria (2015)https://submit.ncbi.nlm.nih.gov/ft/byid/hxnfuuxx/enigma_rules_2015-03-26.pdfENIGMA BRCA1/2 Classification Criteria (2015)germlinehumanunknowncurationnot provided203 G>ANM_000059.3:c.-26G>ABRCA2Breast-ovarian cancer, familial 2ENIGMA (Evidence-based Network for the Interpretation of Germline Mutant Alleles) is a consortium focused on determining the clinical significance of variants in BRCA1, BRCA2 and other known/suspected breast cancer genes. http://enigmaconsortium.org/currentcriteria provided, single submitterBenignACMG Guidelines, 201525741868unknownhumanunknownclinical testingnot providedNM_000059.3:c.-26G>ABRCA2currentcriteria provided, single submitterBenignACGS Guidelines, 2013https://submit.ncbi.nlm.nih.gov/ft/byid/yggjhwfz/evaluation_and_reporting_of_sequence_variants_bpgs_june_2013_-_finalpdf.pdfgermlinehumanyesclinical testingnot providedNM_000059.3:c.-26G>ABRCA2VKGL Data-share Consensuscurrentcriteria provided, single submitterBenignACGS Guidelines, 2013https://submit.ncbi.nlm.nih.gov/ft/byid/3yyso7ro/evaluation_and_reporting_of_sequence_variants_bpgs_june_2013_-_finalpdf.pdfgermlinehumanyesclinical testingnot providedNM_000059.3:c.-26G>ABRCA2VKGL Data-share Consensus ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000113363.xml ================================================ currentNM_000059.3(BRCA2):c.4965C>R (p.Tyr1655Ter) AND Breast-ovarian cancer, familial 2currentno assertion criteria providedPathogenicgermlineCaucasian Non Hispanichumanyesclinical testingNM_000059.3(BRCA2):c.4965C>R (p.Tyr1655Ter)LRG_293t1:c.4965C>RNM_000059.3:c.4965C>RLRG_293:g.28841C>RNG_012772.3:g.28841C>RNC_000013.11:g.32339320C>RNC_000013.10:g.32913457C>RU43746.1:n.5193CtoG/ALRG_293p1:p.Tyr1655TerNP_000050.2:p.Tyr1655TerNP_000050.2:p.Tyr1655TerU43746.1:exon 11nonsenseY1655X13q13.1BRCA2 DNA repair associatedBRCA2Sufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA2No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA2This gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons.This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons.NM_000059.3(BRCA2):c.4965C>R (p.Tyr1655Ter)Breast-ovarian cancer, familial 2BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 2BREAST CANCER, FAMILIAL, SUSCEPTIBILITY TO, 2Breast cancer, familial 2BROVCA2HBOCBRCA2BRCA1- and BRCA2-associated hereditary breast and ovarian cancer syndrome (HBOC) is characterized by an increased risk for female and male breast cancer, ovarian cancer (includes fallopian tube and primary peritoneal cancers), and to a lesser extent other cancers such as prostate cancer, pancreatic cancer, and melanoma primarily in individuals with a BRCA2 pathogenic variant. The exact cancer risks differ slightly depending on whether HBOC is caused by a BRCA1 or BRCA2 pathogenic variant.loss of functionNeoplasm20301425NBK1247173923852006517015604628175082741930534723788249http://www.nccn.org/professionals/physician_gls/pdf/genetics_screening.pdfNational Comprehensive Cancer Network practice guidelines in oncology. Genetic/Familial High-Risk Assessment: Breast and Ovarian2391894429485292449372124366376358549223188549253941752436640224432435253569652785436010.1038/gim.2016.190currentno assertion criteria providedPathogenicgermlineCaucasian Non Hispanichumanyesclinical testingY1655XU43746.1:exon 11U43746.1:n.5193CtoG/ABreast-ovarian cancer, familial 2 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000120902.xml ================================================ currentNM_017709.4(TENT5C):c.201C>G (p.His67Gln) AND not specifiedcurrentno assertion providednot providedgermlineEuropeanhumanunknown331Complete GenomicsDiscoveryreference population0.0544germlineAfricanhumanunknown43Complete GenomicsDiscoveryreference population0.27910000000000001germlineEast_Asianhumanunknown62Complete GenomicsDiscoveryreference population0.1452germlineHispanichumanunknown118Complete GenomicsDiscoveryreference population0.0932germlineWhole_cohorthumanunknown681Complete GenomicsDiscoveryreference population0.0947germlineAfrican_Europeanhumanunknown46Complete GenomicsDiscoveryreference population0.1739germlineCentral_Asianhumanunknown50Complete GenomicsDiscoveryreference population0.07NM_017709.4(TENT5C):c.201C>G (p.His67Gln)NM_017709.4:c.201C>GNC_000001.11:g.117623069C>GNC_000001.10:g.118165691C>GQ5VWP2:p.His67GlnNP_060179.2:p.His67Glnmissense variantH67Q1p12terminal nucleotidyltransferase 5CTENT5CNM_017709.4(TENT5C):c.201C>G (p.His67Gln)not specifiedAllHighlyPenetrantThe term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders. This usage was introduced in 2014 to replace AllHighlyPenetrant.currentno assertion providednot providedgermlineWhole_cohorthumanunknown681Complete Genomicsnext-gen sequencingdiscoveryreference population0.0947germlineAfricanhumanunknown43Complete Genomicsnext-gen sequencingdiscoveryreference population0.27910000000000001germlineAfrican_Europeanhumanunknown46Complete Genomicsnext-gen sequencingdiscoveryreference population0.1739germlineCentral_Asianhumanunknown50Complete Genomicsnext-gen sequencingdiscoveryreference population0.07germlineEast_Asianhumanunknown62Complete Genomicsnext-gen sequencingdiscoveryreference population0.1452germlineEuropeanhumanunknown331Complete Genomicsnext-gen sequencingdiscoveryreference population0.0544germlineHispanichumanunknown118Complete Genomicsnext-gen sequencingdiscoveryreference population0.0932AllHighlyPenetrant24728327Please see associated publication for description of ethnicities ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000124712.xml ================================================ currentNM_004006.2(DMD):c.8810A>G (p.Gln2937Arg) AND not specifiedcurrentcriteria provided, multiple submitters, no conflictsBenigngermlinehumanunknownclinical testingnot providedgermlinehumanyesclinical testingnot providedNM_004006.2(DMD):c.8810A>G (p.Gln2937Arg)p.Q2937R:CAG>CGGLRG_199t1:c.8810A>GNM_004021.3:c.1430=NM_004023.3:c.1430=NM_004013.2:c.1430A>GNM_004020.3:c.1430A>GNM_004022.2:c.1430A>GNM_004012.4:c.4778=NM_004011.4:c.4787=NM_004014.2:c.623A>GNM_004010.3:c.8441A>GNM_000109.4:c.8786=NM_004009.3:c.8798A>GNM_004006.2:c.8810A>GLRG_199:g.1866377A>GNG_012232.1:g.1866377A>GNC_000023.11:g.31478233=NC_000023.10:g.31496350=NM_004018.1:c.-211404A>GLRG_199p1:p.Gln2937ArgNP_004003.2:p.Arg1593=NP_004002.3:p.Arg1596=NP_000100.3:p.Arg2929=NP_004012.2:p.Arg477=NP_004014.2:p.Arg477=NP_004005.1:p.Gln208ArgNP_004001.1:p.Gln2814ArgNP_004000.1:p.Gln2933ArgNP_003997.1:p.Gln2937ArgNP_004004.1:p.Gln477ArgNP_004011.2:p.Gln477ArgNP_004013.1:p.Gln477Argmissense variantmissense variantmissense variantmissense variantmissense variantmissense variantmissense variantno sequence alterationno sequence alterationno sequence alterationno sequence alterationno sequence alterationQ208RQ2814RQ2933RQ2937RQ477RXp21.2dystrophinDMDSufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=DMDNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=DMDNM_004006.2(DMD):c.8810A>G (p.Gln2937Arg)not specifiedAllHighlyPenetrantThe term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders. This usage was introduced in 2014 to replace AllHighlyPenetrant.currentcriteria provided, single submitterBenignThis variant is considered likely benign or benign based on one or more of the following criteria: it is a conservative change, it occurs at a poorly conserved position in the protein, it is predicted to be benign by multiple in silico algorithms, and/or has population frequency not consistent with disease.GeneDX Variant Classification (06012015)https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdfgermlinehumanyesclinical testingnot providedNM_004006.2:c.8810A>GDMDnot specifiedcurrentcriteria provided, single submitterBenignLabCorp Variant Classification Summary - May 2015https://submit.ncbi.nlm.nih.gov/ft/byid/pttb9itm/labcorp_variant_classification_method_-_may_2015.pdfgermlinehumanunknownclinical testingnot providedNM_004006.2:c.8810A>GDMDnot specified ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000144179.xml ================================================ currentNM_004387.4(NKX2-5):c.809G>A (p.Cys270Tyr) AND multiple conditionscurrentno assertion criteria providedLikely benigngermlineCausasianshumanyes188HiSeq/ABI3500researchNoNM_004387.4(NKX2-5):c.809G>A (p.Cys270Tyr)NM_001166176.2:c.*608G>ANM_001166175.2:c.*762G>ANM_004387.4:c.809G>ANG_013340.1:g.7578G>ANC_000005.10:g.173232735C>TNC_000005.9:g.172659738C>TNM_004387.3:c.809G>ANP_004378.1:p.Cys270Tyr3 prime UTR variant3 prime UTR variantmissense variantC270Y5q35.1NK2 homeobox 5NKX2-5Sufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=NKX2-5No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=NKX2-5NM_004387.4(NKX2-5):c.809G>A (p.Cys270Tyr)Single ventricleCommon ventriclesmall Atrial septal defectcurrentno assertion criteria providedLikely benigngermlineCausasiansLebanonhumanyes188NoHiSeq/ABI3500next-gen sequencing, Sanger sequencingresearchyesLebanese Congenital heart disease population allele frequency 1/188 families and 0 MAF in reported databasesThe number of individuals with the variant reflects the total number of individuals within the family.NC_000005.9:g.172659738C>TNKX2-5Single Ventriclesmall Atrial septal defect ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000152657.xml ================================================ current NM_000551.3(VHL):c.-75_-55del AND Von Hippel-Lindau syndrome current criteria provided, single submitter Likely pathogenic Autosomal dominant inheritance germline human not provided clinical testing NM_000551.3(VHL):c.-75_-55del LRG_322t1:c.-75_-55del NM_000551.3:c.-75_-55del LRG_322:g.5139_5159del NG_008212.3:g.5139_5159del NC_000003.12:g.10141773_10141793del NM_000551.2:c.-75_-55del NM_000551.2:exon 1 NM_000551.3:exon 1 5 prime UTR variant 3p25.3 von Hippel-Lindau tumor suppressor VHL Sufficient evidence for dosage pathogenicity https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=VHL No evidence available https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=VHL This gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons. This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons. NM_000551.3(VHL):c.-75_-55del Von Hippel-Lindau syndrome VHL Von Hippel-Lindau (VHL) syndrome is characterized by hemangioblastomas of the brain, spinal cord, and retina; renal cysts and clear cell renal cell carcinoma; pheochromocytoma, pancreatic cysts, and neuroendocrine tumors; endolymphatic sac tumors; and epididymal and broad ligament cysts. Cerebellar hemangioblastomas may be associated with headache, vomiting, gait disturbances, or ataxia. Spinal hemangioblastomas and related syrinx usually present with pain. Sensory and motor loss may develop with cord compression. Retinal hemangioblastomas may be the initial manifestation of VHL syndrome and can cause vision loss. Renal cell carcinoma occurs in about 70% of individuals with VHL and is the leading cause of mortality. Pheochromocytomas can be asymptomatic but may cause sustained or episodic hypertension. Pancreatic lesions often remain asymptomatic and rarely cause endocrine or exocrine insufficiency. Endolymphatic sac tumors can cause hearing loss of varying severity, which can be a presenting symptom. Cystadenomas of the epididymis are relatively common. They rarely cause problems, unless bilateral, in which case they may result in infertility. loss of function 20301636 NBK1463 15604628 23788249 24893135 3419007 25394175 24319509 25356965 27854360 10.1038/gim.2016.190 current criteria provided, single submitter Likely pathogenic 22357542 12114475 18836774 The c.-75_-55del variant in VHL has been identified by our laboratory in 1 Cauca sian adult with VHL and segregated with disease in at least 5 affected relatives including 1 obligate carrier. This variant is located in the 5' untranslated re gion (UTR), a regulatory region, and may have an effect on translational efficie ncy. The deleted sequence in this variant is highly conserved in evolutionarily distant species and in vitro studies have shown that a deletion of this region r emoves a transcription factor binding site which is predicted to alter VHL trans cription (Zatyka 2002). In summary, although additional studies are required to fully establish its clinical significance, the c.-75_-55del variant is likely pa thogenic. Autosomal dominant inheritance LMM Criteria 24033266 germline human not provided clinical testing NM_000551.3:c.-75_-55del NM_000551.3:EXON 1 NC_000003.11:g.10183457_10183477del VHL ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000153339.xml ================================================ currentNM_000828.4(GRIA3):c.-2G= AND not specifiedcurrentcriteria provided, single submitterBenigngermlinehumanunknownclinical testingNM_000828.4(GRIA3):c.-2G=NM_000828.4:c.-2dupNM_001256743.2:c.-2dupNM_007325.5:c.-2dupNG_009377.2:g.5292dupNC_000023.11:g.123184534dupNM_000828.4:5' UTRNM_000828.4:exon 15 prime UTR variant5 prime UTR variant5 prime UTR variantXq25glutamate ionotropic receptor AMPA type subunit 3GRIA3Some evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=GRIA3No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=GRIA3The G in position 5292 of NG_009377.2 is in a single base gap when aligned to NC_000023.10 in GRCh37 assembly. Thus what is asserted as an assertion relative to GRCh37 is no change on GRCh38 (NC_000023.11) and on the RefSeqGene.NM_000828.4(GRIA3):c.-2G=not specifiedAllHighlyPenetrantThe term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders. This usage was introduced in 2014 to replace AllHighlyPenetrant.currentcriteria provided, single submitterBenignEGL_Classification_Definitions_2015https://submit.ncbi.nlm.nih.gov/ft/byid/dn5yhybg/egl_classification_definitions_2015.pdfgermlinehumanunknownmixedclinical testingNM_000828.4:Ex1GRIA3not specifiedhttp://www.egl-eurofins.com/emvclass/emvclass.php?approved_symbol=GRIA3 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000167792.xml ================================================ currentNM_007294.3(BRCA1):c.135-18T>G AND Hereditary breast and ovarian cancer syndromecurrentcriteria provided, single submitterUncertain significancegermlinehumanyesresearchnot providedgermlinehumanunknownclinical testingnot providedNM_007294.3(BRCA1):c.135-18T>GIVS4-18T>GLRG_292t1:c.135-18T>GNM_007297.4:c.-7-18T>GNM_007294.3:c.135-18T>GNM_007298.3:c.135-18T>GNM_007299.4:c.135-18T>GNM_007300.4:c.135-18T>GLRG_292:g.111433T>GNG_005905.2:g.111433T>GNC_000017.11:g.43106551A>CNC_000017.10:g.41258568A>CU14680.1:n.254-18T>GNM_007294.3:intron 3U14680.1:intron 4intron variantintron variantintron variantintron variantintron variant17q21.31BRCA1 DNA repair associatedBRCA1Sufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA1No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA1This gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons.This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons.NM_007294.3(BRCA1):c.135-18T>GHereditary breast and ovarian cancer syndromeHereditary breast and ovarian cancerHBOCBRCA1- and BRCA2-associated hereditary breast and ovarian cancer syndrome (HBOC) is characterized by an increased risk for female and male breast cancer, ovarian cancer (includes fallopian tube and primary peritoneal cancers), and to a lesser extent other cancers such as prostate cancer, pancreatic cancer, and melanoma primarily in individuals with a BRCA2 pathogenic variant. The exact cancer risks differ slightly depending on whether HBOC is caused by a BRCA1 or BRCA2 pathogenic variant.loss of function20301425NBK12471739238520065170126921711560462818163131175082741930534723788249http://www.nccn.org/professionals/physician_gls/pdf/genetics_screening.pdfNational Comprehensive Cancer Network practice guidelines in oncology. Genetic/Familial High-Risk Assessment: Breast and Ovarian2391894429485292449372124366376231885492436640224432435253569652785436010.1038/gim.2016.190currentcriteria provided, single submitterUncertain significance23239986This sequence change falls in intron 3 of the BRCA1 gene. It does not directly change the encoded amino acid sequence of the BRCA1 protein. This variant is present in population databases (rs80358085, ExAC 0.002%). This variant has been reported in an individual with a personal and family history of breast cancer (PMID: 23239986). This variant is also known as IVS4-18T>G in the literature. ClinVar contains an entry for this variant (Variation ID: 54214). Experimental studies have shown that this variant modestly increases skipping of exon 4 (also known as exon 5), but exon 4 skipping has also been observed in the wild-type controls at a low level (PMID: 23239986). In summary, the available evidence is currently insufficient to determine the role of this variant in disease. Therefore, it has been classified as a Variant of Uncertain Significance.Nykamp K et al. (Genet Med 2017)28492532germlinehumanunknownclinical testingnot providedNM_007294.3:c.135-18T>GBRCA1currentno assertion criteria providedUncertain significancegermlinehumanyesfemaleresearchnot providedNM_007294.3:intron 3NM_007294.3:c.135-18T>GBRCA1Hereditary breast and ovarian cancer syndrome30472649 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000169296.xml ================================================ currentNM_000520.6(HEXA):c.986+3A>G AND Tay-Sachs diseasecurrentcriteria provided, multiple submitters, no conflictsPathogenic/Likely pathogenicAutosomal recessive inheritancegermlinehumanunknownclinical testingnot providednot providednot providedmaternalhumanunknown1research15:72641417:T:00364Cunknownhumanunknown1researchliterature onlynot provided15:72641417:T:00364MNM_000520.6(HEXA):c.986+3A>GNM_001318825.2:c.1019+3A>GNM_000520.6:c.986+3A>GNG_009017.1:g.32104A>GNG_009017.2:g.32104A>GNC_000015.10:g.72349076T>CNC_000015.9:g.72641417T>CNM_000520.4:c.986+3A>Gintron variantintron variant15q23hexosaminidase subunit alphaHEXANM_000520.6(HEXA):c.986+3A>GTay-Sachs diseaseTSDHexosaminidase A deficiency results in a group of neurodegenerative disorders caused by intralysosomal storage of the specific glycosphingolipid, GM2 ganglioside. The prototype hexosaminidase A deficiency is Tay-Sachs disease, also known as the acute infantile variant. Tay-Sachs disease is characterized by progressive weakness, loss of motor skills, decreased attentiveness, and increased startle response beginning between ages three and six months with progressive evidence of neurodegeneration including: seizures, blindness, spasticity, eventual total incapacitation, and death, usually before age four years. The juvenile (subacute), chronic, and adult-onset variants of hexosaminidase A deficiency have later onsets, slower progression, and more variable neurologic findings, including: progressive dystonia, spinocerebellar degeneration, motor neuron disease, and, in some individuals with adult-onset disease, a bipolar form of psychosis.20301397NBK12181819705719888064currentcriteria provided, single submitterLikely pathogenic23035047755183024518553Autosomal recessive inheritanceCounsyl Autosomal and X-linked Recessive Disease Classification criteria (2015)https://submit.ncbi.nlm.nih.gov/ft/byid/4yisoce9/mdi-5618_320494_counsyl_autosomal_and_x-linked_recessive_disease_classification_criteria_(2015).pdfCounsyl Autosomal and X-linked Recessive Disease Classification criteria (2015)unknownhumanunknownliterature onlynot providedNM_000520.4:c.986+3A>GHEXATay-Sachs diseasecurrentcriteria provided, single submitterPathogenicHA_assertions_20150911https://submit.ncbi.nlm.nih.gov/ft/byid/suocabs5/ha_assertions_20150911.pdfHA_assertions_20150911.pdfunknownhumanunknown1research15:72641417:T:00364Mmaternalhumanunknown1research15:72641417:T:00364CHEXACSER-HudsonAlphacurrentcriteria provided, single submitterLikely pathogenic755183091501572010046623035047The HEXA c.986+3A>G splice region variant has been reported in at least four studies in which it is found in a total of three patients in a compound heterozygous state and in a heterozygous state in one obligate carrier (Richard et al. 1995; Akerman et al. 1997; Giraud et al. 2010; Saunders et al. 2012). One compound heterozygote presented with the late-infantile onset form of Tay-Sachs disease while another compound heterozygote had onset in adulthood. Control data are unavailable for this variant which is reported at a frequency of 0.000025 in the total population from the Exome Aggregation Consortium. In one study, exons 7 to 9 of the c.986+3A>G variant HEXA were amplified and the resulting mRNA product was shown to be lacking exon 8, suggesting this variant affects splicing (Richard et al. 1995). Based on the evidence, the c.986+3A>G variant is classified as likely pathogenic for hexoaminidase A deficiency. This variant was observed by ICSL as part of a predisposition screen in an ostensibly healthy population.ICSL Variant Classification Criteria 09 May 2019https://submit.ncbi.nlm.nih.gov/ft/byid/thsgk7t4/icsl_variant_classification_criteria_09_may_2019.pdfgermlinehumanunknownclinical testingnot providedNM_000520.4:c.986+3A>GHEXAcurrentcriteria provided, single submitterPathogenic24518553201004669150157755183023035047Variant summary: HEXA c.986+3A>G alters a conserved nucleotide located close to a canonical splice site and therefore could affect mRNA splicing, leading to a significantly altered protein sequence. Several computational tools predict a significant impact on normal splicing: Four predict the variant weakens a 5' donor site. A publication, Richard_1995, functionally assessed the variant and found it to cause exon 8 to be deleted, which is located in the Glycoside hydrolase family 20, catalytic domain (via InterPro). The variant was observed with an allele frequency of 8.1e-06 in 246182 control chromosomes (gnomAD). This frequency is not higher than expected for a pathogenic variant in HEXA causing Tay-Sachs Disease (8.1e-06 vs 0.0014), allowing no conclusion about variant significance. The variant, c.986+3A>G, has been reported in the literature in individuals affected with Tay-Sachs Disease. These data indicate that the variant is likely to be associated with disease. A ClinVar submission from a clinical diagnostic laboratory (evaluation after 2014) cites the variant as "likely pathogenic." Based on the evidence outlined above, the variant was classified as pathogenic.LabCorp Variant Classification Summary - May 2015https://submit.ncbi.nlm.nih.gov/ft/byid/pttb9itm/labcorp_variant_classification_method_-_may_2015.pdfgermlinehumanunknownclinical testingnot providedNM_000520.4:c.986+3A>GHEXATay-Sachs diseasecurrentcriteria provided, single submitterPathogenic75518302451855323035047This sequence change falls in intron 8 of the HEXA gene. It does not directly change the encoded amino acid sequence of the HEXA protein, but it affects a nucleotide within the consensus splice site of the intron. This variant is present in population databases (rs200926928, ExAC 0.02%). This variant has been observed in a patient with Tay-Sachs disease (PMID: 7551830) and on the opposite chromosome (in trans) from a pathogenic variant in an individual affected with Tay-Sachs disease (PMID: 23035047). This finding is consistent with autosomal recessive inheritance, and suggests that this variant contributes to disease. This variant was also identified in a individual with Tay-Sachs disease in whom a second variant was not identified (PMID: 24518553). ClinVar contains an entry for this variant (Variation ID: 188929). Experimental studies have shown that this splice site variant leads to the skipping of exon 8 (PMID: 7551830). Nucleotide substitutions within the consensus splice site are a relatively common cause of aberrant splicing (PMID: 17576681, 9536098). Algorithms developed to predict the effect of sequence changes on RNA splicing suggest that this variant may disrupt the consensus splice site, but this prediction has not been confirmed by published transcriptional studies. For these reasons, this variant has been classified as Pathogenic.Invitae Variant Classification Sherloc (09022015)28492532germlinehumanunknownclinical testingnot providedNM_000520.4:c.986+3A>GHEXATay-Sachs disease ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000170338.xml ================================================ current NM_024537.3(CARS2):c.[649_651delGAG];[752C>T] AND Alpers encephalopathy current criteria provided, single submitter Pathogenic Autosomal recessive inheritance germline human yes 2 HiSeq2000 Trio-based Whole Exome Sequencing using proband and his unaffected parents research No No NM_024537.4(CARS2):c.649_651del (p.Glu217del) NM_001352252.1:c.-138_-136del NM_001352253.2:c.649_651del NM_024537.4:c.649_651del NG_042045.1:g.28124_28126del NG_042045.2:g.35545_35547del NC_000013.11:g.110683055_110683057del NC_000013.10:g.111335402_111335404del NR_147941.1:n.620_622del NM_024537.2:c.649_651delGAG NR_147942.1:n.736_738del NP_001339182.1:p.Glu217del NP_078813.1:p.Glu217del 5 prime UTR variant inframe_deletion inframe_deletion non-coding transcript variant E217del 13q34 cysteinyl-tRNA synthetase 2, mitochondrial CARS2 25787132 NM_024537.2(CARS2):c.649_651delGAG NM_024537.2:c.649_651delGAG NM_024537.4(CARS2):c.752C>T (p.Pro251Leu) NM_001352252.1:c.-35C>T NM_001352253.2:c.752C>T NM_024537.4:c.752C>T NG_042045.1:g.34174C>T NG_042045.2:g.41595C>T NC_000013.11:g.110677007G>A NC_000013.10:g.111329354G>A NR_147941.1:n.723C>T NM_024537.2:c.752C>T NR_147942.1:n.839C>T Q9HA77:p.Pro251Leu NP_001339182.1:p.Pro251Leu NP_078813.1:p.Pro251Leu 5 prime UTR variant missense variant missense variant non-coding transcript variant P251L PRO251LEU 13q34 cysteinyl-tRNA synthetase 2, mitochondrial CARS2 25787132 NM_024537.2(CARS2):c.752C>T NM_024537.2:c.752C>T NM_024537.3(CARS2):c.[649_651delGAG];[752C>T] NM_024537.3:c.[649_651delGAG];[752C>T] Alpers encephalopathy current criteria provided, single submitter Pathogenic 25787132 Autosomal recessive inheritance CARS2 assertion criteria 25787132 germline human yes 1 male No HiSeq2000 Next-gen Whole Exome Sequencing Trio-based Whole Exome Sequencing using proband and his unaffected parents research combined mitochondrial respiratory chain deficiency| neurological regression| complex movement disorder| intractable seizures NM_024537.2:c.649_651delGAG CARS2 NM_024537.2:c.649_651delGAG NM_024537.2:c.752C>T CARS2 NM_024537.2:c.752C>T Alpers encephalopathy 25787132 25787132 The patient has mitochondrial encephalopathy and a combined mitochondrial oxidative phosphorylation deficiency. The child presented with neurological regression, complex movement disorder and intractable seizures. A combined deficiency of mitochondrial complexes I, III, and IV was noted in liver tissue, along with increased mitochondrial DNA content in skeletal muscle. Incomplete assembly of complex V was noted on blue native polyacrylamide gel electrophoretic (BN-PAGE) analysis in skeletal muscle and skin fibroblasts. ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000171474.xml ================================================ current Single allele AND not provided current no assertion criteria provided Likely pathogenic germline human yes research not provided 13q13.3 spartin SPART Single allele not provided The term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified. current no assertion criteria provided Likely pathogenic ACMG Guidelines, 2015 25741868 germline human yes Neurological phenotype research not provided SPG20 Not provided ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000179026.xml ================================================ currentNM_002617.3(PEX10):c.867_868insG (p.His290fs) AND not providedcurrentcriteria provided, single submitterPathogenicgermlinehumanunknownclinical testingNM_002617.3(PEX10):c.867_868insG (p.His290fs)NM_002617.3:c.867_868insGNM_153818.1:c.927_928insGNG_008342.1:g.11043_11044insGNG_016128.1:g.19754_19755insCNC_000001.11:g.2406528_2406529insCNC_000001.10:g.2337967_2337968insCNP_002608.1:p.His290fsNP_722540.1:p.His310fsNM_153818.1:exon 5frameshift variantframeshift variant1p36.32peroxisomal biogenesis factor 10PEX10NM_002617.3(PEX10):c.867_868insG (p.His290fs)not providedThe term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified.currentcriteria provided, single submitterPathogenicEGL_Classification_Definitions_2015https://submit.ncbi.nlm.nih.gov/ft/byid/dn5yhybg/egl_classification_definitions_2015.pdfgermlinehumanunknownmixedclinical testingNM_153818.1:Ex5PEX10not providedhttp://www.egl-eurofins.com/emvclass/emvclass.php?approved_symbol=PEX10 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000194003.xml ================================================ current NM_001080522.2(CC2D2A):c.4179+1del AND Joubert syndrome 9 current criteria provided, multiple submitters, no conflicts Pathogenic germline human yes clinical testing not provided unknown human yes research not provided NM_001080522.2(CC2D2A):c.4179+1del NM_001080522.2:c.4179+1delG NM_001080522.2:c.4179delG LRG_697t1:c.4179+1del NG_013035.1:g.123065del NC_000004.12:g.15587930del NC_000004.11:g.15589553del NM_001080522.2:intron 33 4p15.32 coiled-coil and C2 domain containing 2A CC2D2A 19777577 19466712 NM_001080522.2(CC2D2A):c.4179+1del Joubert syndrome 9 JBTS9 CC2D2A Classic Joubert syndrome (JS) is characterized by three primary findings: A distinctive cerebellar and brain stem malformation called the molar tooth sign (MTS). Hypotonia. Developmental delays. Often these findings are accompanied by episodic tachypnea or apnea and/or atypical eye movements. In general, the breathing abnormalities improve with age, truncal ataxia develops over time, and acquisition of gross motor milestones is delayed. Cognitive abilities are variable, ranging from severe intellectual disability to normal. Additional findings can include retinal dystrophy, renal disease, ocular colobomas, occipital encephalocele, hepatic fibrosis, polydactyly, oral hamartomas, and endocrine abnormalities. Both intra- and interfamilial variation are seen. 20301500 NBK1325 current criteria provided, single submitter Pathogenic ACMG Guidelines, 2015 25741868 germline human yes clinical testing not provided NM_001080522.2:c.4179+1del CC2D2A Joubert syndrome 9 current criteria provided, single submitter Pathogenic 26092869 Submitter's publication 26092869 unknown human yes research not provided NM_001080522.2:c.4179+1delG ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000203290.xml ================================================ currentNM_001127500.3(MET):c.3029C>T (p.Thr1010Ile) AND Congenital diaphragmatic herniacurrentcriteria provided, single submitterUncertain significanceAutosomal dominant inheritancegermlineCaucasianhumanyesresearchyesNM_001127500.3(MET):c.3029C>T (p.Thr1010Ile)LRG_662t1:c.3029C>TNM_001324402.2:c.1685C>TNM_000245.4:c.2975C>TNM_001127500.3:c.3029C>TLRG_662:g.104532C>TNG_008996.1:g.104532C>TNC_000007.14:g.116771936C>TNC_000007.13:g.116411990C>TNM_000245.2:c.2975C>TNM_001127500.1:c.3029C>Tp.T1010ILRG_662p1:p.Thr1010IleNP_001120972.1:p.Thr1010IleNP_001120972.1:p.Thr1010IleNP_001120972.1:p.Thr1010IleNP_001311331.1:p.Thr562IleNP_000236.2:p.Thr992IleNM_001127500.1:exon 14missense variantmissense variantmissense variantT1010IT562IT992I7q31.2MET proto-oncogene, receptor tyrosine kinaseMETNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=METNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=METNM_001127500.3(MET):c.3029C>T (p.Thr1010Ile)Congenital diaphragmatic herniaDiaphragmatic herniaDIHDIH1CDHHCD20301533NBK1359currentcriteria provided, single submitterUncertain significanceIt is unclear whether these changes, alone or in aggregate, are contributing to the development of CDH in this family.Autosomal dominant inheritanceBeck et al. (Am J Med Genet A 2015)25736269germlineCaucasianhumanyesmaleyesnext-generation sequencing2380608624088041researchNM_000245.2:c.2975C>TMET25736269This case report describes a pathogenic FBN1 variant in a family with recurrent congenital diaphragmatic hernia ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000205418.xml ================================================ current NM_000297.4(PKD2):c.290_292AGG[5] (p.Glu102del) AND not provided current criteria provided, single submitter Benign germline human unknown clinical testing not provided NM_000297.4(PKD2):c.290_292AGG[5] (p.Glu102del) NM_000297.4:c.290_292AGG[5] NG_008604.1:g.5356_5358AGG[5] NC_000004.12:g.88008023_88008025AGG[5] NC_000004.11:g.88929175_88929177AGG[5] NM_000297.3:c.289_291delGAG NR_156488.1:n.377_379AGG[5] NP_000288.1:p.Glu102del inframe_deletion E102del 4q22.1 polycystin 2, transient receptor potential cation channel PKD2 Sufficient evidence for dosage pathogenicity https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=PKD2 No evidence available https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=PKD2 NM_000297.4(PKD2):c.290_292AGG[5] (p.Glu102del) not provided The term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified. current criteria provided, single submitter Benign Nykamp K et al. (Genet Med 2017) 28492532 germline human unknown clinical testing not provided NM_000297.3:c.289_291delGAG PKD2 not provided ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000207071.xml ================================================ currentNM_014704.4(CEP104):c.1328_1329insT (p.Tyr444fs) AND Joubert syndrome 25currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn a 3.5-year-old boy (patient 842629), born of consanguineous Arab-Israeli parents, with Joubert syndrome-25 (JBTS25; 616781), Srour et al. (2015) identified a homozygous 1-bp insertion (c.1328_1329insT, NM_014704.3) in the CEP104 gene, resulting in a frameshift and premature termination (Tyr444fsTer3). The mutation, which was found by exome sequencing and confirmed by Sanger sequencing, segregated with the disorder in the family and was nor found in the dbSNP, 1000 Genomes Project, Exome Variant Server, or ExAC databases, or in 350 in-house ethnically matched exomes. Functional studies and studies on patient cells were not performed.26477546NM_014704.4(CEP104):c.1328_1329insT (p.Tyr444fs)NM_014704.4:c.1328_1329insTNG_046726.1:g.27152_27153insTNC_000001.11:g.3835081_3835082insANC_000001.10:g.3751645_3751646insANM_014704.3:c.1328_1329insTNP_055519.1:p.Tyr444fsframeshift variant1p36.32centrosomal protein 104CEP104NM_014704.4(CEP104):c.1328_1329insT (p.Tyr444fs)Joubert syndrome 25JBTS25Classic Joubert syndrome (JS) is characterized by three primary findings: A distinctive cerebellar and brain stem malformation called the molar tooth sign (MTS). Hypotonia. Developmental delays. Often these findings are accompanied by episodic tachypnea or apnea and/or atypical eye movements. In general, the breathing abnormalities improve with age, truncal ataxia develops over time, and acquisition of gross motor milestones is delayed. Cognitive abilities are variable, ranging from severe intellectual disability to normal. Additional findings can include retinal dystrophy, renal disease, ocular colobomas, occipital encephalocele, hepatic fibrosis, polydactyly, oral hamartomas, and endocrine abnormalities. Both intra- and interfamilial variation are seen.20301500NBK1325currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn a 3.5-year-old boy (patient 842629), born of consanguineous Arab-Israeli parents, with Joubert syndrome-25 (JBTS25; 616781), Srour et al. (2015) identified a homozygous 1-bp insertion (c.1328_1329insT, NM_014704.3) in the CEP104 gene, resulting in a frameshift and premature termination (Tyr444fsTer3). The mutation, which was found by exome sequencing and confirmed by Sanger sequencing, segregated with the disorder in the family and was nor found in the dbSNP, 1000 Genomes Project, Exome Variant Server, or ExAC databases, or in 350 in-house ethnically matched exomes. Functional studies and studies on patient cells were not performed.26477546CEP104, 1-BP INS, 1328T1-BP INS, 1328TCEP104JOUBERT SYNDROME 25 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000207504.xml ================================================ currentNM_005343.4(HRAS):c.37G>T (p.Gly13Cys) AND not providedcurrentcriteria provided, multiple submitters, no conflictsPathogenicunknownhumanyesclinical testinggermlinehumanyesclinical testingnot providedNM_005343.4(HRAS):c.37G>T (p.Gly13Cys)p.G13C:GGT>TGTNM_005343.3(HRAS):c.37G>TNM_001318054.2:c.-283G>TNM_001130442.2:c.37G>TNM_005343.4:c.37G>TNM_176795.4:c.37G>TNG_007666.1:g.6265G>TNC_000011.10:g.534286C>ANC_000011.9:g.534286C>Ac.37G>TNM_005343.2:c.37G>TNM_005343.3:c.37G>TP01112:p.Gly13CysNP_001123914.1:p.Gly13CysNP_005334.1:p.Gly13CysNP_789765.1:p.Gly13CysNM_005343.2:exon 2NM_005343.3:exon 25 prime UTR variantmissense variantmissense variantmissense variantG13CGLY13CYS11p15.5HRas proto-oncogene, GTPaseHRASNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=HRASNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=HRASleucine rich repeat containing 56LRRC56NM_005343.4(HRAS):c.37G>T (p.Gly13Cys)not providedThe term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified.currentcriteria provided, single submitterPathogenicThe G13C variant in the HRAS gene has been reported previously in multiple unrelated individuals with Costello syndrome and is one of the common HRAS variants associated with this disorder (Estep et al., 2006; Gripp et al., 2006; Gripp et al., 2011). In addition to the classic Costello features, this variant has been described in patients with unique ectodermal findings such as sparse hair in early childhood and long eyelashes, and appears to have a low incidence of papillomata (Gripp et al., 2011). The G13C variant was not observed in approximately 6,500 individuals of European and African American ancestry in the NHLBI Exome Sequencing Project, indicating it is not a common benign variant in these populations. The G13C variant is a non-conservative amino acid substitution, which occurs at a conserved Glycine residue at codon 12; the majority of pathogenic variants in the HRAS gene (>90%) alter the conserved glycine residues at positions 12 and 13 (Aoki et al., 2005; Gripp et al., 2006). Functional studies demonstrate that G13C alters GTP and GDP dissociation rates resulting in increased active GTP-bound HRAS, which upregulates the Ras/MAPK pathway (Wey et al., 2013). Therefore, we interpret G13C as a pathogenic variant.GeneDx Variant Classification (06012015)https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdfgermlinehumanyesclinical testingnot providedNM_005343.2:c.37G>THRASNot Providedcurrentcriteria provided, single submitterPathogenic16833586319213030214381341637235116329078ACMG Guidelines, 201525741868unknownhuman1019yesfemaleclinical testingp.Gly13CysNM_005343.2:exon 2NM_005343.2:c.37G>THRASnone provided ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000235027.xml ================================================ current RCV000610359 NM_173660.5(DOK7):c.1124_1127dup (p.Ala378fs) AND Congenital myasthenic syndrome current criteria provided, single submitter Pathogenic Autosomal recessive inheritance germline human yes literature only not provided germline human not provided clinical testing NM_173660.5(DOK7):c.1124_1127dup (p.Ala378fs) p.Ala378SerfsX30 NM_001164673.2:c.*345_*348dup NM_001301071.2:c.1124_1127dup NM_173660.5:c.1124_1127dup NM_001256896.1:c.194_197dup NM_001363811.2:c.692_695dup NG_013072.2:g.34805_34808dup NC_000004.12:g.3493110_3493113dup NC_000004.11:g.3494840_3494841insTGCC NC_000004.11:g.3494837_3494840dup NM_173660.4:c.1124_1127dupTGCC NP_775931.3:p.Ala378SerfsTer30 NP_001350740.1:p.Ala234fs NP_001288000.1:p.Ala378fs NP_775931.3:p.Ala378fs NP_001243825.1:p.Ala68fs NM_173660.4:exon 7 3 prime UTR variant frameshift variant frameshift variant frameshift variant frameshift variant 4p16.3 docking protein 7 DOK7 NM_173660.5(DOK7):c.1124_1127dup (p.Ala378fs) Congenital myasthenic syndrome CMS Congenital myasthenic syndromes (designated as CMS throughout this entry) are characterized by fatigable weakness of skeletal muscle (e.g., ocular, bulbar, limb muscles) with onset at or shortly after birth or in early childhood; rarely, symptoms may not manifest until later in childhood. Cardiac and smooth muscle are usually not involved. Severity and course of disease are highly variable, ranging from minor symptoms to progressive disabling weakness. In some subtypes of CMS, myasthenic symptoms may be mild, but sudden severe exacerbations of weakness or even sudden episodes of respiratory insufficiency may be precipitated by fever, infections, or excitement. Major findings of the neonatal-onset subtype include: respiratory insufficiency with sudden apnea and cyanosis; feeding difficulties; poor suck and cry; choking spells; eyelid ptosis; and facial, bulbar, and generalized weakness. Arthrogryposis multiplex congenita may also be present. Stridor in infancy may be an important clue to CMS. Later childhood-onset subtypes show abnormal muscle fatigability with difficulty in activities such as running or climbing stairs; motor milestones may be delayed; fluctuating eyelid ptosis and fixed or fluctuating extraocular muscle weakness are common presentations. 20301347 NBK1168 current no assertion criteria provided Pathogenic 22230109 16917026 http://www.ncbi.nlm.nih.gov/books/NBK1168/ germline human yes literature only not provided NP_775931.3:p.Ala378SerfsTer30 NM_173660.4:c.1124_1127dupTGCC DOK7 current criteria provided, single submitter Pathogenic 19261599 The p.Ala378fs variant in DOK7 has been reported in 17 individuals with congenit al myasthenia syndrome (CMS) with limb-girdle pattern of muscle weakness who wer e either homozygous or compound heterozygous for this variant (Beeson 2006 and L orenzoni 2013). It has been identified in 0.1% (28/24230) of Eurpoean chromosome s by the Exome Aggregation Consortium (ExAC, http://exac.broadinstitute.org), wh ich is consistent with a carrier frequency for this disease. In vitro functional studies also provide some evidence that the Ala378fs variant may impact protein function. This variant is predicted to cause a frameshift, which alters the pro tein?s amino acid sequence beginning at position 378 and leads to a premature te rmination codon 30 amino acids downstream. Loss of function of the DOK7 gene is an established disease mechanism in individuals with CMS. In summary, this varia nt meets our criteria to be classified as pathogenic for CMS in an autosomal rec essive manner based upon its segregation in affected individuals and predicted i mpact on protein function. Autosomal recessive inheritance LMM Criteria 24033266 germline human not provided clinical testing NM_173660.4:c.1124_1127dupTGCC p.Ala378SerfsX30 NM_173660.4:EXON 7 NC_000004.11:g.3494840_3494841insTGCC DOK7 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000267121.xml ================================================ currentNM_198156.3(VHL):c.164_171dup (p.Arg60fs) AND not providedcurrentcriteria provided, single submitterPathogenicgermlinehumanyesclinical testingnot providedNM_198156.3(VHL):c.164_171dup (p.Arg60fs)NM_000551.3:c.164_171dupAGGCCGGGLRG_322t1:c.164_171dupNM_001354723.2:c.164_171dupNM_198156.3:c.164_171dupLRG_322:g.5377_5384dupNG_008212.3:g.5377_5384dupNC_000003.12:g.10142011_10142018dupNC_000003.11:g.10183695_10183702dupLRG_322p1:p.Arg60fsNP_001341652.1:p.Arg60fsNP_937799.1:p.Arg60fsframeshift variantframeshift variant3p25.3von Hippel-Lindau tumor suppressorVHLSufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=VHLNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=VHLThis gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons.This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons.NM_198156.3(VHL):c.164_171dup (p.Arg60fs)not providedThe term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified.currentcriteria provided, single submitterPathogenicThe c.164_171dupAGGCCGGG variant in the VHL gene has been reported previously in associationwith von Hippel-Lindau syndrome (Chacon-Camacho et al., 2014). The duplication causes aframeshift starting with codon Arginine 60, changes this amino acid to a Glycine residue and creates apremature Stop codon at position 10 of the new reading frame, denoted p.Arg60GlyfsX10. Thisvariant is predicted to cause loss of normal protein function either through protein truncation ornonsense-mediated mRNA decay. Based on the currently available information, we considerc.164_171dupAGGCCGGG to be pathogenic.GeneDx Variant Classification (06012015)https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdfgermlinehumanyesclinical testingnot providedNM_000551.3:c.164_171dupAGGCCGGGVHLNot Provided ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000342164.xml ================================================ current NM_000368.4(TSC1):c.2075_2120dup (p.Phe707delinsLeuArgProValAlaPheThrAlaGlnProValThrLeuTer) AND not provided current criteria provided, single submitter Pathogenic germline human yes clinical testing not provided NM_000368.4(TSC1):c.2075_2120dup (p.Phe707delinsLeuArgProValAlaPheThrAlaGlnProValThrLeuTer) NM_000368.4:c.2075_2120dupGAGACCAGTTGCTTTTACTGCACAACCAGTTACTCTATGAGCGTTT LRG_486t1:c.2075_2120dup NM_001362177.2:c.1712_1757dup NM_001162427.2:c.1922_1967dup NM_001162426.2:c.2072_2117dup NM_000368.4:c.2075_2120dup LRG_486:g.45850_45895dup NG_012386.1:g.45850_45895dup NC_000009.12:g.132903739_132903784dup NC_000009.11:g.135779126_135779171dup LRG_486p1:p.Phe707delinsLeuArgProValAlaPheThrAlaGlnProValThrLeuTer NP_001349106.1:p.Phe586delinsLeuArgProValAlaPheThrAlaGlnProValThrLeuTer NP_001155899.1:p.Phe656delinsLeuArgProValAlaPheThrAlaGlnProValThrLeuTer NP_001155898.1:p.Phe706delinsLeuArgProValAlaPheThrAlaGlnProValThrLeuTer NP_000359.1:p.Phe707delinsLeuArgProValAlaPheThrAlaGlnProValThrLeuTer nonsense nonsense nonsense nonsense 9q34.13 TSC complex subunit 1 TSC1 Sufficient evidence for dosage pathogenicity https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=TSC1 No evidence available https://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=TSC1 This gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons. This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons. NM_000368.4(TSC1):c.2075_2120dup (p.Phe707delinsLeuArgProValAlaPheThrAlaGlnProValThrLeuTer) not provided The term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified. current criteria provided, single submitter Pathogenic The c.2075_2120dup46 pathogenic variant in the TSC1 gene causes a frameshift starting with codon Phenylalanine 707, changes this amino acid to a Leucine residue and creates a premature Stop codon at position 14 of the new reading frame, denoted p.Phe707LeufsX14. This pathogenic variant is predicted to cause loss of normal protein function either through protein truncation or nonsense-mediated mRNA decay. Furthermore, it was not observed in approximately 6,500 individuals of European and African American ancestry in the NHLBI Exome Sequencing Project, indicating it is not a common benign variant in these populations. GeneDx Variant Classification (06012015) https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdf germline human yes clinical testing not provided NM_000368.4:c.2075_2120dup46 TSC1 Not Provided ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000373191.xml ================================================ currentNM_004208.4(AIFM1):c.273T>C (p.Asp91=) AND Combined oxidative phosphorylation deficiencycurrentcriteria provided, single submitterBenigngermlinehumanunknownclinical testingnot providedNM_004208.4(AIFM1):c.273T>C (p.Asp91=)NM_145812.2:c.261T>CNM_001130847.3:c.273T>CNM_004208.4:c.273T>CNG_013217.1:g.21289T>CNC_000023.11:g.130149545A>GNC_000023.10:g.129283520A>GNR_132647.1:n.361T>CNM_004208.3:c.273T>CNP_665811.1:p.Asp87=NP_001124319.1:p.Asp91=NP_004199.1:p.Asp91=non-coding transcript variantsynonymous variantsynonymous variantsynonymous variantXq26.1RAB33A, member RAS oncogene familyRAB33Aapoptosis inducing factor mitochondria associated 1AIFM1NM_004208.4(AIFM1):c.273T>C (p.Asp91=)Combined oxidative phosphorylation deficiencyMitochondrial oxidative phosphorylation disorder due to nuclear DNA anomaliescurrentcriteria provided, single submitterBenignICSL Variant Classification 20161018https://submit.ncbi.nlm.nih.gov/ft/byid/4jQgNGYk/ICSL_Variant_Classification_20161018.pdfICSL_Variant_Classification_20161018.pdfgermlinehumanunknownclinical testingnot providedNM_004208.3:c.273T>CAIFM1Combined Oxidative Phosphorylation Deficiency ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000401212.xml ================================================ currentNM_176824.3(BBS7):c.*690T>C AND Bardet-Biedl syndromecurrentcriteria provided, single submitterBenigngermlinehumanunknownclinical testingnot providedNM_176824.3(BBS7):c.*690T>CNM_176824.3:c.*690T>CNG_052974.1:g.3832T>CNG_009111.1:g.50318T>CNC_000004.12:g.121825170A>GNC_000004.11:g.122746325A>GNM_176824.2:c.*690T>C3 prime UTR variant4q27Bardet-Biedl syndrome 7BBS7NM_176824.3(BBS7):c.*690T>CBardet-Biedl syndromeBBSBardet-Biedl syndrome (BBS) is characterized by rod-cone dystrophy, truncal obesity, postaxial polydactyly, cognitive impairment, male hypogonadotropic hypogonadism, complex female genitourinary malformations, and renal abnormalities. The visual prognosis for children with BBS is poor. Night blindness is usually evident by age seven to eight years; the mean age of legal blindness is 15.5 years. Birth weight is usually normal, but significant weight gain begins within the first year and becomes a lifelong issue for most individuals. A majority of individuals have significant learning difficulties; a minority have severe impairment on IQ testing. Renal disease is a major cause of morbidity and mortality.20301537NBK13633061994currentcriteria provided, single submitterBenignICSL Variant Classification 20161018https://submit.ncbi.nlm.nih.gov/ft/byid/4jQgNGYk/ICSL_Variant_Classification_20161018.pdfICSL_Variant_Classification_20161018.pdfgermlinehumanunknownclinical testingnot providedNM_176824.2:c.*690T>CBBS7Bardet-Biedl Syndrome ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000406351.xml ================================================ current NM_001080522.2(CC2D2A):c.676_678GAA[3] (p.Glu229del) AND Joubert syndrome current criteria provided, single submitter Likely benign germline human unknown clinical testing not provided NM_001080522.2(CC2D2A):c.676_678GAA[3] (p.Glu229del) NM_001080522.2:c.676_678GAA[3] NM_001080522.2:c.685_687del NM_001080522.2:c.685_687delGAA LRG_697t1:c.685_687del LRG_697:g.46517_46519GAA[3] NG_013035.1:g.46517_46519GAA[3] NC_000004.12:g.15511382_15511384GAA[3] NC_000004.11:g.15513005_15513007GAA[3] LRG_697p1:p.Glu229del NP_001073991.2:p.Glu229del NM_001080522.2:exon 9 E229del 4p15.32 coiled-coil and C2 domain containing 2A CC2D2A 21068128 NM_001080522.2(CC2D2A):c.676_678GAA[3] (p.Glu229del) Joubert syndrome CEREBELLOPARENCHYMAL DISORDER IV Familial aplasia of the vermis Agenesis of cerebellar vermis Cerebellar vermis aplasia JBTS CPD4 Classic Joubert syndrome (JS) is characterized by three primary findings: A distinctive cerebellar and brain stem malformation called the molar tooth sign (MTS). Hypotonia. Developmental delays. Often these findings are accompanied by episodic tachypnea or apnea and/or atypical eye movements. In general, the breathing abnormalities improve with age, truncal ataxia develops over time, and acquisition of gross motor milestones is delayed. Cognitive abilities are variable, ranging from severe intellectual disability to normal. Additional findings can include retinal dystrophy, renal disease, ocular colobomas, occipital encephalocele, hepatic fibrosis, polydactyly, oral hamartomas, and endocrine abnormalities. Both intra- and interfamilial variation are seen. 20301500 NBK1325 21448235 current criteria provided, single submitter Likely benign ICSL Variant Classification 20161018 https://submit.ncbi.nlm.nih.gov/ft/byid/4jQgNGYk/ICSL_Variant_Classification_20161018.pdf ICSL_Variant_Classification_20161018.pdf germline human unknown clinical testing not provided NM_001080522.2:c.685_687delGAA CC2D2A Joubert Syndrome ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000435546.xml ================================================ currentNM_021625.4(TRPV4):c.[2481_2484delCCGC;2486T>A] AND Avascular necrosis of femoral head, primary, 2currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn 4 sibs from a Greek family with avascular necrosis of the femoral head (ANFH2; 617383), Mah et al. (2016) identified heterozygosity for a 4-bp deletion (c.2480_2483delCCCG, NM_021625.4) followed by a c.2486T-A transversion (c.2486T-A, NM_021625.4) in a highly conserved region of the TRPV4 gene, causing a frameshift that results in a premature termination codon (Val829TrpfsTer3). The mutation was not found in an unaffected brother, or in the 1000 Genomes or Exome Variant Server databases; parental DNA was unavailable, but the sibs' father reportedly had symptoms of joint pain that were never evaluated. Functional analysis in patient fibroblasts and transduced HEK293 cells indicated that the mutation results in a gain-of-function of TRPV4 channels by impeding channel closure.27330106NM_021625.4(TRPV4):c.2481_2484del (p.Arg828fs)LRG_372t1:c.2481_2484delNM_001177433.1:c.2160_2163delNM_147204.2:c.2301_2304delNM_001177428.1:c.2340_2343delNM_001177431.1:c.2379_2382delNM_021625.4:c.2481_2484delLRG_372:g.54652_54655delNG_017090.1:g.54652_54655delNC_000012.12:g.109783754_109783757delNC_000012.11:g.110221559_110221562delLRG_372p1:p.Arg828fsNP_001170904.1:p.Arg721fsNP_671737.1:p.Arg768fsNP_001170899.1:p.Arg781fsNP_001170902.1:p.Arg794fsNP_067638.3:p.Arg828fsframeshift variantframeshift variantframeshift variantframeshift variantframeshift variantR721fsR768fsR781fsR794fsR828fs12q24.11transient receptor potential cation channel subfamily V member 4TRPV4NM_021625.4(TRPV4):c.2486T>A (p.Val829Glu)LRG_372t1:c.2486T>ANM_001177433.1:c.2165T>ANM_147204.2:c.2306T>ANM_001177428.1:c.2345T>ANM_001177431.1:c.2384T>ANM_021625.4:c.2486T>ALRG_372:g.54657T>ANG_017090.1:g.54657T>ANC_000012.12:g.109783751A>TNC_000012.11:g.110221556A>TLRG_372p1:p.Val829GluNP_001170904.1:p.Val722GluNP_671737.1:p.Val769GluNP_001170899.1:p.Val782GluNP_001170902.1:p.Val795GluNP_067638.3:p.Val829Glumissense variantmissense variantmissense variantmissense variantmissense variantV722EV769EV782EV795EV829E12q24.11transient receptor potential cation channel subfamily V member 4TRPV4NM_021625.4(TRPV4):c.[2481_2484delCCGC;2486T>A]TRPV4, 4-BP DEL, 2480CCCG AND 2486T-ANM_021625.4:c.[2481_2484delCCGC;2486T>A]Avascular necrosis of femoral head, primary, 2ANFH2currentno assertion criteria providedPathogenicgermlinehumannot providedliterature onlyIn 4 sibs from a Greek family with avascular necrosis of the femoral head (ANFH2; 617383), Mah et al. (2016) identified heterozygosity for a 4-bp deletion (c.2480_2483delCCCG, NM_021625.4) followed by a c.2486T-A transversion (c.2486T-A, NM_021625.4) in a highly conserved region of the TRPV4 gene, causing a frameshift that results in a premature termination codon (Val829TrpfsTer3). The mutation was not found in an unaffected brother, or in the 1000 Genomes or Exome Variant Server databases; parental DNA was unavailable, but the sibs' father reportedly had symptoms of joint pain that were never evaluated. Functional analysis in patient fibroblasts and transduced HEK293 cells indicated that the mutation results in a gain-of-function of TRPV4 channels by impeding channel closure.27330106TRPV4, 4-BP DEL, 2480CCCG AND 2486T-A4-BP DEL, 2480CCCG AND 2486T-ATRPV4AVASCULAR NECROSIS OF FEMORAL HEAD, PRIMARY, 2 (1 family) ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000485802.xml ================================================ currentNM_000044.6(AR):c.171_173GCA[36] (p.Gln68_Gln80dup) AND Bulbo-spinal atrophy X-linkedcurrentno assertion criteria providedUncertain significancegermlinehumanunknownliterature onlynot providedNM_000044.6(AR):c.171_173GCA[36] (p.Gln68_Gln80dup)NM_001011645.3:c.-1613_-1611GCA[36]NM_000044.6:c.171_173GCA[36]NM_001348061.1:c.171_173GCA[36]NM_001348063.1:c.171_173GCA[36]NM_001348064.1:c.171_173GCA[36]NG_052629.1:g.101_103GCA[36]NG_009014.2:g.6286_6288GCA[36]NC_000023.11:g.67545317_67545319GCA[36]NC_000023.10:g.66765159_66765161GCA[36]NM_000044.3:c.172_174CAG[35]NP_000035.2:p.Gln68_Gln80dupNP_001334990.1:p.Gln68_Gln80dupNP_001334992.1:p.Gln68_Gln80dupNP_001334993.1:p.Gln68_Gln80dup5 prime UTR variantinframe_insertioninframe_insertioninframe_insertioninframe_insertionXq12androgen receptor repeat instability regionLOC109504725androgen receptorARSufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=ARNo evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=ARNM_000044.6(AR):c.171_173GCA[36] (p.Gln68_Gln80dup)Bulbo-spinal atrophy X-linkedSPINAL AND BULBAR MUSCULAR ATROPHY, X-LINKED 1SMAX1KDSBMAXBSNSpinal and bulbar muscular atrophy (SBMA) is a gradually progressive neuromuscular disorder in which degeneration of lower motor neurons results in muscle weakness, muscle atrophy, and fasciculations. SBMA occurs only in males. Affected individuals often show gynecomastia, testicular atrophy, and reduced fertility as a result of mild androgen insensitivity.20301508NBK1333currentno assertion criteria providedUncertain significancehttps://www.ncbi.nlm.nih.gov/books/NBK1333/germlinehumanunknownliterature onlynot providedNM_000044.3:c.172_174CAG(35)ARCAG 35 repeats ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000537563.xml ================================================ currentNM_005214.5(CTLA4):c.75G>C (p.Leu25=) AND not providedcurrentcriteria provided, single submitterBenigngermlinehumanunknownclinical testingnot providedNM_005214.5(CTLA4):c.75G>C (p.Leu25=)LRG_1220t1:c.75G>CNM_001037631.3:c.75G>CNM_005214.5:c.75G>CLRG_1220:g.5232G>CNG_011502.1:g.5232G>CNC_000002.12:g.203868017G>CNC_000002.11:g.204732740G>CNM_005214.4:c.75G>CLRG_1220p1:p.Leu25=NP_001032720.1:p.Leu25=NP_005205.2:p.Leu25=synonymous variantsynonymous variant2q33.2cytotoxic T-lymphocyte associated protein 4CTLA4NM_005214.5(CTLA4):c.75G>C (p.Leu25=)not providedThe term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified.currentcriteria provided, single submitterBenignNykamp K et al. (Genet Med 2017)28492532germlinehumanunknownclinical testingnot providedNM_005214.4:c.75G>CCTLA4not provided ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/RCV000724338.xml ================================================ currentNM_000059.3(BRCA2):c.156_157insAlu AND not providedcurrentcriteria provided, single submitterPathogenicgermlinehumanunknownclinical testingnot providedNM_000059.3(BRCA2):c.156_157insAlu384insAluU43746.1:n.384_385insAluNM_000059.3:c.156_157insALUNM_000059.3:exon 3U43746.1:exon 313q13.1BRCA2 DNA repair associatedBRCA2Sufficient evidence for dosage pathogenicityhttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA2No evidence availablehttps://www.ncbi.nlm.nih.gov/projects/dbvar/ISCA/isca_gene.cgi?sym=BRCA2This gene is cited in the ACMG recommendations of 2013 (PubMed 23788249) for reporting incidental findings in exons.This gene is cited in the ACMG recommendations of 2016 (PubMed 27854360) for reporting incidental findings in exons.NM_000059.3(BRCA2):c.156_157insAlunot providedThe term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified.currentcriteria provided, single submitterPathogenicEGL_Classification_Definitions_2015https://submit.ncbi.nlm.nih.gov/ft/byid/dn5yhybg/egl_classification_definitions_2015.pdfgermlinehumanunknownmixedclinical testingnot providedNM_000059.3:Ex3BRCA2not providedhttp://www.egl-eurofins.com/emvclass/emvclass.php?approved_symbol=BRCA2 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/Two_RCVs.xml ================================================ current NM_005026.5(PIK3CD):c.231G>A (p.Ala77=) AND not provided current criteria provided, single submitter Likely benign germline human unknown clinical testing not provided NM_005026.5(PIK3CD):c.231G>A (p.Ala77=) NC_000001.11:9715629:G:A LRG_191t1:c.231G>A NM_001350234.2:c.231G>A NM_001350235.1:c.231G>A NM_005026.5:c.231G>A LRG_191:g.68899G>A NG_023434.1:g.68899G>A NC_000001.11:g.9715630G>A NC_000001.10:g.9775688G>A NM_005026.3:c.231G>A p.Ala77Ala NP_001337163.1:p.Ala77= NP_001337164.1:p.Ala77= NP_005017.3:p.Ala77= synonymous variant synonymous variant synonymous variant 1p36.22 phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit delta PIK3CD NM_005026.5(PIK3CD):c.231G>A (p.Ala77=) NM_005026.5(PIK3CD):c.231G>A (p.Ala77=) not provided none provided The term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified. current criteria provided, single submitter Likely benign The c.231G>A variant (rs756139699) does not alter the amino acid sequence of the PIK3CD protein and computational splice site prediction algorithms do not predict a change in the nearest splice site or creation of a cryptic splice site. This variant has not been reported in association with primary antibody deficiency in medical literature or in gene specific variation databases. This variant is listed in the genome Aggregation Database (gnomAD) with an overall population frequency of 0.006 percent (identified on 17 out of 276,374 chromosomes). Based on these observations, the c.231G>A variant is likely to be benign. ARUP Molecular Germline Variant Investigation Process https://submit.ncbi.nlm.nih.gov/ft/byid/w2yp3qyt/arup_molecular_germline_variant_investigation_process.pdf germline human unknown clinical testing not provided p.Ala77Ala NM_005026.3:c.231G>A Not Provided current NM_005026.5(PIK3CD):c.231G>A (p.Ala77=) AND Immunodeficiency 14 current criteria provided, single submitter Likely benign germline human unknown clinical testing not provided NM_005026.5(PIK3CD):c.231G>A (p.Ala77=) NC_000001.11:9715629:G:A LRG_191t1:c.231G>A NM_001350234.2:c.231G>A NM_001350235.1:c.231G>A NM_005026.5:c.231G>A LRG_191:g.68899G>A NG_023434.1:g.68899G>A NC_000001.11:g.9715630G>A NC_000001.10:g.9775688G>A NM_005026.3:c.231G>A p.Ala77Ala NP_001337163.1:p.Ala77= NP_001337164.1:p.Ala77= NP_005017.3:p.Ala77= synonymous variant synonymous variant synonymous variant 1p36.22 phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit delta PIK3CD NM_005026.5(PIK3CD):c.231G>A (p.Ala77=) NM_005026.5(PIK3CD):c.231G>A (p.Ala77=) Immunodeficiency 14 p110-DELTA-ACTIVATING MUTATION CAUSING SENESCENT T CELLS, LYMPHADENOPATHY, AND IMMUNODEFICIENCY IMMUNODEFICIENCY 14A, AUTOSOMAL DOMINANT IMD14A PASLI current criteria provided, single submitter Likely benign Invitae Variant Classification Sherloc (09022015) 28492532 germline human unknown clinical testing not provided NC_000001.10:g.9775688G>A PIK3CD ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/VCVs/TwoRecords.xml ================================================ current Homo sapiens 10q23.31 613497 LIPA, 934G-A single nucleotide variant 10q23.31 934G-A no assertion criteria provided Lysosomal acid lipase deficiency Pathogenic Maslen, C. L., Illingworth, D. R. Molecular genetics of cholesterol ester hydrolase deficiency. (Abstract) Am. J. Hum. Genet. 53 (suppl.): A926, 1993. 7759067 8254026 8598644 8617513 CHOLESTEROL ESTER HYDROLASE DEFICIENCY Lysosomal acid lipase deficiency LAL DEFICIENCY The phenotypic spectrum of lysosomal acid lipase (LAL) deficiency ranges from the infantile-onset form (Wolman disease) to later-onset forms collectively known as cholesterol ester storage disease (CESD). Wolman disease is characterized by infantile-onset malabsorption that results in malnutrition, storage of cholesterol esters and triglycerides in hepatic macrophages that results in hepatomegaly and liver disease, and adrenal gland calcification that results in adrenal cortical insufficiency. Unless successfully treated with hematopoietic stem cell transplantation (HSCT), infants with classic Wolman disease do not survive beyond age one year. CESD may present in childhood in a manner similar to Wolman disease or later in life with such findings as serum lipid abnormalities, hepatosplenomegaly, and/or elevated liver enzymes long before a diagnosis is made. The morbidity of late-onset CESD results from atherosclerosis (coronary artery disease, stroke), liver disease (e.g., altered liver function ± jaundice, steatosis, fibrosis, cirrhosis and related complications of esophageal varices, and/or liver failure), complications of secondary hypersplenism (i.e., anemia and/or thrombocytopenia), and/or malabsorption. Individuals with CESD may have a normal life span depending on the severity of disease manifestations. 26452566 26225414 NBK305870 current no assertion criteria provided Pathogenic variation to disease germline human not provided literature only In a 12-year-old patient with cholesteryl ester storage disease (278000) from a nonconsanguineous Polish-German family, Klima et al. (1993) detected a 72-bp in-frame deletion resulting in the loss of amino acid codons 254 through 277. Analysis of genomic DNA revealed that the 72 bp represented an exon, indicating that the deletion in the mRNA was caused by defective splicing. Sequence analysis of the patient's genomic DNA revealed a G-to-A substitution in the last nucleotide of the 72-bp exon on 1 allele. No normal-sized mRNA was detectable in the propositus even though he was not homozygous for the splice site mutation. Klima et al. (1993) concluded that the patient was compound heterozygous for the splice site mutation and a null allele. The patient showed LIPA activity in cultured skin fibroblasts approximately 9% of normal. Hepatosplenomegaly had been present since age 5 years. 8254026 Aslanidis et al. (1996) restudied the patient of Klima et al. (1993) and defined the splice site mutation as a G-to-A mutation at position -1 of the splice donor site following exon 8, resulting in incorrect splicing and the removal of the 72-bp exon 8 of the LIPA gene. They determined that the other allele of the patient carried a premature termination mutation (613497.0003) as well as the L179P mutation (613497.0001); the LIPA mRNA was rendered unstable by the premature stop codon. Aslanidis et al. (1996) demonstrated that the splice site mutation allowed the production of approximately 3 to 4% of correctly spliced mRNA relative to wildtype. Aslanidis et al. (1996) also identified a mutation at the same splice donor site, and also resulting in deletion of exon 8, in 2 sibs with Wolman disease; that mutation, at the +1 position, allowed no correct splicing, and patient fibroblasts were devoid of enzymatic activity. See 613497.0005. 8617513 8254026 In 2 sibs with CESD, Maslen and Illingworth (1993) and Maslen et al. (1995) identified compound heterozygosity for this splice site mutation in the LIPA gene, inherited from their father, and the L179P mutation (613497.0001). The affected children were a sister and brother who presented with idiopathic hepatomegaly at ages 6 and 8 years, respectively. Subsequent analyses indicated that they also had hypercholesterolemia and a severe reduction in cholesteryl ester hydrolase activity in cultured fibroblasts. Maslen, C. L., Illingworth, D. R. Molecular genetics of cholesterol ester hydrolase deficiency. (Abstract) Am. J. Hum. Genet. 53 (suppl.): A926, 1993. 8598644 Muntoni et al. (1995) observed homozygosity for the splice site mutation (Klima et al., 1993) in a Spanish kindred with cholesterol ester storage disease. Exon 8 of the LIPA gene was deleted. 7759067 8254026 LIPA, 934G-A Variation 934G-A CHOLESTERYL ESTER STORAGE DISEASE current Homo sapiens 10q24.2 613469 HPSE2, EX8-9DEL Deletion 10q23-q24 EX8-9DEL no assertion criteria provided Ochoa syndrome Pathogenic 20560210 Ochoa syndrome Urofacial syndrome 1 HPSE2-Releated Urofacial Syndrome Urofacial Syndrome UFS UFS1 Urofacial syndrome (UFS) is characterized by prenatal or infantile onset of urinary bladder voiding dysfunction, abnormal facial movement with expression (resulting from abnormal co-contraction of the corners of the mouth and eyes), and often bowel dysfunction (constipation and/or encopresis). Bladder voiding dysfunction increases the risk for urinary incontinence, megacystis, vesicoureteric reflux, hydroureteronephrosis, urosepsis, and progressive renal impairment. In rare instances, an individual who has (a) a molecularly confirmed diagnosis and/or (b) an affected relative meeting clinical diagnostic criteria manifests only the characteristic facial features or only the urinary bladder voiding dysfunction (not both). Nocturnal lagophthalmos (incomplete closing of the eyes during sleep) appears to be a common and significant finding. 23967498 NBK154138 current no assertion criteria provided Pathogenic variation to disease germline human not provided literature only By copy number analysis in a consanguineous British Pakistani family with urofacial syndrome (UFS1; 236730), Daly et al. (2010) identified homozygosity for an intragenic deletion encompassing exons 8 and 9 of the HPSE2 gene that segregated with the disease. PCR analysis and DNA sequencing across the breakpoints defined a 10.81-kb deletion and a 23-bp insertion at the breakpoints, predicted to cause an in-frame deletion of exons 8 and 9 and removal of 74 amino acids. The unaffected parents were heterozygous for the mutation, which was not found in 93 Pakistani controls. 20560210 HPSE2, EX8-9DEL Variation EX8-9DEL UROFACIAL SYNDROME 1 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/VCVs/VCV000000081.xml ================================================ current Homo sapiens 10q23.31 613497 NM_000235.4(LIPA):c.894+1G>A single nucleotide variant 10q23.31 IVS8, G-A, +1 NC_000010.10:g.90982267C>T NC_000010.11:g.89222510C>T NM_000235.4:c.894+1G>A NM_001127605.3:c.894+1G>A NM_001288979.1:c.546+1G>A NG_008194.1:g.34394G>A ClinGen staff contributed the HGVS expression for this variant. no assertion criteria provided Wolman disease Pathogenic 8617513 Wolman disease current no assertion criteria provided Pathogenic variation to disease germline human not provided literature only In 2 sibs with Wolman disease (278000) from a consanguineous family, Aslanidis et al. (1996) detected homozygosity for a G-to-A mutation at position +1 of the splice donor site following exon 8 of the LIPA gene. Both children died within the first year of life. The parents, who were heterozygous for the mutation, had reduced enzymatic activity, while no enzymatic activity was detectable in fibroblasts from the affected children. Although the same donor splice site is involved as in the mutation reported in CESD (934G-A, 613497.0002), the nucleotide at position +1 was changed in the Wolman disease mutation while the nucleotide at position -1 was changed in the CESD mutation. Both mutations result in deletion of the same 24 amino acids (exon 8), but the effects are dramatically different: the -1 mutation allowed some correct splicing (3% of total LIPA RNA), but the +1 splice site mutation, which affects one of the invariable nucleotides of the splice consensus sequences, permits no correct splicing. Aslanidis et al. (1996) suggested that the residual activity in CESD patients compared to Wolman patients may result either from a partially active enzyme with the internal deletion of 24 amino acids (skipping of exon 8) or from the production of low amounts of the full size of the protein due to inefficient exon exclusion from the mutated allele. 8617513 LIPA, IVS8, G-A, +1 Variation IVS8, G-A, +1 WOLMAN DISEASE ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/VCVs/VCV000137106.xml ================================================ current Homo sapiens Xp21.2-21.1 300377 Sufficient evidence for dosage pathogenicity No evidence available NM_004006.2(DMD):c.8810A>G (p.Gln2937Arg) single nucleotide variant Xp21.2 p.Q2937R:CAG>CGG Q2937R Q2814R Q477R Q208R Q2933R NC_000023.10:g.31496350= NC_000023.11:g.31478233= NM_000109.4:c.8786= NP_000100.3:p.Arg2929= NM_004006.2:c.8810A>G NP_003997.1:p.Gln2937Arg NM_004009.3:c.8798A>G NP_004000.1:p.Gln2933Arg NM_004010.3:c.8441A>G NP_004001.1:p.Gln2814Arg NM_004011.4:c.4787= NP_004002.3:p.Arg1596= NM_004012.4:c.4778= NP_004003.2:p.Arg1593= NM_004013.2:c.1430A>G NP_004004.1:p.Gln477Arg NM_004014.2:c.623A>G NP_004005.1:p.Gln208Arg NM_004020.3:c.1430A>G NP_004011.2:p.Gln477Arg NM_004021.3:c.1430= NP_004012.2:p.Arg477= NM_004022.2:c.1430A>G NP_004013.1:p.Gln477Arg NM_004023.3:c.1430= NP_004014.2:p.Arg477= LRG_199t1:c.8810A>G LRG_199p1:p.Gln2937Arg LRG_199:g.1866377A>G NG_012232.1:g.1866377A>G criteria provided, multiple submitters, no conflicts not specified not provided Becker muscular dystrophy Duchenne muscular dystrophy Benign not provided The term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified. not specified AllHighlyPenetrant The term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders. This usage was introduced in 2014 to replace AllHighlyPenetrant. Becker muscular dystrophy BMD The dystrophinopathies cover a spectrum of X-linked muscle disease ranging from mild to severe that includes Duchenne muscular dystrophy, Becker muscular dystrophy, and DMD-associated dilated cardiomyopathy (DCM). The mild end of the spectrum includes the phenotypes of asymptomatic increase in serum concentration of creatine phosphokinase (CK) and muscle cramps with myoglobinuria. The severe end of the spectrum includes progressive muscle diseases that are classified as Duchenne/Becker muscular dystrophy when skeletal muscle is primarily affected and as DMD-associated dilated cardiomyopathy (DCM) when the heart is primarily affected. Duchenne muscular dystrophy (DMD) usually presents in early childhood with delayed motor milestones including delays in walking independently and standing up from a supine position. Proximal weakness causes a waddling gait and difficulty climbing stairs, running, jumping, and standing up from a squatting position. DMD is rapidly progressive, with affected children being wheelchair dependent by age 12 years. Cardiomyopathy occurs in almost all individuals with DMD after age 18 years. Few survive beyond the third decade, with respiratory complications and progressive cardiomyopathy being common causes of death. Becker muscular dystrophy (BMD) is characterized by later-onset skeletal muscle weakness. With improved diagnostic techniques, it has been recognized that the mild end of the spectrum includes men with onset of symptoms after age 30 years who remain ambulatory even into their 60s. Despite the milder skeletal muscle involvement, heart failure from DCM is a common cause of morbidity and the most common cause of death in BMD. Mean age of death is in the mid-40s. DMD-associated DCM is characterized by left ventricular dilation and congestive heart failure. Females heterozygous for a DMD pathogenic variant are at increased risk for DCM. 16322188 20301298 NBK1119 25313375 Duchenne muscular dystrophy DMD The dystrophinopathies cover a spectrum of X-linked muscle disease ranging from mild to severe that includes Duchenne muscular dystrophy, Becker muscular dystrophy, and DMD-associated dilated cardiomyopathy (DCM). The mild end of the spectrum includes the phenotypes of asymptomatic increase in serum concentration of creatine phosphokinase (CK) and muscle cramps with myoglobinuria. The severe end of the spectrum includes progressive muscle diseases that are classified as Duchenne/Becker muscular dystrophy when skeletal muscle is primarily affected and as DMD-associated dilated cardiomyopathy (DCM) when the heart is primarily affected. Duchenne muscular dystrophy (DMD) usually presents in early childhood with delayed motor milestones including delays in walking independently and standing up from a supine position. Proximal weakness causes a waddling gait and difficulty climbing stairs, running, jumping, and standing up from a squatting position. DMD is rapidly progressive, with affected children being wheelchair dependent by age 12 years. Cardiomyopathy occurs in almost all individuals with DMD after age 18 years. Few survive beyond the third decade, with respiratory complications and progressive cardiomyopathy being common causes of death. Becker muscular dystrophy (BMD) is characterized by later-onset skeletal muscle weakness. With improved diagnostic techniques, it has been recognized that the mild end of the spectrum includes men with onset of symptoms after age 30 years who remain ambulatory even into their 60s. Despite the milder skeletal muscle involvement, heart failure from DCM is a common cause of morbidity and the most common cause of death in BMD. Mean age of death is in the mid-40s. DMD-associated DCM is characterized by left ventricular dilation and congestive heart failure. Females heterozygous for a DMD pathogenic variant are at increased risk for DCM. 15642897 16322188 18079231 19945913 19945914 20301298 NBK1119 20301604 NBK1431 20597083 https://www.orpha.net/data/patho/Pro/en/Emergency_DuchenneMuscularDystrophy-enPro13913.pdf Orphanet, Duchenne muscular dystrophy, 2013 current criteria provided, single submitter Benign variation to disease Athena Diagnostics Criteria 26467025 germline human unknown clinical testing not provided Variation p.Gln2937Arg NM_004006.2:c.8810A>G Benign_2017 current criteria provided, single submitter Benign This variant is considered likely benign or benign based on one or more of the following criteria: it is a conservative change, it occurs at a poorly conserved position in the protein, it is predicted to be benign by multiple in silico algorithms, and/or has population frequency not consistent with disease. variation to disease GeneDX Variant Classification (06012015) https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdf germline human yes clinical testing not provided Variation NM_004006.2:c.8810A>G not specified SUB3839901 current criteria provided, single submitter Benign variation to disease LabCorp Variant Classification Summary - May 2015 https://submit.ncbi.nlm.nih.gov/ft/byid/pttb9itm/labcorp_variant_classification_method_-_may_2015.pdf germline human unknown clinical testing not provided Variation NM_004006.2:c.8810A>G not specified SUB5494893 current criteria provided, single submitter Benign variation to disease Nykamp K et al. (Genet Med 2017) 28492532 germline human unknown clinical testing not provided Variation NM_004006.2:c.8810A>G not provided SUB5321749 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/VCVs/VCV000431749.xml ================================================ current Homo sapiens 1p36.31 601142 1p36.31 607215 GRCh37/hg19 1p36.31(chr1:6051187-6158763) copy number gain 1p36.31 no interpretation for the single variant no interpretation for the single variant no interpretation for the single variant ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/VCVs/VCV000476472.xml ================================================ current Homo sapiens 21q22.3 120240 NM_001849.3(COL6A2):c.2697G>A (p.Thr899=) NC_000021.9:46132188:G:A single nucleotide variant 21q22.3 NC_000021.8:g.47552103G>A NC_000021.9:g.46132189G>A NM_001849.3:c.2697G>A NP_001840.3:p.Thr899= LRG_476t1:c.2697G>A LRG_476p1:p.Thr899= NG_008675.1:g.39071G>A LRG_476:g.39071G>A criteria provided, conflicting interpretations Bethlem myopathy 1 not specified Collagen VI-related myopathy Conflicting interpretations of pathogenicity Likely benign(2);Uncertain significance(1) Likely benign(2);Uncertain significance(1) Likely benign Myopathy, benign congenital, with contractures Muscular dystrophy, benign congenital Bethlem myopathy 1 MUSCULAR DYSTROPHY, LIMB-GIRDLE, AUTOSOMAL DOMINANT 5 BTHLM1 LGMDD5 Collagen type VI-related disorders represent a continuum of overlapping phenotypes with Bethlem myopathy at the mild end, Ullrich congenital muscular dystrophy (CMD) at the severe end, and two rare, less well-defined disorders – autosomal dominant limb-girdle muscular dystrophy and autosomal recessive myosclerosis myopathy – in between. Although Bethlem myopathy and Ullrich CMD were defined long before their molecular basis was known, they remain useful for clarification of prognosis and management. Bethlem myopathy, characterized by the combination of proximal muscle weakness and variable contractures, affects most frequently the long finger flexors, elbows, and ankles. Onset may be prenatal (characterized by decreased fetal movements), neonatal (hypotonia or torticollis), in early childhood (delayed motor milestones, muscle weakness, and contractures), or in adulthood (proximal weakness and Achilles tendon or long finger flexor contractures). Because of slow progression, more than two thirds of affected individuals over age 50 years rely on supportive means for outdoor mobility. Respiratory involvement is rare and appears to be related to more severe muscle weakness in later life. Ullrich CMD is characterized by congenital weakness and hypotonia, proximal joint contractures, and striking hyperlaxity of distal joints. Some affected children acquire the ability to walk independently; however, progression of the disease often results in later loss of ambulation. Early and severe respiratory involvement may require ventilatory support in the first or second decade of life. 20301676 NBK1503 21078917 20301468 NBK1291 Collagen VI-related myopathy Collagen 6-related myopathy COL6-RM Collagen type VI-related disorders represent a continuum of overlapping phenotypes with Bethlem myopathy at the mild end, Ullrich congenital muscular dystrophy (CMD) at the severe end, and two rare, less well-defined disorders – autosomal dominant limb-girdle muscular dystrophy and autosomal recessive myosclerosis myopathy – in between. Although Bethlem myopathy and Ullrich CMD were defined long before their molecular basis was known, they remain useful for clarification of prognosis and management. Bethlem myopathy, characterized by the combination of proximal muscle weakness and variable contractures, affects most frequently the long finger flexors, elbows, and ankles. Onset may be prenatal (characterized by decreased fetal movements), neonatal (hypotonia or torticollis), in early childhood (delayed motor milestones, muscle weakness, and contractures), or in adulthood (proximal weakness and Achilles tendon or long finger flexor contractures). Because of slow progression, more than two thirds of affected individuals over age 50 years rely on supportive means for outdoor mobility. Respiratory involvement is rare and appears to be related to more severe muscle weakness in later life. Ullrich CMD is characterized by congenital weakness and hypotonia, proximal joint contractures, and striking hyperlaxity of distal joints. Some affected children acquire the ability to walk independently; however, progression of the disease often results in later loss of ambulation. Early and severe respiratory involvement may require ventilatory support in the first or second decade of life. 20301676 NBK1503 AllHighlyPenetrant not specified The term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders. This usage was introduced in 2014 to replace AllHighlyPenetrant. current criteria provided, single submitter Likely benign This variant is considered likely benign or benign based on one or more of the following criteria: it is a conservative change, it occurs at a poorly conserved position in the protein, it is predicted to be benign by multiple in silico algorithms, and/or has population frequency not consistent with disease. variation to disease GeneDX Variant Classification (06012015) https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdf germline human yes clinical testing not provided Variation NM_001849.3:c.2697G>A not specified SUB3839901 current criteria provided, single submitter Likely benign variation to disease Invitae Variant Classification Sherloc (09022015) 28492532 germline human unknown clinical testing not provided Variation NM_001849.3:c.2697G>A Bethlem myopathy 1 SUB6897608 current criteria provided, single submitter Uncertain significance This variant was observed in the ICSL laboratory as part of a predisposition screen in an ostensibly healthy population. It had not been previously curated by ICSL or reported in the Human Gene Mutation Database (HGMD: prior to June 1st, 2018), and was therefore a candidate for classification through an automated scoring system. Utilizing variant allele frequency, disease prevalence and penetrance estimates, and inheritance mode, an automated score was calculated to assess if this variant is too frequent to cause the disease. Based on the score, this variant could not be ruled out of causing disease and therefore its association with disease required further investigation. A literature search was performed for the gene, cDNA change, and amino acid change (if applicable). No publications were found based on this search. This variant was therefore classified as a variant of unknown significance for this disease. variation to disease ICSL Variant Classification Criteria 13 December 2019 https://submit.ncbi.nlm.nih.gov/ft/byid/r0x0xrmc/icsl_variant_classification_criteria_13_december_2019.pdf germline human unknown clinical testing not provided Variation NM_001849.3:c.2697G>A Collagen VI-related myopathy SUB6641900 ================================================ FILE: UnitTests/Resources/ClinVarXmlFiles/VCVs/VCV000618791.xml ================================================ currentHomo sapiens1p36.22602839NM_005026.5(PIK3CD):c.231G>A (p.Ala77=)NC_000001.11:9715629:G:Asingle nucleotide variant1p36.22LRG_191t1:c.231G>ALRG_191:g.68899G>ANC_000001.11:g.9715630G>ANM_001350234.2:c.231G>ANP_001337163.1:p.Ala77=NM_001350235.1:c.231G>ANP_001337164.1:p.Ala77=NM_005026.5:c.231G>ANP_005017.3:p.Ala77=NG_023434.1:g.68899G>ANC_000001.10:g.9775688G>Acriteria provided, multiple submitters, no conflictsnot providedImmunodeficiency 14Likely benignnone providednot providedThe term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified.Immunodeficiency 14p110-DELTA-ACTIVATING MUTATION CAUSING SENESCENT T CELLS, LYMPHADENOPATHY, AND IMMUNODEFICIENCYIMMUNODEFICIENCY 14A, AUTOSOMAL DOMINANTIMD14APASLIcurrentcriteria provided, single submitterLikely benignThe c.231G>A variant (rs756139699) does not alter the amino acid sequence of the PIK3CD protein and computational splice site prediction algorithms do not predict a change in the nearest splice site or creation of a cryptic splice site. This variant has not been reported in association with primary antibody deficiency in medical literature or in gene specific variation databases. This variant is listed in the genome Aggregation Database (gnomAD) with an overall population frequency of 0.006 percent (identified on 17 out of 276,374 chromosomes). Based on these observations, the c.231G>A variant is likely to be benign.variation to diseaseARUP Molecular Germline Variant Investigation Processhttps://submit.ncbi.nlm.nih.gov/ft/byid/w2yp3qyt/arup_molecular_germline_variant_investigation_process.pdfgermlinehumanunknownclinical testingnot providedVariationp.Ala77AlaNM_005026.3:c.231G>ANot ProvidedSUB4618058currentcriteria provided, single submitterLikely benignvariation to diseaseInvitae Variant Classification Sherloc (09022015)28492532germlinehumanunknownclinical testingnot providedVariationNC_000001.10:g.9775688G>ASUB8755776 ================================================ FILE: UnitTests/Resources/SA/CosmicCNV.tsv ================================================ CNV_ID ID_GENE gene_name ID_SAMPLE ID_TUMOUR Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 SAMPLE_NAME TOTAL_CN MINOR_ALLELE MUT_TYPE ID_STUDY GRCh Chromosome:G_Start..G_Stop 6119374 68055 LGALS9C 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 0 0 loss 619 37 17:18358950..18464587 6119374 107031 FAM106A 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 0 0 loss 619 37 17:18358950..18464587 6128754 69785 DAZ2 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 7 0 gain 619 37 Y:24624108..26404340 6119398 94344 PCDH11Y_ENST00000215473 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 0 0 loss 619 37 Y:5532303..5565780 6128754 103307 DAZ1 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 7 0 gain 619 37 Y:24624108..26404340 6128754 66769 PRY 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 7 0 gain 619 37 Y:24624108..26404340 6128754 106281 DAZ1_ENST00000382510 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 7 0 gain 619 37 Y:24624108..26404340 6128756 75765 BPY2B 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 10 0 gain 619 37 Y:26409790..27684355 6128756 69787 DAZ3 683665 611825 haematopoietic_and_lymphoid_tissue NS NS NS lymphoid_neoplasm plasma_cell_myeloma NS NS MC-CAR 10 0 gain 619 37 Y:26409790..27684355 610835 95782 MT-CYB_ENST00000361789 2384185 2247017 skin NS NS NS carcinoma NS NS NS ML_33_T_01 gain 656 37 25:2..15814 ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/not_sa.txt ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa1.nsa ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa1.nsa.idx ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa2.nsa ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa2.nsa.idx ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa3.nsi ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa4.nsi ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa5.npd ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa5.npd.idx ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa6.nga ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa7.nga ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa8.rma ================================================  ================================================ FILE: UnitTests/Resources/SA/MockSaFiles/sa8.rma.idx ================================================  ================================================ FILE: UnitTests/Resources/TinyAnnotated.json ================================================ {"header":{"annotator":"Illumina Annotation Engine 1.3.3.1633","creationTime":"2016-12-09 09:49:24","genomeAssembly":"GRCh37","schemaVersion":4,"dataVersion":"84.22.36","dataSources":[{"name":"VEP","version":"84","description":"Ensembl","releaseDate":"2016-04-29"},{"name":"phyloP","version":"hg19","description":"46 way conservation score between humans and 45 other vertebrates","releaseDate":"2009-11-10"},{"name":"OMIM","version":"unknown","description":"An Online Catalog of Human Genes and Genetic Disorders","releaseDate":"2016-09-02"},{"name":"dbSNP","version":"147","description":"Identifiers for observed variants","releaseDate":"2016-06-01"},{"name":"COSMIC","version":"78","description":"Somatic mutation and related details and information relating to human cancers","releaseDate":"2016-09-05"},{"name":"1000 Genomes Project","version":"Phase 3 v5a","description":"A public catalogue of human variation and genotype data","releaseDate":"2013-05-27"},{"name":"EVS","version":"2","releaseDate":"2013-11-13"},{"name":"ExAC","version":"0.3.1","description":"Allele frequency data from the ExAC project","releaseDate":"2016-03-16"},{"name":"ClinVar","version":"unknown","description":"A freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence","releaseDate":"2016-09-01"},{"name":"DGV","version":"unknown","description":"Provides a comprehensive summary of structural variation in the human genome","releaseDate":"2016-05-15"},{"name":"ClinGen","version":"unknown","releaseDate":"2016-04-14"}]},"positions":[ {"chromosome":"chr1","refAllele":"TCC","position":9775924,"altAlleles":["TTT"],"cytogeneticBand":"1p36.22","variants":[{"altAllele":"TT","refAllele":"CC","begin":9775925,"chromosome":"chr1","end":9775926,"variantType":"MNV","vid":"1:9775925:9775926:TT","cosmic":[{"id":"COSM4517654","isAlleleSpecific":true,"refAllele":"CC","altAllele":"TT","gene":"PIK3CD","sampleCount":1,"studies":[{"histology":"carcinoma","primarySite":"skin"}]},{"id":"COSM4517655","isAlleleSpecific":true,"refAllele":"CC","altAllele":"TT","gene":"PIK3CD_ENST00000536656","sampleCount":1,"studies":[{"histology":"carcinoma","primarySite":"skin"}]}],"transcripts":{"ensembl":[{"transcript":"ENST00000536656","bioType":"protein_coding","aminoAcids":"S/F","cDnaPos":"597-598","codons":"tCC/tTT","cdsPos":"389-390","exons":"5/24","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["missense_variant"],"hgvsc":"ENST00000536656.1:c.389_390delCCinsTT","hgvsp":"ENSP00000446444.1:p.Ser130Phe","polyPhenScore":0.781,"polyPhenPrediction":"possibly damaging","proteinId":"ENSP00000446444","proteinPos":"130","siftScore":0,"siftPrediction":"deleterious"},{"transcript":"ENST00000377346","bioType":"protein_coding","aminoAcids":"S/F","cDnaPos":"584-585","codons":"tCC/tTT","cdsPos":"389-390","exons":"5/24","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["missense_variant"],"hgvsc":"ENST00000377346.4:c.389_390delCCinsTT","hgvsp":"ENSP00000366563.4:p.Ser130Phe","isCanonical":true,"polyPhenScore":0.786,"polyPhenPrediction":"possibly damaging","proteinId":"ENSP00000366563","proteinPos":"130","siftScore":0,"siftPrediction":"deleterious"},{"transcript":"ENST00000361110","bioType":"protein_coding","aminoAcids":"S/F","cDnaPos":"504-505","codons":"tCC/tTT","cdsPos":"389-390","exons":"4/23","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["missense_variant"],"hgvsc":"ENST00000361110.2:c.389_390delCCinsTT","hgvsp":"ENSP00000354410.2:p.Ser130Phe","polyPhenScore":0.781,"polyPhenPrediction":"possibly damaging","proteinId":"ENSP00000354410","proteinPos":"130","siftScore":0,"siftPrediction":"deleterious"},{"transcript":"ENST00000481137","bioType":"retained_intron","cDnaPos":"90-91","exons":"2/2","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["non_coding_transcript_exon_variant","non_coding_transcript_variant"],"hgvsc":"ENST00000481137.1:n.90_91delCCinsTT"},{"transcript":"ENST00000479223","bioType":"retained_intron","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["upstream_gene_variant"]},{"transcript":"ENST00000543390","bioType":"protein_coding","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["upstream_gene_variant"],"proteinId":"ENSP00000443811"}]}}]}, {"chromosome":"chr1","refAllele":"G","position":9777113,"altAlleles":["GCC"],"cytogeneticBand":"1p36.22","variants":[{"altAllele":"CC","refAllele":"-","begin":9777114,"chromosome":"chr1","end":9777113,"variantType":"insertion","vid":"1:9777114:9777113:CC","regulatoryRegions":[{"id":"ENSR00000530352","consequence":["regulatory_region_variant"]}],"cosmic":[{"id":"COSM1474274","isAlleleSpecific":true,"refAllele":"-","altAllele":"CC","gene":"PIK3CD","sampleCount":1,"studies":[{"id":414,"histology":"carcinoma","primarySite":"breast"}]},{"id":"COSM5832706","refAllele":"-","altAllele":"NN","gene":"PIK3CD_ENST00000536656","sampleCount":1,"studies":[{"id":414,"histology":"carcinoma","primarySite":"breast"}]}],"transcripts":{"ensembl":[{"transcript":"ENST00000536656","bioType":"protein_coding","introns":"6/23","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["intron_variant"],"hgvsc":"ENST00000536656.1:c.781-4_781-3dupCC","proteinId":"ENSP00000446444"},{"transcript":"ENST00000377346","aminoAcids":"A/AX","cDnaPos":"1072-1073","codons":"gcc/gCCcc","cdsPos":"877-878","exons":"7/24","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["frameshift_variant"],"hgvsc":"ENST00000377346.4:c.882_883dupCC","hgvsp":"ENSP00000366563.4:p.Gln295ProfsTer40","isCanonical":true,"proteinId":"ENSP00000366563","proteinPos":"293"},{"transcript":"ENST00000361110","bioType":"protein_coding","introns":"5/22","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["intron_variant"],"hgvsc":"ENST00000361110.2:c.781-4_781-3dupCC","proteinId":"ENSP00000354410"},{"transcript":"ENST00000481137","bioType":"retained_intron","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["downstream_gene_variant"]},{"transcript":"ENST00000479223","bioType":"retained_intron","cDnaPos":"441-442","exons":"2/3","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["non_coding_transcript_exon_variant","non_coding_transcript_variant"],"hgvsc":"ENST00000479223.1:n.446_447dupCC"},{"transcript":"ENST00000543390","bioType":"protein_coding","geneId":"ENSG00000171608","hgnc":"PIK3CD","consequence":["upstream_gene_variant"],"proteinId":"ENSP00000443811"}]}}]} ]} ================================================ FILE: UnitTests/Resources/cosm5428243.tsv ================================================ Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 Genome-wide screen Mutation ID Mutation CDS Mutation AA Mutation Description Mutation zygosity LOH GRCh Mutation genome position Mutation strand SNP FATHMM prediction FATHMM score Mutation somatic status Pubmed_PMID ID_STUDY Sample source Tumour origin Age FAM138A ENST00000417324 258 CN-AML-CR-42-Dx 2340530 2205513 haematopoietic_and_lymphoid_tissue NS NS NS haematopoietic_neoplasm acute_myeloid_leukaemia NS NS y COSM5428243 c.82T>C p.S28P Substitution - Missense u 37 1:35416-35416 - n Confirmed somatic variant 544 blood-bone marrow primary 69 FAM138A ENST00000417324 258 CN-AML-CR-42-Dx 2340530 2205513 haematopoietic;lymphoid_tissue NS NS NS haematopoietic_neoplasm acute_myeloid_leukaemia NS NS y COSM5428243 c.82T>C p.S28P Substitution - Missense u 37 1:35416-35416 - n Confirmed somatic variant 544 t-bone marrow primary 81 ================================================ FILE: UnitTests/Resources/cosm5428243.vcf ================================================ 1 35416 COSM5428243 A G . . GENE=FAM138A;STRAND=-;CDS=c.82T>C;AA=p.S28P;CNT=1 ================================================ FILE: UnitTests/Resources/dbSNP.version ================================================ NAME=dbSNP VERSION=147 DATE=2016-04-08 DESCRIPTION= ================================================ FILE: UnitTests/Resources/manifest.txt ================================================ ClinGen_Dosage_Sensitivity_Map_20190507.nga not_exist.nsa gnomAD_gene_scores_2.1.nga another_fake_file.nsi OMIM_20190812.nga ================================================ FILE: UnitTests/Resources/mini.WigFix ================================================ fixedStep chrom=chr1 start=100 step=1 0.064 0.058 0.064 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.000 0.000 0.000 0.000 0.000 0.000 0.058 fixedStep chrom=chr1 start=175 step=1 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 fixedStep chrom=chr1 start=250 step=1 0.058 0.064 0.000 0.064 0.058 -2.305 0.064 0.064 0.064 0.058 0.058 -2.096 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 fixedStep chrom=chr2 start=100 step=1 0.064 0.058 0.064 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.000 0.000 0.000 0.000 0.000 0.000 0.058 fixedStep chrom=chr2 start=175 step=1 0.064 0.058 0.064 0.058 0.064 0.058 0.058 -2.088 0.064 0.058 0.058 0.064 0.064 0.064 0.064 0.064 0.058 0.064 0.064 -2.363 0.064 0.064 0.064 0.064 0.000 0.064 0.064 0.058 0.064 0.064 -2.096 0.064 -2.039 0.064 0.064 0.064 0.064 0.064 -2.363 0.064 -2.381 0.064 0.064 0.064 -2.305 0.064 0.058 0.064 ================================================ FILE: UnitTests/Resources/testDgvParser.txt ================================================ variantaccession chr start end varianttype variantsubtype reference pubmedid method platform mergedvariants supportingvariants mergedorsample frequency samplesize observedgains observedlosses cohortdescription genes samples nsv945265 1 352306 371739 OTHER complex Sudmant_et_al_2013 23825009 Oligo aCGH,Sequencing nssv1679650,nssv1756446,nssv1677687,nssv1756463,nssv1677742,nssv1679594,nssv1677709,nssv1756442,nssv1677642,nssv1756473,nssv1686360,nssv1756477,nssv1683216,nssv1756482,nssv1679661,nssv1756449,nssv1684709,nssv1756454,nssv1677754,nssv1756483,nssv1679639,nssv1682067,nssv1679628,nssv1756453,nssv1679627,nssv1756467,nssv1679672,nssv1756476,nssv1756443,nssv1680921,nssv1677665,nssv1756451,nssv1677731,nssv1756478,nssv1682237,nssv1679775,nssv1677641,nssv1756468,nssv1677720,nssv1756447,nssv1677753,nssv1756484,nssv1684698,nssv1756440,nssv1679705,nssv1756455,nssv1679716,nssv1756481,nssv1677676,nssv1756437,nssv1756439,nssv1756464,nssv1677698,nssv1679492,nssv1679694,nssv1756457,nssv1679616,nssv1679073,nssv1756444,nssv1756458,nssv1677643,nssv1677927,nssv1677765,nssv1682178,nssv1679738,nssv1756445,nssv1679683,nssv1679761,nssv1677654,nssv1756469,nssv1756462,nssv1679727,nssv1677816,nssv1756460,nssv1756480,nssv1756470,nssv1756461,nssv1756471,nssv1681956,nssv1756474,nssv1756452,nssv1756448,nssv1679739,nssv1756450,nssv1756475,nssv1756459,nssv1756472,nssv1756479,nssv1756456,nssv1678962,nssv1756465,nssv1680810,nssv1756441,nssv1679605,nssv1679750,nssv1756438,nssv1756466 M 97 10 0 OR4F16,OR4F29,OR4F3 HGDP00456,HGDP00521,HGDP00542,HGDP00665,HGDP00778,HGDP00927,HGDP00998,HGDP01029,HGDP01284,HGDP01307 nsv161172 1 88190 89153 CNV deletion Mills_et_al_2006 16902084 Sequencing nssv179750 M 24 nsv951399 1 46501 71800 CNV duplication Dogan_et_al_2014 24416366 Sequencing nssv2997203 M 1 1 0 OR4F5 BILGI_BIOE nsv471522 1 522139 756783 CNV gain Alkan_et_al_2009 19718026 Oligo aCGH,Sequencing nssv547898,nssv547899,nssv547897 M 3 3 0 FAM87B,LOC100133331,LOC100288069,MIR6723,OR4F16,OR4F29,OR4F3 JDW,NA18507,YH nsv10161 1 712111 1708649 CNV gain+loss Perry_et_al_2008 18304495 Oligo aCGH nssv24602,nssv24600,nssv18108,nssv28946,nssv28533,nssv26882,nssv21436,nssv21448,nssv21431,nssv28031,nssv28029,nssv26877,nssv24610,nssv28921,nssv28038,nssv26876,nssv24621,nssv28552,nssv28048,nssv28542,nssv28940,nssv26892,nssv21429,nssv26879,nssv28558,nssv26888 M 31 11 7 ACAP3,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,C1orf170,C1orf233,CCNL2,CDK11A,CDK11B,CPSF3L,DVL1,FAM132A,FAM41C,FAM87B,GLTPD1,HES4,ISG15,KLHL17,LINC00115,LINC01128,LOC100130417,LOC100288069,LOC148413,LOC254099,MIB2,MIR200A,MIR200B,MIR429,MIR6726,MIR6727,MIR6808,MMP23A,MMP23B,MRPL20,MXRA8,NADK,NOC2L,PLEKHN1,PUSL1,RNF223,SAMD11,SCNN1D,SDF4,SLC35E2,SLC35E2B,SSU72,TAS1R3,TMEM240,TMEM88B,TNFRSF18,TNFRSF4,TTLL10,UBE2J2,VWA1 NA07048,NA10839,NA10863,NA12740,NA12872,NA18504,NA18537,NA18552,NA18564,NA18572,NA18972,NA19144,NA19173,NA19221 esv3358119 1 822853 822861 CNV insertion 1000_Genomes_Consortium_Pilot_Project 20981092 Digital array,Oligo aCGH,PCR,Sequencing essv7863668,essv7863667 M 185 2 0 NA12005,NA18953 esv6890 1 17006189 17052558 OTHER inversion Ahn_et_al_2009 19470904 Sequencing essv29331 M 1 0 0 ESPNP,MIR3675 esv6517 1 964760 965579 CNV loss Ahn_et_al_2009 19470904 Sequencing essv28958 M 1 0 0 AGRN esv3310333 1 17441132 17441133 CNV mobile element insertion 1000_Genomes_Consortium_Pilot_Project 20981092 Digital array,Oligo aCGH,PCR,Sequencing essv7837611,essv7836931,essv7838900 M 185 3 0 PADI2 NA19238,NA19239,NA19240 nsv479682 1 3787207 3787207 CNV novel sequence insertion Kidd_et_al_2010 20440878 Oligo aCGH,Sequencing nssv3012592 M 9 0 0 DFFB nsv506926 1 34597680 34603680 OTHER sequence alteration Teague_et_al_2010 20534489 BAC aCGH,Oligo aCGH,Optical mapping,Sequencing nssv619231,nssv617529,nssv623267,nssv620650 M 4 0 0 CSMD2 CHM,NA10860,NA15510,NA18994 esv3302766 1 38583768 38583926 CNV tandem duplication 1000_Genomes_Consortium_Pilot_Project 20981092 Digital array,Oligo aCGH,PCR,Sequencing essv7736661,essv7732953,essv7735590 M 185 0 0 NA18563,NA18577,NA18582 ================================================ FILE: UnitTests/SAUtils/AnnotationItems/CosmicCnvItemTests.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.ExtractCosmicSvs; using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.SAUtils.AnnotationItems { public sealed class CosmicCnvItemTests { [Fact] public void Merge_add_new_items() { var item1 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} }, 1); var item2 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology3", 1}, {"histology4", 2} }, new Dictionary { { "tissue3", 2}, { "tissue4", 1} },2); item1.Merge(item2); Assert.Equal(4, item1.CancerTypeCount); Assert.Equal(4, item1.TissueTypeCount); } [Fact] public void GetJsonString() { var item1 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} },2); Assert.Equal("\"id\":1,\"variantType\":\"copy_number_gain\",\"copyNumber\":3,\"cancerTypes\":[{\"histology1\":1},{\"histology2\":2}],\"tissueTypes\":[{\"tissue1\":2},{\"tissue2\":1}]", item1.GetJsonString()); } [Fact] public void GetJsonString_unspecified_copy_number() { var item1 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, -1, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} }, 2); Assert.Equal("\"id\":1,\"variantType\":\"copy_number_gain\",\"cancerTypes\":[{\"histology1\":1},{\"histology2\":2}],\"tissueTypes\":[{\"tissue1\":2},{\"tissue2\":1}]", item1.GetJsonString()); } [Fact] public void Merge_same_histology_site() { var item1 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} },1); var item2 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} },2); item1.Merge(item2); Assert.Equal(2, item1.CancerTypeCount); Assert.Equal(2, item1.TissueTypeCount); } [Fact] public void Merge_avoid_double_counting() { var item1 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} }, 1); var item2 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} }, 1); item1.Merge(item2); Assert.Equal("\"id\":1,\"variantType\":\"copy_number_gain\",\"copyNumber\":3,\"cancerTypes\":[{\"histology1\":1},{\"histology2\":2}],\"tissueTypes\":[{\"tissue1\":2},{\"tissue2\":1}]", item1.GetJsonString()); } [Fact] public void Merge_check_adjust_counts() { var item1 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} },1); var item2 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} },2); item1.Merge(item2); Assert.Equal("\"id\":1,\"variantType\":\"copy_number_gain\",\"copyNumber\":3,\"cancerTypes\":[{\"histology1\":2},{\"histology2\":4}],\"tissueTypes\":[{\"tissue1\":4},{\"tissue2\":2}]", item1.GetJsonString()); } [Fact] public void Merge_throws_exception_if_cnvs_differ() { var item1 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_loss, 0, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} },1); var item2 = new CosmicCnvItem(1, ChromosomeUtilities.Chr1, 100, 1000, VariantType.copy_number_gain, 3, new Dictionary { {"histology1", 1}, {"histology2", 2} }, new Dictionary { { "tissue1", 2}, { "tissue2", 1} },1); Assert.Throws(()=>item1.Merge(item2)); } } } ================================================ FILE: UnitTests/SAUtils/ClinGen/GeneDiseaseValidityTests.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.ClinGen; using Xunit; namespace UnitTests.SAUtils.ClinGen { public sealed class GeneDiseaseValidityTests { private static Stream GetGeneValidityStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("CLINGEN GENE VALIDITY CURATIONS\t\t\t\t"); writer.WriteLine("FILE CREATED: 2019-12-02\t\t\t\t"); writer.WriteLine("WEBPAGE: https://search.clinicalgenome.org/kb/gene-validity \t\t\t\t"); writer.WriteLine("+++++++++++\t++++++++++++++\t+++++++++++++\t++++++++++++++++++\t+++++++++\t++++++++++++++\t+++++++++++++\t+++++++++++++++++++"); writer.WriteLine("GENE SYMBOL\tGENE ID (HGNC)\tDISEASE LABEL\tDISEASE ID (MONDO)\tSOP\tCLASSIFICATION\tONLINE REPORT\tCLASSIFICATION DATE"); writer.WriteLine("+++++++++++\t++++++++++++++\t+++++++++++++\t++++++++++++++++++\t+++++++++\t++++++++++++++\t+++++++++++++\t+++++++++++++++++++"); writer.WriteLine("A2ML1\tHGNC:23336\tNoonan syndrome with multiple lentigines\tMONDO_0007893\tSOP5\tNo Reported Evidence\thttps://search.clinicalgenome.org/kb/gene-validity/59b87033-dd91-4f1e-aec1-c9b1f5124b16--2018-06-07T14:37:47\t2018-06-07T14:37:47.175Z"); writer.WriteLine("A2ML1\tHGNC:23336\tcardiofaciocutaneous syndrome\tMONDO_0015280\tSOP5\tNo Reported Evidence\thttps://search.clinicalgenome.org/kb/gene-validity/fc3c41d8-8497-489b-a350-c9e30016bc6a--2018-06-07T14:31:03\t2018-06-07T14:31:03.696Z"); writer.WriteLine("A2ML1\tHGNC:23336\tCostello syndrome\tMONDO_0009026\tSOP5\tNo Reported Evidence\thttps://search.clinicalgenome.org/kb/gene-validity/ea72ba8d-cf62-44bc-86be-da64e3848eba--2018-06-07T14:34:05\t2018-06-07T14:34:05.324Z"); writer.WriteLine("AARS\tHGNC:20\tundetermined early-onset epileptic encephalopathy\tMONDO_0018614\tSOP6\tLimited\thttps://search.clinicalgenome.org/kb/gene-validity/ac62fe65-ee56-4146-9fe4-00dc1db2d958--2018-11-20T17:00:00\t2018-11-20T17:00:00.000Z"); writer.WriteLine("AASS\tHGNC:17366\thyperlysinemia (disease)\tMONDO_0009388\tSOP6\tModerate\thttps://search.clinicalgenome.org/kb/gene-validity/92e04f9e-f03e-4295-baac-e9fb6b48a258--2019-11-08T17:00:00\t2019-11-08T17:00:00.000Z"); writer.WriteLine("ABCC9\tHGNC:60\thypertrichotic osteochondrodysplasia Cantu type\tMONDO_0009406\tSOP4\tDefinitive\thttps://search.clinicalgenome.org/kb/gene-validity/10028\t2017-09-27T00:00:00"); //duplicate item writer.WriteLine("ABCC9\tHGNC:60\thypertrichotic osteochondrodysplasia Cantu type\tMONDO_0009406\tSOP4\tDefinitive\thttps://search.clinicalgenome.org/kb/gene-validity/10028\t2017-10-27T00:00:00"); writer.Flush(); stream.Position = 0; return stream; } private static Dictionary GetIdToSymbols() { return new Dictionary { { 23336,"A2ML1" }, { 20, "AARS"}, { 60, "ABCC9" } }; } [Fact] public void ParserTest() { var parser = new GeneDiseaseValidityParser(GetGeneValidityStream(), GetIdToSymbols()); var items = parser.GetItems(); Assert.Equal(3, items.Count); var firstGene = items["A2ML1"]; Assert.Equal(3, firstGene.Count); Assert.Equal("{\"diseaseId\":\"MONDO_0007893\",\"disease\":\"Noonan syndrome with multiple lentigines\",\"classification\":\"no reported evidence\",\"classificationDate\":\"2018-06-07\"}", firstGene[0].GetJsonString()); var thirdGene = items["ABCC9"]; Assert.Single(thirdGene); Assert.Equal("{\"diseaseId\":\"MONDO_0009406\",\"disease\":\"hypertrichotic osteochondrodysplasia Cantu type\",\"classification\":\"definitive\",\"classificationDate\":\"2017-10-27\"}", thirdGene[0].GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/Cache/TranscriptCacheTests.cs ================================================ using System.Collections.Generic; using System.IO; using Intervals; using SAUtils.CosmicGeneFusions.Cache; using UnitTests.MockedData; using UnitTests.TestUtilities; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions.Cache { public sealed class TranscriptCacheTests { [Theory] [InlineData("ENST00000646891.1", "ENSG00000157764", "BRAF")] [InlineData("ENST00000242365.4", "ENSG00000122778", "KIAA1549")] [InlineData("ENST00000311979.3", "ENSG00000172660", "TAF15")] [InlineData("ENST00000529193.1", "ENSG00000157613", "CREB3L1")] [InlineData("ENST00000312675.4", "ENSG00000145012", "LPP")] [InlineData("ENST00000556625.1", "ENSG00000258389", "DUX4")] public void HandleMissingTranscripts_ExpectedResults(string transcriptId, string expectedGeneId, string expectedGeneSymbol) { (string actualGeneId, string actualGeneSymbol) = TranscriptCache.HandleMissingTranscripts(transcriptId); Assert.Equal(expectedGeneId, actualGeneId); Assert.Equal(expectedGeneSymbol, actualGeneSymbol); } [Fact] public void HandleMissingTranscripts_UnknownTranscriptId_ThrowException() { Assert.Throws(delegate { TranscriptCache.HandleMissingTranscripts("ABC"); }); } [Fact] public void GetTranscriptIdToTranscript() { var chr1 = new IntervalArray(new Interval[] { new(Transcripts.ENST00000290663.Start, Transcripts.ENST00000290663.End, Transcripts.ENST00000290663), new(Transcripts.ENST00000370673.Start, Transcripts.ENST00000370673.End, Transcripts.ENST00000370673), new(Transcripts.ENST00000427819.Start, Transcripts.ENST00000427819.End, Transcripts.ENST00000427819) }); var chr2 = new IntervalArray(new Interval[] { new(Transcripts.ENST00000615053.Start, Transcripts.ENST00000615053.End, Transcripts.ENST00000615053), new(Transcripts.ENST00000347849.Start, Transcripts.ENST00000347849.End, Transcripts.ENST00000347849) }); var transcriptIntervalArrays = new IntervalArray[ChromosomeUtilities.RefIndexToChromosome.Count]; transcriptIntervalArrays[ChromosomeUtilities.Chr1.Index] = chr1; transcriptIntervalArrays[ChromosomeUtilities.Chr2.Index] = chr2; Dictionary idToTranscript = TranscriptCache.GetTranscriptIdToTranscript(transcriptIntervalArrays); Assert.Equal(10, idToTranscript.Count); Assert.True(idToTranscript.ContainsKey("ENST00000290663")); Assert.True(idToTranscript.ContainsKey("ENST00000290663.10")); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/Conversion/CosmicConverterTests.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.CosmicGeneFusions.Cache; using SAUtils.CosmicGeneFusions.Conversion; using VariantAnnotation.GeneFusions.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions.Conversion { public sealed class CosmicConverterTests { [Fact] public void Convert_ExpectedResults() { (TranscriptCache transcriptCache, ITranscript transcript, ITranscript transcript2) = HgvsRnaParserTests.GetTranscriptCache(); Dictionary> fusionIdToEntries = GetFusionIdToEntries(transcript, transcript2); ulong expectedFusionKey = GeneFusionKey.Create( GeneFusionKey.CreateGeneKey(transcript.Gene.EnsemblId.WithoutVersion), GeneFusionKey.CreateGeneKey(transcript2.Gene.EnsemblId.WithoutVersion)); string[] expectedJsonEntries = { "\"id\":\"COSF665\",\"numSamples\":1,\"geneSymbols\":[\"MED8\",\"PTPN18\"],\"hgvsr\":\"ENST00000290663.10(MED8):r.1_3555::ENST00000347849.7(PTPN18):r.2100_3452\",\"histologies\":[{\"name\":\"ductal carcinoma\",\"numSamples\":1}],\"sites\":[{\"name\":\"breast\",\"numSamples\":1}],\"pubMedIds\":[20033038]", "\"id\":\"COSF667\",\"numSamples\":1,\"geneSymbols\":[\"MED8\",\"PTPN18\"],\"hgvsr\":\"ENST00000290663.10(MED8):r.1_1234::ENST00000347849.7(PTPN18):r.5678_6789\",\"histologies\":[{\"name\":\"ductal carcinoma\",\"numSamples\":1}],\"sites\":[{\"name\":\"breast\",\"numSamples\":1}],\"pubMedIds\":[20033038]" }; Dictionary actualFusionKeyToJson = CosmicConverter.Convert(fusionIdToEntries, transcriptCache); Assert.Single(actualFusionKeyToJson); string[] actualJsonEntries = actualFusionKeyToJson[expectedFusionKey]; Assert.NotNull(actualJsonEntries); Assert.Equal(expectedJsonEntries, actualJsonEntries); } private static Dictionary> GetFusionIdToEntries(ITranscript transcript, ITranscript transcript2) { string transcriptId5 = transcript.Id.WithVersion; string geneSymbol5 = transcript.Gene.Symbol; string transcriptId3 = transcript2.Id.WithVersion; string geneSymbol3 = transcript2.Gene.Symbol; var rawGeneFusion = new RawCosmicGeneFusion(749711, 665, "breast", "NS", "carcinoma", "ductal carcinoma", $"{transcriptId5}({geneSymbol5}):r.1_3555_{transcriptId3}({geneSymbol3}):r.2100_3452", 20033038); var rawGeneFusion2 = new RawCosmicGeneFusion(749712, 667, "breast", "NS", "carcinoma", "ductal carcinoma", $"{transcriptId5}({geneSymbol5}):r.1_1234_{transcriptId3}({geneSymbol3}):r.5678_6789", 20033038); return new Dictionary> { [rawGeneFusion.FusionId] = new() {rawGeneFusion}, [rawGeneFusion2.FusionId] = new() {rawGeneFusion2} }; } [Fact] public void ToJsonArray_ExpectedResults() { var geneKeyToJsonList = new Dictionary> { [123] = new() {"A", "B", "C"}, [456] = new() {"A"}, [789] = new() }; Dictionary actualResults = geneKeyToJsonList.ToJsonArray(); Assert.Equal(3, actualResults.Count); Assert.Equal(3, actualResults[123].Length); Assert.Single(actualResults[456]); Assert.Empty(actualResults[789]); } [Fact] public void GetCosmicGeneFusion_NullHgvs_ReturnNull() { const string hgvsNotation = "ENST00000283243.12(PLA2R1):r.1_2802"; var fusionEntries = new HashSet { new(10, 0, null, null, null, null, hgvsNotation, 123) }; const ulong expectedFusionKey = 0; (ulong actualFusionKey, string actualJson) = CosmicConverter.GetCosmicGeneFusion(0, fusionEntries, null); Assert.Equal(expectedFusionKey, actualFusionKey); Assert.Null(actualJson); } [Fact] public void AggregateRawCosmicGeneFusions_ExpectedResults() { const int expectedNumSamples = 4; const int expectedNumPubMedIds = 2; const string hgvsNotation = "ENST00000000123.1(ABC):r.1_1000_ENST00000000456.2(DEF):r.300_2000"; const string expectedHgvsNotation = "ENST00000000123.1(ABC):r.1_1000::ENST00000000456.2(DEF):r.300_2000"; var fusionEntries = new HashSet { new(10, 0, null, null, null, null, hgvsNotation, 123), new(20, 0, null, null, null, null, hgvsNotation, 123), new(30, 0, null, null, null, null, hgvsNotation, 200), new(40, 0, null, null, null, null, hgvsNotation, 123) }; (int[] actualPubMedIds, int actualNumSamples, string actualHgvsNotation) = CosmicConverter.AggregateRawCosmicGeneFusions(fusionEntries); Assert.Equal(expectedNumSamples, actualNumSamples); Assert.Equal(expectedNumPubMedIds, actualPubMedIds.Length); Assert.Equal(expectedHgvsNotation, actualHgvsNotation); } [Fact] public void AggregateRawCosmicGeneFusions_MultipleHgvsStrings_ThrowException() { const string hgvsNotation = "ENST00000000123.1(ABC):r.1_1000_ENST00000000456.2(DEF):r.300_2000"; const string hgvsNotation2 = "ENST00000000789.3(GHI):r.1_1000_ENST00000000456.2(DEF):r.300_2000"; var fusionEntries = new HashSet { new(10, 0, null, null, null, null, hgvsNotation, 123), new(20, 0, null, null, null, null, hgvsNotation, 123), new(30, 0, null, null, null, null, hgvsNotation2, 200), new(40, 0, null, null, null, null, hgvsNotation, 123) }; Assert.Throws(delegate { CosmicConverter.AggregateRawCosmicGeneFusions(fusionEntries); }); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/Conversion/HgvsRnaFixerTests.cs ================================================ using SAUtils.CosmicGeneFusions.Conversion; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions.Conversion { public sealed class HgvsRnaFixerTests { [Theory] [InlineData("ENST00000415083.2(SS18):r.1_1286_ENST00000415083.2(SS18):r.1286+683_1286+701_ENST00000336777.5(SSX2):r.351_1410", "ENST00000415083.2(SS18):r.1_1286::ENST00000415083.2(SS18):r.1286+683_1286+701::ENST00000336777.5(SSX2):r.351_1410")] [InlineData("ENST00000397938.2(EWSR1):r.1_1293_ENST00000397938.2(EWSR1):r.1332_1364_ENST00000527786.2(FLI1):r.1079_4127", "ENST00000397938.2(EWSR1):r.1_1293::ENST00000397938.2(EWSR1):r.1332_1364::ENST00000527786.2(FLI1):r.1079_4127")] [InlineData("ENST00000397938.2(EWSR1):r.1_1293_ENST00000397938.2(EWSR1):r.1293+1627_1293+1656_insU_ENST00000310015.6(SP3):r.2389_6359", "ENST00000397938.2(EWSR1):r.1_1293::ENST00000397938.2(EWSR1):r.1293+1627_1293+1656_insU::ENST00000310015.6(SP3):r.2389_6359")] [InlineData("ENST00000397938.2(EWSR1):r.1_545_insAAGGGACCAGUACAG_ENST00000397938.2(EWSR1):r.546_1112_ENST00000332351.3(WT1):r.1535_3122", "ENST00000397938.2(EWSR1):r.1_545_insAAGGGACCAGUACAG::ENST00000397938.2(EWSR1):r.546_1112::ENST00000332351.3(WT1):r.1535_3122")] [InlineData("ENST00000254108.7(FUS):r.1_727_ENST00000254108.7(FUS):r.904+1322_904+1354inv_ENST00000330387.6(CREB3L2):r.936_7412", "ENST00000254108.7(FUS):r.1_727::ENST00000254108.7(FUS):r.904+1322_904+1354inv::ENST00000330387.6(CREB3L2):r.936_7412")] [InlineData("ENST00000429538.3(PAX8):r.?_ENST00000287820.6(PPARG):r.?", "ENST00000429538.3(PAX8):r.?::ENST00000287820.6(PPARG):r.?")] public void CorrectHgvsNotationUsingRegex_ExpectedResults(string originalHgvs, string expectedHgvs) { string actualHgvs = HgvsRnaFixer.Fix(originalHgvs); Assert.Equal(expectedHgvs, actualHgvs); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/Conversion/HgvsRnaParserTests.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.CosmicGeneFusions.Cache; using SAUtils.CosmicGeneFusions.Conversion; using VariantAnnotation.GeneFusions.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions.Conversion { public sealed class HgvsRnaParserTests { [Theory] [InlineData( "ENST00000332149.5(TMPRSS2):r.1_79+?_ENST00000442448.1(ERG):r.312_5034", "ENST00000332149.5", "ENST00000442448.1")] [InlineData( "ENST00000415083.2(SS18):r.1_1286_ENST00000415083.2(SS18):r.1286+683_1286+701_ENST00000336777.5(SSX2):r.351_1410", "ENST00000415083.2", "ENST00000336777.5")] [InlineData( "ENST00000397938.2(EWSR1):r.1_1112_ENST00000527786.2(FLI1):r.1079_1144_ENST00000527786.2(FLI1):r.1145-1478_1145-1410_ENST00000527786.2(FLI1):r.1145_4127", "ENST00000397938.2", "ENST00000527786.2")] [InlineData("ENST00000305877.8(BCR):r.1_2866::ENST00000372348.2(ABL1):r.511-?_511-?::ENST00000318560.5(ABL1):r.461_5766", "ENST00000305877.8", "ENST00000318560.5")] [InlineData( "ENST00000305877.12(BCR):r.1_2866::ENST00000372348.6(ABL1):r.511-?_511-?::ENST00000318560.5(ABL1):r.461_5766", "ENST00000305877.12", "ENST00000318560.5")] public void Parse_ExpectedResults(string hgvsString, string expectedTranscriptId5, string expectedTranscriptId3) { (string actualTranscriptId5, string actualTranscriptId3) = HgvsRnaParser.Parse(hgvsString); Assert.Equal(expectedTranscriptId5, actualTranscriptId5); Assert.Equal(expectedTranscriptId3, actualTranscriptId3); } [Theory] [InlineData("ENST00000305877.8(BCR):r.1_2866")] [InlineData("ENST00000000123.1(ABC):r.1_2866::ENST00000000456.2(ABC):r.511-?_511-?::ENST00000000789.3(ABC):r.461_5766")] public void Parse_UnexpectedTranscriptCount_ThrowException(string hgvsString) { Assert.Throws(delegate { HgvsRnaParser.Parse(hgvsString); }); } [Fact] public void GetTranscripts_ExpectedResults() { (TranscriptCache transcriptCache, ITranscript transcript, ITranscript transcript2) = GetTranscriptCache(); string[] expectedGeneSymbols = {transcript.Gene.Symbol, transcript2.Gene.Symbol}; ulong expectedFusionKey = GeneFusionKey.Create( GeneFusionKey.CreateGeneKey(transcript.Gene.EnsemblId.WithoutVersion), GeneFusionKey.CreateGeneKey(transcript2.Gene.EnsemblId.WithoutVersion)); (string[] actualGeneSymbols, ulong actualFusionKey) = HgvsRnaParser.GetTranscripts("ENST00000290663.10(MED8):r.1_79+?_ENST00000347849.7(ERG):r.312_5034", transcriptCache); Assert.Equal(expectedGeneSymbols, actualGeneSymbols); Assert.Equal(expectedFusionKey, actualFusionKey); } public static (TranscriptCache TranscriptCache, ITranscript Transcript, ITranscript Transcript2) GetTranscriptCache() { ITranscript transcript = MockedData.Transcripts.ENST00000290663; ITranscript transcript2 = MockedData.Transcripts.ENST00000347849; var idToTranscript = new Dictionary { [transcript.Id.WithoutVersion] = transcript, [transcript2.Id.WithoutVersion] = transcript2 }; return (new TranscriptCache(idToTranscript), transcript, transcript2); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/Conversion/HistologyTests.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.CosmicGeneFusions.Conversion; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions.Conversion { public sealed class HistologyTests { private readonly HashSet _fusionEntries = new() { new RawCosmicGeneFusion(10, 0, null, null, "carcinoma", "ductal carcinoma", null, 0), new RawCosmicGeneFusion(20, 0, null, null, "carcinoma", "ductal carcinoma", null, 0), new RawCosmicGeneFusion(30, 0, null, null, "carcinoma", "NS", null, 0), new RawCosmicGeneFusion(40, 0, null, null, "carcinoma", "signet ring adenocarcinoma", null, 0) }; [Fact] public void GetCounts_ExpectedResults() { const int numSamples = 4; CosmicCount[] actualCounts = Histology.GetCounts(_fusionEntries, numSamples); Assert.Equal(3, actualCounts.Length); CosmicCount actualCount = actualCounts[0]; Assert.Equal("ductal carcinoma", actualCount.name); Assert.Equal(2, actualCount.numSamples); actualCount = actualCounts[1]; Assert.Equal("carcinoma", actualCount.name); Assert.Equal(1, actualCount.numSamples); actualCount = actualCounts[2]; Assert.Equal("signet ring adenocarcinoma", actualCount.name); Assert.Equal(1, actualCount.numSamples); } [Theory] [InlineData(3)] [InlineData(5)] public void GetCounts_WrongSampleCount_ThrowException(int numSamples) { Assert.Throws(delegate { Histology.GetCounts(_fusionEntries, numSamples); }); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/Conversion/SiteTests.cs ================================================ using System.Collections.Generic; using System.IO; using SAUtils.CosmicGeneFusions.Conversion; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions.Conversion { public sealed class SiteTests { private readonly HashSet _fusionEntries = new() { new RawCosmicGeneFusion(10, 0, "skin", "ear", null, null, null, 0), new RawCosmicGeneFusion(20, 0, "skin", "NS", null, null, null, 0), new RawCosmicGeneFusion(30, 0, "skin", "ear", null, null, null, 0), new RawCosmicGeneFusion(40, 0, "soft tissue", "blood vessel", null, null, null, 0) }; [Fact] public void GetCounts_ExpectedResults() { const int numSamples = 4; CosmicCount[] actualCounts = Site.GetCounts(_fusionEntries, numSamples); Assert.Equal(3, actualCounts.Length); CosmicCount actualCount = actualCounts[0]; Assert.Equal("skin (ear)", actualCount.name); Assert.Equal(2, actualCount.numSamples); actualCount = actualCounts[1]; Assert.Equal("skin", actualCount.name); Assert.Equal(1, actualCount.numSamples); actualCount = actualCounts[2]; Assert.Equal("soft tissue (blood vessel)", actualCount.name); Assert.Equal(1, actualCount.numSamples); } [Fact] public void GetCounts_TotalSampleCountTooHigh_ThrowException() { const int numSamples = 3; Assert.Throws(delegate { Site.GetCounts(_fusionEntries, numSamples); }); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/CreateCosmicGeneFusionsTests.cs ================================================ using System; using SAUtils.CosmicGeneFusions; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions { public sealed class CreateCosmicGeneFusionsTests { [Fact] public void CreateDataSourceVersion_ExpectedResults() { const string expectedName = "COSMIC gene fusions"; const string expectedDescription = "manually curated somatic gene fusions"; const string expectedVersion = "94"; const string releaseDate = "2021-05-28"; long expectedReleaseDateTicks = DateTime.Parse(releaseDate).Ticks; DataSourceVersion actualDataSourceVersion = CreateCosmicGeneFusions.CreateDataSourceVersion(expectedVersion, releaseDate); Assert.Equal(expectedName, actualDataSourceVersion.Name); Assert.Equal(expectedDescription, actualDataSourceVersion.Description); Assert.Equal(expectedVersion, actualDataSourceVersion.Version); Assert.Equal(expectedReleaseDateTicks, actualDataSourceVersion.ReleaseDateTicks); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/IO/CosmicGeneFusionParserTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using SAUtils.CosmicGeneFusions.Conversion; using SAUtils.CosmicGeneFusions.IO; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions.IO { public sealed class CosmicGeneFusionParserTests { [Fact] public void Parse_ExpectedResults() { var lines = new List { "SAMPLE_ID SAMPLE_NAME PRIMARY_SITE SITE_SUBTYPE_1 SITE_SUBTYPE_2 SITE_SUBTYPE_3 PRIMARY_HISTOLOGY HISTOLOGY_SUBTYPE_1 HISTOLOGY_SUBTYPE_2 HISTOLOGY_SUBTYPE_3 FUSION_ID TRANSLOCATION_NAME 5'_CHROMOSOME 5'_STRAND 5'_GENE_ID 5'_GENE_NAME 5'_LAST_OBSERVED_EXON 5'_GENOME_START_FROM 5'_GENOME_START_TO 5'_GENOME_STOP_FROM 5'_GENOME_STOP_TO 3'_CHROMOSOME 3'_STRAND 3'_GENE_ID 3'_GENE_NAME 3'_FIRST_OBSERVED_EXON 3'_GENOME_START_FROM 3'_GENOME_START_TO 3'_GENOME_STOP_FROM 3'_GENOME_STOP_TO FUSION_TYPE PUBMED_PMID", "749711 HCC1187 breast NS NS NS carcinoma ductal_carcinoma NS NS 665 ENST00000360863.10(RGS22):r.1_3555_ENST00000369518.1(SYCP1):r.2100_3452 8 - 197199 RGS22 22 99981937 99981937 100106116 100106116 1 + 212470 SYCP1_ENST00000369518 24 114944339 114944339 114995367 114995367 Inferred Breakpoint 20033038", "749711 HCC1187 breast NS NS NS carcinoma ductal_carcinoma NS NS 665 ENST00000360863.10(RGS22):r.1_3555_ENST00000369518.1(SYCP1):r.2100_3452 8 - 197199 RGS22 22 99981937 99981937 100106116 100106116 1 + 212470 SYCP1_ENST00000369518 24 114944339 114944339 114995367 114995367 Observed mRNA 20033038", "749712 HCC1395 breast NS NS NS carcinoma ductal_carcinoma NS NS 667 ENST00000395686.7(ERO1A):r.1_658_ENST00000395631.6(FERMT2):r.744_3369 14 - 282967 ERO1A 5 52671795 52671795 52695705 52695705 14 - 268960 FERMT2_ENST00000395631 5 52857268 52857268 52881469 52881469 Inferred Breakpoint 20033038" }; using var ms = new MemoryStream(); StreamReader reader = GetCosmicTestData(ms, lines); Dictionary> actualFusionIdToEntries = CosmicGeneFusionParser.Parse(reader); Assert.Equal(2, actualFusionIdToEntries.Count); HashSet geneFusions = actualFusionIdToEntries[665]; Assert.NotNull(geneFusions); Assert.Single(geneFusions); RawCosmicGeneFusion actualFusion = geneFusions.First(); Assert.Equal(749711, actualFusion.SampleId); Assert.Equal(665, actualFusion.FusionId); Assert.Equal("breast", actualFusion.PrimarySite); Assert.Equal("NS", actualFusion.SiteSubtype1); Assert.Equal("carcinoma", actualFusion.PrimaryHistology); Assert.Equal("ductal carcinoma", actualFusion.HistologySubtype1); Assert.Equal("ENST00000360863.10(RGS22):r.1_3555_ENST00000369518.1(SYCP1):r.2100_3452", actualFusion.HgvsNotation); Assert.Equal(20033038, actualFusion.PubMedId); geneFusions = actualFusionIdToEntries[667]; Assert.NotNull(geneFusions); Assert.Single(geneFusions); actualFusion = geneFusions.First(); Assert.Equal(749712, actualFusion.SampleId); Assert.Equal(667, actualFusion.FusionId); Assert.Equal("breast", actualFusion.PrimarySite); Assert.Equal("NS", actualFusion.SiteSubtype1); Assert.Equal("carcinoma", actualFusion.PrimaryHistology); Assert.Equal("ductal carcinoma", actualFusion.HistologySubtype1); Assert.Equal("ENST00000395686.7(ERO1A):r.1_658_ENST00000395631.6(FERMT2):r.744_3369", actualFusion.HgvsNotation); Assert.Equal(20033038, actualFusion.PubMedId); } [Fact] public void Parse_IncorrectColumnCount_ThrowException() { var lines = new List { "SAMPLE_ID SAMPLE_NAME PRIMARY_SITE SITE_SUBTYPE_1 SITE_SUBTYPE_2 SITE_SUBTYPE_3 PRIMARY_HISTOLOGY HISTOLOGY_SUBTYPE_1 HISTOLOGY_SUBTYPE_2 HISTOLOGY_SUBTYPE_3 FUSION_ID TRANSLOCATION_NAME 5'_CHROMOSOME 5'_STRAND 5'_GENE_ID 5'_GENE_NAME 5'_LAST_OBSERVED_EXON 5'_GENOME_START_FROM 5'_GENOME_START_TO 5'_GENOME_STOP_FROM 5'_GENOME_STOP_TO 3'_CHROMOSOME 3'_STRAND 3'_GENE_ID 3'_GENE_NAME 3'_FIRST_OBSERVED_EXON 3'_GENOME_START_FROM 3'_GENOME_START_TO 3'_GENOME_STOP_FROM 3'_GENOME_STOP_TO FUSION_TYPE PUBMED_PMID", "749711 HCC1187" }; using var ms = new MemoryStream(); StreamReader reader = GetCosmicTestData(ms, lines); Assert.Throws(delegate { CosmicGeneFusionParser.Parse(reader); }); } private static StreamReader GetCosmicTestData(Stream stream, List lines) { using (var writer = new StreamWriter(stream, Encoding.UTF8, 1024, true)) { foreach (string line in lines) writer.WriteLine(line); } stream.Position = 0; return new StreamReader(stream); } [Fact] public void RemoveUnderlines_ExpectedResults() { const string input = "spindle_epithelial_tumour_with_thymus_like_differentiation"; const string expectedResult = "spindle epithelial tumour with thymus like differentiation"; string actualResult = CosmicGeneFusionParser.RemoveUnderlines(input); Assert.Equal(expectedResult, actualResult); } } } ================================================ FILE: UnitTests/SAUtils/CosmicGeneFusions/IO/GeneFusionJsonWriterTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using SAUtils.CosmicGeneFusions.IO; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.SAUtils.CosmicGeneFusions.IO { public sealed class GeneFusionJsonWriterTests { [Fact] public void GeneFusionJsonWriter_ExpectedResults() { Dictionary expectedGeneKeyToJson = GetKeyToJson(); IDataSourceVersion expectedVersion = new DataSourceVersion("COSMIC Gene Fusions", "102", DateTime.Now.Ticks, "COSMIC"); using var ms = new MemoryStream(); using (var writer = new GeneFusionJsonWriter(ms, "cosmicGeneFusions", expectedVersion, true)) { writer.Write(expectedGeneKeyToJson); } ms.Position = 0; Dictionary actualGeneKeyToJson; IDataSourceVersion actualVersion; using (var reader = new GeneFusionJsonReader(ms)) { reader.LoadAnnotations(); actualGeneKeyToJson = reader.FusionKeyToFusions; actualVersion = reader.Version; } Assert.Equal(expectedVersion, actualVersion, new DataSourceVersionComparer()); Assert.Equal(expectedGeneKeyToJson.Count, actualGeneKeyToJson.Count); foreach (ulong geneKey in expectedGeneKeyToJson.Keys) { Assert.Equal(expectedGeneKeyToJson[geneKey], actualGeneKeyToJson[geneKey]); } } private static Dictionary GetKeyToJson() { var geneKeyToFusion = new Dictionary(); var json = "\"id\":\"COSF2245\",\"numSamples\":13,\"geneSymbols\":[\"ETV6\",\"RUNX1\"],\"hgvsr\":\"ENST00000396373.8(ETV6):r.1_1283_ENST00000300305.7(RUNX1):r.504_6222\",\"histologies\":[{\"histology\":\"lymphoid neoplasm\",\"numSamples\":14}],\"sites\":[{\"site\":\"haematopoietic and lymphoid tissue\",\"numSamples\":11}]"; var json2 = "\"id\":\"COSF100\",\"numSamples\":2,\"geneSymbols\":[\"A\",\"B\"],\"hgvsr\":\"ENST00000396373.8(A):r.1_1283_ENST00000300305.7(B):r.504_6222\",\"histologies\":[{\"histology\":\"lymphoid neoplasm\",\"numSamples\":14}]"; var json3 = "\"id\":\"COSF200\",\"numSamples\":7,\"geneSymbols\":[\"C\",\"D\"],\"hgvsr\":\"ENST00000396373.8(C):r.1_1283_ENST00000300305.7(D):r.504_6222\",\"sites\":[{\"site\":\"haematopoietic and lymphoid tissue\",\"numSamples\":11}]"; geneKeyToFusion[1000] = new[] {json, json2}; geneKeyToFusion[2000] = new[] {json3}; return geneKeyToFusion; } } } ================================================ FILE: UnitTests/SAUtils/CustomAnnotations/AllowedValuesTests.cs ================================================ using ErrorHandling.Exceptions; using Xunit; using SAUtils.Custom; namespace UnitTests.SAUtils.CustomAnnotations { public sealed class AllowedValuesTests { [Fact] public void IsEmptyValue_AsExpected() { Assert.True(AllowedValues.IsEmptyValue("")); Assert.True(AllowedValues.IsEmptyValue(".")); Assert.False(AllowedValues.IsEmptyValue("-")); } [Fact] public void ValidatePredictionValue_Pass() { AllowedValues.ValidatePredictionValue("", ""); AllowedValues.ValidatePredictionValue(".", ""); AllowedValues.ValidatePredictionValue("P", ""); AllowedValues.ValidatePredictionValue("Likely Benign", ""); AllowedValues.ValidatePredictionValue("Vus", ""); } [Fact] public void ValidatePredictionValue_ThrowException() { Assert.Throws(() => AllowedValues.ValidatePredictionValue("LikelyBenign", "")); Assert.Throws(() => AllowedValues.ValidatePredictionValue("Likely Benign, LB", "")); } } } ================================================ FILE: UnitTests/SAUtils/CustomAnnotations/GeneAnnotationParserTests.cs ================================================ using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using SAUtils.Custom; using SAUtils.Schema; using VariantAnnotation.SA; using Xunit; namespace UnitTests.SAUtils.CustomAnnotations { public sealed class GeneAnnotationParserTests { private static readonly Dictionary EntrezGeneIdToSymbol = new Dictionary { {"1", "Gene1" }, {"2", "Gene2" } }; private static readonly Dictionary EnsemblIdToSymbol = new Dictionary { {"ENSG1", "Gene1" }, {"ENSG2", "Gene2" } }; private static StreamReader GetReadStream(string text) { byte[] data; using (var memStream = new MemoryStream()) using (var writer = new StreamWriter(memStream)) { writer.Write(text); writer.Flush(); data = memStream.ToArray(); } return new StreamReader(new MemoryStream(data)); } [Fact] public void ParseHeaderLines_AsExpected() { const string headerLines = "#title=InternalGeneAnnotation\n" + "#geneSymbol\tgeneId\tOMIM Description\tIs Oncogene\tphenotype\tmimNumber\tnotes\n" + "#categories\t.\tDescription\tFilter\t\tIdentifier\t.\n" + "#descriptions\t.\tGene description from OMIM\t\tGene phenotype\t\tFree text\n" + "#type\t\tstring\tbool\tstring\tnumber\tstring\n"; using (var parser = new GeneAnnotationsParser(GetReadStream(headerLines), EntrezGeneIdToSymbol, EnsemblIdToSymbol)) { parser.ParseHeaderLines(); var expectedJsonKeys = new[] {"OMIM Description", "Is Oncogene", "phenotype", "mimNumber", "notes"}; var expectedCategories = new[] { CustomAnnotationCategories.Description, CustomAnnotationCategories.Filter, CustomAnnotationCategories.Unknown, CustomAnnotationCategories.Identifier, CustomAnnotationCategories.Unknown }; var expectedDescriptions = new[] { "Gene description from OMIM", null, "Gene phenotype", null, "Free text" }; var expectedTypes = new[] { SaJsonValueType.String, SaJsonValueType.Bool, SaJsonValueType.String, SaJsonValueType.Number, SaJsonValueType.String }; Assert.Equal("InternalGeneAnnotation", parser.JsonTag); Assert.Equal(expectedJsonKeys, parser.JsonKeys); Assert.Equal(expectedCategories, parser.Categories); Assert.Equal(expectedDescriptions, parser.Descriptions); Assert.Equal(expectedTypes, parser.ValueTypes); } } [Fact] public void ParseHeaderLines_version_and_Description() { const string headerLines = "#title=InternalGeneAnnotation\n" + "#version=v1.1\n" + "#description=Internal Gene Annotation\n" + "#geneSymbol\tgeneId\tOMIM Description\tIs Oncogene\tphenotype\tmimNumber\tnotes\n" + "#categories\t.\tDescription\tFilter\t\tIdentifier\t.\n" + "#descriptions\t.\tGene description from OMIM\t\tGene phenotype\t\tFree text\n" + "#type\t\tstring\tbool\tstring\tnumber\tstring\n"; using (var parser = new GeneAnnotationsParser(GetReadStream(headerLines), EntrezGeneIdToSymbol, EnsemblIdToSymbol)) { parser.ParseHeaderLines(); Assert.Equal("v1.1", parser.Version); Assert.Equal("Internal Gene Annotation", parser.DataSourceDescription); } } [Fact] public void ParseHeaderLines_InconsistentFields() { const string invalidHeaderLines = "#title=InternalGeneAnnotation\n" + "#geneSymbol\tgeneId\tphenotype\tmimNumber\tnotes\n" + "#categories\t\t\tstring\tnumber\t.\n" + "#descriptions\t.\t.\t.\t.\tSome\tText\tHere\n" + "#type\t\t\tstring\tnumber\t.\n"; using (var parser = new GeneAnnotationsParser(GetReadStream(invalidHeaderLines), EntrezGeneIdToSymbol, EnsemblIdToSymbol)) { Assert.Throws(() => parser.ParseHeaderLines()); } } [Fact] public void GetItems_UnrecognizedGeneId_ThrowException() { const string lines = "#title=InternalGeneAnnotation\n" + "#geneSymbol\tgeneId\tOMIM Description\tIs Oncogene\tphenotype\tmimNumber\tnotes\n" + "#categories\t.\tDescription\tFilter\t\tIdentifier\t.\n" + "#descriptions\t.\tGene description from OMIM\t\tGene phenotype\t\tFree text\n" + "#type\t\tstring\tbool\tstring\tnumber\tstring\n" + "Abc\t3\tsome text\ttrue\tgood\t234\ttest\n"; using (var parser = GeneAnnotationsParser.Create(GetReadStream(lines), EntrezGeneIdToSymbol, EnsemblIdToSymbol)) { Assert.Throws(() => parser.GetItems()); } } [Fact] public void GetItems_SameGene_MultipleEntries_ThrowException() { const string lines = "#title=InternalGeneAnnotation\n" + "#geneSymbol\tgeneId\tOMIM Description\tIs Oncogene\tphenotype\tmimNumber\tnotes\n" + "#categories\t.\tDescription\tFilter\t\tIdentifier\t.\n" + "#descriptions\t.\tGene description from OMIM\t\tGene phenotype\t\tFree text\n" + "#type\t\tstring\tbool\tstring\tnumber\tstring\n"+ "Abc\t1\tsome text\ttrue\tgood\t234\ttest\n" + "123\tENSG1\tsome other text\tfalse\tbad\t200\ttest2\n"; using (var parser = GeneAnnotationsParser.Create(GetReadStream(lines), EntrezGeneIdToSymbol, EnsemblIdToSymbol)) { Assert.Throws(() => parser.GetItems()); } } [Fact] public void GetItems_EmptyAnnotation_ThrowException() { const string lines = "#title=InternalGeneAnnotation\n" + "#geneSymbol\tgeneId\tOMIM Description\tIs Oncogene\tphenotype\tmimNumber\tnotes\n" + "#categories\t.\tDescription\tFilter\t\tIdentifier\t.\n" + "#descriptions\t.\tGene description from OMIM\t\tGene phenotype\t\tFree text\n" + "#type\t\tstring\tbool\tstring\tnumber\tstring\n" + "Abc\t1\t\t.\t\t.\t\n" + "Abc\tENSG2\tsome other text\tfalse\tbad\t200\ttest2\n"; using (var parser = GeneAnnotationsParser.Create(GetReadStream(lines), EntrezGeneIdToSymbol, EnsemblIdToSymbol)) { Assert.Throws(() => parser.GetItems()); } } [Fact] public void GetItems_AsExpected() { const string lines = "#title=InternalGeneAnnotation\n" + "#geneSymbol\tgeneId\tOMIM Description\tIs Oncogene\tphenotype\tmimNumber\tnotes\n" + "#categories\t.\tDescription\tFilter\t\tIdentifier\t.\n" + "#descriptions\t.\tGene description from OMIM\t\tGene phenotype\t\tFree text\n" + "#type\t\tstring\tbool\tstring\tnumber\tstring\n" + "Abc\t1\tsome text\ttrue\tgood\t234\ttest\n" + "Abc\tENSG2\tsome other text\tfalse\tbad\t200\ttest2\n"; using (var parser = GeneAnnotationsParser.Create(GetReadStream(lines), EntrezGeneIdToSymbol, EnsemblIdToSymbol)) { var geneSymbol2Items = parser.GetItems(); Assert.Equal(2, geneSymbol2Items.Count); Assert.Single(geneSymbol2Items["Gene1"]); Assert.Single(geneSymbol2Items["Gene2"]); Assert.Equal("{\"OMIM Description\":\"some text\",\"Is Oncogene\":true,\"phenotype\":\"good\",\"mimNumber\":234,\"notes\":\"test\"}", geneSymbol2Items["Gene1"][0].GetJsonString()); Assert.Equal("{\"OMIM Description\":\"some other text\",\"phenotype\":\"bad\",\"mimNumber\":200,\"notes\":\"test2\"}", geneSymbol2Items["Gene2"][0].GetJsonString()); } } } } ================================================ FILE: UnitTests/SAUtils/CustomAnnotations/ParserUtilitiesTests.cs ================================================ using System.Collections.Generic; using ErrorHandling.Exceptions; using Genome; using SAUtils.Custom; using Xunit; namespace UnitTests.SAUtils.CustomAnnotations { public sealed class ParserUtilitiesTests { private readonly HashSet _allowedGenomeAssemblies = new HashSet { GenomeAssembly.GRCh37, GenomeAssembly.GRCh38}; [Fact] public void CheckPrefix_InvalidPrefix_ThrowException() { Assert.Throws(() => ParserUtilities.CheckPrefix("invalidPrefix=someValue", "expectedPrefix")); } [Fact] public void ParseTags_LessThanRequiredColumns_ThrowException() { Assert.Throws(() => ParserUtilities.ParseTags("#CHROM\tPOS\tREF", "#CHROM", 4)); } [Theory] [InlineData("String")] [InlineData("NUMBER")] [InlineData("Bool")] public void ParseTypes_ValidType_Pass(string type) { string typeLine = $"#type\t.\t.\t.\t{type}"; ParserUtilities.ParseTypes(typeLine, 4, 1); } [Theory] [InlineData("boolean")] [InlineData("double")] [InlineData("int")] public void ParseTypes_InvalidType_ThrowException(string type) { string typeLine = $"#type\t.\t.\t.\t{type}"; Assert.Throws(() => ParserUtilities.ParseTypes(typeLine, 4, 1)); } [Fact] public void ParseCategories_InvalidValue_ThrowException() { Assert.Throws(() => ParserUtilities.ParseCategories("#categories\tWOW", 1, 1, null)); } } } ================================================ FILE: UnitTests/SAUtils/CustomAnnotations/VariantAnnotationsParserTests.cs ================================================ using System.IO; using System.Linq; using ErrorHandling.Exceptions; using Genome; using Moq; using SAUtils.Custom; using SAUtils.Schema; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.SA; using Xunit; namespace UnitTests.SAUtils.CustomAnnotations { public sealed class VariantAnnotationsParserTests { private static readonly ISequence Sequence = GetMockedSequence(); private static readonly ISequenceProvider SequenceProvider = GetMockedSequenceProvider(); private static StreamReader GetReadStream(string text) { byte[] data; using (var memStream = new MemoryStream()) using (var writer = new StreamWriter(memStream)) { writer.Write(text); writer.Flush(); data = memStream.ToArray(); } return new StreamReader(new MemoryStream(data)); } [Fact] public void CheckPosAndRefColumns_InvalidPosOrRef_ThrowException() { var caParser = new VariantAnnotationsParser(null, null) {Tags = new[] {"#CHROM", "", "REF", "ALT"}}; Assert.Throws(() => caParser.CheckPosAndRefColumns()); caParser.Tags = new[] { "#CHROM", "POS", "REFERENCE", "ALT" }; Assert.Throws(() => caParser.CheckPosAndRefColumns()); } [Fact] public void CheckAltAndEndColumns_NoAltAndEnd_ThrowException() { var caParser = new VariantAnnotationsParser(null, null) {Tags = new[] {"#CHROM", "POS", "REF", "Note"}}; Assert.Throws(() => caParser.CheckAltAndEndColumns()); } [Fact] public void ParseHeaderLines_AsExpected() { const string headerLines = "#title=IcslAlleleFrequencies \n" + "#assembly=GRCh38\t\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tdeNovoQual\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\tScore\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tnumber\tstring"; using (var custParser = new VariantAnnotationsParser(GetReadStream(headerLines), null)) { custParser.ParseHeaderLines(); var expectedJsonKeys = new[] {"refAllele", "altAllele", "allAc", "allAn", "allAf", "failedFilter", "pathogenicity", "deNovoQual", "notes"}; var expectedIntervalJsonKeys = new[] {"start", "end", "allAc", "allAn", "allAf", "failedFilter", "pathogenicity", "deNovoQual", "notes"}; var expectedCategories = new[] { CustomAnnotationCategories.AlleleCount, CustomAnnotationCategories.AlleleNumber, CustomAnnotationCategories.AlleleFrequency, CustomAnnotationCategories.Unknown, CustomAnnotationCategories.Prediction, CustomAnnotationCategories.Score, CustomAnnotationCategories.Unknown }; var expectedDescriptions = new[] { "ALL", "ALL", "ALL", null, null, null, null }; var expectedTypes = new[] { SaJsonValueType.Number, SaJsonValueType.Number, SaJsonValueType.Number, SaJsonValueType.Bool, SaJsonValueType.String, SaJsonValueType.Number, SaJsonValueType.String }; Assert.Equal("IcslAlleleFrequencies", custParser.JsonTag); Assert.Equal(GenomeAssembly.GRCh38, custParser.Assembly); Assert.Equal(expectedJsonKeys, custParser.JsonKeys); Assert.Equal(expectedIntervalJsonKeys, custParser.IntervalJsonKeys); Assert.Equal(expectedCategories, custParser.Categories); Assert.Equal(expectedDescriptions, custParser.Descriptions); Assert.Equal(expectedTypes, custParser.ValueTypes); } } [Fact] public void ParseHeaderLines_matchBy_sv() { const string headerLines = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=sv\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring"; using (var custParser = new VariantAnnotationsParser(GetReadStream(headerLines), null)) { custParser.ParseHeaderLines(); Assert.Equal(ReportFor.StructuralVariants, custParser.ReportFor); } } [Fact] public void ParseHeaderLines_InconsistentFields() { const string invalidHeaderLines = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\tMore\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring"; using (var parser = new VariantAnnotationsParser(GetReadStream(invalidHeaderLines), null)) { Assert.Throws(() => parser.ParseHeaderLines()); } } [Fact] public void GetItems() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n" + "chr1\t14783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\n" + "chr2\t10302\tC\tA\t.\t53\t8928\t0.001421\tfalse\t.\t\n" + "chr2\t46993\tA\t\t50879\t50\t250\t0.001\tfalse\tbenign\t"; using (var custParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var items = custParser.GetItems().ToArray(); Assert.Equal(2, items.Length); Assert.Equal("\"refAllele\":\"G\",\"altAllele\":\"A\",\"allAc\":20,\"allAn\":125568,\"allAf\":0.000159,\"failedFilter\":true,\"pathogenicity\":\"VUS\"", items[0].GetJsonString()); Assert.Equal("\"refAllele\":\"C\",\"altAllele\":\"A\",\"allAc\":53,\"allAn\":8928,\"allAf\":0.001421", items[1].GetJsonString()); } } [Fact] public void GetIntervals_noALT() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tEND\tnotes\n" + "#categories\t.\t.\t.\t.\n" + "#descriptions\t.\t.\t.\t.\n" + "#type\t.\t.\t.\tstring\n" + "chr16\t20000000\tT\t70000000\tLots of false positives in this region"; using (var custParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var items = custParser.GetItems().ToArray(); Assert.Empty(items); var intervals = custParser.GetCustomIntervals(); Assert.Single(intervals); Assert.Equal("\"start\":20000000,\"end\":70000000,\"notes\":\"Lots of false positives in this region\"", intervals[0].GetJsonString()); } } [Fact] public void GetIntervals_start() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tnotes\n" + "#categories\t.\t.\t.\t.\t.\n" + "#descriptions\t.\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tstring\n" + "chr21\t10510818\tT\t.\t10699435\tinterval 1\n"+ "chr21\t10510818\tT\t\t10699435\tinterval 2"; using (var custParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var items = custParser.GetItems().ToArray(); Assert.Empty(items); var intervals = custParser.GetCustomIntervals(); Assert.Equal(2,intervals.Count); Assert.Equal("\"start\":10510818,\"end\":10699435,\"notes\":\"interval 1\"", intervals[0].GetJsonString()); Assert.Equal("\"start\":10510819,\"end\":10699435,\"notes\":\"interval 2\"", intervals[1].GetJsonString()); } } [Fact] public void GetItems_OnlyAlleleFrequencyTreatedAsDouble_OtherNumbersPrintAsIs() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\tanyNumber\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\tscore\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\tnumber\n" + "chr1\t12783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\t1.000\n" + "chr1\t13302\tC\tA\t.\t53\t8928\t0.001421\tfalse\t.\t\t3\n" + "chr1\t18972\tT\tC\t.\t10\t1000\t0.01\tfalse\t.\t\t100.1234567\n" + "chr1\t46993\tA\t\t50879\t50\t250\t0.001\tfalse\tbenign\t\t3.1415926"; using (var custParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var items = custParser.GetItems().ToArray(); Assert.Equal(3, items.Length); Assert.Equal("\"refAllele\":\"G\",\"altAllele\":\"A\",\"allAc\":20,\"allAn\":125568,\"allAf\":0.000159,\"failedFilter\":true,\"pathogenicity\":\"VUS\",\"anyNumber\":1.000", items[0].GetJsonString()); Assert.Equal("\"refAllele\":\"C\",\"altAllele\":\"A\",\"allAc\":53,\"allAn\":8928,\"allAf\":0.001421,\"anyNumber\":3", items[1].GetJsonString()); Assert.Equal("\"refAllele\":\"T\",\"altAllele\":\"C\",\"allAc\":10,\"allAn\":1000,\"allAf\":0.01,\"anyNumber\":100.1234567", items[2].GetJsonString()); } } [Fact] public void GetItems_invalid_scores() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\tanyNumber\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\tscore\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\tnumber\n" + "chr1\t12783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\t1.0\n" + "chr1\t13302\tC\tA\t.\t53\t8928\t0.001421\tfalse\t.\t\t3\n" + "chr1\t18972\tT\tC\t.\t10\t1000\t0.01\tfalse\t.\t\t100.1234567\n" + "chr1\t46993\tA\t\t50879\t50\t250\t0.001\tfalse\tbenign\t\tthree"; using (var parser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { Assert.Throws(()=> parser.GetItems().ToArray()); } } [Fact] public void GetItems_missing_scores() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\tanyNumber\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\tscore\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\tnumber\n" + "chr1\t12783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\t.\n" + "chr1\t13302\tC\tA\t.\t53\t8928\t0.001421\tfalse\t.\t\t3\n" + "chr1\t18972\tT\tC\t.\t10\t1000\t0.01\tfalse\t.\t\t100.1234567\n"; using (var parser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var items = parser.GetItems().ToArray(); Assert.DoesNotContain("anyNumber", items[0].GetJsonString()); Assert.Contains("anyNumber", items[1].GetJsonString()); } } [Fact] public void GetItems_ExtractCustomFilters() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\tanyNumber\tcustomFilter\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\t.\tFilter\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\tnumber\tstring\n" + "chr1\t12783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\t1.000\tgood variant\n" + "chr1\t13302\tC\tA\t.\t53\t8928\t0.001421\tfalse\t.\t\t3\tbad variant\n" + "chr1\t18972\tT\tC\t.\t10\t1000\t0.01\tfalse\t.\t\t100.1234567\tugly variant\n" + "chr1\t46993\tA\t\t50879\t50\t250\t0.001\tfalse\tbenign\t\t3.1415926\tvery ugly variant"; using (var custParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var items = custParser.GetItems().ToArray(); Assert.Equal(3, items.Length); Assert.Equal("\"refAllele\":\"G\",\"altAllele\":\"A\",\"allAc\":20,\"allAn\":125568,\"allAf\":0.000159,\"failedFilter\":true,\"pathogenicity\":\"VUS\",\"anyNumber\":1.000,\"customFilter\":\"good variant\"", items[0].GetJsonString()); Assert.Equal("\"refAllele\":\"C\",\"altAllele\":\"A\",\"allAc\":53,\"allAn\":8928,\"allAf\":0.001421,\"anyNumber\":3,\"customFilter\":\"bad variant\"", items[1].GetJsonString()); Assert.Equal("\"refAllele\":\"T\",\"altAllele\":\"C\",\"allAc\":10,\"allAn\":1000,\"allAf\":0.01,\"anyNumber\":100.1234567,\"customFilter\":\"ugly variant\"", items[2].GetJsonString()); } } [Fact] public void GetItems_missing_filter() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\tanyNumber\tcustomFilter\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\t.\tFilter\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\tnumber\tstring\n" + "chr1\t12783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\t1.000\tgood variant\n" + "chr1\t13302\tC\tA\t.\t53\t8928\t0.001421\tfalse\t.\t\t3\tbad variant\n" + "chr1\t18972\tT\tC\t.\t10\t1000\t0.01\tfalse\t.\t\t100.1234567\tugly variant\n" + "chr1\t46993\tA\tG\t.\t50\t250\t0.001\tfalse\tbenign\t\t3.1415926\t."; using (var custParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var items = custParser.GetItems().ToArray(); Assert.Equal(4, items.Length); Assert.Equal("\"refAllele\":\"G\",\"altAllele\":\"A\",\"allAc\":20,\"allAn\":125568,\"allAf\":0.000159,\"failedFilter\":true,\"pathogenicity\":\"VUS\",\"anyNumber\":1.000,\"customFilter\":\"good variant\"", items[0].GetJsonString()); Assert.Equal("\"refAllele\":\"C\",\"altAllele\":\"A\",\"allAc\":53,\"allAn\":8928,\"allAf\":0.001421,\"anyNumber\":3,\"customFilter\":\"bad variant\"", items[1].GetJsonString()); Assert.Equal("\"refAllele\":\"T\",\"altAllele\":\"C\",\"allAc\":10,\"allAn\":1000,\"allAf\":0.01,\"anyNumber\":100.1234567,\"customFilter\":\"ugly variant\"", items[2].GetJsonString()); Assert.DoesNotContain("customFilter",items[3].GetJsonString()); } } [Fact] public void GetItems_ExtractCustomFilters_failsOnLargeText() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\tanyNumber\tcustomFilter\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\t.\tFilter\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\tnumber\tstring\n" + "chr1\t12783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\t1.000\tthe good variant, the bad variant and the ugly variant\n"; using (var custParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { Assert.Throws(() => custParser.GetItems().ToArray()); } } [Fact] public void GetItems_UnsortedData_ThrowException() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\tanyNumber\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\tnumber\n" + "chr1\t12783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\t1.000\n" + "chr1\t3302\tC\tA\t.\t53\t8928\t0.001421\tfalse\t.\t\t3\n" + "chr1\t18972\tT\tC\t.\t10\t1000\t0.01\tfalse\t.\t\t100.1234567\n" + "chr1\t46993\tA\t\t50879\t50\t250\t0.001\tfalse\tbenign\t\t3.1415926"; using (var caParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { Assert.Throws(() => caParser.GetItems().ToArray()); } } [Fact] public void GetIntervals() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=sv\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n" + "chr1\t12783\tG\tA\t.\t20\t125568\t0.000159\ttrue\tVUS\t\n" + "chr1\t13302\tC\tA\t.\t53\t8928\t0.001421\tfalse\t.\t\n" + "chr1\t46993\tA\t\t50879\t50\t250\t0.001\tfalse\tbenign\t"; using (var custParser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var items = custParser.GetItems().ToArray(); Assert.Equal(ReportFor.StructuralVariants, custParser.ReportFor); Assert.Equal(2, items.Length); var intervals = custParser.GetCustomIntervals(); Assert.Single(intervals); Assert.Equal("\"start\":46994,\"end\":50879,\"allAc\":50,\"allAn\":250,\"allAf\":0.001,\"pathogenicity\":\"benign\"", intervals[0].GetJsonString()); } } [Fact] public void IsValidNucleotideSequence_IsValidSequence_Pass() { Assert.True(VariantAnnotationsParser.IsValidAltAllele("actgnACTGN")); Assert.True(VariantAnnotationsParser.IsValidAltAllele("AAAAAAAAAAAAAAAAAATTAGTCAGGCAC[chr3:153444911[")); Assert.False(VariantAnnotationsParser.IsValidAltAllele("AC-GT")); } [Fact] public void ExtractItems_TrimmedAndLeftShifted() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n"; using (var parser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { var item = parser.ExtractItems("chr1\t12783\tA\tATA\t.\t20\t125568\t0.000159\ttrue\tVUS\t"); Assert.Equal(12782, item.Position); Assert.Equal("", item.RefAllele); Assert.Equal("TA", item.AltAllele); } } [Fact] public void Extract_symbolic_alleles() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n"; using (var parser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { parser.ExtractItems("chr1\t12783\tA\t\t24486\t20\t125568\t0.000159\ttrue\tVUS\t"); var intervals = parser.GetCustomIntervals(); Assert.Single(intervals); Assert.Equal(12784, intervals[0].Start); Assert.Equal(24486, intervals[0].End); } } [Fact] public void ParseTitle_Conflict_JsonTag() { const string text = "#title=topmed\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n"; Assert.Throws(() => VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)); } [Fact] public void ParseTitle_IncorrectFormat() { const string text = "#title:IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n"; Assert.Throws(() => VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)); } [Fact] public void ParseGenomeAssembly_UnsupportedAssembly_ThrowException() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=hg20\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n"; Assert.Throws(() => VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)); } [Fact] public void ParseGenomeAssembly_IncorrectFormat_ThrowException() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly-hg20\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n"; Assert.Throws(() => VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)); } [Fact] public void ParseHeader_version_and_description() { const string text = "#title=IcslAlleleFrequencies\n" + "#assembly=GRCh38\n" + "#version=v4.5\t\n"+ "#description=Internal allele frequencies\t\n" + "#matchVariantsBy=allele\n" + "#CHROM\tPOS\tREF\tALT\tEND\tallAc\tallAn\tallAf\tfailedFilter\tpathogenicity\tnotes\n" + "#categories\t.\t.\t.\t.\tAlleleCount\tAlleleNumber\tAlleleFrequency\t.\tPrediction\t.\n" + "#descriptions\t.\t.\t.\t.\tALL\tALL\tALL\t.\t.\t.\n" + "#type\t.\t.\t.\t.\tnumber\tnumber\tnumber\tbool\tstring\tstring\n"; using (var parser = VariantAnnotationsParser.Create(GetReadStream(text), SequenceProvider)) { Assert.Equal("v4.5", parser.Version); Assert.Equal("Internal allele frequencies", parser.DataSourceDescription); } } private static ISequenceProvider GetMockedSequenceProvider() { var seqProviderMock = new Mock(); seqProviderMock.SetupGet(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome); seqProviderMock.SetupGet(x => x.Sequence).Returns(Sequence); return seqProviderMock.Object; } private static ISequence GetMockedSequence() { var sequenceMock = new Mock(); sequenceMock.Setup(x => x.Substring(12783, 0)).Returns(""); sequenceMock.Setup(x => x.Substring(12733, 50)).Returns("ACGTA"); sequenceMock.Setup(x => x.Substring(12283, 500)).Returns("ACGTA"); return sequenceMock.Object; } } } ================================================ FILE: UnitTests/SAUtils/Dann/DannParserTests.cs ================================================ using System.IO; using System.Linq; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.Dann { public sealed class DannParserTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##DANN"); writer.WriteLine("#chr\tpos\tref\talt\tscore"); writer.WriteLine("1\t10001\t10001\tT\tC\t0.4396994049749739"); writer.WriteLine("1\t10001\t10001\tT\tG\t0.38108629377072734"); writer.WriteLine("1\t10002\t10002\tA\tC\t0.36182020272810128"); writer.WriteLine("1\t10002\t10002\tA\tG\t0.44413258111779291"); writer.WriteLine("1\t10002\t10002\tA\tT\t0.16812846819989813"); writer.WriteLine("1\t10003\t10003\tA\tC\t0.36516159615040267"); writer.WriteLine("1\t10003\t10003\tA\tG\t0.4480978029675266"); writer.WriteLine("1\t10003\t10003\tA\tG\taskdlj"); writer.WriteLine("asd\t10003\t10003\tA\tG\taskdlj"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void DannReader_GetItems_AsExpected() { var nucleotides = new[] {"A", "C", "G", "T"}; var dannParserSettings = new ParserSettings( new ColumnIndex(0, 2, 3, 4, 5, null), nucleotides, GenericScoreParser.MaxRepresentativeScores ); using (var streamReader = new StreamReader(GetStream())) using (var scoreParser = new GenericScoreParser(dannParserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { GenericScoreItem[] dannItems = scoreParser.GetItems().ToArray(); Assert.Equal(7, dannItems.Length); Assert.Equal(10001, dannItems[0].Position); Assert.Equal("T", dannItems[0].RefAllele); Assert.Equal("C", dannItems[0].AltAllele); Assert.Equal(0.4396994049749739, dannItems[0].Score); Assert.Equal(10001, dannItems[1].Position); Assert.Equal("T", dannItems[1].RefAllele); Assert.Equal("G", dannItems[1].AltAllele); Assert.Equal(0.38108629377072734, dannItems[1].Score); Assert.Equal(10002, dannItems[4].Position); Assert.Equal("A", dannItems[4].RefAllele); Assert.Equal("T", dannItems[4].AltAllele); Assert.Equal(0.16812846819989813, dannItems[4].Score); } } } } ================================================ FILE: UnitTests/SAUtils/DataStructures/CounterDictionaryTests.cs ================================================ using System.Collections.Generic; using OptimizedCore; using SAUtils.DataStructures; using Xunit; namespace UnitTests.SAUtils.DataStructures; public sealed class CounterDictionaryTests { [Fact] public void TestCounterDictionary() { var inputData = new[] { "A", "B", "A", "A", "C", "B" }; var counterDict = new CounterDictionary(); foreach (string keys in inputData) { counterDict.Add(keys); } Assert.Equal(6, counterDict.Total); Assert.Equal(3, counterDict["A"]); Assert.Equal(2, counterDict["B"]); Assert.Equal(1, counterDict["C"]); Assert.Equal(0, counterDict.GetValueOrDefault("NOT THERE", 0)); var sb = StringBuilderPool.Get(); counterDict.SerializeJson(sb); Assert.Equal("{\"count\":6,\"A\":3,\"B\":2,\"C\":1}", sb.ToString()); } } ================================================ FILE: UnitTests/SAUtils/DbVar/DosageMapRegionParserTests.cs ================================================ using System.IO; using System.Linq; using SAUtils.ClinGen; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.DbVar { public sealed class DosageMapRegionParserTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("#ClinGen Region Curation Results"); writer.WriteLine("#07 May,2019"); writer.WriteLine("#Genomic Locations are reported on GRCh37 (hg19): GCF_000001405.13"); writer.WriteLine("#https://www.ncbi.nlm.nih.gov/projects/dbvar/clingen"); writer.WriteLine("#to create link: https://www.ncbi.nlm.nih.gov/projects/dbvar/clingen/clingen_region.cgi?id=key"); writer.WriteLine("#ISCA ID\tISCA Region Name\tcytoBand\tGenomic Location\tHaploinsufficiency Score\tHaploinsufficiency Description\tHaploinsufficiency PMID1\tHaploinsufficiency PMID2\tHaploinsufficiency PMID3\tTriplosensitivity Score\tTriplosensitivity Description\tTriplosensitivity PMID1\tTriplosensitivity PMID2\tTriplosensitivity PMID3\tDate Last Evaluated\tLoss phenotype OMIM ID\tTriplosensitive phenotype OMIM ID"); writer.WriteLine("ISCA-46299\tXp11.22 region (includes HUWE1)\tXp11.22\tchrX:53363456-53793054\t0\tNo evidence available\t\t\t\t3\tSufficient evidence for dosage pathogenicity\t22840365\t20655035\t26692240\t2018-11-19"); writer.WriteLine("ISCA-46295\t15q13.3 recurrent region (D-CHRNA7 to BP5) (includes CHRNA7 and OTUD7A)\t15q13.3\tchr15:32019621-32445405\t3\tSufficient evidence for dosage pathogenicity\t19898479\t20236110\t22775350\t40\tDosage sensitivity unlikely\t26968334\t22420048\t\t2018-05-10"); writer.WriteLine("ISCA-46291\t7q11.23 recurrent distal region (includes HIP1, YWHAG)\t7q11.23\tchr7:75158048-76063176\t2\tSome evidence for dosage pathogenicity\t21109226\t16971481\t\t1\tLittle evidence for dosage pathogenicity\t21109226\t27867344\t\t2018-12-31"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void StandardParsing() { using (var dosageMapRegionParser = new DosageMapRegionParser(GetStream(), ChromosomeUtilities.RefNameToChromosome)) { var items = dosageMapRegionParser.GetItems().OrderBy(x => x.Chromosome.Index).ToArray(); Assert.Equal(3, items.Length); Assert.Equal("\"chromosome\":\"7\",\"begin\":75158048,\"end\":76063176,\"haploinsufficiency\":\"emerging evidence suggesting dosage sensitivity is associated with clinical phenotype\",\"triplosensitivity\":\"little evidence suggesting dosage sensitivity is associated with clinical phenotype\"", items[0].GetJsonString()); Assert.Equal("\"chromosome\":\"15\",\"begin\":32019621,\"end\":32445405,\"haploinsufficiency\":\"sufficient evidence suggesting dosage sensitivity is associated with clinical phenotype\",\"triplosensitivity\":\"dosage sensitivity unlikely\"", items[1].GetJsonString()); Assert.Equal("\"chromosome\":\"X\",\"begin\":53363456,\"end\":53793054,\"haploinsufficiency\":\"no evidence to suggest that dosage sensitivity is associated with clinical phenotype\",\"triplosensitivity\":\"sufficient evidence suggesting dosage sensitivity is associated with clinical phenotype\"", items[2].GetJsonString()); } } } } ================================================ FILE: UnitTests/SAUtils/DbVar/DosageSensitivityParserTests.cs ================================================ using System.IO; using SAUtils.ClinGen; using Xunit; namespace UnitTests.SAUtils.DbVar { public sealed class DosageSensitivityParserTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("#ClinGen Gene Curation Results"); writer.WriteLine("#07 May,2019"); writer.WriteLine("#Genomic Locations are reported on GRCh37 (hg19): GCF_000001405.13"); writer.WriteLine("#Gene Symbol\tGene ID\tcytoBand\tGenomic Location\tHaploinsufficiency Score\tHaploinsufficiency Description\tHaploinsufficiency PMID1\tHaploinsufficiency PMID2\tHaploinsufficiency PMID3\tTriplosensitivity Score\tTriplosensitivity Description\tTriplosensitivity PMID1\tTriplosensitivity PMID2\tTriplosensitivity PMID3\tDate Last Evaluated\tLoss phenotype OMIM ID\tTriplosensitive phenotype OMIM ID"); writer.WriteLine("A4GALT\t53947\t22q13.2\tchr22:43088121-43117307\t30\tGene associated with autosomal recessive phenotype\t\t\t\t0\tNo evidence available\t\t\t\t2014-12-11\t111400\t"); writer.WriteLine("AAGAB\t79719\t15q23\tchr15:67493013-67547536\t3\tSufficient evidence for dosage pathogenicity\t23064416\t23000146\t\t0\tNo evidence available\t\t\t\t2013-02-28\t148600\t"); writer.WriteLine("AARS\t16\t16q22.1\tchr16:70286297-70323412\t0\tNo evidence available\t\t\t\t0\tNo evidence available\t\t\t\t2018-01-11\t\t"); writer.WriteLine("AARS2\t57505\t6p21.1\tchr6:44266463-44281063\t30\tGene associated with autosomal recessive phenotype\t\t\t\tNot yet evaluated\tNot yet evaluated\t\t\t\t2016-08-22\t\t"); writer.WriteLine("RSPH1\t89765\t21q22.3\tchr21:42472486-42496246\t30\tGene associated with autosomal recessive phenotype\t\t\t\t\t\t\tNot yet evaluated\tNot yet evaluated\t\t\t\t\t\t\t2016-08-22\t615481\t"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void StandardParsing() { using (var dbVarReader = new DosageSensitivityParser(GetStream())) { var items = dbVarReader.GetItems(); Assert.Equal(5, items.Count); Assert.Equal("{\"haploinsufficiency\":\"gene associated with autosomal recessive phenotype\",\"triplosensitivity\":\"no evidence to suggest that dosage sensitivity is associated with clinical phenotype\"}", items["A4GALT"][0].GetJsonString()); Assert.Equal("{\"haploinsufficiency\":\"sufficient evidence suggesting dosage sensitivity is associated with clinical phenotype\",\"triplosensitivity\":\"no evidence to suggest that dosage sensitivity is associated with clinical phenotype\"}", items["AAGAB"][0].GetJsonString()); Assert.Equal("{\"haploinsufficiency\":\"no evidence to suggest that dosage sensitivity is associated with clinical phenotype\",\"triplosensitivity\":\"no evidence to suggest that dosage sensitivity is associated with clinical phenotype\"}", items["AARS"][0].GetJsonString()); Assert.Equal("{\"haploinsufficiency\":\"gene associated with autosomal recessive phenotype\",\"triplosensitivity\":\"Not yet evaluated\"}", items["AARS2"][0].GetJsonString()); Assert.Equal("{\"haploinsufficiency\":\"gene associated with autosomal recessive phenotype\",\"triplosensitivity\":\"Not yet evaluated\"}", items["RSPH1"][0].GetJsonString()); } } } } ================================================ FILE: UnitTests/SAUtils/FusionCatcher/FusionCatcherDataSourceTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Text; using SAUtils.FusionCatcher; using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.GeneFusions.Utilities; using Xunit; namespace UnitTests.SAUtils.FusionCatcher { public sealed class FusionCatcherDataSourceTests { [Fact] public void Parse_ExpectedResults() { var geneKeyToFusion = new Dictionary(); var knownEnsemblGenes = new HashSet { "ENSG00000035499", "ENSG00000155959" }; using var ms = new MemoryStream(); using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("ENSG00000006210\tENSG00000102962"); writer.WriteLine("ENSG00000006652\tENSG00000181016"); writer.WriteLine("ENSG00000014138\tENSG00000149798"); writer.WriteLine("ENSG00000026297\tENSG00000071242"); writer.WriteLine("ENSG00000035499\tENSG00000155959"); writer.WriteLine("ENSG00000055211\tENSG00000131013"); writer.WriteLine("ENSG00000055332\tENSG00000179915"); writer.WriteLine("ENSG00000062485\tENSG00000257727"); writer.WriteLine("ENSG00000065978\tENSG00000166501"); writer.WriteLine("ENSG00000066044\tENSG00000104980"); } ms.Position = 0; FusionCatcherDataSource.Parse(ms, GeneFusionSource.OneK_Genomes_Project, CollectionType.Germline, geneKeyToFusion, knownEnsemblGenes); Assert.Single(geneKeyToFusion); ulong fusionKey = GeneFusionKey.Create(GeneFusionKey.CreateGeneKey("ENSG00000035499"), GeneFusionKey.CreateGeneKey("ENSG00000155959")); bool hasEntry = geneKeyToFusion.TryGetValue(fusionKey, out GeneFusionSourceBuilder actualBuilder); Assert.True(hasEntry); Assert.False(actualBuilder.IsPseudogenePair); Assert.False(actualBuilder.IsParalogPair); Assert.False(actualBuilder.IsReadthrough); Assert.Single(actualBuilder.GermlineSources); Assert.Empty(actualBuilder.SomaticSources); Assert.Equal(GeneFusionSource.OneK_Genomes_Project, actualBuilder.GermlineSources[0]); } [Fact] public void Parse_IncorrectFileFormat_ThrowException() { var geneKeyToFusion = new Dictionary(); var knownEnsemblGenes = new HashSet(); using var ms = new MemoryStream(); using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("ENSG00000006210\tENSG00000102962\tENSG00000181016"); } ms.Position = 0; Assert.Throws(delegate { FusionCatcherDataSource.Parse(ms, GeneFusionSource.OneK_Genomes_Project, CollectionType.Germline, geneKeyToFusion, knownEnsemblGenes); }); } [Fact] public void Parse_MultipleCollections_ExpectedResults() { var geneKeyToFusion = new Dictionary(); var knownEnsemblGenes = new HashSet { "ENSG00000035499", "ENSG00000155959" }; using var ms = new MemoryStream(); AddData(ms); FusionCatcherDataSource.Parse(ms, GeneFusionSource.Bao_gliomas, CollectionType.Somatic, geneKeyToFusion, knownEnsemblGenes); using var ms2 = new MemoryStream(); AddData(ms2); FusionCatcherDataSource.Parse(ms2, GeneFusionSource.Readthrough, CollectionType.Relationships, geneKeyToFusion, knownEnsemblGenes); Assert.Single(geneKeyToFusion); ulong fusionKey = GeneFusionKey.Create(GeneFusionKey.CreateGeneKey("ENSG00000035499"), GeneFusionKey.CreateGeneKey("ENSG00000155959")); bool hasEntry = geneKeyToFusion.TryGetValue(fusionKey, out GeneFusionSourceBuilder actualBuilder); Assert.True(hasEntry); Assert.False(actualBuilder.IsPseudogenePair); Assert.False(actualBuilder.IsParalogPair); Assert.True(actualBuilder.IsReadthrough); Assert.Empty(actualBuilder.GermlineSources); Assert.Single(actualBuilder.SomaticSources); } private static void AddData(MemoryStream ms) { using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("ENSG00000035499\tENSG00000155959"); } ms.Position = 0; } } } ================================================ FILE: UnitTests/SAUtils/FusionCatcher/GeneFusionSourceWriterTests.cs ================================================ using System; using System.IO; using Genome; using SAUtils.FusionCatcher; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.SAUtils.FusionCatcher { public sealed class GeneFusionSourceWriterTests { [Fact] public void GeneFusionSourceWriter_ExpectedResults() { (uint[] expectedOncogeneKeys, GeneFusionSourceCollection[] expectedIndex, GeneFusionIndexEntry[] expectedIndexEntries) = GetKeyToGeneFusion(); IDataSourceVersion expectedVersion = new DataSourceVersion("FusionCatcher", "1.33", DateTime.Now.Ticks, "gene fusions"); const string expectedJsonKey = "fusionCatcher"; using var ms = new MemoryStream(); using (var writer = new GeneFusionSourceWriter(ms, expectedJsonKey, expectedVersion, true)) { writer.Write(expectedOncogeneKeys, expectedIndex, expectedIndexEntries); } ms.Position = 0; uint[] actualOncogeneKeys; GeneFusionSourceCollection[] actualIndex; GeneFusionIndexEntry[] actualIndexEntries; IDataSourceVersion actualVersion; string actualJsonKey; GenomeAssembly actualAssembly; using (var reader = new GeneFusionSourceReader(ms)) { reader.LoadAnnotations(); actualOncogeneKeys = reader.OncogeneKeys; actualIndex = reader.Index; actualIndexEntries = reader.IndexEntries; actualVersion = reader.Version; actualJsonKey = reader.JsonKey; actualAssembly = reader.Assembly; } Assert.Equal(expectedVersion, actualVersion, new DataSourceVersionComparer()); Assert.Equal(expectedJsonKey, actualJsonKey); Assert.Equal(expectedOncogeneKeys, actualOncogeneKeys); Assert.Equal(expectedIndex.Length, actualIndex.Length); Assert.Equal(expectedIndex, actualIndex); Assert.Equal(expectedIndexEntries.Length, actualIndexEntries.Length); Assert.Equal(expectedIndexEntries, actualIndexEntries); Assert.Equal(GenomeAssembly.Unknown, actualAssembly); } internal static (uint[] OncogeneKeys, GeneFusionSourceCollection[] Index, GeneFusionIndexEntry[] IndexEntries) GetKeyToGeneFusion() { uint[] oncogeneKeys = {123}; var index = new GeneFusionSourceCollection[3]; var fusionsWithBothSources = new GeneFusionSourceBuilder {IsParalogPair = true}; fusionsWithBothSources.GermlineSources.Add(GeneFusionSource.OneK_Genomes_Project); fusionsWithBothSources.GermlineSources.Add(GeneFusionSource.Healthy_strong_support); fusionsWithBothSources.GermlineSources.Add(GeneFusionSource.Illumina_BodyMap2); fusionsWithBothSources.SomaticSources.Add(GeneFusionSource.Alaei_Mahabadi_18_Cancers); fusionsWithBothSources.SomaticSources.Add(GeneFusionSource.CCLE); index[0] = fusionsWithBothSources.Create(); var germlineFusions = new GeneFusionSourceBuilder {IsPseudogenePair = true, IsReadthrough = true}; germlineFusions.GermlineSources.Add(GeneFusionSource.CACG); germlineFusions.GermlineSources.Add(GeneFusionSource.ConjoinG); germlineFusions.GermlineSources.Add(GeneFusionSource.Healthy_prefrontal_cortex); germlineFusions.GermlineSources.Add(GeneFusionSource.Duplicated_Genes_Database); index[1] = germlineFusions.Create(); var somaticFusions = new GeneFusionSourceBuilder(); somaticFusions.SomaticSources.Add(GeneFusionSource.CCLE_Vellichirammal); somaticFusions.SomaticSources.Add(GeneFusionSource.Cancer_Genome_Project); index[2] = somaticFusions.Create(); var indexEntries = new GeneFusionIndexEntry[] { new(1000, 0), new(2000, 1), new(3000, 2) }; return (oncogeneKeys, index, indexEntries); } } } ================================================ FILE: UnitTests/SAUtils/FusionCatcher/IndexBuilderTests.cs ================================================ using System.Collections.Generic; using SAUtils.FusionCatcher; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.GeneFusions.SA; using Xunit; namespace UnitTests.SAUtils.FusionCatcher { public sealed class IndexBuilderTests { [Fact] public void Convert_ExpectedResults() { var expectedSourceCollection = new GeneFusionSourceCollection(false, true, false, new[] {GeneFusionSource.OneK_Genomes_Project, GeneFusionSource.Healthy}, new[] {GeneFusionSource.Alaei_Mahabadi_18_Cancers}); var expectedSourceCollection2 = new GeneFusionSourceCollection(false, true, false, null, null); var expectedIndexEntries = new GeneFusionIndexEntry[] { new(1000, 0), new(2000, 0), new(3000, 0), new(4000, 1), }; Dictionary geneKeyToSourceBuilder = GetGeneKeyToSourceBuilder(); (GeneFusionSourceCollection[] actualIndex, GeneFusionIndexEntry[] actualIndexEntries) = IndexBuilder.Convert(geneKeyToSourceBuilder); Assert.Equal(2, actualIndex.Length); Assert.Equal(expectedSourceCollection, actualIndex[0]); // most common entry first Assert.Equal(expectedSourceCollection2, actualIndex[1]); Assert.Equal(4, actualIndexEntries.Length); Assert.Equal(expectedIndexEntries, actualIndexEntries); } private static Dictionary GetGeneKeyToSourceBuilder() { var builder = new GeneFusionSourceBuilder { IsParalogPair = true, GermlineSources = {GeneFusionSource.OneK_Genomes_Project, GeneFusionSource.Healthy}, SomaticSources = {GeneFusionSource.Alaei_Mahabadi_18_Cancers} }; var builder2 = new GeneFusionSourceBuilder { IsParalogPair = true, GermlineSources = {GeneFusionSource.OneK_Genomes_Project, GeneFusionSource.Healthy}, SomaticSources = {GeneFusionSource.Alaei_Mahabadi_18_Cancers} }; var builder3 = new GeneFusionSourceBuilder { IsParalogPair = true, GermlineSources = {GeneFusionSource.OneK_Genomes_Project, GeneFusionSource.Healthy}, SomaticSources = {GeneFusionSource.Alaei_Mahabadi_18_Cancers} }; var builder4 = new GeneFusionSourceBuilder { IsParalogPair = true }; return new Dictionary { [1000] = builder, [2000] = builder2, [3000] = builder3, [4000] = builder4 }; } } } ================================================ FILE: UnitTests/SAUtils/GERP/GerpParserTests.cs ================================================ using System.IO; using System.Linq; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.GERP { public sealed class GerpParserTests { private static Stream GetGerpWigStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("#bedGraph section 1:12646-13697\n" + "1\t12646\t12647\t0.298\n" + "1\t12647\t12648\t2.63\n" + "1\t12648\t12649\t1.87\n" + "1\t12649\t12650\t0.252\n" + "1\t12650\t12651\t-2.06\n" + "1\t12651\t12652\t2.61\n" + "1\t12652\t12653\t3.97\n" + "1\t12653\t12654\t4.9\n" + "1\t12654\t12655\t1.98\n" + "1\t12655\t12656\t4.72"); writer.Flush(); stream.Position = 0; return stream; } private static Stream GetGerpTsvStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("#chrom\tpos\tGERP\n" + "1\t10000\t0\n" + "1\t12596\t-0.159\n" + "1\t12597\t0.848\n" + "1\t12598\t0.848\n" + "1\t12599\t-1.13\n" + "1\t12600\t-0.649\n" + "1\t12601\t0.698\n" + "1\t12602\t-0.194\n" + "1\t12603\t0.848\n" + "1\t12604\t-0.479\n" + "1\t12605\t0.848"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void ReadWigItems() { var nucleotides = new[] {"N"}; var parserSettings = new ParserSettings( new ColumnIndex(0, 2, null, null, 3, null), nucleotides, GenericScoreParser.NonConflictingScore ); using (var streamReader = new StreamReader(GetGerpWigStream())) using (var scoreParser = new GenericScoreParser(parserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { GenericScoreItem[] items = scoreParser.GetItems().ToArray(); Assert.Equal(10, items.Length); } } [Fact] public void ReadTsvItems() { var nucleotides = new[] {"N"}; var parserSettings = new ParserSettings( new ColumnIndex(0, 1, null, null, 2, null), nucleotides, GenericScoreParser.NonConflictingScore ); using (var streamReader = new StreamReader(GetGerpTsvStream())) using (var scoreParser = new GenericScoreParser(parserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { GenericScoreItem[] items = scoreParser.GetItems().ToArray(); Assert.Equal(11, items.Length); } } [Fact] public void TestScientificNotationScore() { var writer = new StreamWriter(new MemoryStream()); writer.WriteLine("#chr\tpos\tscore"); writer.WriteLine("21\t21757144\t-2.57"); writer.WriteLine("21\t21757145\t3.7e-5"); writer.Flush(); writer.BaseStream.Position = 0; var parserSettings = new ParserSettings( new ColumnIndex(0, 1, null, null, 2, null), new[] {"N"}, GenericScoreParser.NonConflictingScore ); using (var streamReader = new StreamReader(writer.BaseStream)) using (var reader = new GenericScoreParser(parserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { GenericScoreItem[] genericScoreItems = reader.GetItems().ToArray(); Assert.Equal(2, genericScoreItems.Length); Assert.Equal(-2.57, genericScoreItems[0].Score); Assert.Equal(0.000037, genericScoreItems[1].Score); } } } } ================================================ FILE: UnitTests/SAUtils/GERP/GerpReaderTests.cs ================================================ using System.IO; using VariantAnnotation.GenericScore; using Xunit; namespace UnitTests.SAUtils.GERP; public sealed class GerpReaderTests { /// /// This test is used to test backward compatibility with reader and writer. /// We do use schema versions to keep them in sync, but if one forgets to update /// the schema version, then the reader will fail. /// There are other tests that consider writing and reading in the same loop, /// however, in that case, a new code tests the writer and reader. /// Contrasting with this case, it will test the backward compatibility of the reader /// in case the reader code has a breaking change that prevents it from reading /// the old score files. /// [Fact] public void TestReadGerpData() { // This is the raw data from the files as byte array generated using wig file with one position // 1 12646 12647 0.298 var indexStreamRaw = new byte[] { 137, 78, 73, 82, 13, 10, 26, 10, 100, 25, 1, 0, 202, 250, 153, 145, 3, 135, 195, 225, 240, 2, 4, 71, 101, 114, 112, 8, 49, 49, 49, 49, 49, 49, 49, 49, 128, 128, 188, 209, 129, 179, 218, 238, 4, 59, 80, 97, 116, 104, 111, 103, 101, 110, 105, 99, 105, 116, 121, 32, 115, 99, 111, 114, 101, 115, 32, 111, 102, 32, 109, 105, 115, 115, 101, 110, 115, 101, 32, 118, 97, 114, 105, 97, 110, 116, 115, 32, 112, 114, 101, 100, 105, 99, 116, 101, 100, 32, 98, 121, 32, 71, 101, 114, 112, 22, 1, 0, 1, 231, 98, 21, 83, 1, 0, 1, 0, 0, 223, 79, 141, 151, 110, 18, 211, 63, 4, 103, 101, 114, 112, 5, 115, 99, 111, 114, 101, 1, 1, 78, 192, 132, 61 }; var dataStreamRaw = new byte[] { 137, 78, 73, 82, 13, 10, 26, 10, 112, 23, 1, 0, 202, 250, 153, 145, 3, 135, 195, 225, 240, 40, 181, 47, 253, 160, 128, 132, 30, 0, 92, 0, 0, 24, 0, 0, 255, 1, 0, 250, 255, 57, 24, 2, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 2, 0, 16, 255, 3, 36, 4, 255, 78, 73, 82, 255 }; using (var dataStream = new MemoryStream(dataStreamRaw)) using (var indexStream = new MemoryStream(indexStreamRaw)) { var scoreReader = ScoreReader.Read(dataStream, indexStream); Assert.Equal(0.298, scoreReader.GetScore(0, 12647, "A")); } } } ================================================ FILE: UnitTests/SAUtils/GeneAnnotationsTest.cs ================================================ using System; using System.Collections.Generic; using System.IO; using SAUtils; using SAUtils.DataStructures; using SAUtils.Omim; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Xunit; namespace UnitTests.SAUtils { public sealed class GeneAnnotationsTest { private static Dictionary> GetGeneAnnotations() { var omimJsonSchema = OmimSchema.Get(); return new Dictionary> { { "gene1", new List { new OmimItem("gene1", "gene name 1 (\'minibrain\', Drosophila, homolog of)", "describing gene 1\n\"some citation\"", 123, new List { new OmimItem.Phenotype(1, "disease 1", "This is disease 1", OmimItem.Mapping.mapping_of_the_wildtype_gene, new [] {OmimItem.Comment.unconfirmed_or_possibly_spurious_mapping}, new HashSet {"autosomal recessive"}, omimJsonSchema.GetSubSchema("phenotypes")) }, omimJsonSchema) } }, { "gene2", new List { new OmimItem("gene2", "gene name 2","", 124, new List { new OmimItem.Phenotype( 2, "disease 2", "COVID-19", OmimItem.Mapping.chromosome_deletion_or_duplication_syndrome, new [] {OmimItem.Comment.nondiseases}, new HashSet {"whatever", "never-ever"}, omimJsonSchema.GetSubSchema("phenotypes")) }, omimJsonSchema) } } }; } [Fact] public void ReadBackGeneAnnotations() { NgaReader reader; var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks); const string jsonKey = "mimo"; const bool isArray = true; using (var ms = new MemoryStream()) { using (var writer = new NgaWriter(ms, version, jsonKey, SaCommon.SchemaVersion, isArray, true)) { writer.Write(GetGeneAnnotations()); } ms.Position = 0; reader = NgaReader.Read(ms); } Assert.NotNull(reader); Assert.Null(reader.GetAnnotation("gene3")); Assert.Equal("[{\"mimNumber\":123,\"geneName\":\"gene name 1 ('minibrain', Drosophila, homolog of)\",\"description\":\"describing gene 1\\n\\\"some citation\\\"\",\"phenotypes\":[{\"phenotype\":\"disease 1\",\"description\":\"This is disease 1\",\"mapping\":\"mapping of the wildtype gene\",\"inheritances\":[\"autosomal recessive\"],\"comments\":[\"unconfirmed or possibly spurious mapping\"]}]}]", reader.GetAnnotation("gene1")); Assert.Equal("[{\"mimNumber\":124,\"geneName\":\"gene name 2\",\"phenotypes\":[{\"phenotype\":\"disease 2\",\"description\":\"COVID-19\",\"mapping\":\"chromosome deletion or duplication syndrome\",\"inheritances\":[\"whatever\",\"never-ever\"],\"comments\":[\"nondiseases\"]}]}]", reader.GetAnnotation("gene2")); } } } ================================================ FILE: UnitTests/SAUtils/GenericScoreParserTests/GenericScoreParserTests.cs ================================================ using System.IO; using System.Linq; using ErrorHandling.Exceptions; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.GenericScoreParserTests { public sealed class GenericScoreParserTests { private ParserSettings _parserSettings = new( new ColumnIndex(0, 2, 3, 4, 5, null), new[] {"A", "C", "G", "T"}, GenericScoreParser.MaxRepresentativeScores ); [Fact] public void TestParserNonNumericValues() { var writer = new StreamWriter(new MemoryStream()); writer.WriteLine("#chr\tpos\tref\talt\tscore"); writer.WriteLine("1\t10003\t10003\tA\tG\taskdlj"); writer.WriteLine("asd\t10003\t10003\tA\tG\taskdlj"); writer.Flush(); writer.BaseStream.Position = 0; using (var streamReader = new StreamReader(writer.BaseStream)) using (var reader = new GenericScoreParser(_parserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { GenericScoreItem[] genericScoreItems = reader.GetItems().ToArray(); Assert.Empty(genericScoreItems); } } [Fact] public void TestMaxScore() { var writer = new StreamWriter(new MemoryStream()); writer.WriteLine("#chr\tpos\tref\talt\tscore"); writer.WriteLine("1\t10003\t10003\tA\tG\t0.1"); writer.WriteLine("1\t10003\t10003\tA\tG\t0.5"); writer.Flush(); writer.BaseStream.Position = 0; _parserSettings = new ParserSettings( new ColumnIndex(0, 2, 3, 4, 5, null), new[] {"A", "C", "G", "T"}, GenericScoreParser.MaxRepresentativeScores ); using (var streamReader = new StreamReader(writer.BaseStream)) using (var reader = new GenericScoreParser(_parserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { GenericScoreItem[] genericScoreItems = reader.GetItems().ToArray(); Assert.Single(genericScoreItems); Assert.Equal(0.5, genericScoreItems[0].Score); } } [Fact] public void TestMinScore() { var writer = new StreamWriter(new MemoryStream()); writer.WriteLine("#chr\tpos\tref\talt\tscore"); writer.WriteLine("1\t10003\t10003\tA\tG\t0.1"); writer.WriteLine("1\t10003\t10003\tA\tG\t0.5"); writer.Flush(); writer.BaseStream.Position = 0; _parserSettings = new ParserSettings( new ColumnIndex(0, 2, 3, 4, 5, null), new[] {"A", "C", "G", "T"}, GenericScoreParser.MinRepresentativeScores ); using (var streamReader = new StreamReader(writer.BaseStream)) using (var reader = new GenericScoreParser(_parserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { GenericScoreItem[] genericScoreItems = reader.GetItems().ToArray(); Assert.Single(genericScoreItems); Assert.Equal(0.1, genericScoreItems[0].Score); } } [Fact] public void TestNonConflictingScores() { var writer = new StreamWriter(new MemoryStream()); writer.WriteLine("#chr\tpos\tref\talt\tscore"); writer.WriteLine("1\t10003\t10003\tA\tG\t0.1"); writer.WriteLine("1\t10003\t10003\tA\tG\t0.5"); writer.Flush(); writer.BaseStream.Position = 0; _parserSettings = new ParserSettings( new ColumnIndex(0, 2, 3, 4, 5, null), new[] {"A", "C", "G", "T"}, GenericScoreParser.NonConflictingScore ); using (var streamReader = new StreamReader(writer.BaseStream)) using (var reader = new GenericScoreParser(_parserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { Assert.Throws(() => reader.GetItems().ToArray()); } } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/AlleleReaderTests.cs ================================================ using System.IO; using System.Linq; using Genome; using SAUtils.InputFileParsers; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class AlleleReaderTests { private static Stream GetAncestralAlleleStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##AncestralAllele"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("1\t13284\trs548333521\tG\tA\t100\tPASS\tAC=7;AF=0.00139776;AN=5008;NS=2504;DP=26384;EAS_AF=0.001;AMR_AF=0;AFR_AF=0.0045;EUR_AF=0;SAS_AF=0;AA=g|||;VT=SNP;EAS_AN=1008;EAS_AC=1;EUR_AN=1006;EUR_AC=0;AFR_AN=1322;AFR_AC=6;AMR_AN=694;AMR_AC=0;SAS_AN=978;SAS_AC=0"); writer.WriteLine("1\t13289\trs568318295\tC\tT\t100\tPASS\tAC=3;AF=0.000599042;AN=5008;NS=2504;DP=25361;EAS_AF=0.003;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=c|||;VT=SNP;EAS_AN=1008;EAS_AC=3;EUR_AN=1006;EUR_AC=0;AFR_AN=1322;AFR_AC=0;AMR_AN=694;AMR_AC=0;SAS_AN=978;SAS_AC=0"); writer.WriteLine("1\t13313\trs527952245\tT\tG\t100\tPASS\tAC=1;AF=0.000199681;AN=5008;NS=2504;DP=20943;EAS_AF=0;AMR_AF=0;AFR_AF=0;EUR_AF=0.001;SAS_AF=0;AA=t|||;VT=SNP;EAS_AN=1008;EAS_AC=0;EUR_AN=1006;EUR_AC=1;AFR_AN=1322;AFR_AC=0;AMR_AN=694;AMR_AC=0;SAS_AN=978;SAS_AC=0"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems_test() { var sequence = new SimpleSequence(new string('T', VariantUtils.MaxUpstreamLength) + "G" + new string('T', 13289 - 13284) + "C" + new string('T', 13313 - 13289) + "T", 13284 - 1 - VariantUtils.MaxUpstreamLength); var seqProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh37, sequence, ChromosomeUtilities.RefNameToChromosome); var reader = new AncestralAlleleReader(new StreamReader(GetAncestralAlleleStream()), seqProvider); var items = reader.GetItems().ToList(); Assert.Equal(3, items.Count); Assert.Equal("\"g\"", items[0].GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/ClinGenTests.cs ================================================ using System.IO; using System.Linq; using SAUtils.InputFileParsers.ClinGen; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class ClinGenTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("nsv530705\t1\t564405\t8597804\t0\t1\tcopy_number_loss\tpathogenic\tFalse\tDevelopmental delay AND/OR other significant developmental or morphological phenotypes\t"); writer.WriteLine("nsv530706\t1\t564424\t3262790\t0\t1\tcopy_number_loss\tpathogenic\tFalse\tAbnormal facial shape,Abnormality of cardiac morphology,Global developmental delay,Muscular hypotonia\tHP:0001252,HP:0001263,HP:0001627,HP:0001999,MedGen:CN001147,MedGen:CN001157,MedGen:CN001482,MedGen:CN001810"); writer.WriteLine("nsv530300\t1\t728138\t5066371\t1\t0\tcopy_number_gain\tpathogenic\tFalse\tAbnormality of cardiac morphology,Cleft palate,Global developmental delay\tHP:0000175,HP:0001263,HP:0001627,MedGen:C2240378,MedGen:CN001157,MedGen:CN001482"); writer.WriteLine("nsv530780\t1\t807685\t2574042\t1\t1\tcopy_number_variation\tpathogenic\tFalse\tDevelopmental delay AND/OR other significant developmental or morphological phenotypes,Global developmental delay,Hirsutism,Obesity,Seizure,Short stature\tHP:0001007,HP:0001250,HP:0001263,HP:0001513,HP:0004322,MedGen:C0019572,MedGen:C0349588,MedGen:C1959629,MedGen:C1963185,MedGen:CN001157"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems() { using (var reader = new ClinGenReader(new StreamReader(GetStream()), ChromosomeUtilities.RefNameToChromosome)) { var items = reader.GetItems().ToList(); Assert.Equal(4, items.Count); Assert.Equal("\"chromosome\":\"1\",\"begin\":564405,\"end\":8597804,\"variantType\":\"copy_number_loss\",\"id\":\"nsv530705\",\"clinicalInterpretation\":\"pathogenic\",\"phenotypes\":[\"Developmental delay AND/OR other significant developmental or morphological phenotypes\"],\"observedLosses\":1", items[0].GetJsonString()); } } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/ClinVarXmlReaderTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using IO; using Moq; using SAUtils.DataStructures; using SAUtils.InputFileParsers.ClinVar; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Providers; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class ClinVarXmlReaderTests { private static ISequenceProvider GetSequenceProvider(GenomeAssembly assembly, int start, string refSequence) { var seqProvider = new Mock(); seqProvider.Setup(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome); seqProvider.Setup(x => x.Assembly).Returns(assembly); seqProvider.Setup(x => x.Sequence).Returns(new SimpleSequence(refSequence, start - 1)); return seqProvider.Object; } [Fact] public void BasicReadTest() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 41234419, "A"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000077146.xml")),Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); var clinVarItem = items.First(); Assert.Equal("RCV000077146.3", clinVarItem.Id); Assert.Equal("17", clinVarItem.Chromosome.EnsemblName); Assert.Equal(41234419, clinVarItem.Position); Assert.Equal("A", clinVarItem.RefAllele); Assert.Equal("C", clinVarItem.AltAllele); Assert.Equal("2019-12-15", new DateTime(clinVarItem.LastUpdatedDate).ToString("yyyy-MM-dd")); Assert.Equal(clinVarItem.AlleleOrigins, new List { "germline" }); Assert.Equal("C2676676", clinVarItem.MedGenIds.First()); Assert.Equal("145", clinVarItem.OrphanetIds.First()); Assert.Equal("604370", clinVarItem.OmimIds.First()); Assert.Equal("Breast-ovarian cancer, familial 1", clinVarItem.Phenotypes.First()); } [Fact] public void RCV000001373_NoExtraOmimId() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 3209662, "AGCAGACGGGCA"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000001373.xml")), Stream.Null, sequenceProvider); var clinVarItems = reader.GetRcvItems().ToArray(); Assert.Single(clinVarItems); var clinVarItem = clinVarItems[0]; Assert.Equal("RCV000001373.3", clinVarItem.Id); var omimIds = clinVarItem.OmimIds; Assert.Single(omimIds); Assert.Equal("610206.0007", omimIds.First()); } [Fact] public void RCV000435546_NotMissing() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 110221557, "CGCGG"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000435546.xml")), Stream.Null, sequenceProvider); var clinVarItems = reader.GetRcvItems(); Assert.True(clinVarItems.Any()); } [Fact] public void MissingAltAllele() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 118165691, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000120902.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal("C", clinVarItem.RefAllele); Assert.Equal("G", clinVarItem.AltAllele); } } [Fact] public void NonEnglishChars() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 225592188, "TAGAAGA"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000087262.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal("Pelger-Huët anomaly", clinVarItem.Phenotypes.First()); } } [Fact] public void WrongPosition() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 112064826, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000073701.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { switch (clinVarItem.Position) { case 112064826: Assert.Equal("G", clinVarItem.RefAllele); Assert.Equal("C", clinVarItem.AltAllele); break; default: throw new InvalidDataException($"Unexpected clinvar item start point : {clinVarItem.Position}"); } } } [Fact] public void PubmedTest1() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 10183453, "AGCGCGCACGCAGCTCCGCCCC"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000152657.xml")),Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal( new List { 12114475, 18836774, 22357542, 24033266 }, clinVarItem.PubmedIds); } } [Fact] public void PubmedTest2() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 5247992, "CAAAG"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000016673.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(new List { 6826539, 9113933, 9845707, 12000828, 12383672 }, clinVarItem.PubmedIds); } } [Fact] public void PubmedTest3() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 55259485, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000038438.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal( new List { 17285735, 17877814, 22848293, 24033266 }, clinVarItem.PubmedIds); } } [Fact] public void PubmedTest4() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 43609944, "GCTGT"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000021819.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal( new List { 7595167, 8099202, 8612479 }, clinVarItem.PubmedIds); } } [Fact] public void PubmedTest5() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 88907409, "A"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000000734.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal("699", clinVarItem.VariationId); Assert.Null(clinVarItem.PubmedIds); Assert.Contains("\"variationId\":\"699\"", clinVarItem.GetJsonString()); } } [Fact] public void PubmedTest6() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 118165691, "C"); //extracting from SCV record var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000120902.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(clinVarItem.PubmedIds, new List { 24728327 }); } } [Fact] public void PubmedTest7_comma_trimming() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 41258568, "A"); //extracting from SCV record var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000167792.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(clinVarItem.PubmedIds, new List { 23239986, 28492532, 30472649 }); } } [Fact] public void MultiScvPubmed() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 15589551, "AG"); //extracting from SCV record var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000194003.xml")) , Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(clinVarItem.PubmedIds, new List {25741868, 26092869}); } } [Fact] public void NoClinVarItem_due_to_ref_mismatch() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 90982267, "A"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000000101.xml")), Stream.Null, sequenceProvider); Assert.False(reader.GetRcvItems().Any()); } [Fact] public void ClinVarForRef() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 31496350, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000124712.xml")), Stream.Null, sequenceProvider); var clinVarList = new List(); foreach (var clinVarItem in reader.GetRcvItems()) { clinVarList.Add(clinVarItem); Assert.Equal(clinVarItem.RefAllele, clinVarItem.AltAllele); } Assert.Single(clinVarList); } [Fact] public void MultiplePhenotypes() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 172659738, "C"); //no citations show up for this RCV in the website. But the XML has these pubmed ids under fields that we parse pubmed ids from var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000144179.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { var expectedPhenotypes = new List { "Single ventricle", "small Atrial septal defect" }; Assert.Equal(expectedPhenotypes, clinVarItem.Phenotypes); } } [Fact] public void MultipleOrigins() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 18671566, "G"); //no citations show up for this RCV in the website. But the XML has these pubmed ids under fields that we parse pubmed ids from var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000080071.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { var expectedOrigins = new List { "germline", "maternal", "unknown" }; Assert.Equal(expectedOrigins, clinVarItem.AlleleOrigins); } } [Fact] public void SkipGeneralCitations() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 67705958, "G"); //no citations show up for this RCV in the website. But the XML has these pubmed ids under fields that we parse pubmed ids from var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000003254.xml")), Stream.Null, sequenceProvider); foreach (var clinVarItem in reader.GetRcvItems()) { Assert.Equal(clinVarItem.PubmedIds, new List { 12023369, 17068223, 17447842, 17587057, 17786191, 17804789, 18438406, 19122664, 20228799 }); } } [Fact] public void IndelTest() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 187122303, "TCATACAGGTCATCGCT"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000032548.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal("RCV000032548.8", clinVarItem.Id); switch (clinVarItem.Id) { case "RCV000032548.8": Assert.Equal("4", clinVarItem.Chromosome.EnsemblName); Assert.Equal(187122303, clinVarItem.Position); Assert.Equal(17, clinVarItem.RefAllele.Length); Assert.Equal("GC", clinVarItem.AltAllele); break; } } } [Fact] [Trait("jira", "NIR-2034")] public void MultiScvPubmeds() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 116411990, "C"); //extracting from SCV record var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000203290.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(clinVarItem.PubmedIds, new List { 23806086, 24088041, 25736269 }); } } [Fact] [Trait("jira", "NIR-2034")] public void MultipleAlleleOrigins() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 32890572, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000112977.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(2, clinVarItem.AlleleOrigins.Count()); Assert.NotEqual(clinVarItem.AlleleOrigins.First(), clinVarItem.AlleleOrigins.Last()); foreach (var origin in clinVarItem.AlleleOrigins) { Assert.True(origin == "unknown" || origin == "germline"); } } } [Fact] [Trait("jira", "NIR-2748")] public void Discard_entries_with_unknown_variant_type() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 66765160, "CAG"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000485802.xml")), Stream.Null, sequenceProvider); Assert.False(reader.GetRcvItems().Any()); } [Fact] [Trait("jira", "NIR-2035")] public void EmptyRefAndAlt() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 31805881, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000083638.xml")), Stream.Null, sequenceProvider); Assert.Empty(reader.GetRcvItems()); } [Fact] [Trait("jira", "NIR-2036")] public void SkipMicrosattelite() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 87637894, "CTG"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000005426.xml")), Stream.Null, sequenceProvider); Assert.Empty(reader.GetRcvItems()); } [Fact] public void SkipAlus() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 32893302, "TAAA"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000724338.xml")), Stream.Null, sequenceProvider); Assert.Empty(reader.GetRcvItems()); } [Fact] [Trait("jira", "NIR-2072")] public void MissingClinvarInsertion() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 2337967, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000179026.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(2337968, clinVarItem.Position); } } [Fact] [Trait("jira", "NIR-2072")] public void MissingClinvarInsertionShift() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 3751645, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000207071.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(3751646, clinVarItem.Position); } } [Fact] [Trait("jira", "NIR-2072")] public void MissingClinvarInsertionShift2() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 9324412, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000017510.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(9324413, clinVarItem.Position); } } [Fact] [Trait("jira", "NIR-2045")] public void AlternatePhenotype() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 42018227, "GTC"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000032707.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.NotNull(clinVarItem.Phenotypes); } } [Fact] [Trait("jira", "NIR-2072")] public void IupacBases() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh38, 32339320, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000113363.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); var altAlleles = new List(); foreach (var clinVarItem in items) { altAlleles.Add(clinVarItem.AltAllele); Assert.Equal(new[] {"pathogenic"}, clinVarItem.Significances); } Assert.Equal(2, altAlleles.Count); } [Fact] [Trait("jira", "NIR-2072")] public void OmitOmimFromAltPhenotypes() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 55529187, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000030349.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Single(clinVarItem.OmimIds); } } [Fact] [Trait("jira", "NIR-2072")] public void TrimSpaceFromOmimIds() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 129283520, "A"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000373191.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Single(clinVarItem.OmimIds); Assert.Equal("609060", clinVarItem.OmimIds.FirstOrDefault()); } } [Fact] [Trait("jira", "NIR-2099")] public void ClinvarInsertion() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 122318386, "A"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000153339.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(122318387, clinVarItem.Position); } } [Fact] public void Remove9DigitsPubmedId() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 534286, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000207504.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(clinVarItem.PubmedIds, new List { 16329078, 16372351, 19213030, 21438134, 25741868 }); } } [Fact] public void CaptureGeneOmimId() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 3494833, "A"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000235027.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(clinVarItem.OmimIds, new List { "601462", "610285.0001" }); } } [Fact] public void CapturePhenotypicSeriesOmimIDandUniq() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 122746325, "A"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000401212.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(clinVarItem.OmimIds, new List { "209900" }); } } [Fact] public void CapturePhenotypeSeriesOmimId() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 15513004, "GGAA"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000406351.xml")), Stream.Null, sequenceProvider); var items = reader.GetRcvItems(); Assert.True(items.Any()); foreach (var clinVarItem in items) { Assert.Equal(clinVarItem.OmimIds, new List { "213300" }); } } [Fact] public void RemoveDuplicationWithWrongRefSequence() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 10183702, "GCGGCCGCGGCCCG"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000267121.xml")), Stream.Null, sequenceProvider); Assert.False(reader.GetRcvItems().Any()); } [Fact] [Trait("jira", "NIR-2372")] public void AllelicOmimIdsForSnvs() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 111329354, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000170338.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems); var clinvarItem = clinvarItems[0]; Assert.Single(clinvarItem.OmimIds); Assert.Equal("612800.0003", clinvarItem.OmimIds.First()); } [Fact] [Trait("jira", "NIR-2372")] public void AllelicOmimIdsForDeletions() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 111335401, "GCTC"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000170338.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems); var clinvarItem = clinvarItems[0]; Assert.Single(clinvarItem.OmimIds); Assert.Equal("612800.0002", clinvarItem.OmimIds.First()); } [Fact] [Trait("jira", "NIR-2372")] public void ExcludeAllelicOmimIdsFromTraits() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 100887648, "AGAT"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000050055.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems); var clinvarItem = clinvarItems[0]; Assert.Single(clinvarItem.OmimIds); Assert.Equal("216550", clinvarItem.OmimIds.First()); } [Fact] [Trait("jira", "NIR-2372")] public void AllelicOmimIdsFromAttributeSetChrX() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 595469, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000010551.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems); foreach (var clinVarItem in clinvarItems) { Assert.Equal(2, clinVarItem.OmimIds.Count()); } } [Fact] [Trait("jira", "NIR-2372")] public void AllelicOmimIdsFromAttributeSetChrY() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 545469, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000010551.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems); foreach (var clinVarItem in clinvarItems) { Assert.Equal(2, clinVarItem.OmimIds.Count()); } } [Fact] [Trait("jira", "NIR-2372")] public void MultipleEntryRecordVariant1() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 8045031, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000007484.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems); } [Fact] [Trait("jira", "NIR-2372")] public void MultipleEntryRecordVariant2() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 8021910, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000007484.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems); } [Fact] [Trait("jira", "NIR-2372")] public void SkipMicrosatellitesWithoutAltAllele() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 46191240, "ATTCT"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000001054.xml")), Stream.Null, sequenceProvider); Assert.False(reader.GetRcvItems().Any()); } [Fact] [Trait("jira", "NIR-2029")] public void MissingClinvarInsertion2() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh38, 132903738, "A"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000342164.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems); } [Fact] public void Skip_entries_with_inconsistant_start_end() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 132903739, "AAACGCTCATAGAGTAACTGGTTGTGCAGTAAAAGCAACTGGTCTCAAACGCTCATAGAGTAACTGGTTGTGCAGTAAAAGCAACTGGTCTC"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000342164.xml")), Stream.Null, sequenceProvider); Assert.False(reader.GetRcvItems().Any()); } [Fact] public void Alternate_phenotypes() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 204732740, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000537563.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Single(clinvarItems[0].Phenotypes); } [Fact] public void Mising_entry() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 36888396, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000171474.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Equal("",clinvarItems[0].RefAllele); } [Fact] public void Multiple_significance() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh38, 72349076, "T"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000169296.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Equal(new[]{ "pathogenic", "likely pathogenic" }, clinvarItems[0].Significances); } [Fact] public void Multiple_significance_from_explanation() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh38, 12665750, "T"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000001752.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems().ToList(); Assert.Equal(new[] { "pathogenic", "uncertain significance" }, clinvarItems[0].Significances); } [Fact] public void Override_microsatellite_type() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 88929173, "CGAG"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000205418.xml")), Stream.Null, sequenceProvider); var clinvarItems = reader.GetRcvItems(); Assert.Single(clinvarItems); } [Fact] public void OneRcv_oneVcv() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 31496350, "C"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("RCV000124712.xml")), FileUtilities.GetReadStream(Resources.VcvXmlFiles("VCV000137106.xml")), sequenceProvider); var items = reader.GetItems().ToArray(); Assert.Equal(2,items.Length); var rcvJson = items[1].GetJsonString(); Assert.Contains("VCV000137106.3", rcvJson); } [Fact] public void TwoRcv_oneVcv() { var sequenceProvider = GetSequenceProvider(GenomeAssembly.GRCh37, 9775688, "G"); var reader = new ClinVarParser(FileUtilities.GetReadStream(Resources.ClinvarXmlFiles("Two_RCVs.xml")), FileUtilities.GetReadStream(Resources.VcvXmlFiles("VCV000618791.xml")), sequenceProvider); var items = reader.GetItems().ToArray(); Assert.Equal(3, items.Length); var rcvJson = items[1].GetJsonString(); Assert.Contains("VCV000618791.5", rcvJson); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/ClinvarVariationParserTests.cs ================================================ using System.Linq; using IO; using SAUtils.InputFileParsers.ClinVar; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class ClinvarVariationParserTests { [Fact] public void InterpretedRecordsTest() { using (var reader = new ClinVarVariationReader(FileUtilities.GetReadStream(Resources.VcvXmlFiles("TwoRecords.xml")))) { var items = reader.GetItems().ToArray(); Assert.Equal(2, items.Length); Assert.Equal(79, items[0].VariantId); Assert.Equal(ClinVarCommon.ReviewStatus.no_criteria, items[0].ReviewStatus); Assert.Equal(new []{"pathogenic"}, items[0].Significances); Assert.Equal(86, items[1].VariantId); } } [Fact] public void IncludedRecordTest() { using (var reader = new ClinVarVariationReader(FileUtilities.GetReadStream(Resources.VcvXmlFiles("VCV000431749.xml")))) { var items = reader.GetItems().ToArray(); Assert.Equal(ClinVarCommon.ReviewStatus.no_interpretation_single, items[0].ReviewStatus); Assert.Equal(new []{"no interpretation for the single variant"}, items[0].Significances); } } [Fact] public void SignificanceTest() { using (var reader = new ClinVarVariationReader(FileUtilities.GetReadStream(Resources.VcvXmlFiles("VCV000476472.xml")))) { var items = reader.GetItems().ToArray(); Assert.Equal(new []{"likely benign","uncertain significance"}, items[0].Significances); } } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/CosmicCnvReaderTests.cs ================================================ using System.IO; using System.Linq; using Genome; using SAUtils.ExtractCosmicSvs; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class CosmicCnvReaderTests { [Fact] public void GetColumnIndices_valid_header() { const string header = @"CNV_ID ID_GENE gene_name ID_SAMPLE ID_TUMOUR Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 SAMPLE_NAME TOTAL_CN MINOR_ALLELE MUT_TYPE ID_STUDY GRCh Chromosome:G_Start..G_Stop"; var readStream = ResourceUtilities.GetReadStream(Resources.SaPath("CosmicCNV.tsv")); var cnvReader = new CosmicCnvReader(readStream, ChromosomeUtilities.RefNameToChromosome, GenomeAssembly.GRCh37); cnvReader.GetColumnIndices(header); //we do not need an assert because not getting an exception in the last line means pass } [Fact] public void GetColumnIndices_missing_column() { const string header = @"CNV_ID ID_GENE gene_name ID_SAMPLE ID_TUMOUR Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 SAMPLE_NAME TOTAL_CN MINOR_ALLELE MUT_TYPE ID_STUDY Chromosome:G_Start..G_Stop"; var readStream = ResourceUtilities.GetReadStream(Resources.SaPath("CosmicCNV.tsv")); var cnvReader = new CosmicCnvReader(readStream, ChromosomeUtilities.RefNameToChromosome, GenomeAssembly.GRCh37); Assert.Throws(()=>cnvReader.GetColumnIndices(header)); } [Fact] public void GetEntries() { var readStream = ResourceUtilities.GetReadStream(Resources.SaPath("CosmicCNV.tsv")); var cnvReader = new CosmicCnvReader(readStream, ChromosomeUtilities.RefNameToChromosome, GenomeAssembly.GRCh37); var cnvItems = cnvReader.GetEntries(); Assert.Equal(5, cnvItems.Count()); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/CosmicItemTests.cs ================================================ using System.Collections.Generic; using SAUtils.DataStructures; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class CosmicItemTests { [Fact] public void GetCancerSiteCount_same_study() { var cosmicItem = new CosmicItem(ChromosomeUtilities.Chr1, 100, "rs101", "A", "C", "GENE0", new HashSet { new CosmicItem.CosmicStudy("100", new []{"primary histology 0", "histology subtype 1"}, new []{"primarySite 0", "site subtype 1"}), new CosmicItem.CosmicStudy("100", new []{"primary histology 0", "histology subtype 1"}, new []{"primarySite 0", "site subtype 1"}) }, 1); var counts = cosmicItem.GetTissueCounts(); Assert.Equal(2, counts.Count); Assert.Equal(1, counts["primarySite 0"]); Assert.Equal(1, counts["site subtype 1"]); } [Fact] public void GetTissueCount_different_studies() { var cosmicItem = new CosmicItem(ChromosomeUtilities.Chr1, 100, "rs101", "A", "C", "GENE0", new HashSet { new CosmicItem.CosmicStudy("100", new []{"primary histology 0", "histology subtype 1"}, new []{"primarySite 0", "site subtype 2"}), new CosmicItem.CosmicStudy("110", new []{"primary histology 0", "histology subtype 1"}, new []{"primarySite 0", "site subtype 1"}) }, 1); var counts = cosmicItem.GetTissueCounts(); Assert.Equal(3, counts.Count); Assert.Equal(2, counts["primarySite 0"]); Assert.Equal(1, counts["site subtype 1"]); Assert.Equal(1, counts["site subtype 2"]); } [Fact] public void GetCancerTypeCount_same_study() { var cosmicItem = new CosmicItem(ChromosomeUtilities.Chr1, 100, "rs101", "A", "C", "GENE0", new HashSet { new CosmicItem.CosmicStudy("100", new []{"primary histology 0", "histology subtype 1"}, new []{"primarySite 0", "site subtype 1"}), new CosmicItem.CosmicStudy("100", new []{"primary histology 0", "histology subtype 1"}, new []{"primarySite 0", "site subtype 1"}) }, 1); var cancerTypeCounts = cosmicItem.GetCancerTypeCounts(); Assert.Equal(2, cancerTypeCounts.Count); Assert.Equal(1, cancerTypeCounts["primary histology 0"]); Assert.Equal(1, cancerTypeCounts["histology subtype 1"]); } [Fact] public void GetCancerTypeCount_different_studies() { var cosmicItem = new CosmicItem(ChromosomeUtilities.Chr1, 100, "rs101", "A", "C", "GENE0", new HashSet { new CosmicItem.CosmicStudy("100", new []{"primary histology 0", "histology subtype 1"}, new []{"primarySite 0", "site subtype 1"}), new CosmicItem.CosmicStudy("101", new []{"primary histology 0", "histology subtype 2"}, new []{"primarySite 0", "site subtype 1"}) }, 1); var cancerTypeCounts = cosmicItem.GetCancerTypeCounts(); Assert.Equal(3, cancerTypeCounts.Count); Assert.Equal(2, cancerTypeCounts["primary histology 0"]); Assert.Equal(1, cancerTypeCounts["histology subtype 1"]); Assert.Equal(1, cancerTypeCounts["histology subtype 2"]); } [Fact] public void GetJsonString() { var cosmicItem = new CosmicItem(ChromosomeUtilities.Chr1, 100, "rs101", "A", "C", "GENE0", new HashSet { new CosmicItem.CosmicStudy("100", new []{"primary histology 0", "histology subtype 1"}, new []{"primarySite 0", "site subtype 1"}), new CosmicItem.CosmicStudy("101", new []{"primary histology 0", "histology subtype 2"}, new []{"primarySite 0", "site subtype 1"}) }, 1); Assert.Equal("\"id\":\"rs101\",\"refAllele\":\"A\",\"altAllele\":\"C\",\"gene\":\"GENE0\",\"sampleCount\":1,\"cancerTypesAndCounts\":[{\"cancerType\":\"primary histology 0\",\"count\":2},{\"cancerType\":\"histology subtype 1\",\"count\":1},{\"cancerType\":\"histology subtype 2\",\"count\":1}],\"cancerSitesAndCounts\":[{\"cancerSite\":\"primarySite 0\",\"count\":2},{\"cancerSite\":\"site subtype 1\",\"count\":2}]", cosmicItem.GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/DataSourceVersionTests.cs ================================================ using System; using IO; using OptimizedCore; using SAUtils.InputFileParsers; using UnitTests.TestUtilities; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class DataSourceVersionTests { [Fact] public void ReadDataVersionFromFile() { DataSourceVersion version; using (var reader = new DataSourceVersionReader(FileUtilities.GetReadStream(Resources.TopPath("dbSNP.version")))) { version = reader.GetVersion(); } Assert.Equal("dbSNP", version.Name); Assert.Equal("147", version.Version); Assert.Equal(DateTime.Parse("2016-04-08").Ticks, version.ReleaseDateTicks); Assert.True(string.IsNullOrEmpty(version.Description)); Assert.Contains("dataSource=dbSNP", version.ToString());//vcf output var sb = StringBuilderPool.Get(); version.SerializeJson(sb); Assert.Contains("name\":\"dbSNP", StringBuilderPool.GetStringAndReturn(sb));//json output } [Fact] public void GetSourceVersionTest() { var versionPath = Resources.TopPath("dbSNP.version"); var version = DataSourceVersionReader.GetSourceVersion(versionPath); Assert.Equal("dbSNP", version.Name); Assert.Equal("147", version.Version); Assert.Equal(DateTime.Parse("2016-04-08").Ticks, version.ReleaseDateTicks); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/DbSnpReaderTests.cs ================================================ using System; using System.IO; using System.Linq; using Genome; using SAUtils.InputFileParsers.DbSnp; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class DbSnpReaderTests { [Fact] public void MissingEntry() { const string vcfLine = "1 241369 rs11490246 C T . . RS=11490246;RSPOS=241369;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x050000000005000126000100;WGT=1;VC=SNV;ASP;GNO;KGPhase3;CAF=0,1;COMMON=0"; var sequenceProvider = ParserTestUtils.GetSequenceProvider(241369, "C", 'A', ChromosomeUtilities.RefNameToChromosome); var dbsnpReader = new DbSnpReader(null, sequenceProvider); var dbSnpEntry = dbsnpReader.ExtractItem(vcfLine).First(); Assert.Equal(11490246, dbSnpEntry.RsId); } [Fact] public void MissingEntry2() { const string vcfLine = "17 828 rs62053745 T C . . RS=62053745;RSPOS=828;dbSNPBuildID=129;SSR=0;SAO=0;VP=0x050100080005140136000100;WGT=1;VC=SNV;SLO;INT;ASP;VLD;GNO;KGPhase1;KGPhase3;CAF=0.2576,0.7424;COMMON=1"; var sequenceProvider = ParserTestUtils.GetSequenceProvider(828, "T", 'A', ChromosomeUtilities.RefNameToChromosome); var dbsnpReader = new DbSnpReader(null, sequenceProvider); var dbSnpEntry = dbsnpReader.ExtractItem(vcfLine).First(); Assert.Equal(62053745, dbSnpEntry.RsId); } [Fact] public void MissingDbsnpId() { const string vcfLine = "X 21505833 rs12395602 G A,C,T . . RS=12395602;RSPOS=21505833;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x05010008000505051f000101;WGT=1;VC=SNV;SLO;INT;ASP;VLD;G5;HD;GNO;KGPhase1"; var sequenceProvider = ParserTestUtils.GetSequenceProvider(21505833, "G", 'G', ChromosomeUtilities.RefNameToChromosome); var dbsnpReader = new DbSnpReader(null, sequenceProvider); var dbSnpEntries = dbsnpReader.ExtractItem(vcfLine).ToList(); Assert.Equal(3, dbSnpEntries.Count); Assert.Equal("A", dbSnpEntries[0].AltAllele); Assert.Equal(12395602, dbSnpEntries[0].RsId); Assert.Equal("C", dbSnpEntries[1].AltAllele); Assert.Equal(12395602, dbSnpEntries[1].RsId); Assert.Equal("T", dbSnpEntries[2].AltAllele); Assert.Equal(12395602, dbSnpEntries[2].RsId); } [Obsolete("We should not have skipped unit tests.")] [Fact(Skip = "redo test with AlleleFrequency object")] public void NoMinorAllele() { const string vcfLine = "17 828 rs62053745 T C . . RS=62053745;RSPOS=828;dbSNPBuildID=129;SSR=0;SAO=0;VP=0x050100080005140136000100;WGT=1;VC=SNV;SLO;INT;ASP;VLD;GNO;KGPhase1;KGPhase3;CAF=.,0.7424;COMMON=1"; var sequenceProvider = ParserTestUtils.GetSequenceProvider(828, "T", 'G', ChromosomeUtilities.RefNameToChromosome); var dbsnpReader = new DbSnpReader(null, sequenceProvider); var dbSnpEntry = dbsnpReader.ExtractItem(vcfLine).First(); Assert.Equal("C", dbSnpEntry.AltAllele); } [Obsolete("We should not have skipped unit tests.")] [Fact(Skip = "redo test with AlleleFrequency object")] public void DisregardZeroFreq() { const string vcfLine = "1 241369 rs11490246 C T . . RS=11490246;RSPOS=241369;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x050100000005000126000100;WGT=1;VC=SNV;SLO;ASP;GNO;KGPhase3;CAF=0,1;COMMON=0"; var sequenceProvider = ParserTestUtils.GetSequenceProvider(241369, "C", 'G', ChromosomeUtilities.RefNameToChromosome); var dbsnpReader = new DbSnpReader(null, sequenceProvider); var dbSnpEntry = dbsnpReader.ExtractItem(vcfLine).First(); Assert.Equal("T", dbSnpEntry.AltAllele); } private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##dbSNP"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("1\t10285\trs866375379\tT\tA,C\t.\t.\tRS=866375379;RSPOS=10285;dbSNPBuildID=147;SSR=0;SAO=0;VP=0x050100020005000002000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;SLO;R5;ASP"); writer.WriteLine("1\t10329\trs150969722\tAC\tA\t.\t.\tRS=150969722;RSPOS=10330;dbSNPBuildID=134;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems_test() { var sequence = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "T" + new string('G', 10329 - 10285) + "AC", 10284 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh37, sequence, ChromosomeUtilities.RefNameToChromosome); var reader = new DbSnpReader(GetStream(), sequenceProvider); var items = reader.GetItems().ToList(); Assert.Equal(3, items.Count); Assert.Equal("\"rs866375379\"", items[0].GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/DecipherReaderTest.cs ================================================ using System.IO; using System.Linq; using SAUtils.InputFileParsers.Decipher; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class DecipherTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); // file has been modified to 7 columns writer.WriteLine("#population_cnv_id\tchr\tstart\tend\tdeletion_observations\tdeletion_frequency\tdeletion_standard_error\tduplication_observations\tduplication_frequency\tduplication_standard_error\tobservations\tfrequency\tstandard_error\ttype\tsample_size\tstudy"); writer.WriteLine("1\t1\t10529\t177368\t0\t0\t1\t3\t0.075\t0.555277708\t3\t0.075\t0.555277708\t1\t40\t42M calls"); writer.WriteLine("2\t1\t13516\t91073\t0\t0\t1\t27\t0.675\t0.109713431\t27\t0.675\t0.109713431\t1\t40\t42M call"); writer.WriteLine("3\t1\t18888\t35451\t0\t0\t1\t2\t0.002366864\t0.706269473\t2\t0.002366864\t0.706269473\t1\t845\tDDD"); writer.WriteLine("4\t1\t23946\t88271\t27\t0.031952663\t0.189350482\t21\t0.024852071\t0.215489247\t48\t0.056804734\t0.140178106\t0\t845\tDDD"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItemsTest() { var decipherReader = new DecipherParser(new StreamReader(GetStream()), ChromosomeUtilities.RefNameToChromosome); var items = decipherReader.GetItems().ToList(); Assert.Equal(4, items.Count); Assert.Equal("\"chromosome\":\"1\",\"begin\":10529,\"end\":177368,\"numDeletions\":0,\"deletionFrequency\":0,\"numDuplications\":3,\"duplicationFrequency\":0.075,\"sampleSize\":40", items[0].GetJsonString()); Assert.Equal("\"chromosome\":\"1\",\"begin\":13516,\"end\":91073,\"numDeletions\":0,\"deletionFrequency\":0,\"numDuplications\":27,\"duplicationFrequency\":0.675,\"sampleSize\":40", items[1].GetJsonString()); Assert.Equal("\"chromosome\":\"1\",\"begin\":18888,\"end\":35451,\"numDeletions\":0,\"deletionFrequency\":0,\"numDuplications\":2,\"duplicationFrequency\":0.002367,\"sampleSize\":845", items[2].GetJsonString()); Assert.Equal("\"chromosome\":\"1\",\"begin\":23946,\"end\":88271,\"numDeletions\":27,\"deletionFrequency\":0.031953,\"numDuplications\":21,\"duplicationFrequency\":0.024852,\"sampleSize\":845", items[3].GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/DgvReaderTests.cs ================================================ using System.Collections.Generic; using System.Linq; using Compression.Utilities; using SAUtils.DataStructures; using SAUtils.InputFileParsers.DGV; using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class DgvReaderTests { private static readonly string TestDgvFile = Resources.TopPath("testDgvParser.txt"); private static IEnumerable CreateTruthDgvItemSequence() { yield return new DgvItem("nsv945265", ChromosomeUtilities.Chr1, 352306, 371739, 97, 10, 0, VariantType.complex_structural_alteration); yield return new DgvItem("nsv161172", ChromosomeUtilities.Chr1, 88190, 89153, 24, 0, 0, VariantType.copy_number_loss); yield return new DgvItem("nsv951399", ChromosomeUtilities.Chr1, 46501, 71800, 1, 1, 0, VariantType.copy_number_gain); yield return new DgvItem("nsv471522", ChromosomeUtilities.Chr1, 522139, 756783, 3, 3, 0, VariantType.copy_number_gain); yield return new DgvItem("nsv10161", ChromosomeUtilities.Chr1, 712111, 1708649, 31, 11, 7, VariantType.copy_number_variation); yield return new DgvItem("esv3358119", ChromosomeUtilities.Chr1, 822853, 822861, 185, 2, 0, VariantType.insertion); yield return new DgvItem("esv6890", ChromosomeUtilities.Chr1, 17006189, 17052558, 1, 0, 0, VariantType.inversion); yield return new DgvItem("esv6517", ChromosomeUtilities.Chr1, 964760, 965579, 1, 0, 0, VariantType.copy_number_loss); yield return new DgvItem("esv3310333", ChromosomeUtilities.Chr1, 17441132, 17441133, 185, 3, 0, VariantType.mobile_element_insertion); yield return new DgvItem("nsv479682", ChromosomeUtilities.Chr1, 3787207, 3787207, 9, 0, 0, VariantType.novel_sequence_insertion); yield return new DgvItem("nsv506926", ChromosomeUtilities.Chr1, 34597680, 34603680, 4, 0, 0, VariantType.structural_alteration); yield return new DgvItem("esv3302766", ChromosomeUtilities.Chr1, 38583768, 38583926, 185, 0, 0, VariantType.tandem_duplication); } [Fact] public void TestDbSnpReader() { using (var dgvReader = new DgvReader(GZipUtilities.GetAppropriateStreamReader(TestDgvFile), ChromosomeUtilities.RefNameToChromosome)) { Assert.True(dgvReader.GetItems().SequenceEqual(CreateTruthDgvItemSequence())); } } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/DgvTests.cs ================================================ using System.Collections.Generic; using SAUtils.DataStructures; using SAUtils.InputFileParsers.DGV; using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class DgvTests { [Fact] public void ExtractDgvCnv() { const string dgvLine = "nsv482937 1 1 2300000 CNV loss Iafrate_et_al_2004 15286789 BAC aCGH,FISH nssv2995976 M 39 0 1 ACAP3,AGRN,WASH7P "; var dgvItem = DgvReader.ExtractDgvItem(dgvLine, ChromosomeUtilities.RefNameToChromosome); var jsonString = dgvItem.GetJsonString(); Assert.Equal("\"chromosome\":\"1\",\"begin\":1,\"end\":2300000,\"variantType\":\"copy_number_loss\",\"id\":\"nsv482937\",\"sampleSize\":39,\"observedLosses\":1,\"variantFreqAll\":0.02564", jsonString ); } [Fact] public void ExtractDgvComplex() { const string dgvLine = "esv2421662 1 12841928 12971833 OTHER complex Altshuler_et_al_2010 20811451 SNP array essv5038349,essv5012238 M 1184 20 70 HNRNPCL1,LOC649330,PRAMEF1,PRAMEF10,PRAMEF11,PRAMEF2,PRAMEF4 NA10838,NA10847"; var dgvItem = DgvReader.ExtractDgvItem(dgvLine, ChromosomeUtilities.RefNameToChromosome); var jsonString = dgvItem.GetJsonString(); Assert.Equal("\"chromosome\":\"1\",\"begin\":12841928,\"end\":12971833,\"variantType\":\"complex_structural_alteration\",\"id\":\"esv2421662\",\"sampleSize\":1184,\"observedGains\":20,\"observedLosses\":70,\"variantFreqAll\":0.07601", jsonString); } [Fact] public void EmptyObservedLossesAndGains() { const string dgvLine = "nsv161172 1 88190 89153 CNV deletion Mills_et_al_2006 16902084 Sequencing nssv179750 M 24 "; var dgvItem = DgvReader.ExtractDgvItem(dgvLine, ChromosomeUtilities.RefNameToChromosome); var jsonString = dgvItem.GetJsonString(); Assert.Equal("\"chromosome\":\"1\",\"begin\":88190,\"end\":89153,\"variantType\":\"copy_number_loss\",\"id\":\"nsv161172\",\"sampleSize\":24", jsonString); //Assert.Equal("1", dgvInterval.Chromosome.EnsemblName); //Assert.Equal(88190, dgvInterval.Start); //Assert.Equal(89153, dgvInterval.End); //Assert.Equal("copy_number_loss", dgvInterval.VariantType.ToString()); //Assert.Equal("dgv", dgvInterval.Source); //Assert.Equal("nsv161172", dgvInterval.StringValues["id"]); //Assert.Equal(24, dgvInterval.IntValues["sampleSize"]); //Assert.False(dgvInterval.IntValues.ContainsKey("observedGains")); //Assert.False(dgvInterval.IntValues.ContainsKey("observedLosses")); //Assert.False(dgvInterval.PopulationFrequencies.ContainsKey("variantFreqAll")); } [Fact] public void EqualityAndHash() { var dgvItem = new DgvItem("dgv101", ChromosomeUtilities.Chr1, 100, 200, 123, 34, 32, VariantType.complex_structural_alteration); var dgvHash = new HashSet { dgvItem }; Assert.Single(dgvHash); Assert.Contains(dgvItem, dgvHash); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/GlobalMinorReaderTests.cs ================================================ using System.IO; using System.Linq; using SAUtils.DataStructures; using SAUtils.InputFileParsers.DbSnp; using UnitTests.TestUtilities; using VariantAnnotation.Interface.SA; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class GlobalMinorReaderTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##dbSNP"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("1\t15274\trs2758118\tA\tG,T\t.\t.\tRS=2758118;RSPOS=15274;RV;dbSNPBuildID=111;SSR=0;SAO=0;VP=0x050000080005000126000100;GENEINFO=WASH7P:653635;WGT=1;VC=SNV;INT;ASP;GNO;KGPhase3;CAF=0.01178,0.3472,0.641;COMMON=1"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems_test() { var reader = new GlobalMinorReader(GetStream(), ChromosomeUtilities.RefNameToChromosome); var items = reader.GetItems().Cast().ToList(); var globalMinor = SuppDataUtilities.GetPositionalAnnotation(items); Assert.Equal("{\"globalMinorAllele\":\"G\",\"globalMinorAlleleFrequency\":0.3472}", globalMinor.GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/GmeReaderTests.cs ================================================ using System.IO; using System.Linq; using Genome; using SAUtils.InputFileParsers.Gme; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class GmeTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); // file has been modified to 7 columns writer.WriteLine("#chrom\tpos\tref\talt\tfilter\tGME_GC\tGME_AC\tGME_AF"); writer.WriteLine("1\t69134\tA\tG\tVQSRTrancheSNP99.90to100.00\t10,192\t0.04950495049504951"); writer.WriteLine("1\t69270\tA\tG\tPASS\t518,224\t0.6981132075471698"); writer.WriteLine("1\t69428\tT\tG\tPASS\t74,1396\t0.050340136054421766"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems_test() { var sequence = new SimpleSequence(new string('T', VariantUtils.MaxUpstreamLength) + "A" +new string('T', 69270- 69134) + "A" +new string('T', 69428- 69270-1)+ "T", 69134 - 1 - VariantUtils.MaxUpstreamLength); var seqProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh37, sequence, ChromosomeUtilities.RefNameToChromosome); var gmeReader = new GmeParser(new StreamReader(GetStream()), seqProvider); var items = gmeReader.GetItems().ToList(); Assert.Equal(3, items.Count); Assert.Equal("\"allAc\":10,\"allAn\":202,\"allAf\":0.0495,\"failedFilter\":true", items[0].GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/MergedCosmicReaderTests.cs ================================================ using System.Collections; using System.Linq; using SAUtils.InputFileParsers.Cosmic; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class MergedCosmicReaderTests { [Fact] public void TwoStudyCosmicCoding() { var seqProvider = ParserTestUtils.GetSequenceProvider(35416, "A", 'C', ChromosomeUtilities.RefNameToChromosome); var cosmicReader = new MergedCosmicReader(Resources.TopPath("cosm5428243.vcf"), Resources.TopPath("cosm5428243.tsv"), seqProvider); var cosmicItem = cosmicReader.GetItems().ToList()[0]; var studies = cosmicItem.Studies.ToList(); Assert.Equal("544", studies[0].Id); Assert.Equal(new[] { "haematopoietic and lymphoid tissue" }, studies[0].Sites); Assert.Equal(new[] { "haematopoietic neoplasm" }, studies[0].Histologies); //Assert.Equal(new [] { "haematopoietic neoplasm", "acute myeloid leukaemia" }, study.Histologies); Assert.Equal("544", studies[1].Id); Assert.Equal(new[] { "haematopoietic;lymphoid tissue" }, studies[1].Sites); Assert.Equal(new[] { "haematopoietic neoplasm" }, studies[1].Histologies); //Assert.Equal(new[] { "haematopoietic_neoplasm", "acute_myeloid_leukaemia" }, study.Histologies); } [Fact] public void IndelWithNoLeadingBase() { var seqProvider = ParserTestUtils.GetSequenceProvider(10188320, "GGTACTGAC", 'A', ChromosomeUtilities.RefNameToChromosome); //the files provided are just for the sake of construction. The main aim is to test the VCF line parsing capabilities var cosmicReader = new MergedCosmicReader(Resources.TopPath("cosm5428243.vcf"), Resources.TopPath("cosm5428243.tsv"), seqProvider); const string vcfLine1 = "3 10188320 COSM14426 GGTACTGAC A . . GENE=VHL;STRAND=+;CDS=c.463G>A;AA=p.?;CNT=2"; const string vcfLine2 = "3 10188320 COSM18152 G A . . GENE=VHL;STRAND=+;CDS=c.463G>A;AA=p.V155M;CNT=7"; var items = cosmicReader.ExtractCosmicItems(vcfLine1); Assert.Equal("GGTACTGAC", items[0].RefAllele); Assert.Equal("A", items[0].AltAllele); Assert.Equal(10188320, items[0].Position); var items2 = cosmicReader.ExtractCosmicItems(vcfLine2); Assert.Equal("G", items2[0].RefAllele); Assert.Equal("A", items2[0].AltAllele); Assert.Equal(10188320, items2[0].Position); } /// /// testing if cosmic alternate allele is correctly output /// [Fact] public void CosmicAltAllele() { var seqProvider = ParserTestUtils.GetSequenceProvider(6928019, "C", 'A', ChromosomeUtilities.RefNameToChromosome); var cosmicReader = new MergedCosmicReader(Resources.TopPath("COSM983708.vcf"), Resources.TopPath("COSM983708.tsv"), seqProvider); var items = cosmicReader.GetItems().ToList(); Assert.Single((IEnumerable) items); Assert.Contains("\"refAllele\":\"-\"", items[0].GetJsonString()); } [Fact] public void CosmicAlleleSpecificIndel() { //10188320 var seqProvider = ParserTestUtils.GetSequenceProvider(10188320, "G", 'A', ChromosomeUtilities.RefNameToChromosome); var cosmicReader = new MergedCosmicReader(Resources.TopPath("COSM18152.vcf"), Resources.TopPath("COSM18152.tsv"), seqProvider); var items = cosmicReader.GetItems(); Assert.Single(items); } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/OneKGenTests.cs ================================================ using System.IO; using System.Linq; using System.Text.RegularExpressions; using SAUtils.InputFileParsers.OneKGen; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class OneKGenTests { private static string GetAlleleFrequency(string jsonString, string description) { var regexMatch = Regex.Match(jsonString, $"\"{description}\":([0|1]\\.?\\d+)?"); return regexMatch.Success ? regexMatch.Groups[1].ToString() : null; } [Fact] public void AlleleFrequencyTest() { const string vcfLine = "1 10352 rs555500075 T TA 100 PAS AC=2191;AF=0.4375;AN=5008;NS=2504;DP=88915;EAS_AF=0.4306;AMR_AF=0.4107;AFR_AF=0.4788;EUR_AF=0.4264;SAS_AF=0.4192;AA=|||unknown(NO_COVERAGE); VT=INDEL;EAS_AN=1008;EAS_AC=434;EUR_AN=1006;EUR_AC=429;AFR_AN=1322;AFR_AC=633;AMR_AN=694;AMR_AC=285;SAS_AN=978;SAS_AC=410"; var oneKGenReader = new OneKGenReader(null, ParserTestUtils.GetSequenceProvider(10352,"T",'C', ChromosomeUtilities.RefNameToChromosome)); var oneKItem = oneKGenReader.ExtractItems(vcfLine).First().GetJsonString(); Assert.Equal("0.4375", GetAlleleFrequency(oneKItem, "allAf")); Assert.Equal("0.47882", GetAlleleFrequency(oneKItem, "afrAf")); Assert.Equal("0.410663", GetAlleleFrequency(oneKItem, "amrAf")); Assert.Equal("0.430556", GetAlleleFrequency(oneKItem, "easAf")); Assert.Equal("0.426441", GetAlleleFrequency(oneKItem, "eurAf")); Assert.Equal("0.419223", GetAlleleFrequency(oneKItem, "sasAf")); Assert.DoesNotContain("ancestralAllele", oneKItem); } [Fact] public void MultiAltAlleleTest() { const string vcfLine = "1 15274 rs62636497 A G,T 100 PASS AC=1739,3210;AF=0.347244,0.640974;AN=5008;NS=2504;DP=23255;EAS_AF=0.4812,0.5188;AMR_AF=0.2752,0.7205;AFR_AF=0.323,0.6369;EUR_AF=0.2922,0.7078;SAS_AF=0.3497,0.6472;AA=g|||;VT=SNP;MULTI_ALLELIC;EAS_AN=1008;EAS_AC=485,523;EUR_AN=1006;EUR_AC=294,712;AFR_AN=1322;AFR_AC=427,842;AMR_AN=694;AMR_AC=191,500;SAS_AN=978;SAS_AC=342,633"; var oneKGenReader = new OneKGenReader(null, ParserTestUtils.GetSequenceProvider(15274, "A", 'C', ChromosomeUtilities.RefNameToChromosome)); var oneKGenItems = oneKGenReader.ExtractItems(vcfLine).ToList(); Assert.Equal(2, oneKGenItems.Count); var json1 = oneKGenItems[0].GetJsonString(); var json2 = oneKGenItems[1].GetJsonString(); Assert.Equal("0.347244", GetAlleleFrequency(json1, "allAf")); Assert.Equal("0.322995", GetAlleleFrequency(json1, "afrAf")); Assert.Equal("0.275216", GetAlleleFrequency(json1, "amrAf")); Assert.Equal("0.481151", GetAlleleFrequency(json1, "easAf")); Assert.Equal("0.292247", GetAlleleFrequency(json1, "eurAf")); Assert.Equal("0.349693", GetAlleleFrequency(json1, "sasAf")); Assert.Equal("0.640974", GetAlleleFrequency(json2, "allAf")); Assert.Equal("0.636914", GetAlleleFrequency(json2, "afrAf")); Assert.Equal("0.720461", GetAlleleFrequency(json2, "amrAf")); Assert.Equal("0.518849", GetAlleleFrequency(json2, "easAf")); Assert.Equal("0.707753", GetAlleleFrequency(json2, "eurAf")); //double check this one: 0.7077535 Assert.Equal("0.647239", GetAlleleFrequency(json2, "sasAf")); } [Fact] public void PrioritizingSymbolicAllele4Svs() { const string vcfLine = "X 101155257 rs373174489 GTGCAAAAGCTCTTTAGTTTAATTAGGTCTCAGCTATTTATCTTTGTTCTTAT G 100 PASS AN=3775;AC=1723;AF=0.456424;AA=;EAS_AN=764;EAS_AC=90;EAS_AF=0.1178;EUR_AN=766;EUR_AC=439;EUR_AF=0.5731;AFR_AN=1003;AFR_AC=839;AFR_AF=0.8365;AMR_AN=524;AMR_AC=180;AMR_AF=0.3435;SAS_AN=718;SAS_AC=175;SAS_AF=0.2437"; var oneKGenReader = new OneKGenReader(null, ParserTestUtils.GetSequenceProvider(101155257, "GTGCAAAAGCTCTTTAGTTTAATTAGGTCTCAGCTATTTATCTTTGTTCTTAT", 'C', ChromosomeUtilities.RefNameToChromosome)); var oneKItems = oneKGenReader.ExtractItems(vcfLine); var json1 = oneKItems.First().GetJsonString(); Assert.Equal("0.456424", GetAlleleFrequency(json1, "allAf")); Assert.Equal("0.836491", GetAlleleFrequency(json1, "afrAf")); Assert.Equal("0.343511", GetAlleleFrequency(json1, "amrAf")); Assert.Equal("0.117801", GetAlleleFrequency(json1, "easAf")); Assert.Equal("0.573107", GetAlleleFrequency(json1, "eurAf")); Assert.Equal("0.243733", GetAlleleFrequency(json1, "sasAf")); } [Fact] public void MissingSubPopulationFrequencies() { const string vcfLine = "1\t10616\trs376342519\tCCGCCGTTGCAAAGGCGCGCCG\tC\t100\tPASS\tAN=5008;AC=4973;AF=0.993011;AA=;EAS_AN=1008;EAS_AC=999;EAS_AF=0.9911;EUR_AN=1006;EUR_AC=1000;EUR_AF=0.994;AFR_AN=1322;AFR_AC=1308;AFR_AF=0.9894;AMR_AN=694;AMR_AC=691;AMR_AF=0.9957;SAS_AN=978;SAS_AC=975;SAS_AF=0.9969"; var oneKGenReader = new OneKGenReader(null, ParserTestUtils.GetSequenceProvider(10616, "CCGCCGTTGCAAAGGCGCGCCG", 'C', ChromosomeUtilities.RefNameToChromosome)); var items = oneKGenReader.ExtractItems(vcfLine).ToList(); Assert.Single(items); Assert.Equal("\"allAf\":0.993011,\"afrAf\":0.98941,\"amrAf\":0.995677,\"easAf\":0.991071,\"eurAf\":0.994036,\"sasAf\":0.996933,\"allAn\":5008,\"afrAn\":1322,\"amrAn\":694,\"easAn\":1008,\"eurAn\":1006,\"sasAn\":978,\"allAc\":4973,\"afrAc\":1308,\"amrAc\":691,\"easAc\":999,\"eurAc\":1000,\"sasAc\":975", items[0].GetJsonString()); } private static Stream GetOneKgSvStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("1\t668630\t850204\tesv3584976\t\tAC=64;AF=0.0127796;AN=5008;CIEND=-150,150;CIPOS=-150,150;CS=DUP_delly;END=850204;NS=2504;SVTYPE=DUP;IMPRECISE;DP=22135;EAS_AF=0.0595;AMR_AF=0;AFR_AF=0.0015;EUR_AF=0.001;SAS_AF=0.001;VT=SV;EX_TARGET"); writer.WriteLine("1\t713044\t755966\tesv3584977;esv3584978\t,\tAC=3,206;AF=0.000599042,0.0411342;AN=5008;CS=DUP_gs;END=755966;NS=2504;SVTYPE=CNV;DP=20698;EAS_AF=0.001,0.0615;AMR_AF=0.0014,0.0259;AFR_AF=0,0.0303;EUR_AF=0.001,0.0417;SAS_AF=0,0.045;VT=SV;EX_TARGET"); writer.WriteLine("1\t738570\t742020\tesv3584979\t\tAC=1;AF=0.000199681;AN=5008;CIEND=0,354;CIPOS=-348,0;CS=DEL_union;END=742020;NS=2504;SVTYPE=DEL;DP=19859;EAS_AF=0.001;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;VT=SV;EX_TARGET"); writer.WriteLine("1\t645710\t699999\tesv3584975\t\tAC=35;AF=0.00698882;AN=5008;CS=ALU_umary;MEINFO=AluYa4_5,1,223,-;NS=2504;SVLEN=222;SVTYPE=ALU;TSD=null;DP=12290;EAS_AF=0.0069;AMR_AF=0.0072;AFR_AF=0;EUR_AF=0.0189;SAS_AF=0.0041;VT=SV"); writer.WriteLine("1\t812283\t876543\tesv3584985\t\tAC=58;AF=0.0115815;AN=5008;CS=L1_umary;MEINFO=LINE1,2926,3363,+;NS=2504;SVLEN=437;SVTYPE=LINE1;TSD=null;DP=19016;EAS_AF=0.0109;AMR_AF=0.0187;AFR_AF=0.0098;EUR_AF=0.0179;SAS_AF=0.0031;VT=SV"); writer.WriteLine("1\t2397655\t2401469\t.;esv3585028\t,\tAC=0,96;AF=0,0.0191693;AN=5008;CS=DUP_gs;END=2401469;NS=2504;SVTYPE=DUP;DP=16784;EAS_AF=0,0.0248;AMR_AF=0,0.0216;AFR_AF=0,0.0287;EUR_AF=0,0.0119;SAS_AF=0,0.0061;VT=SV"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void OnekGenSvReader() { using (var reader = new StreamReader(GetOneKgSvStream())) { var svReader = new OneKGenSvReader(reader, ChromosomeUtilities.RefNameToChromosome); var svItemList = svReader.GetItems().ToList(); Assert.Equal(4, svItemList.Count); Assert.Equal("\"chromosome\":\"1\",\"begin\":668631,\"end\":850204,\"variantType\":\"copy_number_gain\",\"id\":\"esv3584976\",\"allAn\":5008,\"allAc\":64,\"allAf\":0.01278,\"afrAf\":0.0015,\"amrAf\":0,\"eurAf\":0.001,\"easAf\":0.0595,\"sasAf\":0.001", svItemList[0].GetJsonString()); Assert.Equal("\"chromosome\":\"1\",\"begin\":713045,\"end\":755966,\"variantType\":\"copy_number_variation\",\"id\":\"esv3584977;esv3584978\",\"allAn\":5008,\"allAc\":209,\"allAf\":0.041733,\"afrAf\":0.0303,\"amrAf\":0.0273,\"eurAf\":0.0427,\"easAf\":0.0625,\"sasAf\":0.045", svItemList[1].GetJsonString()); Assert.Equal("\"chromosome\":\"1\",\"begin\":738571,\"end\":742020,\"variantType\":\"copy_number_loss\",\"id\":\"esv3584979\",\"allAn\":5008,\"allAc\":1,\"allAf\":0.0002,\"afrAf\":0,\"amrAf\":0,\"eurAf\":0,\"easAf\":0.001,\"sasAf\":0", svItemList[2].GetJsonString()); Assert.Equal("\"chromosome\":\"1\",\"begin\":2397656,\"end\":2401469,\"variantType\":\"copy_number_gain\",\"id\":\"esv3585028\",\"allAn\":5008,\"allAc\":96,\"allAf\":0.019169,\"afrAf\":0.0287,\"amrAf\":0.0216,\"eurAf\":0.0119,\"easAf\":0.0248,\"sasAf\":0.0061", svItemList[3].GetJsonString()); } } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/ParserTestUtils.cs ================================================ using System.Collections.Generic; using Genome; using Moq; using UnitTests.TestDataStructures; using VariantAnnotation.Interface.Providers; using Variants; namespace UnitTests.SAUtils.InputFileParsers { public static class ParserTestUtils { public static ISequenceProvider GetSequenceProvider(int position, string refAllele, char upstreamBase, Dictionary refChromDict) { var sequence = new SimpleSequence(new string(upstreamBase, VariantUtils.MaxUpstreamLength) + refAllele, position - 1 - VariantUtils.MaxUpstreamLength); return new SimpleSequenceProvider(GenomeAssembly.GRCh37, sequence, refChromDict); } public static IRefMinorProvider GetRefMinorProvider(List<(Chromosome chrom, int position, string globalMinor)> refMinors) { var refMinorProvider = new Mock(); foreach (var (chrom, position, globalMinor) in refMinors) { refMinorProvider.Setup(x => x.GetGlobalMajorAllele(chrom, position)).Returns(globalMinor); } return refMinorProvider.Object; } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/RefMinorTests.cs ================================================ using System; using System.IO; using System.Linq; using Genome; using IO; using Moq; using SAUtils.InputFileParsers.OneKGen; using SAUtils.RefMinorDb; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Providers; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class RefMinorTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##1000Genomes"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("1\t15274\trs62636497\tA\tG,T\t100\tPASS\tAC=1739,3210;AF=0.347244,0.640974;AN=5008;NS=2504;DP=23255;EAS_AF=0.4812,0.5188;AMR_AF=0.2752,0.7205;AFR_AF=0.323,0.6369;EUR_AF=0.2922,0.7078;SAS_AF=0.3497,0.6472;AA=g|||;VT=SNP;MULTI_ALLELIC;EAS_AN=1008;EAS_AC=485,523;EUR_AN=1006;EUR_AC=294,712;AFR_AN=1322;AFR_AC=427,842;AMR_AN=694;AMR_AC=191,500;SAS_AN=978;SAS_AC=342,633"); writer.WriteLine("1\t241369\trs11490246\tC\tT\t100\tPASS\tAC=5008;AF=1;AN=5008;NS=2504;DP=8951;EAS_AF=1;AMR_AF=1;AFR_AF=1;EUR_AF=1;SAS_AF=1;AA=.|||;VT=SNP;EAS_AN=1008;EAS_AC=1008;EUR_AN=1006;EUR_AC=1006;AFR_AN=1322;AFR_AC=1322;AMR_AN=694;AMR_AC=694;SAS_AN=978;SAS_AC=978"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems() { using (var reader = new RefMinorReader(new StreamReader(GetStream()), GetSequenceProvider())) { var items = reader.GetItems().ToList(); Assert.Equal(3, items.Count); } } private static ISequenceProvider GetSequenceProvider() { var seqProvider = new Mock(); seqProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); seqProvider.SetupGet(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome); seqProvider.Setup(x => x.Sequence.Substring(15274 -1, 1)).Returns("A"); seqProvider.Setup(x => x.Sequence.Substring(241369-1, 1)).Returns("C"); return seqProvider.Object; } [Fact] public void LoopBack() { var version = new DataSourceVersion("onekgen", "v0.3", DateTime.Now.Ticks); using (var reader = new RefMinorReader(new StreamReader(GetStream()), GetSequenceProvider())) using (var stream = new MemoryStream()) using (var indexStream = new MemoryStream()) using (var writer = new RefMinorDbWriter(new ExtendedBinaryWriter(stream), new ExtendedBinaryWriter(indexStream), version, GetSequenceProvider(), SaCommon.SchemaVersion)) { writer.Write(reader.GetItems()); stream.Position = 0; indexStream.Position = 0; using (var dbReader = new RefMinorDbReader(stream, indexStream)) { Assert.Equal("T", dbReader.GetGlobalMajorAllele(ChromosomeUtilities.Chr1, 15274)); Assert.Null(dbReader.GetGlobalMajorAllele(ChromosomeUtilities.Chr1, 1524)); } } } } } ================================================ FILE: UnitTests/SAUtils/InputFileParsers/TopMedReaderTests.cs ================================================ using System.IO; using System.Linq; using Genome; using SAUtils.InputFileParsers.TOPMed; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.SAUtils.InputFileParsers { public sealed class TopMedReaderTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##TopMED"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("chr1\t10128\trs796688738\tA\tAC\t255\tSVM;DISC\tVRT=2;NS=62784;AN=125568;AC=334;AF=0.00265991;Het=334;Hom=0\tNA:FRQ\t125568:0.00265991"); writer.WriteLine("chr1\t10146\trs779258992\tAC\tA\t255\tSVM;DISC;EXHET\tVRT=2;NS=62784;AN=125568;AC=2897;AF=0.0230712;Het=2897;Hom=0\tNA:FRQ\t125568:0.0230712"); writer.WriteLine("chr1\t10177\trs201752861\tA\tC\t255\tSVM;DISC\tVRT=1;NS=62784;AN=125568;AC=488;AF=0.00388634;Het=488;Hom=0\tNA:FRQ\t125568:0.00388634"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems_test() { var sequence = new SimpleSequence(new string('T', VariantUtils.MaxUpstreamLength) + "A" +new string('T', 10146- 10128) + "AC" +new string('T', 10177- 10146-1)+"A", 10128 - 1 - VariantUtils.MaxUpstreamLength); var seqProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh37, sequence, ChromosomeUtilities.RefNameToChromosome); var gnomadReader = new TopMedReader(new StreamReader(GetStream()), seqProvider); var items = gnomadReader.GetItems().ToList(); Assert.Equal(3, items.Count); Assert.Equal("\"allAf\":0.00266,\"allAn\":125568,\"allAc\":334,\"allHc\":0,\"failedFilter\":true", items[0].GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/MitoHeteroplasmy/MitoHeteroplasmyTests.cs ================================================ using System.IO; using System.Linq; using Moq; using SAUtils.MitoHeteroplasmy; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Providers; using Xunit; namespace UnitTests.SAUtils.MitoHeteroplasmy { public sealed class MitoHeteroplasmyTests { private static ISequenceProvider GetSequenceProvider() { var mockProvider = new Mock(); mockProvider.SetupGet(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome); mockProvider.SetupGet(x => x.RefIndexToChromosome).Returns(ChromosomeUtilities.RefIndexToChromosome); return mockProvider.Object; } private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("## num_samples=246"); writer.WriteLine("MT\t4\t5\t{}");// 0 items writer.WriteLine("MT\t5\t6\t{\"C:A\":{\"ad\":[1],\"allele_type\":\"alt\",\"vrf\":[0.006329113924050633],\"vrf_stats\":{\"kurtosis\":241.00408163265314,\"max\":0.0063291139240506328,\"mean\":2.5728105382319646e-05,\"min\":0.0,\"nobs\":246,\"skewness\":15.588588185998534,\"stdev\":0.00040352956522996095,\"variance\":1.6283611001468132e-07}}}");// 1 item writer.WriteLine("MT\t7\t8\t{\"G:A\":{\"ad\":[1,1,1,1],\"allele_type\":\"alt\",\"vrf\":[0.003205128205128205,0.002232142857142857,0.0037593984962406013,0.00273224043715847],\"vrf_stats\":{\"kurtosis\":64.96245848503843,\"max\":0.0037593984962406013,\"mean\":4.849150404743957e-05,\"min\":0.0,\"nobs\":246,\"skewness\":8.05974448165666,\"stdev\":0.00038478763089843624,\"variance\":1.4806152089243121e-07}},\"G:C\":{\"ad\":[1,1],\"allele_type\":\"alt\",\"vrf\":[0.0024813895781637717,0.004291845493562232],\"vrf_stats\":{\"kurtosis\":148.72822661048482,\"max\":0.0042918454935622317,\"mean\":2.7533475901325216e-05,\"min\":0.0,\"nobs\":246,\"skewness\":12.019856436922753,\"stdev\":0.00031552186298069995,\"variance\":9.9554046018811583e-08}},\"G:T\":{\"ad\":[1,1,1,1],\"allele_type\":\"alt\",\"vrf\":[0.0027624309392265192,0.002680965147453083,0.003236245954692557,0.0030211480362537764],\"vrf_stats\":{\"kurtosis\":57.92357810503749,\"max\":0.0032362459546925568,\"mean\":4.7564187307422503e-05,\"min\":0.0,\"nobs\":246,\"skewness\":7.717570354191911,\"stdev\":0.0003717728271743761,\"variance\":1.3821503502522855e-07}}}");//3 items writer.Flush(); stream.Position = 0; return stream; } [Fact] public void ParseItems() { using var parser = new MitoHeteroplasmyParser(GetStream()); var items = parser.GetOutputLines().ToList(); Assert.Equal(4, items.Count); Assert.Equal("6\tC\tA\t0.006\t1", items[0]); Assert.Equal("8\tG\tA\t0.002,0.003,0.004\t1,2,1", items[1]); } [Fact] public void DeserializeStats() { const string input = "{\"G:A\":{\"ad\":[1,1,1,1],\"allele_type\":\"alt\",\"vrf\":[0.003205128205128205,0.002232142857142857,0.0037593984962406013,0.00273224043715847],\"vrf_stats\":{\"kurtosis\":64.96245848503843,\"max\":0.0037593984962406013,\"mean\":4.849150404743957e-05,\"min\":0.0,\"nobs\":246,\"skewness\":8.05974448165666,\"stdev\":0.00038478763089843624,\"variance\":1.4806152089243121e-07}},\"G:C\":{\"ad\":[1,1],\"allele_type\":\"alt\",\"vrf\":[0.0024813895781637717,0.004291845493562232],\"vrf_stats\":{\"kurtosis\":148.72822661048482,\"max\":0.0042918454935622317,\"mean\":2.7533475901325216e-05,\"min\":0.0,\"nobs\":246,\"skewness\":12.019856436922753,\"stdev\":0.00031552186298069995,\"variance\":9.9554046018811583e-08}},\"G:T\":{\"ad\":[1,1,1,1],\"allele_type\":\"alt\",\"vrf\":[0.0027624309392265192,0.002680965147453083,0.003236245954692557,0.0030211480362537764],\"vrf_stats\":{\"kurtosis\":57.92357810503749,\"max\":0.0032362459546925568,\"mean\":4.7564187307422503e-05,\"min\":0.0,\"nobs\":246,\"skewness\":7.717570354191911,\"stdev\":0.0003717728271743761,\"variance\":1.3821503502522855e-07}}}"; var stats = MitoHeteroplasmyParser.DeserializeStats(input); Assert.NotNull(stats.G_A); Assert.Equal(0.003205128205128205, stats.G_A.vrf[0]); } } } ================================================ FILE: UnitTests/SAUtils/MitoMap/MitoMapSvReaderTests.cs ================================================ using System.IO; using System.Linq; using Genome; using SAUtils.MitoMap; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.MitoMap { public sealed class MitoMapSvReaderTests { private static readonly string RawSequence = "ABC" + new string('N', 200); private static readonly ISequence Sequence = new SimpleSequence(RawSequence); private readonly SimpleSequenceProvider _sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh37, Sequence, ChromosomeUtilities.RefNameToChromosome); [Theory] [InlineData("[\"5:105\",\"-101\",\"1837-1840/5447-5451\",\"D, 4/4\",\"1\"],", "DeletionsSingle", "\"chromosome\":\"MT\",\"begin\":4,\"end\":104,\"variantType\":\"deletion\"")] [InlineData("[\"2:122\",\"-121\",\"7439/13476\",\"D, 1/1\",\"1\"],", "DeletionsSingle", "\"chromosome\":\"MT\",\"begin\":3,\"end\":123,\"variantType\":\"deletion\"")] [InlineData("[\"Complete (16.5 kb)\",\"+266\",\"7-27 D-Loop region\",\"573 D-Loop region\",\"D, 7/7\",\"25\",\"4\"],", "InsertionsSimple", "\"chromosome\":\"MT\",\"begin\":16030,\"end\":16050,\"variantType\":\"duplication\"")] public void ParseLine_AsExpected(string line, string fileName, string expectedJsonString) { var reader = new MitoMapSvReader(new FileInfo(fileName), _sequenceProvider); var jsonString = reader.ParseLine(line).FirstOrDefault().GetJsonString(); Assert.Equal(expectedJsonString, jsonString); } } } ================================================ FILE: UnitTests/SAUtils/MitoMap/MitoMapVariantReaderTests.cs ================================================ using System.Collections.Generic; using System.Linq; using CacheUtils.TranscriptCache; using Genome; using SAUtils.InputFileParsers.ClinVar; using SAUtils.MitoMap; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.MitoMap { public sealed class MitoMapVariantReaderTests { private static readonly ISequence Sequence = new NSequence(); private static readonly SimpleSequenceProvider SequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh37, Sequence, ChromosomeUtilities.RefNameToChromosome); private static readonly VariantAligner VariantAligner = new VariantAligner(SequenceProvider?.Sequence); private static readonly MitoMapInputDb MitoMapInputDb = new MitoMapInputDb( new Dictionary {{"7616", "17616"},{"3510", "13510"},{"90282","190282"},{"99016","199016"}}); [Fact] public void GetAltAllelesTests() { const string altAlleleString1 = "ACT"; const string altAlleleString2 = "ACT;AGT"; const string altAlleleString3 = "AKY"; const string altAlleleString4 = "ACT;AKY"; const string altAlleleString5 = "CNT;AKY"; Assert.Equal(new[] { "ACT" }, MitoMapVariantReader.GetAltAlleles(altAlleleString1)); Assert.Equal(new[] { "ACT", "AGT" }, MitoMapVariantReader.GetAltAlleles(altAlleleString2)); Assert.Equal(new[] { "AGC", "AGT", "ATC", "ATT" }, MitoMapVariantReader.GetAltAlleles(altAlleleString3)); Assert.Equal(new[] { "ACT", "AGC", "AGT", "ATC", "ATT" }, MitoMapVariantReader.GetAltAlleles(altAlleleString4)); Assert.Equal(new[] { "CNT", "AGC", "AGT", "ATC", "ATT" }, MitoMapVariantReader.GetAltAlleles(altAlleleString5)); } [Theory] [InlineData("0 (0)", MitoMapDataTypes.MitoMapMutationsRNA, 0)] [InlineData("858 (0)\"", MitoMapDataTypes.MitoMapMutationsRNA, 858)] [InlineData("3657 (4688)", MitoMapDataTypes.MitoMapMutationsCodingControl, 3657)] [InlineData("36", MitoMapDataTypes.MitoMapPolymorphismsCoding, 36)] [InlineData("0", MitoMapDataTypes.MitoMapPolymorphismsCoding, 0)] [InlineData("0", MitoMapDataTypes.MitoMapPolymorphismsControl, 0)] [InlineData("5 (3/2)", MitoMapDataTypes.MitoMapPolymorphismsControl, 3)] [InlineData("38 (0/38)", MitoMapDataTypes.MitoMapPolymorphismsControl, 0)] public void GetNumFullLengthSequences_AsExpected(string field, string dataType, int numFullLengthSequences) { Assert.Equal(numFullLengthSequences, MitoMapVariantReader.GetNumFullLengthSequences(field, dataType)); } [Theory] [InlineData("[\"618\",\"MT-TF\",\"Ptosis CPEO MM & EXIT\",\"T618G\",\"tRNA Phe\",\"-\",\"+\",\"Reported\",\"77.50% \",\"0.0%
(0.0%)\",\"0 (0)\",\"1\"],", "MutationsRNA", "\"refAllele\":\"T\",\"altAllele\":\"G\",\"diseases\":[\"Ptosis CPEO MM & EXIT\"],\"hasHomoplasmy\":false,\"hasHeteroplasmy\":true,\"status\":\"Reported\",\"clinicalSignificance\":\"likely pathogenic\",\"scorePercentile\":77.50,\"numGenBankFullLengthSeqs\":0,\"pubMedIds\":[\"17616\"]")] [InlineData("[\"3308\",\"MT-ND1\",\"Sudden Infant Death\",\"T3308G\",\"T-G\",\"M-Term\",\"+\",\"+\",\"Reported\",\"0.0%
(0.0%)\",\"6 (0)\",\"1\"],", "MutationsCodingControl", "\"refAllele\":\"T\",\"altAllele\":\"G\",\"diseases\":[\"Sudden Infant Death\"],\"hasHomoplasmy\":true,\"hasHeteroplasmy\":true,\"status\":\"Reported\",\"numGenBankFullLengthSeqs\":6,\"pubMedIds\":[\"13510\"]")] [InlineData("[\"606\",\"MT-TF\",\"A-G\",\"-\",\"-\",\"tRNA\",\"0.0%\",\"15\",\"2\"],", "PolymorphismsCoding", "\"refAllele\":\"A\",\"altAllele\":\"G\",\"numGenBankFullLengthSeqs\":15,\"pubMedIds\":[\"190282\",\"199016\"]")] public void ParseLine_AsExpected(string line, string fileName, string expectedJsonString) { string jsonString = MitoMapVariantReader.ParseLine(line, fileName, SequenceProvider, VariantAligner, ChromosomeUtilities.ChrM, MitoMapInputDb) .FirstOrDefault() ?.GetJsonString(); Assert.Equal(expectedJsonString, jsonString); } } } ================================================ FILE: UnitTests/SAUtils/MitoMap/ParsingUtilitiesTests.cs ================================================ using System.Collections.Generic; using SAUtils.MitoMap; using Xunit; namespace UnitTests.SAUtils.MitoMap { public sealed class ParsingUtilitiesTests { private static readonly MitoMapInputDb MitoMapInputDb = new MitoMapInputDb(new Dictionary { {"1", "101"}, {"2", "102"}, {"13", "103"}, {"4100", "104"}, {"5678", "105"}, {"23202", "105"} }); [Theory] [InlineData("1", "104")] [InlineData("3", "101,103,105")] [InlineData("4", "101,102,103,104")] public void GetPubMedIds_AsExpected(string field, string pubmedIds) { Assert.Equal(string.Join(',', ParsingUtilities.GetPubMedIds(field, MitoMapInputDb)), pubmedIds); } [Theory] [InlineData("4", "1,2,13,4100")] [InlineData("97", "45,247,280,303,312,330,332,394,396,541,3311,3370,3427,3569,3584,3732,3943,4287,4946,5113,5329,5348,5451,5452,5628,6169,6221,6228,6421,6490,6531,6603,6623,7647,7695,8405,8492,20372,20373,90237,90296,90302,90311,90331,90333,90365,90377,90453,90468,90477,90483,90534,90572,90627,90764,90809,90812,90835,90903,90904,90941,90963,91015,91020,91030,91041,91056,91068,91077,91081,91094,91099,91206,91214,91215,91221,91227,91228,91256,91269,91311,91318,91394,91420,91431,91514,91687,91737,91775,91789,91830,91958,91999,92063,92082,92100,99016")] public void ExtractInternalIds_AsExpected(string field, string internalIds) { Assert.Equal(string.Join(',', ParsingUtilities.ExtractInternalIds(field)), internalIds); } } } ================================================ FILE: UnitTests/SAUtils/NsaWriters/IntervalWriterReaderTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using SAUtils.DataStructures; using UnitTests.TestUtilities; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Pools; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Variants; using Xunit; namespace UnitTests.SAUtils.NsaWriters { public sealed class IntervalWriterReaderTests { private static IEnumerable GetClinGenItems() { return new[] { new ClinGenItem("cg1", ChromosomeUtilities.Chr1, 145, 2743, VariantType.copy_number_gain, 3, 0, ClinicalInterpretation.likely_benign,true, new HashSet {"phenotype1", "phenotype2"}, new HashSet {"pid1", "pid2"} ), new ClinGenItem("cg2", ChromosomeUtilities.Chr1, 14585, 5872743, VariantType.copy_number_loss, 0, 5, ClinicalInterpretation.likely_pathogenic,true, new HashSet {"phenotype3", "phenotype5"}, new HashSet {"pid3", "pid5"} ), new ClinGenItem("cg3", ChromosomeUtilities.Chr2, 45759, 8792743, VariantType.deletion, 3, 0, ClinicalInterpretation.pathogenic,true, new HashSet {"phenotype1", "phenotype4"}, new HashSet {"pid1", "pid4"} ), new ClinGenItem("cg4", ChromosomeUtilities.Chr2, 5589745, 7987923, VariantType.insertion, 3, 0, ClinicalInterpretation.uncertain_significance, true, new HashSet {"phenotype10", "phenotype14"}, new HashSet {"pid10", "pid14"} ) }; } [Fact] public void Readback_clingen() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); using (var saStream = new MemoryStream()) { using(var siWriter = new NsiWriter(saStream, version, GenomeAssembly.GRCh37, "clingen", ReportFor.StructuralVariants, SaCommon.SchemaVersion, true)) { siWriter.Write(GetClinGenItems()); } saStream.Position = 0; var siReader = NsiReader.Read(saStream); var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 100, 14590, "", "", VariantType.deletion, "1:100:14590:del", false, false, false, null, null, true); var annotations = siReader.GetAnnotation(variant).ToArray(); string[] expected = { "\"chromosome\":\"1\",\"begin\":145,\"end\":2743,\"variantType\":\"copy_number_gain\",\"id\":\"cg1\",\"clinicalInterpretation\":\"likely benign\",\"phenotypes\":[\"phenotype1\",\"phenotype2\"],\"phenotypeIds\":[\"pid1\",\"pid2\"],\"observedGains\":3,\"validated\":true,\"reciprocalOverlap\":0.17935,\"annotationOverlap\":1", "\"chromosome\":\"1\",\"begin\":14585,\"end\":5872743,\"variantType\":\"copy_number_loss\",\"id\":\"cg2\",\"clinicalInterpretation\":\"likely pathogenic\",\"phenotypes\":[\"phenotype3\",\"phenotype5\"],\"phenotypeIds\":[\"pid3\",\"pid5\"],\"observedLosses\":5,\"validated\":true,\"reciprocalOverlap\":0,\"annotationOverlap\":0" }; Assert.Equal(2, annotations.Length); Assert.Equal(expected, annotations); VariantPool.Return(variant); } } } } ================================================ FILE: UnitTests/SAUtils/NsaWriters/NsaUtilitiesTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using SAUtils.DataStructures; using SAUtils.InputFileParsers.TOPMed; using UnitTests.SAUtils.InputFileParsers; using UnitTests.TestUtilities; using VariantAnnotation.Interface.SA; using Xunit; namespace UnitTests.SAUtils.NsaWriters { public sealed class NsaUtilitiesTests { private static Stream GetDupItemsStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##TopMED"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("chr5\t70220313\trs377439976;rs372466088\tTGCC\tT\t155\tSVM;DISCVRT=2;NS=62784;AN=125568;AC=43904;AF=0.349643;Het=12194;Hom=15855\tNA:FRQ 125568:0.349643"); writer.WriteLine("chr5\t70220313\trs377439976;rs372466088\tTGCC\tT\t155\tSVM;DISCVRT=2;NS=62784;AN=125568;AC=43904;AF=0.349643;Het=12194;Hom=15855\tNA:FRQ 125568:0.349643"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void RemoveConflictingAlleles_does_not_remove_duplicates() { var seqProvider = ParserTestUtils.GetSequenceProvider(70220313, "TGCC", 'A', ChromosomeUtilities.RefNameToChromosome); var topMedReader = new TopMedReader(new StreamReader(GetDupItemsStream()), seqProvider); var items = topMedReader.GetItems().ToList(); var saItems = new List(items); saItems = SuppDataUtilities.RemoveConflictingAlleles(saItems, false); Assert.Single(saItems); } } } ================================================ FILE: UnitTests/SAUtils/NsaWriters/WriterReaderTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using Genome; using Moq; using SAUtils; using SAUtils.DataStructures; using SAUtils.gnomAD; using SAUtils.InputFileParsers.ClinVar; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Variants; using Xunit; namespace UnitTests.SAUtils.NsaWriters { public sealed class WriterReaderTests { private static IEnumerable GetClinvarItems() { var clinvarItems = new List { new ClinVarItem(ChromosomeUtilities.Chr1, 100, 100, "T", "A", ClinVarSchema.Get(), new[] {"origin1"}, "SNV", "RCV0001", null, ClinVarCommon.ReviewStatus.no_assertion, new[] {"medgen1"}, new[] {"omim1"}, new[] {"orpha1"}, new[] {"phenotype1"}, new[] {"significance"}, new[] {10024875684920}, 658794146787), new ClinVarItem(ChromosomeUtilities.Chr1, 101, 101, "A", "", ClinVarSchema.Get(), new[] {"origin1"}, "del", "RCV00011", "101", ClinVarCommon.ReviewStatus.no_assertion, new[] {"medgen1"}, new[] {"omim1"}, new[] {"orpha1"}, new[] {"phenotype1"}, new[] {"significance"}, new[] {10024875684920}, 658794146787), new ClinVarItem(ChromosomeUtilities.Chr1, 106, 106, "C", "", ClinVarSchema.Get(), new[] {"origin5"}, "del", "RCV0005", null, ClinVarCommon.ReviewStatus.multiple_submitters, new[] {"medgen5"}, new[] {"omim5"}, new[] {"orpha5"}, new[] {"phenotype5"}, new[] {"significance5"}, new[] {10024255684920}, 658794187787), new ClinVarItem(ChromosomeUtilities.Chr2, 200, 200, "G", "A", ClinVarSchema.Get(), new[] {"origin21"}, "SNV", "RCV20001", null, ClinVarCommon.ReviewStatus.multiple_submitters_no_conflict, new[] {"medgen20"}, new[] {"omim20"}, new[] {"orpha20"}, new[] {"phenotype20"}, new[] {"significance20"}, new[] {10024875684480}, 669794146787), new ClinVarItem(ChromosomeUtilities.Chr2, 205, 205, "T", "C", ClinVarSchema.Get(), new[] {"origin25"}, "ins", "RCV20005", null, ClinVarCommon.ReviewStatus.expert_panel, new[] {"medgen25"}, new[] {"omim25"}, new[] {"orpha25"}, new[] {"phenotype25"}, new[] {"significance25"}, new[] {10024255684925}, 658794187287) }; return clinvarItems; } private static ISequenceProvider GetSequenceProvider() { var sequence = new SimpleSequence(new string('A', 99) + "TAGTCGGTTAA" + new string('A', 89) + "GCCCAT"); return new SimpleSequenceProvider(GenomeAssembly.GRCh37, sequence, ChromosomeUtilities.RefNameToChromosome); } [Fact] public void Write_clinvar_basic() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new NsaWriter(saStream, indexStream, version, GetSequenceProvider(), "clinvar", false, true, SaCommon.SchemaVersion, false, true, false, 1024, GenomeAssembly.GRCh37, true)) { saWriter.Write(GetClinvarItems()); } saStream.Position = 0; indexStream.Position = 0; using (var saReader = new NsaReader(saStream, indexStream, 1024)) { Assert.Equal(GenomeAssembly.GRCh37, saReader.Assembly); Assert.Equal(version.ToString(), saReader.Version.ToString()); saReader.PreLoad(ChromosomeUtilities.Chr1, new List {100, 101, 106}); var annotations = new List<(string refAllele, string altAllele, string annotation)>(); saReader.GetAnnotation(100, annotations); Assert.Equal("T", annotations[0].refAllele); Assert.Equal("A", annotations[0].altAllele); Assert.Equal( "\"id\":\"RCV0001\",\"reviewStatus\":\"no assertion provided\",\"alleleOrigins\":[\"origin1\"],\"refAllele\":\"T\",\"altAllele\":\"A\",\"phenotypes\":[\"phenotype1\"],\"medGenIds\":[\"medgen1\"],\"omimIds\":[\"omim1\"],\"orphanetIds\":[\"orpha1\"],\"significance\":[\"significance\"],\"lastUpdatedDate\":\"0001-01-01\",\"pubMedIds\":[\"10024875684920\"]", annotations[0].annotation); saReader.GetAnnotation(101, annotations); Assert.Equal("A", annotations[0].refAllele); Assert.Equal("", annotations[0].altAllele); Assert.Equal( "\"id\":\"RCV00011\",\"variationId\":\"101\",\"reviewStatus\":\"no assertion provided\",\"alleleOrigins\":[\"origin1\"],\"refAllele\":\"A\",\"altAllele\":\"-\",\"phenotypes\":[\"phenotype1\"],\"medGenIds\":[\"medgen1\"],\"omimIds\":[\"omim1\"],\"orphanetIds\":[\"orpha1\"],\"significance\":[\"significance\"],\"lastUpdatedDate\":\"0001-01-01\",\"pubMedIds\":[\"10024875684920\"]", annotations[0].annotation); saReader.PreLoad(ChromosomeUtilities.Chr2, new List {200, 205}); saReader.GetAnnotation(200,annotations); var (refAllele, altAllele, annotation) = annotations[0]; Assert.Equal("G", refAllele); Assert.Equal("A", altAllele); Assert.NotNull(annotation); } } } private static IEnumerable GetDbsnpItems(int count) { var items = new List(); var position = 100; for (int i = 0; i < count; i++, position += 5) { items.Add(new DbSnpItem(ChromosomeUtilities.Chr1, position, position, "A", "C", null)); } return items; } private static IEnumerable GetParRegionItems(int count) { var items = new List(); var position = 10_010; for (int i = 0; i < count; i++, position += 2) { items.Add(new DbSnpItem(ChromosomeUtilities.ChrY, position, position, "N", "C", null)); } return items; } private static ISequenceProvider GetAllASequenceProvider() { var seqProvider = new Mock(); seqProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); seqProvider.Setup(x => x.Sequence.Substring(It.IsAny(), 1)).Returns("A"); return seqProvider.Object; } [Fact] public void Preload() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new NsaWriter(saStream, indexStream, version, GetAllASequenceProvider(), "dbsnp", true, true, SaCommon.SchemaVersion, false, true, false, 1024, GenomeAssembly.GRCh37, true)) { saWriter.Write(GetDbsnpItems(1000)); } saStream.Position = 0; indexStream.Position = 0; using (var saReader = new NsaReader(saStream, indexStream, 1024)) { saReader.PreLoad(ChromosomeUtilities.Chr1, GetAlternatePositions(50, 1000)); var annotations = new List<(string refAllele, string altAllele, string annotation)>(); saReader.GetAnnotation(90, annotations); Assert.True(annotations.Count==0); //before any SA existed saReader.GetAnnotation(100, annotations); Assert.True(annotations.Count > 0); //first entry of first block saReader.GetAnnotation(480, annotations); Assert.True(annotations.Count > 0); //last query of first block saReader.GetAnnotation(488, annotations); Assert.True(annotations.Count ==0);//between first and second block saReader.GetAnnotation(490, annotations); Assert.True(annotations.Count > 0);//first entry of second block } } } [Fact] public void WriteParRegion() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); var count = 1000; using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new NsaWriter(saStream, indexStream, version, GetAllASequenceProvider(), "dbsnp", true, true, SaCommon.SchemaVersion, false, true, false, 1024, GenomeAssembly.GRCh37, true)) { saWriter.Write(GetParRegionItems(count)); } saStream.Position = 0; indexStream.Position = 0; using (var saReader = new NsaReader(saStream, indexStream, 1024)) { saReader.PreLoad(ChromosomeUtilities.ChrY, GetAlternatePositions(10_010, 1000)); var annotations = new List<(string refAllele, string altAllele, string annotation)>(); var position = 10_010; for (int i = 0; i < count; i++, position += 2) { saReader.GetAnnotation(position, annotations); Assert.True(annotations.Count == 1); //before any SA existed } } } } private static List GetAlternatePositions(int start, int count) { var positions = new List(); for (var i = 0; i < count; i++, start += 2) { positions.Add(start); } return positions; } [Fact] public void WrongRefAllele_ThrowUserException() { var customItem = new CustomItem(ChromosomeUtilities.Chr1, 100, "A", "T", null, null, null); Assert.Throws(() => WriteCustomSaItem(customItem)); } private static void WriteCustomSaItem(CustomItem customItem) { using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) using (var saWriter = new NsaWriter( saStream, indexStream, new DataSourceVersion("customeSa", "test", DateTime.Now.Ticks), GetSequenceProvider(), "customeSa", false, true, SaCommon.SchemaVersion, false, false)) { saWriter.Write(new[] {customItem}); } } [Fact] public void RemoveConflictinItems() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) using (var saWriter = new NsaWriter(saStream, indexStream, version, GetAllASequenceProvider(), "gnomad", true, true, SaCommon.SchemaVersion, false, true, false, 1024)) { Assert.Equal(0, saWriter.Write(GetConflictingGnomadItems())); } } private static Stream GetChr22_17467787_17467799_genome() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t17467787\trs1013532764\tAAAAG\tA\t5607.38\tPASS\tAC=9;AN=7342;AF=0.00122582;rf_tp_probability=0.526938;FS=1.835;InbreedingCoeff=-0.0586;MQ=60.31;MQRankSum=-0.363;QD=12.01;ReadPosRankSum=0.416;SOR=0.869;BaseQRankSum=0.067;ClippingRankSum=0.263;DP=659925;VQSLOD=-0.9495;VQSR_culprit=FS;variant_type=indel;allele_type=del;n_alt_alleles=1;pab_max=0.864166;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|17;gq_hist_all_bin_freq=2625|6415|2399|2552|894|245|475|590|299|567|573|228|560|58|171|68|135|8|78|194;dp_hist_alt_bin_freq=0|0|0|2|4|6|2|2|0|1|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=4|18|221|1132|2818|4248|4392|3451|2107|976|414|186|95|56|40|33|32|20|18|17;dp_hist_all_n_larger=32;ab_hist_alt_bin_freq=0|0|0|0|0|0|2|1|4|1|2|5|2|0|0|0|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=38;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=1;controls_AN_afr_male=132;controls_AF_afr_male=0.00757576;controls_nhomalt_afr_male=0;non_topmed_AC_amr=1;non_topmed_AN_amr=168;non_topmed_AF_amr=0.00595238;non_topmed_nhomalt_amr=0;AC_raw=9;AN_raw=29502;AF_raw=0.000305064;nhomalt_raw=0;AC_fin_female=0;AN_fin_female=598;AF_fin_female=0;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=12;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=1;non_neuro_AN_afr_male=154;non_neuro_AF_afr_male=0.00649351;non_neuro_nhomalt_afr_male=0;AC_afr_male=1;AN_afr_male=446;AF_afr_male=0.00224215;nhomalt_afr_male=0;AC_afr=2;AN_afr=756;AF_afr=0.0026455;nhomalt_afr=0;non_neuro_AC_afr_female=1;non_neuro_AN_afr_female=164;non_neuro_AF_afr_female=0.00609756;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=1;non_topmed_AN_amr_female=72;non_topmed_AF_amr_female=0.0138889;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=2;non_topmed_AN_oth_female=110;non_topmed_AF_oth_female=0.0181818;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=12;AF_eas_female=0;nhomalt_eas_female=0;AC_afr_female=1;AN_afr_female=310;AF_afr_female=0.00322581;nhomalt_afr_female=0;non_neuro_AC_female=2;non_neuro_AN_female=2324;non_neuro_AF_female=0.000860585;non_neuro_nhomalt_female=0;controls_AC_afr=1;controls_AN_afr=228;controls_AF_afr=0.00438596;controls_nhomalt_afr=0;AC_nfe_onf=1;AN_nfe_onf=628;AF_nfe_onf=0.00159236;nhomalt_nfe_onf=0;controls_AC_fin_male=0;controls_AN_fin_male=200;controls_AF_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=2;non_neuro_AN_nfe_nwe=2582;non_neuro_AF_nfe_nwe=0.000774593;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=526;AF_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=0;AN_nfe_female=2104;AF_nfe_female=0;nhomalt_nfe_female=0;AC_amr=1;AN_amr=178;AF_amr=0.00561798;nhomalt_amr=0;non_topmed_AC_nfe_male=3;non_topmed_AN_nfe_male=1778;non_topmed_AF_nfe_male=0.00168729;non_topmed_nhomalt_nfe_male=0;AC_eas=0;AN_eas=48;AF_eas=0;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=0;non_neuro_AN_nfe_female=1840;non_neuro_AF_nfe_female=0;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=2;non_neuro_AN_afr=318;non_neuro_AF_afr=0.00628931;non_neuro_nhomalt_afr=0;controls_AC_raw=2;controls_AN_raw=10110;controls_AF_raw=0.000197824;controls_nhomalt_raw=0;controls_AC_male=2;controls_AN_male=1340;controls_AF_male=0.00149254;controls_nhomalt_male=0;non_topmed_AC_male=5;non_topmed_AN_male=3004;non_topmed_AF_male=0.00166445;non_topmed_nhomalt_male=0;controls_AC_nfe_female=0;controls_AN_nfe_female=740;controls_AF_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=114;non_neuro_AF_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=12;non_neuro_AF_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=1;AN_asj_male=50;AF_asj_male=0.02;nhomalt_asj_male=0;controls_AC_nfe_male=1;controls_AN_nfe_male=908;controls_AF_nfe_male=0.00110132;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=378;non_neuro_AF_fin=0;non_neuro_nhomalt_fin=0;AC_oth_female=2;AN_oth_female=112;AF_oth_female=0.0178571;nhomalt_oth_female=0;controls_AC_nfe=1;controls_AN_nfe=1648;controls_AF_nfe=0.000606796;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=48;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=8;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=58;non_neuro_AF_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=1;controls_AN_nfe_nwe=308;controls_AF_nfe_nwe=0.00324675;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=2;AN_nfe_nwe=2906;AF_nfe_nwe=0.000688231;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=16;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=56;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=1;non_neuro_AN_nfe_onf=464;non_neuro_AF_nfe_onf=0.00215517;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=34;non_topmed_AF_eas_male=0;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=0;controls_AN_amr_female=16;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=200;non_neuro_AF_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=4;AN_female=3236;AF_female=0.00123609;nhomalt_female=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=84;non_neuro_AF_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=1352;non_topmed_AF_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=2;non_topmed_AN_nfe_nwe=1632;non_topmed_AF_nfe_nwe=0.00122549;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=96;non_topmed_AF_amr_male=0;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=1;non_topmed_AN_nfe_onf=448;non_topmed_AF_nfe_onf=0.00223214;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=0;controls_AN_eas_male=16;controls_AF_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=52;controls_AF_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=9;non_topmed_AN=5806;non_topmed_AF=0.00155012;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=378;controls_AF_fin=0;controls_nhomalt_fin=0;non_neuro_AC_nfe=3;non_neuro_AN_nfe=4272;non_neuro_AF_nfe=0.000702247;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=178;non_neuro_AF_fin_female=0;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=38;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=12;controls_AF_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=1;non_topmed_AN_asj=38;non_topmed_AF_asj=0.0263158;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=124;controls_AF_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=7;non_neuro_AN=5332;non_neuro_AF=0.00131283;non_neuro_nhomalt=0;non_topmed_AC_nfe=3;non_topmed_AN_nfe=3470;non_topmed_AF_nfe=0.000864553;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=9;non_topmed_AN_raw=24832;non_topmed_AF_raw=0.000362436;non_topmed_nhomalt_raw=0;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=1212;non_neuro_AF_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=114;non_topmed_AF_oth_male=0;non_topmed_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=1356;AF_nfe_est=0;nhomalt_nfe_est=0;non_topmed_AC_afr_male=1;non_topmed_AN_afr_male=434;non_topmed_AF_afr_male=0.00230415;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=36;AF_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=28;controls_AF_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=36;non_neuro_AF_eas_male=0;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=1;non_neuro_AN_asj_male=44;non_neuro_AF_asj_male=0.0227273;non_neuro_nhomalt_asj_male=0;controls_AC_oth=0;controls_AN_oth=100;controls_AF_oth=0;controls_nhomalt_oth=0;AC_nfe=3;AN_nfe=4928;AF_nfe=0.000608766;nhomalt_nfe=0;non_topmed_AC_female=4;non_topmed_AN_female=2802;non_topmed_AF_female=0.00142755;non_topmed_nhomalt_female=0;non_neuro_AC_asj=1;non_neuro_AN_asj=56;non_neuro_AF_asj=0.0178571;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=10;non_topmed_AF_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=7;non_neuro_AN_raw=20066;non_neuro_AF_raw=0.000348849;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=0;non_topmed_AN_eas=44;non_topmed_AF_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=526;non_topmed_AF_fin_male=0;non_topmed_nhomalt_fin_male=0;AC_fin=0;AN_fin=1124;AF_fin=0;nhomalt_fin=0;AC_nfe_male=3;AN_nfe_male=2824;AF_nfe_male=0.00106232;nhomalt_nfe_male=0;controls_AC_amr_male=0;controls_AN_amr_male=30;controls_AF_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=96;controls_AF_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=46;controls_AF_amr=0;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=22;AF_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=0;non_neuro_AN_eas=48;non_neuro_AF_eas=0;non_neuro_nhomalt_eas=0;non_neuro_AC_male=5;non_neuro_AN_male=3008;non_neuro_AF_male=0.00166223;non_neuro_nhomalt_male=0;AC_asj=1;AN_asj=72;AF_asj=0.0138889;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=1200;controls_AF_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=16;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=2;non_topmed_AN_oth=224;non_topmed_AF_oth=0.00892857;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=598;non_topmed_AF_fin_female=0;non_topmed_nhomalt_fin_female=0;AC_oth=2;AN_oth=236;AF_oth=0.00847458;nhomalt_oth=0;non_neuro_AC_nfe_male=3;non_neuro_AN_nfe_male=2432;non_neuro_AF_nfe_male=0.00123355;non_neuro_nhomalt_nfe_male=0;controls_AC_female=0;controls_AN_female=1096;controls_AF_female=0;controls_nhomalt_female=0;non_topmed_AC_fin=0;non_topmed_AN_fin=1124;non_topmed_AF_fin=0;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=0;non_topmed_AN_nfe_female=1692;non_topmed_AF_nfe_female=0;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=2;controls_AF_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=1;non_topmed_AN_asj_male=22;non_topmed_AF_asj_male=0.0454545;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=1;non_neuro_AN_oth=146;non_neuro_AF_oth=0.00684932;non_neuro_nhomalt_oth=0;AC_male=5;AN_male=4106;AF_male=0.00121773;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=178;controls_AF_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=6;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=100;AF_amr_male=0;nhomalt_amr_male=0;AC_amr_female=1;AN_amr_female=78;AF_amr_female=0.0128205;nhomalt_amr_female=0;AC_oth_male=0;AN_oth_male=124;AF_oth_male=0;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=14;non_neuro_AF_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=1;non_topmed_AN_afr_female=304;non_topmed_AF_afr_female=0.00328947;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=2;non_topmed_AN_afr=738;non_topmed_AF_afr=0.00271003;non_topmed_nhomalt_afr=0;controls_AC=2;controls_AN=2436;controls_AF=0.000821018;controls_nhomalt=0;non_neuro_AC_oth_female=1;non_neuro_AN_oth_female=62;non_neuro_AF_oth_female=0.016129;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0.000305;non_topmed_faf99_amr=0.000305;faf95_afr=0.00047001;faf99_afr=0.00046996;controls_faf95_afr=0.000224;controls_faf99_afr=0.000224;faf95_amr=0.000288;faf99_amr=0.000288;faf95_eas=0;faf99_eas=0;faf95=0.00063865;faf99=0.0006395;non_neuro_faf95_afr=0.00111728;non_neuro_faf99_afr=0.00111671;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;controls_faf95_nfe=3.1e-05;controls_faf99_nfe=3.1e-05;non_topmed_faf95=0.00080814;non_topmed_faf99=0.00080791;non_neuro_faf95_nfe=0.000191;non_neuro_faf99_nfe=0.00019047;non_neuro_faf95=0.00061599;non_neuro_faf99=0.00061588;non_topmed_faf95_nfe=0.0002353;non_topmed_faf99_nfe=0.00023558;controls_faf95_eas=0;controls_faf99_eas=0;faf95_nfe=0.0001658;faf99_nfe=0.00016511;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_topmed_faf95_afr=0.00048118;non_topmed_faf99_afr=0.00048064;controls_faf95=0.00014568;controls_faf99=0.00014565;controls_popmax=afr;controls_AC_popmax=1;controls_AN_popmax=228;controls_AF_popmax=0.00438596;controls_nhomalt_popmax=0;popmax=amr;AC_popmax=1;AN_popmax=178;AF_popmax=0.00561798;nhomalt_popmax=0;age_hist_het_bin_freq=1|0|1|1|0|2|0|0|0|0;age_hist_het_n_smaller=1;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=afr;non_neuro_AC_popmax=2;non_neuro_AN_popmax=318;non_neuro_AF_popmax=0.00628931;non_neuro_nhomalt_popmax=0;non_topmed_popmax=amr;non_topmed_AC_popmax=1;non_topmed_AN_popmax=168;non_topmed_AF_popmax=0.00595238;non_topmed_nhomalt_popmax=0"); writer.WriteLine("22\t17467793\trs200526150\tAAGAA\tA\t2.96178e+06\tPASS\tAC=25;AN=13820;AF=0.00180897;rf_tp_probability=0.6944;FS=0;InbreedingCoeff=-0.0226;MQ=61.07;MQRankSum=0.061;QD=19.6;ReadPosRankSum=0.177;SOR=0.694;BaseQRankSum=-0.031;ClippingRankSum=-0.053;DP=657153;VQSLOD=5.11;VQSR_culprit=FS;variant_type=multi-indel;allele_type=del;n_alt_alleles=2;pab_max=1;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|36;gq_hist_all_bin_freq=2892|4902|1140|827|277|141|343|478|268|556|481|207|525|87|178|89|169|40|119|5100;dp_hist_alt_bin_freq=0|0|0|1|5|8|10|5|4|1|0|0|1|1|0|0|0|0|1|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=3|25|286|1366|3137|4439|4355|3211|1821|851|331|175|79|53|32|42|22|27|18|12;dp_hist_all_n_larger=25;ab_hist_alt_bin_freq=0|0|0|0|0|0|2|2|6|8|3|6|7|2|0|0|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=60;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=654;controls_AF_afr_male=0;controls_nhomalt_afr_male=0;non_topmed_AC_amr=17;non_topmed_AN_amr=272;non_topmed_AF_amr=0.0625;non_topmed_nhomalt_amr=1;AC_raw=25;AN_raw=28996;AF_raw=0.000862188;nhomalt_raw=1;AC_fin_female=0;AN_fin_female=834;AF_fin_female=0;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=38;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=0;non_neuro_AN_afr_male=730;non_neuro_AF_afr_male=0;non_neuro_nhomalt_afr_male=0;AC_afr_male=2;AN_afr_male=2172;AF_afr_male=0.00092081;nhomalt_afr_male=0;AC_afr=2;AN_afr=3678;AF_afr=0.000543774;nhomalt_afr=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=754;non_neuro_AF_afr_female=0;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=9;non_topmed_AN_amr_female=132;non_topmed_AF_amr_female=0.0681818;non_topmed_nhomalt_amr_female=1;non_topmed_AC_oth_female=2;non_topmed_AN_oth_female=190;non_topmed_AF_oth_female=0.0105263;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=248;AF_eas_female=0;nhomalt_eas_female=0;AC_afr_female=0;AN_afr_female=1506;AF_afr_female=0;nhomalt_afr_female=0;non_neuro_AC_female=7;non_neuro_AN_female=4262;non_neuro_AF_female=0.00164242;non_neuro_nhomalt_female=0;controls_AC_afr=0;controls_AN_afr=1120;controls_AF_afr=0;controls_nhomalt_afr=0;AC_nfe_onf=0;AN_nfe_onf=904;AF_nfe_onf=0;nhomalt_nfe_onf=0;controls_AC_fin_male=0;controls_AN_fin_male=276;controls_AF_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=1;non_neuro_AN_nfe_nwe=3534;non_neuro_AF_nfe_nwe=0.000282965;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=708;AF_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=1;AN_nfe_female=3128;AF_nfe_female=0.000319693;nhomalt_nfe_female=0;AC_amr=18;AN_amr=286;AF_amr=0.0629371;nhomalt_amr=1;non_topmed_AC_nfe_male=1;non_topmed_AN_nfe_male=2566;non_topmed_AF_nfe_male=0.000389712;non_topmed_nhomalt_nfe_male=0;AC_eas=0;AN_eas=656;AF_eas=0;nhomalt_eas=0;nhomalt=1;non_neuro_AC_nfe_female=1;non_neuro_AN_nfe_female=2732;non_neuro_AF_nfe_female=0.000366032;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=1484;non_neuro_AF_afr=0;non_neuro_nhomalt_afr=0;controls_AC_raw=4;controls_AN_raw=9932;controls_AF_raw=0.000402739;controls_nhomalt_raw=0;controls_AC_male=3;controls_AN_male=2680;controls_AF_male=0.0011194;controls_nhomalt_male=0;non_topmed_AC_male=11;non_topmed_AN_male=6164;non_topmed_AF_male=0.00178456;non_topmed_nhomalt_male=0;controls_AC_nfe_female=0;controls_AN_nfe_female=1186;controls_AF_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=9;non_neuro_AN_amr=184;non_neuro_AF_amr=0.048913;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=248;non_neuro_AF_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=92;AF_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=1378;controls_AF_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=532;non_neuro_AF_fin=0;non_neuro_nhomalt_fin=0;AC_oth_female=2;AN_oth_female=194;AF_oth_female=0.0103093;nhomalt_oth_female=0;controls_AC_nfe=0;controls_AN_nfe=2564;controls_AF_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=76;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=20;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=4;non_neuro_AN_amr_male=74;non_neuro_AF_amr_male=0.0540541;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=426;controls_AF_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=2;AN_nfe_nwe=3958;AF_nfe_nwe=0.000505306;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=26;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=5;non_neuro_AN_amr_female=110;non_neuro_AF_amr_female=0.0454545;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=704;non_neuro_AF_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=400;non_topmed_AF_eas_male=0;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=1;controls_AN_amr_female=46;controls_AF_amr_female=0.0217391;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=276;non_neuro_AF_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=13;AN_female=6098;AF_female=0.00213185;nhomalt_female=1;non_neuro_AC_oth_male=1;non_neuro_AN_oth_male=156;non_neuro_AF_oth_male=0.00641026;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=2184;non_topmed_AF_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=2;non_topmed_AN_nfe_nwe=2250;non_topmed_AF_nfe_nwe=0.000888889;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=8;non_topmed_AN_amr_male=140;non_topmed_AF_amr_male=0.0571429;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=646;non_topmed_AF_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=0;controls_AN_eas_male=244;controls_AF_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=84;controls_AF_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=23;non_topmed_AN=11642;non_topmed_AF=0.00197561;non_topmed_nhomalt=1;controls_AC_fin=0;controls_AN_fin=532;controls_AF_fin=0;controls_nhomalt_fin=0;non_neuro_AC_nfe=1;non_neuro_AN_nfe=6226;non_neuro_AF_nfe=0.000160617;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=256;non_neuro_AF_fin_female=0;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=60;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=172;controls_AF_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=68;non_topmed_AF_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=168;controls_AF_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=12;non_neuro_AN=9480;non_neuro_AF=0.00126582;non_neuro_nhomalt=0;non_topmed_AC_nfe=2;non_topmed_AN_nfe=5140;non_topmed_AF_nfe=0.000389105;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=23;non_topmed_AN_raw=24482;non_topmed_AF_raw=0.000939466;non_topmed_nhomalt_raw=1;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=1962;non_neuro_AF_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=1;non_topmed_AN_oth_male=184;non_topmed_AF_oth_male=0.00543478;non_topmed_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=2192;AF_nfe_est=0;nhomalt_nfe_est=0;non_topmed_AC_afr_male=1;non_topmed_AN_afr_male=2132;non_topmed_AF_afr_male=0.000469043;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=408;AF_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=416;controls_AF_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=408;non_neuro_AF_eas_male=0;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=80;non_neuro_AF_asj_male=0;non_neuro_nhomalt_asj_male=0;controls_AC_oth=0;controls_AN_oth=160;controls_AF_oth=0;controls_nhomalt_oth=0;AC_nfe=2;AN_nfe=7114;AF_nfe=0.000281136;nhomalt_nfe=0;non_topmed_AC_female=12;non_topmed_AN_female=5478;non_topmed_AF_female=0.00219058;non_topmed_nhomalt_female=1;non_neuro_AC_asj=0;non_neuro_AN_asj=118;non_neuro_AF_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=240;non_topmed_AF_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=12;non_neuro_AN_raw=19660;non_neuro_AF_raw=0.000610376;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=0;non_topmed_AN_eas=640;non_topmed_AF_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=708;non_topmed_AF_fin_male=0;non_topmed_nhomalt_fin_male=0;AC_fin=0;AN_fin=1542;AF_fin=0;nhomalt_fin=0;AC_nfe_male=1;AN_nfe_male=3986;AF_nfe_male=0.000250878;nhomalt_nfe_male=0;controls_AC_amr_male=3;controls_AN_amr_male=38;controls_AF_amr_male=0.0789474;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=466;controls_AF_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=4;controls_AN_amr=84;controls_AF_amr=0.047619;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=46;AF_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=0;non_neuro_AN_eas=656;non_neuro_AF_eas=0;non_neuro_nhomalt_eas=0;non_neuro_AC_male=5;non_neuro_AN_male=5218;non_neuro_AF_male=0.000958222;non_neuro_nhomalt_male=0;AC_asj=0;AN_asj=138;AF_asj=0;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=1944;controls_AF_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=34;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=3;non_topmed_AN_oth=374;non_topmed_AF_oth=0.00802139;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=834;non_topmed_AF_fin_female=0;non_topmed_nhomalt_fin_female=0;AC_oth=3;AN_oth=406;AF_oth=0.00738916;nhomalt_oth=0;non_neuro_AC_nfe_male=0;non_neuro_AN_nfe_male=3494;non_neuro_AF_nfe_male=0;non_neuro_nhomalt_nfe_male=0;controls_AC_female=1;controls_AN_female=2216;controls_AF_female=0.000451264;controls_nhomalt_female=0;non_topmed_AC_fin=0;non_topmed_AN_fin=1542;non_topmed_AF_fin=0;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=1;non_topmed_AN_nfe_female=2574;non_topmed_AF_nfe_female=0.0003885;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=6;controls_AF_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=34;non_topmed_AF_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=2;non_neuro_AN_oth=280;non_neuro_AF_oth=0.00714286;non_neuro_nhomalt_oth=0;AC_male=12;AN_male=7722;AF_male=0.001554;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=256;controls_AF_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=14;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=8;AN_amr_male=144;AF_amr_male=0.0555556;nhomalt_amr_male=0;AC_amr_female=10;AN_amr_female=142;AF_amr_female=0.0704225;nhomalt_amr_female=1;AC_oth_male=1;AN_oth_male=212;AF_oth_male=0.00471698;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=26;non_neuro_AF_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=1474;non_topmed_AF_afr_female=0;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=1;non_topmed_AN_afr=3606;non_topmed_AF_afr=0.000277316;non_topmed_nhomalt_afr=0;controls_AC=4;controls_AN=4896;controls_AF=0.000816993;controls_nhomalt=0;non_neuro_AC_oth_female=1;non_neuro_AN_oth_female=124;non_neuro_AF_oth_female=0.00806452;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0.0398231;non_topmed_faf99_amr=0.0398236;faf95_afr=9.592e-05;faf99_afr=9.609e-05;controls_faf95_afr=0;controls_faf99_afr=0;faf95_amr=0.0406793;faf99_amr=0.0406792;faf95_eas=0;faf99_eas=0;faf95=0.00125772;faf99=0.00125736;non_neuro_faf95_afr=0;non_neuro_faf99_afr=0;non_neuro_faf95_amr=0.0255171;non_neuro_faf99_amr=0.0255167;controls_faf95_nfe=0;controls_faf99_nfe=0;non_topmed_faf95=0.00134988;non_topmed_faf99=0.00134945;non_neuro_faf95_nfe=8e-06;non_neuro_faf99_nfe=8e-06;non_neuro_faf95=0.00072973;non_neuro_faf99=0.00073008;non_topmed_faf95_nfe=6.881e-05;non_topmed_faf99_nfe=6.877e-05;controls_faf95_eas=0;controls_faf99_eas=0;faf95_nfe=4.922e-05;faf99_nfe=4.923e-05;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0.0162655;controls_faf99_amr=0.0162653;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_topmed_faf95_afr=1.4e-05;non_topmed_faf99_afr=1.4e-05;controls_faf95=0.00027835;controls_faf99=0.00027827;controls_popmax=amr;controls_AC_popmax=4;controls_AN_popmax=84;controls_AF_popmax=0.047619;controls_nhomalt_popmax=0;popmax=amr;AC_popmax=18;AN_popmax=286;AF_popmax=0.0629371;nhomalt_popmax=1;age_hist_het_bin_freq=0|0|2|1|1|1|0|0|0|0;age_hist_het_n_smaller=4;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=amr;non_neuro_AC_popmax=9;non_neuro_AN_popmax=184;non_neuro_AF_popmax=0.048913;non_neuro_nhomalt_popmax=0;non_topmed_popmax=amr;non_topmed_AC_popmax=17;non_topmed_AN_popmax=272;non_topmed_AF_popmax=0.0625;non_topmed_nhomalt_popmax=1"); writer.WriteLine("22\t17467793\trs200526150\tAAGAA\tA\t2.96178e+06\tPASS\tAC=4501;AN=13820;AF=0.325687;rf_tp_probability=0.6944;FS=0;InbreedingCoeff=-0.0226;MQ=61.07;MQRankSum=0.061;QD=19.6;ReadPosRankSum=0.177;SOR=0.694;BaseQRankSum=-0.031;ClippingRankSum=-0.053;DP=657153;VQSLOD=5.11;VQSR_culprit=FS;variant_type=multi-indel;allele_type=del;n_alt_alleles=2;pab_max=1;gq_hist_alt_bin_freq=3|3|4|4|5|3|4|6|8|10|21|14|36|33|27|47|34|35|43|4884;gq_hist_all_bin_freq=2897|4907|1144|830|282|143|344|482|273|559|484|208|528|92|176|87|149|45|119|5070;dp_hist_alt_bin_freq=0|6|126|551|1133|1285|1033|600|260|102|40|27|13|13|3|11|1|6|7|2;dp_hist_alt_n_larger=5;dp_hist_all_bin_freq=3|25|286|1366|3137|4439|4355|3211|1821|851|331|175|79|53|32|42|22|27|18|12;dp_hist_all_n_larger=25;ab_hist_alt_bin_freq=0|7|1|7|36|124|277|456|835|741|1055|616|404|155|42|25|5|6|5|0;AC_nfe_seu=19;AN_nfe_seu=60;AF_nfe_seu=0.316667;nhomalt_nfe_seu=1;controls_AC_afr_male=325;controls_AN_afr_male=654;controls_AF_afr_male=0.496942;controls_nhomalt_afr_male=35;non_topmed_AC_amr=77;non_topmed_AN_amr=272;non_topmed_AF_amr=0.283088;non_topmed_nhomalt_amr=2;AC_raw=4527;AN_raw=28996;AF_raw=0.156125;nhomalt_raw=356;AC_fin_female=187;AN_fin_female=834;AF_fin_female=0.224221;nhomalt_fin_female=6;non_neuro_AC_asj_female=15;non_neuro_AN_asj_female=38;non_neuro_AF_asj_female=0.394737;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=358;non_neuro_AN_afr_male=730;non_neuro_AF_afr_male=0.490411;non_neuro_nhomalt_afr_male=37;AC_afr_male=1071;AN_afr_male=2172;AF_afr_male=0.493094;nhomalt_afr_male=113;AC_afr=1825;AN_afr=3678;AF_afr=0.496194;nhomalt_afr=196;non_neuro_AC_afr_female=376;non_neuro_AN_afr_female=754;non_neuro_AF_afr_female=0.498674;non_neuro_nhomalt_afr_female=42;non_topmed_AC_amr_female=35;non_topmed_AN_amr_female=132;non_topmed_AF_amr_female=0.265152;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=58;non_topmed_AN_oth_female=190;non_topmed_AF_oth_female=0.305263;non_topmed_nhomalt_oth_female=6;AC_eas_female=135;AN_eas_female=248;AF_eas_female=0.544355;nhomalt_eas_female=14;AC_afr_female=754;AN_afr_female=1506;AF_afr_female=0.500664;nhomalt_afr_female=83;non_neuro_AC_female=1325;non_neuro_AN_female=4262;non_neuro_AF_female=0.310887;non_neuro_nhomalt_female=93;controls_AC_afr=566;controls_AN_afr=1120;controls_AF_afr=0.505357;controls_nhomalt_afr=67;AC_nfe_onf=233;AN_nfe_onf=904;AF_nfe_onf=0.257743;nhomalt_nfe_onf=13;controls_AC_fin_male=58;controls_AN_fin_male=276;controls_AF_fin_male=0.210145;controls_nhomalt_fin_male=2;non_neuro_AC_nfe_nwe=797;non_neuro_AN_nfe_nwe=3534;non_neuro_AF_nfe_nwe=0.225523;non_neuro_nhomalt_nfe_nwe=38;AC_fin_male=146;AN_fin_male=708;AF_fin_male=0.206215;nhomalt_fin_male=4;AC_nfe_female=774;AN_nfe_female=3128;AF_nfe_female=0.247442;nhomalt_nfe_female=42;AC_amr=79;AN_amr=286;AF_amr=0.276224;nhomalt_amr=2;non_topmed_AC_nfe_male=636;non_topmed_AN_nfe_male=2566;non_topmed_AF_nfe_male=0.247857;non_topmed_nhomalt_nfe_male=33;AC_eas=359;AN_eas=656;AF_eas=0.547256;nhomalt_eas=35;nhomalt=352;non_neuro_AC_nfe_female=666;non_neuro_AN_nfe_female=2732;non_neuro_AF_nfe_female=0.243777;non_neuro_nhomalt_nfe_female=30;non_neuro_AC_afr=734;non_neuro_AN_afr=1484;non_neuro_AF_afr=0.494609;non_neuro_nhomalt_afr=79;controls_AC_raw=1673;controls_AN_raw=9932;controls_AF_raw=0.168445;controls_nhomalt_raw=138;controls_AC_male=920;controls_AN_male=2680;controls_AF_male=0.343284;controls_nhomalt_male=78;non_topmed_AC_male=2163;non_topmed_AN_male=6164;non_topmed_AF_male=0.350909;non_topmed_nhomalt_male=179;controls_AC_nfe_female=300;controls_AN_nfe_female=1186;controls_AF_nfe_female=0.252951;controls_nhomalt_nfe_female=11;non_neuro_AC_amr=55;non_neuro_AN_amr=184;non_neuro_AF_amr=0.298913;non_neuro_nhomalt_amr=1;non_neuro_AC_eas_female=135;non_neuro_AN_eas_female=248;non_neuro_AF_eas_female=0.544355;non_neuro_nhomalt_eas_female=14;AC_asj_male=34;AN_asj_male=92;AF_asj_male=0.369565;nhomalt_asj_male=5;controls_AC_nfe_male=360;controls_AN_nfe_male=1378;controls_AF_nfe_male=0.261248;controls_nhomalt_nfe_male=21;non_neuro_AC_fin=118;non_neuro_AN_fin=532;non_neuro_AF_fin=0.221805;non_neuro_nhomalt_fin=3;AC_oth_female=60;AN_oth_female=194;AF_oth_female=0.309278;nhomalt_oth_female=7;controls_AC_nfe=660;controls_AN_nfe=2564;controls_AF_nfe=0.25741;controls_nhomalt_nfe=32;controls_AC_oth_female=19;controls_AN_oth_female=76;controls_AF_oth_female=0.25;controls_nhomalt_oth_female=1;controls_AC_asj=9;controls_AN_asj=20;controls_AF_asj=0.45;controls_nhomalt_asj=1;non_neuro_AC_amr_male=24;non_neuro_AN_amr_male=74;non_neuro_AF_amr_male=0.324324;non_neuro_nhomalt_amr_male=1;controls_AC_nfe_nwe=99;controls_AN_nfe_nwe=426;controls_AF_nfe_nwe=0.232394;controls_nhomalt_nfe_nwe=5;AC_nfe_nwe=894;AN_nfe_nwe=3958;AF_nfe_nwe=0.225872;nhomalt_nfe_nwe=44;controls_AC_nfe_seu=10;controls_AN_nfe_seu=26;controls_AF_nfe_seu=0.384615;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=31;non_neuro_AN_amr_female=110;non_neuro_AF_amr_female=0.281818;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=190;non_neuro_AN_nfe_onf=704;non_neuro_AF_nfe_onf=0.269886;non_neuro_nhomalt_nfe_onf=12;non_topmed_AC_eas_male=219;non_topmed_AN_eas_male=400;non_topmed_AF_eas_male=0.5475;non_topmed_nhomalt_eas_male=20;controls_AC_amr_female=18;controls_AN_amr_female=46;controls_AF_amr_female=0.391304;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=58;non_neuro_AN_fin_male=276;non_neuro_AF_fin_male=0.210145;non_neuro_nhomalt_fin_male=2;AC_female=1965;AN_female=6098;AF_female=0.322237;nhomalt_female=152;non_neuro_AC_oth_male=49;non_neuro_AN_oth_male=156;non_neuro_AF_oth_male=0.314103;non_neuro_nhomalt_oth_male=5;non_topmed_AC_nfe_est=577;non_topmed_AN_nfe_est=2184;non_topmed_AF_nfe_est=0.264194;non_topmed_nhomalt_nfe_est=32;non_topmed_AC_nfe_nwe=515;non_topmed_AN_nfe_nwe=2250;non_topmed_AF_nfe_nwe=0.228889;non_topmed_nhomalt_nfe_nwe=28;non_topmed_AC_amr_male=42;non_topmed_AN_amr_male=140;non_topmed_AF_amr_male=0.3;non_topmed_nhomalt_amr_male=2;non_topmed_AC_nfe_onf=169;non_topmed_AN_nfe_onf=646;non_topmed_AF_nfe_onf=0.26161;non_topmed_nhomalt_nfe_onf=8;controls_AC_eas_male=136;controls_AN_eas_male=244;controls_AF_eas_male=0.557377;controls_nhomalt_eas_male=15;controls_AC_oth_male=25;controls_AN_oth_male=84;controls_AF_oth_male=0.297619;controls_nhomalt_oth_male=4;non_topmed_AC=3972;non_topmed_AN=11642;non_topmed_AF=0.341178;non_topmed_nhomalt=324;controls_AC_fin=118;controls_AN_fin=532;controls_AF_fin=0.221805;controls_nhomalt_fin=3;non_neuro_AC_nfe=1506;non_neuro_AN_nfe=6226;non_neuro_AF_nfe=0.241889;non_neuro_nhomalt_nfe=73;non_neuro_AC_fin_female=60;non_neuro_AN_fin_female=256;non_neuro_AF_fin_female=0.234375;non_neuro_nhomalt_fin_female=1;non_topmed_AC_nfe_seu=19;non_topmed_AN_nfe_seu=60;non_topmed_AF_nfe_seu=0.316667;non_topmed_nhomalt_nfe_seu=1;controls_AC_eas_female=95;controls_AN_eas_female=172;controls_AF_eas_female=0.552326;controls_nhomalt_eas_female=12;non_topmed_AC_asj=24;non_topmed_AN_asj=68;non_topmed_AF_asj=0.352941;non_topmed_nhomalt_asj=1;controls_AC_nfe_onf=46;controls_AN_nfe_onf=168;controls_AF_nfe_onf=0.27381;controls_nhomalt_nfe_onf=4;non_neuro_AC=2909;non_neuro_AN=9480;non_neuro_AF=0.306857;non_neuro_nhomalt=207;non_topmed_AC_nfe=1280;non_topmed_AN_nfe=5140;non_topmed_AF_nfe=0.249027;non_topmed_nhomalt_nfe=69;non_topmed_AC_raw=3996;non_topmed_AN_raw=24482;non_topmed_AF_raw=0.163222;non_topmed_nhomalt_raw=327;non_neuro_AC_nfe_est=509;non_neuro_AN_nfe_est=1962;non_neuro_AF_nfe_est=0.259429;non_neuro_nhomalt_nfe_est=23;non_topmed_AC_oth_male=56;non_topmed_AN_oth_male=184;non_topmed_AF_oth_male=0.304348;non_topmed_nhomalt_oth_male=6;AC_nfe_est=579;AN_nfe_est=2192;AF_nfe_est=0.264142;nhomalt_nfe_est=32;non_topmed_AC_afr_male=1054;non_topmed_AN_afr_male=2132;non_topmed_AF_afr_male=0.494371;non_topmed_nhomalt_afr_male=113;AC_eas_male=224;AN_eas_male=408;AF_eas_male=0.54902;nhomalt_eas_male=21;controls_AC_eas=231;controls_AN_eas=416;controls_AF_eas=0.555288;controls_nhomalt_eas=27;non_neuro_AC_eas_male=224;non_neuro_AN_eas_male=408;non_neuro_AF_eas_male=0.54902;non_neuro_nhomalt_eas_male=21;non_neuro_AC_asj_male=31;non_neuro_AN_asj_male=80;non_neuro_AF_asj_male=0.3875;non_neuro_nhomalt_asj_male=5;controls_AC_oth=44;controls_AN_oth=160;controls_AF_oth=0.275;controls_nhomalt_oth=5;AC_nfe=1725;AN_nfe=7114;AF_nfe=0.24248;nhomalt_nfe=90;non_topmed_AC_female=1809;non_topmed_AN_female=5478;non_topmed_AF_female=0.33023;non_topmed_nhomalt_female=145;non_neuro_AC_asj=46;non_neuro_AN_asj=118;non_neuro_AF_asj=0.389831;non_neuro_nhomalt_asj=5;non_topmed_AC_eas_female=132;non_topmed_AN_eas_female=240;non_topmed_AF_eas_female=0.55;non_topmed_nhomalt_eas_female=14;non_neuro_AC_raw=2928;non_neuro_AN_raw=19660;non_neuro_AF_raw=0.148932;non_neuro_nhomalt_raw=211;non_topmed_AC_eas=351;non_topmed_AN_eas=640;non_topmed_AF_eas=0.548438;non_topmed_nhomalt_eas=34;non_topmed_AC_fin_male=146;non_topmed_AN_fin_male=708;non_topmed_AF_fin_male=0.206215;non_topmed_nhomalt_fin_male=4;AC_fin=333;AN_fin=1542;AF_fin=0.215953;nhomalt_fin=10;AC_nfe_male=951;AN_nfe_male=3986;AF_nfe_male=0.238585;nhomalt_nfe_male=48;controls_AC_amr_male=12;controls_AN_amr_male=38;controls_AF_amr_male=0.315789;controls_nhomalt_amr_male=0;controls_AC_afr_female=241;controls_AN_afr_female=466;controls_AF_afr_female=0.517167;controls_nhomalt_afr_female=32;controls_AC_amr=30;controls_AN_amr=84;controls_AF_amr=0.357143;controls_nhomalt_amr=0;AC_asj_female=18;AN_asj_female=46;AF_asj_female=0.391304;nhomalt_asj_female=0;non_neuro_AC_eas=359;non_neuro_AN_eas=656;non_neuro_AF_eas=0.547256;non_neuro_nhomalt_eas=35;non_neuro_AC_male=1584;non_neuro_AN_male=5218;non_neuro_AF_male=0.303565;non_neuro_nhomalt_male=114;AC_asj=52;AN_asj=138;AF_asj=0.376812;nhomalt_asj=5;controls_AC_nfe_est=505;controls_AN_nfe_est=1944;controls_AF_nfe_est=0.259774;controls_nhomalt_nfe_est=23;non_topmed_AC_asj_female=14;non_topmed_AN_asj_female=34;non_topmed_AF_asj_female=0.411765;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=114;non_topmed_AN_oth=374;non_topmed_AF_oth=0.304813;non_topmed_nhomalt_oth=12;non_topmed_AC_fin_female=187;non_topmed_AN_fin_female=834;non_topmed_AF_fin_female=0.224221;non_topmed_nhomalt_fin_female=6;AC_oth=128;AN_oth=406;AF_oth=0.315271;nhomalt_oth=14;non_neuro_AC_nfe_male=840;non_neuro_AN_nfe_male=3494;non_neuro_AF_nfe_male=0.240412;non_neuro_nhomalt_nfe_male=43;controls_AC_female=738;controls_AN_female=2216;controls_AF_female=0.333032;controls_nhomalt_female=57;non_topmed_AC_fin=333;non_topmed_AN_fin=1542;non_topmed_AF_fin=0.215953;non_topmed_nhomalt_fin=10;non_topmed_AC_nfe_female=644;non_topmed_AN_nfe_female=2574;non_topmed_AF_nfe_female=0.250194;non_topmed_nhomalt_nfe_female=36;controls_AC_asj_male=4;controls_AN_asj_male=6;controls_AF_asj_male=0.666667;controls_nhomalt_asj_male=1;non_topmed_AC_asj_male=10;non_topmed_AN_asj_male=34;non_topmed_AF_asj_male=0.294118;non_topmed_nhomalt_asj_male=1;non_neuro_AC_oth=91;non_neuro_AN_oth=280;non_neuro_AF_oth=0.325;non_neuro_nhomalt_oth=11;AC_male=2536;AN_male=7722;AF_male=0.328412;nhomalt_male=200;controls_AC_fin_female=60;controls_AN_fin_female=256;controls_AF_fin_female=0.234375;controls_nhomalt_fin_female=1;controls_AC_asj_female=5;controls_AN_asj_female=14;controls_AF_asj_female=0.357143;controls_nhomalt_asj_female=0;AC_amr_male=42;AN_amr_male=144;AF_amr_male=0.291667;nhomalt_amr_male=2;AC_amr_female=37;AN_amr_female=142;AF_amr_female=0.260563;nhomalt_amr_female=0;AC_oth_male=68;AN_oth_male=212;AF_oth_male=0.320755;nhomalt_oth_male=7;non_neuro_AC_nfe_seu=10;non_neuro_AN_nfe_seu=26;non_neuro_AF_nfe_seu=0.384615;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=739;non_topmed_AN_afr_female=1474;non_topmed_AF_afr_female=0.501357;non_topmed_nhomalt_afr_female=83;non_topmed_AC_afr=1793;non_topmed_AN_afr=3606;non_topmed_AF_afr=0.497227;non_topmed_nhomalt_afr=196;controls_AC=1658;controls_AN=4896;controls_AF=0.338644;controls_nhomalt=135;non_neuro_AC_oth_female=42;non_neuro_AN_oth_female=124;non_neuro_AF_oth_female=0.33871;non_neuro_nhomalt_oth_female=6;non_topmed_faf95_amr=0.232194;non_topmed_faf99_amr=0.232194;faf95_afr=0.477244;faf99_afr=0.477244;controls_faf95_afr=0.470932;controls_faf99_afr=0.470932;faf95_amr=0.227168;faf99_amr=0.227169;faf95_eas=0.500629;faf99_eas=0.500629;faf95=0.317744;faf99=0.317744;non_neuro_faf95_afr=0.464967;non_neuro_faf99_afr=0.464967;non_neuro_faf95_amr=0.235846;non_neuro_faf99_amr=0.235846;controls_faf95_nfe=0.241154;controls_faf99_nfe=0.241154;non_topmed_faf95=0.332322;non_topmed_faf99=0.332323;non_neuro_faf95_nfe=0.231727;non_neuro_faf99_nfe=0.231728;non_neuro_faf95=0.297558;non_neuro_faf99=0.297559;non_topmed_faf95_nfe=0.237689;non_topmed_faf99_nfe=0.23769;controls_faf95_eas=0.49659;controls_faf99_eas=0.49659;faf95_nfe=0.232957;faf99_nfe=0.232956;non_topmed_faf95_eas=0.501191;non_topmed_faf99_eas=0.501191;controls_faf95_amr=0.257071;controls_faf99_amr=0.257071;non_neuro_faf95_eas=0.500629;non_neuro_faf99_eas=0.500629;non_topmed_faf95_afr=0.47807;non_topmed_faf99_afr=0.47807;controls_faf95=0.32508;controls_faf99=0.325081;controls_popmax=eas;controls_AC_popmax=231;controls_AN_popmax=416;controls_AF_popmax=0.555288;controls_nhomalt_popmax=27;popmax=eas;AC_popmax=359;AN_popmax=656;AF_popmax=0.547256;nhomalt_popmax=35;age_hist_het_bin_freq=128|162|214|283|349|260|234|152|93|46;age_hist_het_n_smaller=717;age_hist_het_n_larger=23;age_hist_hom_bin_freq=9|11|18|24|26|15|20|8|12|6;age_hist_hom_n_smaller=82;age_hist_hom_n_larger=4;non_neuro_popmax=eas;non_neuro_AC_popmax=359;non_neuro_AN_popmax=656;non_neuro_AF_popmax=0.547256;non_neuro_nhomalt_popmax=35;non_topmed_popmax=eas;non_topmed_AC_popmax=351;non_topmed_AN_popmax=640;non_topmed_AF_popmax=0.548438;non_topmed_nhomalt_popmax=34"); writer.Flush(); stream.Position = 0; return stream; } private static IEnumerable GetConflictingGnomadItems() { var sequence = new SimpleSequence(new string('T', VariantUtils.MaxUpstreamLength) + "AAAGAAAGAAAG", 17467787 - 1 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var gnomadReader = new GnomadSnvReader(new StreamReader(GetChr22_17467787_17467799_genome()), null, sequenceProvider); return gnomadReader.GetCombinedItems(); } } } ================================================ FILE: UnitTests/SAUtils/Omim/OmimUtilitiesTests.cs ================================================ using System.Linq; using Newtonsoft.Json; using SAUtils.DataStructures; using SAUtils.Omim; using SAUtils.Omim.EntryApiResponse; using Xunit; namespace UnitTests.SAUtils.Omim { public sealed class OmimUtilitiesTests { [Theory] [InlineData("In unstressed cells, p53 (not removed) is {kept} inactive essentially through the actions of the ubiquitin ligase MDM2 ({164785}) and a 28-kD beta subunits (ETFB; {130410}), which inhibits p53 transcriptional activity and ubiquitinates p53 to promote its degradation. Activity of p53 is ubiquitously lost in human cancer either by mutation of the p53 gene itself or by loss of cell signaling upstream or downstream of p53 ({305:Toledo and Wahl, 2006}; {30:Bourdon, 2007}; {324:Vousden and Lane, 2007}).", "In unstressed cells, p53 (not removed) is kept inactive essentially through the actions of the ubiquitin ligase MDM2 and a 28-kD beta subunits (ETFB), which inhibits p53 transcriptional activity and ubiquitinates p53 to promote its degradation. Activity of p53 is ubiquitously lost in human cancer either by mutation of the p53 gene itself or by loss of cell signaling upstream or downstream of p53 (Toledo and Wahl, 2006; Bourdon, 2007; Vousden and Lane, 2007).")] [InlineData("macules (summary by {2:Baas et al., 2013}).", "macules (summary by Baas et al., 2013).")] [InlineData("(MMR) ({18,17:Fishel et al., 1993, 1994}).", "(MMR) (Fishel et al., 1993, 1994).")] [InlineData("({516030}, {516040}, and {516050})", "")] [InlineData("(e.g., D1, {168461}; D2, {123833}; D3, {123834})", "(e.g., D1; D2; D3)")] [InlineData("(desmocollins; see DSC2, {125645})", "(desmocollins; see DSC2)")] [InlineData("(e.g., see {102700}, {300755})", "")] [InlineData("(ADH, see {103700}). See also liver mitochondrial ALDH2 ({100650})", "(ADH). See also liver mitochondrial ALDH2")] [InlineData("(see, e.g., CACNA1A; {601011})", "(see, e.g., CACNA1A)")] [InlineData("(e.g., GSTA1; {138359}), mu (e.g., {138350})", "(e.g., GSTA1), mu")] [InlineData("(NFKB; see {164011})", "(NFKB)")] [InlineData("(see ISGF3G, {147574})", "(see ISGF3G)")] [InlineData("(DCK; {EC 2.7.1.74}; {125450})", "(DCK; EC 2.7.1.74)")] [InlineData("chromosome 13q21 (see {603680.0001} and {613289.0001}).", "chromosome 13q21.")] [InlineData("common genetic haptoglobin types, Hp1 ({140100.0001}), Hp2 ({140100.0002}), and the heterozygous phenotype Hp2-1.", "common genetic haptoglobin types, Hp1, Hp2, and the heterozygous phenotype Hp2-1.")] [InlineData("and RBBP7/4 ({300825}/{602923}).", "and RBBP7/4.")] [InlineData("ultimately to formation of fibrin ({134570}/{134580}).", "ultimately to formation of fibrin.")] public void RemoveLinks_AsExpected(string input, string output) { Assert.Equal(output, input.RemoveLinks()); } [Theory] [InlineData(" UGT1A Gene Complex", " UGT1A Gene Complex")] public void RemoveFormatControl_AsExpected(string input, string output) { Assert.Equal(output, input.RemoveFormatControl()); } [Theory] [InlineData("[Beta-glycopyranoside tasting], (3) {Alcohol dependence, susceptibility to}", "[Beta-glycopyranoside tasting], {Alcohol dependence, susceptibility to}", "2,3")] [InlineData("?Proteasome-associated autoinflammatory syndrome 3, digenic", "?Proteasome-associated autoinflammatory syndrome 3, digenic", "1")] [InlineData("{?Thyroid cancer, nonmedullary, 5}", "{?Thyroid cancer, nonmedullary, 5}", "3,1")] [InlineData("Methylmalonic aciduria, mut(0) type", "Methylmalonic aciduria, mut(0) type", "0")] [InlineData("?{Diabetes, susceptibility to},", "?{Diabetes, susceptibility to}", "1,3")] public void ExtractPhenotypeAndComments_AsExpected(string input, string expectedPhenotype, string commentsEnumString) { (string phenotype, var comments) = OmimUtilities.ExtractPhenotypeAndComments(input); var expectedComments = commentsEnumString.Split(',').Select(x => (OmimItem.Comment) byte.Parse(x)).Where(x => x != OmimItem.Comment.unknown).ToArray(); Assert.Equal(expectedPhenotype, phenotype); Assert.Equal(expectedComments, comments); } [Fact] public void ExtractAndProcessItemDescription_AsExpected() { const string textSectionJson = "{\"textSection\":{\"textSectionName\": \"description\",\"textSectionTitle\": \"Description\",\"textSectionContent\": \"Constitutional mismatch repair deficiency is a rare childhood cancer predisposition syndrome with 4 main tumor types: hematologic malignancies, brain/central nervous system tumors, colorectal tumors and multiple intestinal polyps, and other malignancies including embryonic tumors and rhabdomyosarcoma. Many patients show signs reminiscent of neurofibromatosis type I (NF1; {162200}), particularly multiple cafe-au-lait macules (summary by {2:Baas et al., 2013}).\n\n'Turcot syndrome' classically refers to the combination of colorectal polyposis and primary tumors of the central nervous system ({13:Hamilton et al., 1995}).\"}}"; var textSection = JsonConvert.DeserializeObject(textSectionJson); var entryItem = new EntryItem{textSectionList = new []{textSection}}; var description = OmimUtilities.ExtractAndProcessItemDescription(entryItem); const string expected = "Constitutional mismatch repair deficiency is a rare childhood cancer predisposition syndrome with 4 main tumor types: hematologic malignancies, brain/central nervous system tumors, colorectal tumors and multiple intestinal polyps, and other malignancies including embryonic tumors and rhabdomyosarcoma. Many patients show signs reminiscent of neurofibromatosis type I (NF1), particularly multiple cafe-au-lait macules (summary by Baas et al., 2013).\n\n'Turcot syndrome' classically refers to the combination of colorectal polyposis and primary tumors of the central nervous system (Hamilton et al., 1995)."; Assert.Equal(expected, description); } } } ================================================ FILE: UnitTests/SAUtils/ParseUtils/SplitLineTests.cs ================================================ using SAUtils.ParseUtils; using Xunit; namespace UnitTests.SAUtils.ParseUtils; public sealed class SplitLineTests { [Theory] [InlineData("SomeString\tAnotherString", 0, "SomeString")] [InlineData("SomeString\tAnotherString", 1, "AnotherString")] [InlineData("\tAnotherString", 0, "")] [InlineData("\tAnotherString", 1, "AnotherString")] [InlineData("SomeString\t", 1, "")] [InlineData("SomeString\t", 0, "SomeString")] [InlineData("\t", 0, "")] [InlineData("", 0, "")] public void TestGetString(string inputLine, int index, string expectedString) { var splitLine = new SplitLine(inputLine, '\t'); Assert.Equal(expectedString, splitLine.GetString(index)); } [Theory] [InlineData("SomeString\t1", 0, null)] [InlineData("SomeString\t1", 1, 1)] [InlineData("SomeString\t2.0", 1, 2)] [InlineData("\t1", 0, null)] [InlineData("\t1", 1, 1)] [InlineData("SomeString\t", 1, null)] [InlineData("SomeString\t", 0, null)] [InlineData("\t", 0, null)] [InlineData("", 0, null)] [InlineData("A1", 0, null)] [InlineData("-1", 0, -1)] public void TestParseInteger(string inputLine, int index, int? expectedInt) { var splitLine = new SplitLine(inputLine, '\t'); Assert.Equal(expectedInt, splitLine.ParseInteger(index)); } [Theory] [InlineData("SomeString\t1", 0, null)] [InlineData("SomeString\t1", 1, 1.0)] [InlineData("SomeString\t2.0", 1, 2.0)] [InlineData("\t1", 0, null)] [InlineData("\t1", 1, 1.0)] [InlineData("SomeString\t", 1, null)] [InlineData("SomeString\t", 0, null)] [InlineData("\t", 0, null)] [InlineData("", 0, null)] [InlineData("A1", 0, null)] [InlineData("-1", 0, -1.0)] public void TestParseDouble(string inputLine, int index, double? expectedDouble) { var splitLine = new SplitLine(inputLine, '\t'); Assert.Equal(expectedDouble, splitLine.ParseDouble(index)); } } ================================================ FILE: UnitTests/SAUtils/ParseUtils/TsvIndicesTests.cs ================================================ using SAUtils.ParseUtils; using Xunit; namespace UnitTests.SAUtils.ParseUtils; public class TsvIndicesTests { [Theory] [InlineData(0, 1)] public void TestTsvIndices(ushort chromosomeIndex, ushort startIndex) { var tsvIndices = new TsvIndices() { Chromosome = chromosomeIndex, Start = startIndex }; Assert.Equal(tsvIndices.Chromosome, chromosomeIndex); Assert.Equal(tsvIndices.Start, startIndex); Assert.Equal(tsvIndices.SvType, ushort.MaxValue); } } ================================================ FILE: UnitTests/SAUtils/PhylopTests.cs ================================================ using System; using System.IO; using Genome; using IO; using SAUtils.InputFileParsers; using SAUtils.PhyloP; using UnitTests.TestUtilities; using VariantAnnotation.PhyloP; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Xunit; namespace UnitTests.SAUtils { public sealed class PhylopTests { [Fact] public void LoopbackTest() { var wigFixFile = Resources.TopPath("mini.WigFix"); var version = new DataSourceVersion("phylop", "0", DateTime.Now.Ticks, "unit test"); using(var reader = new PhylopParser(FileUtilities.GetReadStream(wigFixFile),GenomeAssembly.GRCh37, ChromosomeUtilities.RefNameToChromosome)) using (var npdStream = new MemoryStream()) using(var indexStream = new MemoryStream()) using (var npdWriter = new NpdWriter(npdStream, indexStream, version, GenomeAssembly.GRCh37, SaCommon.PhylopTag, SaCommon.SchemaVersion)) { npdWriter.Write(reader.GetItems()); npdStream.Position = 0; indexStream.Position = 0; using (var phylopReader = new NpdReader(npdStream, indexStream)) { Assert.Equal(0.1, phylopReader.GetAnnotation(ChromosomeUtilities.Chr1, 100));//first position of first block Assert.Equal(0.1, phylopReader.GetAnnotation(ChromosomeUtilities.Chr1, 101));// second position Assert.Equal(0.1, phylopReader.GetAnnotation(ChromosomeUtilities.Chr1, 120));// some internal position Assert.Equal(0.1, phylopReader.GetAnnotation(ChromosomeUtilities.Chr1, 130));//last position of first block //moving on to the next block: should cause reloading from file Assert.Equal(0.1, phylopReader.GetAnnotation(ChromosomeUtilities.Chr1, 175));//first position of second block Assert.Equal(-2.1, phylopReader.GetAnnotation(ChromosomeUtilities.Chr1, 182));// some negative value //chrom 2 Assert.Null(phylopReader.GetAnnotation(ChromosomeUtilities.Chr2, 400));//values past the last phylop positions should return null } } } } } ================================================ FILE: UnitTests/SAUtils/PrimateAi/PrimateAiTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using Moq; using SAUtils.DataStructures; using SAUtils.PrimateAi; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using Xunit; namespace UnitTests.SAUtils.PrimateAi { public sealed class PrimateAiTests { private static ISequenceProvider GetSequenceProvider() { var mockProvider = new Mock(); mockProvider.SetupGet(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome); mockProvider.SetupGet(x => x.RefIndexToChromosome).Returns(ChromosomeUtilities.RefIndexToChromosome); return mockProvider.Object; } private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("#CHROM\tPOS\tREF\tALT\tGeneId\tScorePercentile"); writer.WriteLine("1\t69094\tG\tA\t79501\t0.79"); writer.WriteLine("1\t69094\tG\tC\t79501\t0.75"); writer.WriteLine("1\t69094\tG\tT\t79501\t0.75"); writer.WriteLine("1\t69097\tA\tG\t79501\t0.56"); writer.WriteLine("1\t69097\tA\tC\t79501\t0.57"); writer.WriteLine("1\t69097\tA\tT\t79501\t0.54"); writer.WriteLine("1\t56197104\tA\tG\tENSG00000234810\t0.80"); writer.WriteLine("1\t56197443\tC\tT\tENSG00000234810\t0.20"); writer.WriteLine("1\t56197476\tC\tT\tENSG00000234810\t0.40"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void ExtractEntries() { var entrezToHgnc = new Dictionary { { "79501", "Gene1" } }; var ensemblToHgnc = new Dictionary { {"ENSG00000234810", "Gene2" } }; var primateParser = new PrimateAiParser(GetStream(), GetSequenceProvider(), entrezToHgnc, ensemblToHgnc); var items = primateParser.GetItems().ToList(); Assert.Equal(9, items.Count); Assert.Equal("\"hgnc\":\"Gene1\",\"scorePercentile\":0.79", items[0].GetJsonString()); Assert.Equal("\"hgnc\":\"Gene2\",\"scorePercentile\":0.2", items[7].GetJsonString()); } private static Stream GetDuplicateItemStream() { var stream = new MemoryStream(); using (var writer = new StreamWriter(stream, Encoding.Default, 1024, true)) { writer.WriteLine("#CHROM\tPOS\tREF\tALT\tGeneId\tScorePercentile"); writer.WriteLine("4\t155713\tA\tG\t255403\t0.03"); writer.WriteLine("4\t155713\tA\tG\t255403\t0.93"); } stream.Position = 0; return stream; } [Fact] public void ResolveDuplicates() { var entrezToHgnc = new Dictionary { { "255403", "Gene1"} }; var ensemblToHgnc = new Dictionary { {"ENSG00000234810", "Gene2" } }; var primateParser = new PrimateAiParser(GetDuplicateItemStream(), GetSequenceProvider(), entrezToHgnc, ensemblToHgnc); var items = primateParser.GetItems().Cast().ToList(); var deDupItems = SuppDataUtilities.DeDuplicatePrimateAiItems(items); Assert.Single(deDupItems); Assert.Equal("\"hgnc\":\"Gene1\",\"scorePercentile\":0.93", deDupItems[0].GetJsonString()); } } } ================================================ FILE: UnitTests/SAUtils/ProteinConservation/ParserTests.cs ================================================ using System.IO; using System.Linq; using SAUtils.AAConservation; using Xunit; namespace UnitTests.SAUtils.ProteinConservation { public sealed class ParserTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("#Ensembl\tChromosome\tProteinSequence\tPercent Conservation at each AA residue"); writer.WriteLine("ENST00000641515\tchr1\tMKKVTAEAISWNESTSETNNSMVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKFZ\t40,38,11,34,37,11,31,17,25,31,38,39,37,38,12,35,33,33,52,9,47,20,66,56,61,73,50,40,57,71,62,55,28,47,34,55,51,61,6,44,58,61,5,29,66,38,42,43,71,10,41,55,46,35,65,79,54,49,56,44,32,47,54,17,34,63,40,49,79,67,54,81,78,77,63,52,74,46,65,72,61,37,40,71,55,23,41,50,57,55,57,44,79,65,53,40,18,60,45,43,46,16,27,60,40,73,70,57,35,61,81,46,15,80,41,62,37,43,60,45,50,58,51,44,69,20,42,41,79,37,48,78,9,49,81,81,75,17,77,72,76,46,80,79,37,80,44,49,64,70,24,30,29,8,66,31,41,30,8,40,15,5,73,38,23,63,54,50,57,56,41,41,57,39,41,49,40,39,9,73,1,67,75,62,58,63,36,40,53,44,54,48,76,71,42,58,44,56,47,52,74,60,77,43,62,57,48,36,43,37,49,41,47,41,48,51,56,60,44,44,39,45,36,37,58,33,43,55,44,50,70,73,43,31,66,61,20,45,48,36,18,27,43,5,25,10,42,41,81,72,52,61,79,43,39,44,76,49,52,67,66,42,63,64,57,52,55,48,11,56,49,67,43,40,63,50,43,35,35,47,45,58,58,49,41,49,58,47,53,39,56,22,55,72,46,62,80,76,43,78,80,56,70,61,65,61,43,52,64,30,57,19,39,4,50,35,31,28,10,8,8,30,8,19,33,7,39"); writer.WriteLine("ENST00000335137\tchr1\tMVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKFZ\t20,66,56,61,73,50,40,57,71,62,55,28,47,34,55,51,61,6,44,58,61,5,29,66,38,42,43,71,10,41,55,46,35,65,79,54,49,56,44,32,47,54,17,34,63,40,49,79,67,54,81,78,77,63,52,74,46,65,72,61,37,40,71,55,23,41,50,57,55,57,44,79,65,53,40,18,60,45,43,46,16,27,60,40,73,70,57,35,61,81,46,15,80,41,62,37,43,60,45,50,58,51,44,69,20,42,41,79,37,48,78,9,49,81,81,75,17,77,72,76,46,80,79,37,80,44,49,64,70,24,30,29,8,66,31,41,30,8,40,15,5,73,38,23,63,54,50,57,56,41,41,57,39,41,49,40,39,9,73,1,67,75,62,58,63,36,40,53,44,54,48,76,71,42,58,44,56,47,52,74,60,77,43,62,57,48,36,43,37,49,41,47,41,48,51,56,60,44,44,39,45,36,37,58,33,43,55,44,50,70,73,43,31,66,61,20,45,48,36,18,27,43,5,25,10,42,41,81,72,52,61,79,43,39,44,76,49,52,67,66,42,63,64,57,52,55,48,11,56,49,67,43,40,63,50,43,35,35,47,45,58,58,49,41,49,58,47,53,39,56,22,55,72,46,62,80,76,43,78,80,56,70,61,65,61,43,52,64,30,57,19,39,4,50,35,31,28,10,8,8,30,8,19,33,7,39"); writer.WriteLine("ENST00000379407\tchr1\tMGNSHCVPQAPRRLRASFSRKPSLKGNREDSARMSAGLPGPEAARSGDAAANKLFHYIPGTDILDLENQRENLEQPFLSVFKKGRRRVPVRNLGKVVHYAKVQLRFQHSQDVSDCYLELFPAHLYFQAHGSEGLTFQGLLPLTELSVCPLEGSREHAFQITGPLPAPLLVLCPSRAELDRWLYHLEKQTALLGGPRRCHSAPPQGSCGDELPWTLQRRLTRLRTASGHEPGGSAVCASRVKLQHLPAQEQWDRLLVLYPTSLAIFSEELDGLCFKGELPLRAVHINLEEKEKQIRSFLIEGPLINTIRVVCASYEDYGHWLLCLRAVTHREGAPPLPGAESFPGSQVMGSGRGSLSSGGQTSWDSGCLAPPSTRTSHSLPESSVPSTVGCSSQHTPLHRLSLESSPDAPDHTSETSHSPLYADPYTPPATSHRRVTDVRGLEEFLSAMQSARGPTPSSPLPSVPVSVPASDPRSCSSGPAGPYLLSKKGALQSRAAQRHRGSAKDGGPQPPDAPQLVSSAREGSPEPWLPLTDGRSPRRSRDPGYDHLWDETLSSSHQKCPQLGGPEASGGLVQWIZ\t67,70,66,54,41,68,60,69,67,60,66,55,63,54,71,61,68,66,59,68,67,63,66,67,68,51,40,56,55,53,20,35,57,9,3,50,52,55,1,64,12,52,47,12,7,11,47,58,28,39,53,15,50,15,49,25,56,53,61,56,45,48,60,15,13,50,60,31,65,31,72,52,67,49,73,61,67,60,71,59,72,71,62,68,65,71,49,42,51,80,73,60,73,79,66,58,48,60,74,48,76,78,53,60,59,75,72,77,60,76,56,46,57,69,74,64,76,73,75,73,53,15,57,80,69,72,72,60,54,70,58,40,77,73,73,53,75,67,61,59,61,66,25,56,63,52,42,52,46,41,54,53,48,46,59,55,57,59,57,59,61,56,70,68,58,51,70,69,54,68,52,69,57,51,11,53,68,62,14,40,70,73,65,66,70,68,71,69,11,57,72,16,61,68,9,22,40,52,48,39,38,46,48,71,51,12,7,27,47,46,50,52,48,39,44,41,30,33,44,40,36,53,41,33,36,47,50,22,28,12,12,53,38,47,12,61,54,58,55,59,55,61,52,52,53,52,9,52,59,53,42,54,59,52,64,56,61,57,60,39,52,59,52,59,52,59,57,55,8,46,56,59,42,57,57,66,64,66,62,62,13,60,18,47,49,54,54,54,57,39,47,57,53,55,54,58,63,63,62,65,64,18,65,68,66,54,69,70,69,41,70,57,65,66,62,68,66,13,43,70,65,54,70,67,33,17,38,12,18,51,13,45,29,29,27,52,41,54,11,50,44,45,43,65,15,29,41,8,54,33,54,52,63,60,58,50,60,10,61,13,54,55,56,40,51,53,47,17,45,41,36,47,42,25,55,64,40,56,57,61,51,44,42,39,45,37,44,14,47,45,9,13,36,8,11,46,35,9,38,30,39,55,45,13,26,40,9,46,51,41,35,6,12,48,36,11,53,61,64,59,57,60,58,60,57,59,59,64,56,54,57,49,25,15,15,52,57,51,33,47,60,14,56,56,57,46,52,44,47,46,5,3,55,59,2,48,37,60,44,11,51,34,48,55,52,59,54,68,12,51,46,45,3,23,2,19,13,42,27,5,35,46,5,49,47,43,41,57,46,39,36,49,31,56,51,6,50,53,51,52,51,60,16,54,10,8,44,43,23,54,43,46,3,53,53,44,53,50,19,31,44,50,31,43,55,13,49,3,28,52,15,13,44,44,15,22,43,28,17,9,4,6,18,13,33,20,10,12,47,42,16,24,21,45,17,56,36,39,34,11,50,19,23,38,18,47,49,44,10,36,52,49,18,58,59,59,58"); writer.WriteLine("ENST00000379319\tchr1\tMALRHLALLAGLLVGVASKSMENTAQLPECCVDVVGVNASCPGASLCGPGCYRRWNADGSASCVRCGNGTLPAYNGSECRSFAGPGAPFPMNRSSGTPGRPHPGAPRVAASLFLGTFFISSGLILSVAGFFYLKRSSKLPRACYRRNKAPALQPGEAAAMIPPPQSSVRKPRYVRRERPLDRATDPAAFPGEARISNVZ\t57,48,53,12,18,25,12,54,62,51,49,52,51,45,23,51,44,56,41,52,8,44,9,4,26,52,10,55,53,61,61,46,52,47,33,11,26,59,39,19,64,55,53,10,33,54,64,53,62,60,62,59,53,24,38,39,24,55,63,52,21,57,61,38,40,61,13,60,45,59,17,29,22,21,40,32,59,52,65,55,31,11,34,49,2,49,33,18,41,52,31,54,51,55,18,55,36,55,56,30,49,25,17,49,14,73,19,75,68,77,77,76,56,78,78,78,46,50,66,78,44,55,80,80,79,68,77,77,37,76,78,78,78,76,78,47,57,63,80,76,23,3,13,75,57,71,55,64,52,50,42,54,54,54,46,56,47,57,49,58,52,59,45,63,47,48,58,52,65,64,67,66,58,52,65,64,58,60,50,34,46,39,20,18,23,44,26,45,25,12,3,47,50,59,21,64,52,64,59"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void ReadItemsTest() { using (var stream = GetStream()) using (var parser = new ProteinConservationParser(stream)) { var items = parser.GetItems().ToArray(); Assert.Equal(4, items.Length); Assert.Equal("ENST00000641515", items[0].TranscriptId); Assert.Equal(new byte[]{40,38,11,34,37,11,31,17,25,31,38,39,37,38,12,35,33,33,52,9,47,20,66,56,61,73,50,40,57,71,62,55,28,47,34,55,51,61,6,44,58,61,5,29,66,38,42,43,71,10,41,55,46,35,65,79,54,49,56,44,32,47,54,17,34,63,40,49,79,67,54,81,78,77,63,52,74,46,65,72,61,37,40,71,55,23,41,50,57,55,57,44,79,65,53,40,18,60,45,43,46,16,27,60,40,73,70,57,35,61,81,46,15,80,41,62,37,43,60,45,50,58,51,44,69,20,42,41,79,37,48,78,9,49,81,81,75,17,77,72,76,46,80,79,37,80,44,49,64,70,24,30,29,8,66,31,41,30,8,40,15,5,73,38,23,63,54,50,57,56,41,41,57,39,41,49,40,39,9,73,1,67,75,62,58,63,36,40,53,44,54,48,76,71,42,58,44,56,47,52,74,60,77,43,62,57,48,36,43,37,49,41,47,41,48,51,56,60,44,44,39,45,36,37,58,33,43,55,44,50,70,73,43,31,66,61,20,45,48,36,18,27,43,5,25,10,42,41,81,72,52,61,79,43,39,44,76,49,52,67,66,42,63,64,57,52,55,48,11,56,49,67,43,40,63,50,43,35,35,47,45,58,58,49,41,49,58,47,53,39,56,22,55,72,46,62,80,76,43,78,80,56,70,61,65,61,43,52,64,30,57,19,39,4,50,35,31,28,10,8,8,30,8,19,33,7,39} , items[0].Scores); Assert.Equal("ENST00000379319", items[3].TranscriptId); } } } } ================================================ FILE: UnitTests/SAUtils/Revel/RevelParserTests.cs ================================================ using System.IO; using System.Linq; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.Revel { public sealed class RevelParserTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##REVEL"); writer.WriteLine("#chr\tpos\tref\talt\trefAA\taltAA\tscore"); writer.WriteLine("1\t35290\tG\tA\tP\tD\t0.035"); writer.WriteLine("1\t35290\tG\tA\tP\tS\t0.031"); writer.WriteLine("1\t35290\tG\tC\tP\tA\t0.040"); writer.WriteLine("1\t35290\tG\tT\tP\tT\t0.035"); writer.WriteLine("1\t35290\tG\tC\tP\tA\t0.063"); writer.WriteLine("1\t35291\tG\tC\tF\tL\t0.022"); writer.WriteLine("1\t35291\tG\tT\tF\tL\t0.022"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void RevelReader_GetItems_AsExpected() { var nucleotides = new[] {"A", "C", "G", "T"}; var revelParserSettings = new ParserSettings( new ColumnIndex(0, 1, 2, 3, 6, null), nucleotides, GenericScoreParser.MaxRepresentativeScores ); using (var streamReader = new StreamReader(GetStream())) using (var reader = new GenericScoreParser(revelParserSettings, streamReader, ChromosomeUtilities.RefNameToChromosome)) { var revelItems = reader.GetItems().ToArray(); Assert.Equal(5, revelItems.Length); Assert.Equal(35290, revelItems[0].Position); Assert.Equal("G", revelItems[0].RefAllele); Assert.Equal("A", revelItems[0].AltAllele); Assert.Equal("\"score\":0.035", revelItems[0].GetJsonString()); Assert.Equal(35290, revelItems[1].Position); Assert.Equal("G", revelItems[1].RefAllele); Assert.Equal("C", revelItems[1].AltAllele); Assert.Equal("\"score\":0.063", revelItems[1].GetJsonString()); Assert.Equal(35291, revelItems[4].Position); Assert.Equal("G", revelItems[4].RefAllele); Assert.Equal("T", revelItems[4].AltAllele); Assert.Equal("\"score\":0.022", revelItems[4].GetJsonString()); } } } } ================================================ FILE: UnitTests/SAUtils/SaJsonSchemaTests.cs ================================================ using System.Collections.Generic; using System.Text; using ErrorHandling.Exceptions; using SAUtils.Schema; using VariantAnnotation.SA; using Xunit; namespace UnitTests.SAUtils { public sealed class SaJsonSchemaTests { private const string SchemaVersion = "http://json-schema.org/draft-06/schema#"; [Fact] public void Create_InitialJsonObject_AsExpected() { var sb = new StringBuilder(); SaJsonSchema.Create(sb, "test", SaJsonValueType.ObjectArray, new List()); const string expectedJsonString = "{\"$schema\":\"" + SchemaVersion + "\",\"type\":\"object\",\"properties\":{\"test\":{\"type\":\"array\",\"items\":{\"type\":\"object\",\"properties\":{"; Assert.Equal(expectedJsonString, sb.ToString()); } [Fact] public void OutputKeyAnnotation_AsExpected() { var sb = new StringBuilder(); var jsonSchema = new SaJsonSchema(sb); jsonSchema.AddAnnotation("name", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.String, 0, null)); jsonSchema.OutputKeyAnnotation("name"); Assert.Equal("\"name\":{\"type\":\"string\"}", sb.ToString()); } [Fact] public void ToString_AsExpected() { var jsonSchema = SaJsonSchema.Create(new StringBuilder(), "test", SaJsonValueType.ObjectArray, new List { "name", "phone", "employed" }); jsonSchema.AddAnnotation("name", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.String, 0, null)); jsonSchema.AddAnnotation("phone", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Number, 0, "phone number")); jsonSchema.AddAnnotation("employed", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Bool, 0, null)); jsonSchema.TotalItems = 100; jsonSchema.KeyCounts["name"] = 100; jsonSchema.KeyCounts["phone"] = 50; jsonSchema.KeyCounts["employed"] = 0; const string expectedJsonSchemaString = "{\"$schema\":\"" + SchemaVersion + "\",\"type\":\"object\",\"properties\":{\"test\":{\"type\":\"array\",\"items\":{\"type\":\"object\",\"properties\":{" + "\"name\":{\"type\":\"string\"},\"phone\":{\"type\":\"number\",\"description\":\"phone number\"}}," + "\"required\":[\"name\"],\"additionalProperties\":false}}}}"; Assert.Equal(expectedJsonSchemaString, jsonSchema.ToString()); // make sure the returned string is the same when ToString method is called more than once Assert.Equal(expectedJsonSchemaString, jsonSchema.ToString()); } [Fact] public void GetJsonString_AsExpected() { var jsonSchema = SaJsonSchema.Create(new StringBuilder(), "test", SaJsonValueType.ObjectArray, new List { "name", "phone", "employed" }); jsonSchema.AddAnnotation("name", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.String, 0, null)); jsonSchema.AddAnnotation("phone", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Number, 0, "phone number")); jsonSchema.AddAnnotation("employed", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Bool, 0, null)); var jsonString = jsonSchema.GetJsonString(new List { new[] { "Ada" }, new[] { "123456" }, new[] { "true" } }); Assert.Equal("\"name\":\"Ada\",\"phone\":123456,\"employed\":true", jsonString); } [Fact] public void GetJsonString_DoubleValueHandling_AsExpected() { var jsonSchema = SaJsonSchema.Create(new StringBuilder(), "test", SaJsonValueType.ObjectArray, new List { "allAf", "doubleValue1", "doubleValue2" }); jsonSchema.AddAnnotation("allAf", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Number, CustomAnnotationCategories.AlleleFrequency, null)); jsonSchema.AddAnnotation("doubleValue1", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Number, 0, "A double value")); jsonSchema.AddAnnotation("doubleValue2", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Number, 0, "Another double value")); var jsonString = jsonSchema.GetJsonString(new List { new[] { "0.12345678" }, new[] { "0.12" }, new[] { "0.12345678" } }); Assert.Equal("\"allAf\":0.123457,\"doubleValue1\":0.12,\"doubleValue2\":0.12345678", jsonString); } [Fact] public void CheckAndGetBoolFromString_AsExpected() { Assert.True(SaJsonSchema.CheckAndGetBoolFromString("true")); Assert.True(SaJsonSchema.CheckAndGetBoolFromString("TRUE")); Assert.False(SaJsonSchema.CheckAndGetBoolFromString("false")); Assert.False(SaJsonSchema.CheckAndGetBoolFromString("False")); Assert.False(SaJsonSchema.CheckAndGetBoolFromString("")); Assert.False(SaJsonSchema.CheckAndGetBoolFromString(".")); } [Fact] public void CheckAndGetBoolFromString_InvalidValue_ThrowException() { Assert.Throws(() => SaJsonSchema.CheckAndGetBoolFromString("T")); Assert.Throws(() => SaJsonSchema.CheckAndGetBoolFromString("F")); Assert.Throws(() => SaJsonSchema.CheckAndGetBoolFromString("0")); Assert.Throws(() => SaJsonSchema.CheckAndGetBoolFromString("-")); } [Fact] public void CheckAndGetNullableDoubleFromString_GetNull_AsExpected() { Assert.Null(SaJsonSchema.CheckAndGetNullableDoubleFromString("")); Assert.Null(SaJsonSchema.CheckAndGetNullableDoubleFromString(".")); } [Fact] public void CheckAndGetNullableDoubleFromString_NotANum_ThrowException() { Assert.Throws(() => SaJsonSchema.CheckAndGetNullableDoubleFromString("Bob")); Assert.Throws(() => SaJsonSchema.CheckAndGetNullableDoubleFromString("1+1")); Assert.Throws(() => SaJsonSchema.CheckAndGetNullableDoubleFromString("bool")); } [Fact] public void GetJsonString__AsExpected() { var jsonSchema = SaJsonSchema.Create(new StringBuilder(), "test", SaJsonValueType.ObjectArray, new List { "allAf", "doubleValue1", "doubleValue2" }); jsonSchema.AddAnnotation("allAf", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Number, CustomAnnotationCategories.AlleleFrequency, null )); jsonSchema.AddAnnotation("doubleValue1", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Number, 0, "A double value" )); jsonSchema.AddAnnotation("doubleValue2", SaJsonKeyAnnotation.CreateFromProperties(SaJsonValueType.Number, 0, "Another double value" )); var jsonString = jsonSchema.GetJsonString(new List { new[] { "0.12345678" }, new[] { "0.12" }, new[] { "0.12345678" } }); Assert.Equal("\"allAf\":0.123457,\"doubleValue1\":0.12,\"doubleValue2\":0.12345678", jsonString); } } } ================================================ FILE: UnitTests/SAUtils/SpliceAi/SpliceAiTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using CacheUtils.TranscriptCache; using Genome; using Intervals; using Moq; using SAUtils.SpliceAi; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO.Caches; using Variants; using Xunit; namespace UnitTests.SAUtils.SpliceAi { public sealed class SpliceAiTests { private static Dictionary GetSpliceToNirvanaGenes() { return new Dictionary { {"TUBB8", "TUBB8"}, {"CDK11B", "CDK11B" }, {"MMP23B", "MMP23B" }, {"KRTAP19-3", "KRTAP19-3" }, {"KRTAP19-2", "KRTAP19-2" }, { "CECR5", "CECR5"}, { "SPLICE", "NIR91"} }; } private static ISequenceProvider GetSequenceProvider() { var mockProvider = new Mock(); mockProvider.SetupGet(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome); mockProvider.SetupGet(x => x.RefIndexToChromosome).Returns(ChromosomeUtilities.RefIndexToChromosome); //only for unit tests that uses variants at 17148654 mockProvider.SetupGet(x => x.Sequence).Returns(new SimpleSequence(new string('T', VariantUtils.MaxUpstreamLength) + "GAAAAA", 17148654 - 1 - VariantUtils.MaxUpstreamLength)); return mockProvider.Object; } private static ISequenceProvider GetCacheSequenceProvider() { var mockProvider = new Mock(); mockProvider.SetupGet(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome); mockProvider.SetupGet(x => x.RefIndexToChromosome).Returns(ChromosomeUtilities.RefIndexToChromosome); return mockProvider.Object; } private static Dictionary> GetSpliceIntervals() { var intervals10 = new[] { new Interval(92946 - SpliceUtilities.SpliceFlankLength, 92946 + SpliceUtilities.SpliceFlankLength, 0), new Interval(93816 - SpliceUtilities.SpliceFlankLength, 93816 + SpliceUtilities.SpliceFlankLength, 0) }; var intervals1 = new[] { new Interval(1577180 - SpliceUtilities.SpliceFlankLength, 1577180 + SpliceUtilities.SpliceFlankLength, 0) }; var intervals21 = new[] { new Interval(31859677 - SpliceUtilities.SpliceFlankLength, 31859677 + SpliceUtilities.SpliceFlankLength, 0), new Interval(35275955 - SpliceUtilities.SpliceFlankLength, 35275955 + SpliceUtilities.SpliceFlankLength, 0) }; var intervals22 = new[] { new Interval(17148600 - SpliceUtilities.SpliceFlankLength, 17148600 + SpliceUtilities.SpliceFlankLength, 0) }; return new Dictionary> { {ChromosomeUtilities.Chr1.Index, new IntervalArray(intervals1)}, {ChromosomeUtilities.Chr10.Index, new IntervalArray(intervals10)}, {ChromosomeUtilities.Chr21.Index, new IntervalArray(intervals21)}, {ChromosomeUtilities.Chr22.Index, new IntervalArray(intervals22)} }; } private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO=") ; writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); //this line should not produce any objects since all values are <0.10 and its far from splice sites writer.WriteLine("10\t92900\t.\tC\tT\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); // values are small but it is close to a splice site. So we report all of it writer.WriteLine("10\t92946\t.\tC\tT\t.\t.\tSpliceAI=T|TUBB8|0.00|0.00|0.00|0.00|-26|-10|3|35"); // not around a splice site but has higher than 0.1 value. So, we report the one that is significant writer.WriteLine("10\t93389\t.\tC\tA\t.\t.\tSpliceAI=A|TUBB8|0.11|0.00|0.00|0.00|-11|-29|-11|-32"); //should be reported back with 4 object since it is within splice interval; writer.WriteLine("10\t93816\t.\tC\tG\t.\t.\tSpliceAI=G|TUBB8|0.19|0.00|0.00|0.00|-7|-50|-7|-6"); writer.Flush(); stream.Position = 0; return stream; } private static Stream GetPositionCachingStream() { //testing the position caching using minHeap. All entries have significant entries, so all of them should be reported var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO="); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("10\t92900\t.\tC\tT\t.\t.\tSpliceAI=A|TUBB8|0.80|0.00|0.00|0.00|-4|-2|-12|25"); writer.WriteLine("10\t92946\t.\tC\tT\t.\t.\tSpliceAI=T|TUBB8|0.00|0.00|0.00|0.00|-26|-10|3|35"); writer.WriteLine("10\t92946\t.\tC\tA\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-10|-48|35|-21"); writer.WriteLine("10\t93389\t.\tC\tA\t.\t.\tSpliceAI=A|TUBB8|0.11|0.00|0.00|0.00|-11|-29|-11|-32"); writer.WriteLine("10\t93816\t.\tC\tG\t.\t.\tSpliceAI=G|TUBB8|0.19|0.00|0.00|0.00|-7|-50|-7|-6"); writer.Flush(); stream.Position = 0; return stream; } private static Stream GetMultiGeneStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO="); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("1\t1577180\t.\tC\tT\t.\t.\tSpliceAI=T|MMP23B|0.00|0.00|0.00|0.00|8|-16|-16|26"); writer.WriteLine("1\t1577180\t.\tC\tT\t.\t.\tSpliceAI=T|CDK11B|0.92|0.00|0.00|0.00|-2|-8|33|-13"); writer.Flush(); stream.Position = 0; return stream; } private static Stream GetMissingEntryStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO="); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("21\t35275955\t.\tG\tA\t.\t.\tSpliceAI=A|AP000304.12|0.14|0.00|0.00|0.00|-12|24|-41|5"); writer.WriteLine("21\t35275955\t.\tG\tA\t.\t.\tSpliceAI=A|ATP5O|0.00|0.00|0.00|0.00|-12|24|-41|-12"); writer.Flush(); stream.Position = 0; return stream; } private static Stream GetMultChromosomeStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO="); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); //having two gene symbols at the same position should avoid updating gene symbol writer.WriteLine("10\t92900\t.\tC\tT\t.\t.\tSpliceAI=A|TUBB8|0.00|0.50|0.00|0.00|-4|-2|-12|25"); writer.WriteLine("10\t92900\t.\tC\tT\t.\t.\tSpliceAI=A|SPLICE|0.00|0.00|0.00|0.20|-4|-2|-12|25"); //The previous entries should be flushed since we changed chromosome writer.WriteLine("1\t92900\t.\tC\tT\t.\t.\tSpliceAI=A|TUBB8|0.30|0.00|0.00|0.00|-4|-2|-12|25"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void Check_multi_chromosome_gene_update() { using (var spliceParser = new SpliceAiParser(GetMultChromosomeStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); // the third item will be skipped since hgnc is null Assert.Equal(3, spliceItems.Count); Assert.Equal("TUBB8", spliceItems[0].Hgnc); //checking a case where the splice AI gene is different from Nirvana Assert.Equal("NIR91", spliceItems[1].Hgnc); } } [Fact] public void Parse_standard_lines() { using (var spliceParser = new SpliceAiParser(GetStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Equal(3,spliceItems.Count); Assert.Equal("\"hgnc\":\"TUBB8\",\"acceptorGainScore\":0,\"acceptorGainDistance\":-26,\"acceptorLossScore\":0,\"acceptorLossDistance\":-10,\"donorGainScore\":0,\"donorGainDistance\":3,\"donorLossScore\":0,\"donorLossDistance\":35", spliceItems[0].GetJsonString()); Assert.Equal("\"hgnc\":\"TUBB8\",\"acceptorGainScore\":0.1,\"acceptorGainDistance\":-11", spliceItems[1].GetJsonString()); Assert.Equal("\"hgnc\":\"TUBB8\",\"acceptorGainScore\":0.2,\"acceptorGainDistance\":-7,\"acceptorLossScore\":0,\"acceptorLossDistance\":-50,\"donorGainScore\":0,\"donorGainDistance\":-7,\"donorLossScore\":0,\"donorLossDistance\":-6", spliceItems[2].GetJsonString()); } } [Fact] public void MissingEntry() { using (var spliceParser = new SpliceAiParser(GetMissingEntryStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Equal(2,spliceItems.Count); Assert.Equal("\"hgnc\":\"AP000304.12\",\"acceptorGainScore\":0.1,\"acceptorGainDistance\":-12,\"acceptorLossScore\":0,\"acceptorLossDistance\":24,\"donorGainScore\":0,\"donorGainDistance\":-41,\"donorLossScore\":0,\"donorLossDistance\":5", spliceItems[0].GetJsonString()); } } private static Stream GetMultiScoreStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO="); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("10\t93816\t.\tC\tG\t.\t.\tSpliceAI=G|TUBB8|0.19|0.40|0.00|0.20|-7|-50|-7|-6"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void Parse_multiScore_entry() { using (var spliceParser = new SpliceAiParser(GetMultiScoreStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Single(spliceItems); Assert.Equal("\"hgnc\":\"TUBB8\",\"acceptorGainScore\":0.2,\"acceptorGainDistance\":-7,\"acceptorLossScore\":0.4,\"acceptorLossDistance\":-50,\"donorGainScore\":0,\"donorGainDistance\":-7,\"donorLossScore\":0.2,\"donorLossDistance\":-6", spliceItems[0].GetJsonString()); } } [Fact] public void Parse_multiGene_entry() { using (var spliceParser = new SpliceAiParser(GetMultiGeneStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Equal(2,spliceItems.Count); Assert.Equal("\"hgnc\":\"MMP23B\",\"acceptorGainScore\":0,\"acceptorGainDistance\":8,\"acceptorLossScore\":0,\"acceptorLossDistance\":-16,\"donorGainScore\":0,\"donorGainDistance\":-16,\"donorLossScore\":0,\"donorLossDistance\":26", spliceItems[0].GetJsonString()); Assert.Equal("\"hgnc\":\"CDK11B\",\"acceptorGainScore\":0.9,\"acceptorGainDistance\":-2,\"acceptorLossScore\":0,\"acceptorLossDistance\":-8,\"donorGainScore\":0,\"donorGainDistance\":33,\"donorLossScore\":0,\"donorLossDistance\":-13", spliceItems[1].GetJsonString()); } } private static Stream GetShiftableInsertionStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO="); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t17148654\t.\tG\tGA\t.\t.\tSpliceAI=GA|CECR5|0.10|0.00|0.00|0.00|-10|10|-10|-15"); writer.WriteLine("22\t17148655\t.\tA\tAA\t.\t.\tSpliceAI=AA|CECR5|0.10|0.00|0.00|0.00|-11|9|-11|-16"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void Skip_shiftable_indels() { using (var spliceParser = new SpliceAiParser(GetShiftableInsertionStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Single(spliceItems); Assert.Equal("\"hgnc\":\"CECR5\",\"acceptorGainScore\":0.1,\"acceptorGainDistance\":-10", spliceItems[0].GetJsonString()); } } [Fact] public void Check_position_caching() { using (var spliceParser = new SpliceAiParser(GetPositionCachingStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Equal(5, spliceItems.Count); } } private static Stream GetCacheStream() { const GenomeAssembly genomeAssembly = GenomeAssembly.GRCh38; var baseHeader = new Header("test", 2, 3, Source.BothRefSeqAndEnsembl, 4, genomeAssembly); var customHeader = new TranscriptCacheCustomHeader(1, 2); var expectedHeader = new CacheHeader(baseHeader, customHeader); var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 199, 300, 399), new TranscriptRegion(TranscriptRegionType.Intron, 1, 200, 299, 399, 400), new TranscriptRegion(TranscriptRegionType.Exon, 2, 300, 399, 400, 499) }; var mirnas = new IInterval[2]; mirnas[0] = new Interval(100, 200); mirnas[1] = new Interval(300, 400); var peptideSeqs = new[] { "MASE*" }; var genes = new IGene[1]; genes[0] = new Gene(ChromosomeUtilities.Chr3, 100, 200, true, "TP53", 300, CompactId.Convert("7157"), CompactId.Convert("ENSG00000141510")); var regulatoryRegions = new IRegulatoryRegion[2]; regulatoryRegions[0] = new RegulatoryRegion(ChromosomeUtilities.Chr3, 1200, 1300, CompactId.Convert("123"), RegulatoryRegionType.enhancer); regulatoryRegions[1] = new RegulatoryRegion(ChromosomeUtilities.Chr3, 1250, 1450, CompactId.Convert("456"), RegulatoryRegionType.enhancer); var regulatoryRegionIntervalArrays = regulatoryRegions.ToIntervalArrays(3); var transcripts = GetTranscripts(ChromosomeUtilities.Chr3, genes, transcriptRegions, mirnas); var transcriptIntervalArrays = transcripts.ToIntervalArrays(3); var expectedCacheData = new TranscriptCacheData(expectedHeader, genes, transcriptRegions, mirnas, peptideSeqs, transcriptIntervalArrays, regulatoryRegionIntervalArrays); var ms = new MemoryStream(); using (var writer = new TranscriptCacheWriter(ms, expectedHeader, true)) { writer.Write(expectedCacheData); } ms.Position = 0; return ms; } private static ITranscript[] GetTranscripts(Chromosome chromosome, IGene[] genes, ITranscriptRegion[] regions, IInterval[] mirnas) { return new ITranscript[] { new Transcript(chromosome, 120, 180, CompactId.Convert("789"), null, BioType.IG_D_gene, genes[0], 0, 0, false, regions, 0, mirnas, -1, -1, Source.None, false, false, null, null) }; } [Fact] public void GetSpliceIntervals_standard() { using (var transcriptCacheReader = new TranscriptCacheReader(GetCacheStream())) { var seqProvider = GetCacheSequenceProvider(); var transcriptData = transcriptCacheReader.Read(seqProvider.RefIndexToChromosome); var spliceIntervals = SpliceUtilities.GetSpliceIntervals(seqProvider, transcriptData); Assert.Single(spliceIntervals); //given 2 exons, there should be 2 splice intervals Assert.Equal(2, spliceIntervals[2].Array.Length); } } private static Stream GetMultiGeneAtSameLocationStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO="); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("21\t31859677\t.\tG\tA\t.\t.\tSpliceAI=A|KRTAP19-3|0.00|0.00|0.00|0.00|-42|38|23|38"); writer.WriteLine("21\t31859677\t.\tG\tA\t.\t.\tSpliceAI=A|KRTAP19-2|0.01|0.00|0.0262|0.00|-42|38|23|-11"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void Two_symbols_in_spliceAi() { using (var spliceParser = new SpliceAiParser(GetMultiGeneAtSameLocationStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Equal(2, spliceItems.Count); Assert.Equal("\"hgnc\":\"KRTAP19-3\",\"acceptorGainScore\":0,\"acceptorGainDistance\":-42,\"acceptorLossScore\":0,\"acceptorLossDistance\":38,\"donorGainScore\":0,\"donorGainDistance\":23,\"donorLossScore\":0,\"donorLossDistance\":38", spliceItems[0].GetJsonString()); Assert.Equal("\"hgnc\":\"KRTAP19-2\",\"acceptorGainScore\":0,\"acceptorGainDistance\":-42,\"acceptorLossScore\":0,\"acceptorLossDistance\":38,\"donorGainScore\":0,\"donorGainDistance\":23,\"donorLossScore\":0,\"donorLossDistance\":-11", spliceItems[1].GetJsonString()); } } private static Stream GetInsertionStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("##INFO="); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); // insertions at the boundary of splice intervals // splice boundary is [92931- 92961] //insertion just before the interval should be skipped writer.WriteLine("10\t92930\t.\tC\tCT\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); // insertion right on the boundary should be kept writer.WriteLine("10\t92931\t.\tC\tCT\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); // insertion just after the interval should be skipped writer.WriteLine("10\t92961\t.\tC\tCT\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); // insertion right on the interval boundary should be kept writer.WriteLine("10\t92960\t.\tC\tCT\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void Parse_insertions() { using (var spliceParser = new SpliceAiParser(GetInsertionStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Equal(2, spliceItems.Count); } } private static Stream GetDeletionStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##fileformat=VCFv4.0"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("##INFO="); // deletions at the boundary of splice intervals // splice boundary is [92931- 92961] // deletion just before the interval should be skipped writer.WriteLine("10\t92929\t.\tCT\tC\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); // deletion that start before the interval but is long enough to go into the interval is kept writer.WriteLine("10\t92929\t.\tCTA\tC\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); // deletion right on the boundary should be kept writer.WriteLine("10\t92930\t.\tCT\tC\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); // deletion just after the interval should be skipped writer.WriteLine("10\t92961\t.\tCT\tC\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); // deletion right on the interval boundary should be kept writer.WriteLine("10\t92959\t.\tCT\tC\t.\t.\tSpliceAI=A|TUBB8|0.00|0.00|0.00|0.00|-4|-2|-12|25"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void Parse_deletions() { using (var spliceParser = new SpliceAiParser(GetDeletionStream(), GetSequenceProvider(), GetSpliceIntervals(), GetSpliceToNirvanaGenes())) { var spliceItems = spliceParser.GetItems().ToList(); Assert.Equal(3, spliceItems.Count); } } } } ================================================ FILE: UnitTests/SAUtils/gnomAD/GnomadGeneScoreTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using SAUtils.GnomadGeneScores; using Xunit; namespace UnitTests.SAUtils.gnomAD { public sealed class GnomadGeneScoreTests { private static Stream GetStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("gene\ttranscript\tobs_mis\texp_mis\toe_mis\tmu_mis\tpossible_mis\tobs_mis_pphen\texp_mis_pphen\toe_mis_pphen\tpossible_mis_pphen\tobs_syn\texp_syn\toe_syn\tmu_syn\tpossible_syn\tobs_lof\tmu_lof\tpossible_lof\texp_lof\tpLI\tpNull\tpRec\toe_lof\toe_syn_lower\toe_syn_upper\toe_mis_lower\toe_mis_upper\toe_lof_lower\toe_lof_upper\tconstraint_flag\tsyn_z\tmis_z\tlof_z\toe_lof_upper_rank\toe_lof_upper_bin\toe_lof_upper_bin_6\tn_sites\tclassic_caf\tmax_af\tno_lofs\tobs_het_lof\tobs_hom_lof\tdefined\tp\texp_hom_lof\tclassic_caf_afr\tclassic_caf_amr\tclassic_caf_asj\tclassic_caf_eas\tclassic_caf_fin\tclassic_caf_nfe\tclassic_caf_oth\tclassic_caf_sas\tp_afr\tp_amr\tp_asj\tp_eas\tp_fin\tp_nfe\tp_oth\tp_sas\ttranscript_type\tgene_id\ttranscript_level\tcds_length\tnum_coding_exons\tgene_type\tgene_length\texac_pLI\texac_obs_lof\texac_exp_lof\texac_oe_lof\tbrain_expression\tchromosome\tstart_position\tend_position"); writer.WriteLine("MED13\tENST00000397786\t871\t1.1178e+03\t7.7921e-01\t5.5598e-05\t14195\t314\t5.2975e+02\t5.9273e-01\t6708\t422\t3.8753e+02\t1.0890e+00\t1.9097e-05\t4248\t0\t4.9203e-06\t1257\t9.8429e+01\t1.0000e+00\t8.9436e-40\t1.8383e-16\t0.0000e+00\t1.0050e+00\t1.1800e+00\t7.3600e-01\t8.2400e-01\t0.0000e+00\t3.0000e-02\t\t-1.3765e+00\t2.6232e+00\t9.1935e+00\t0\t0\t0\t2\t1.2058e-05\t8.0492e-06\t124782\t3\t0\t124785\t1.2021e-05\t1.8031e-05\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t9.2812e-05\t8.8571e-06\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t9.2760e-05\t8.8276e-06\t0.0000e+00\t0.0000e+00\tprotein_coding\tENSG00000108510\t2\t6522\t30\tprotein_coding\t122678\t1.0000e+00\t0\t6.4393e+01\t0.0000e+00\tNA\t17\t60019966\t60142643"); writer.WriteLine("NIPBL\tENST00000282516\t846\t1.4415e+03\t5.8688e-01\t7.3808e-05\t18540\t158\t5.4310e+02\t2.9092e-01\t7135\t496\t4.9501e+02\t1.0020e+00\t2.4942e-05\t5211\t1\t9.4214e-06\t1781\t1.5032e+02\t1.0000e+00\t2.9773e-59\t3.5724e-24\t6.6527e-03\t9.3000e-01\t1.0790e+00\t5.5400e-01\t6.2100e-01\t1.0000e-03\t3.2000e-02\t\t-3.5119e-02\t5.5737e+00\t1.1286e+01\t1\t0\t0\t2\t1.1943e-05\t7.9636e-06\t125693\t3\t0\t125696\t1.1934e-05\t1.7901e-05\t0.0000e+00\t0.0000e+00\t9.9246e-05\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t6.5338e-05\t0.0000e+00\t0.0000e+00\t9.9231e-05\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t6.5327e-05\tprotein_coding\tENSG00000164190\t2\t8412\t46\tprotein_coding\t189655\t1.0000e+00\t1\t1.1057e+02\t9.0443e-03\tNA\t5\t36876861\t37066515"); writer.WriteLine("RPS17\tENST00000330339\t0\t0.0000e+00\tNaN\t0.0000e+00\t879\t0\t3.4388e-02\t0.0000e+00\t75\t0\t0.0000e+00\tNaN\t0.0000e+00\t254\t0\t1.7630e-08\t85\t4.1103e-01\t3.2566e-01\t1.7556e-01\t4.9878e-01\t0.0000e+00\t0.0000e+00\t1.9000e+00\t0.0000e+00\t1.9000e+00\t0.0000e+00\t1.8490e+00\tno_exp_mis|no_exp_syn|no_variants\tNaN\tNaN\t5.9410e-01\t17933\t9\t5\t0\t0.0000e+00\t0.0000e+00\t0\t0\t0\t0\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\t0.0000e+00\tprotein_coding\tENSG00000184779\t2\t405\t5\tprotein_coding\t3815\tNA\tNA\tNA\tNA\tNA\t15\t82821158\t82824972"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems() { var geneIdToSymbols = new Dictionary { {"ENSG00000108510", "MED13"}, {"ENSG00000164190", "NIPBL"}, {"tENSG00000184779", "RPS17" } }; using (var reader = new GnomadGeneParser(new StreamReader(GetStream()), geneIdToSymbols)) { var items = reader.GetItems().ToList(); Assert.Equal(3, items.Count); Assert.Equal("{\"pLi\":1.00e0,\"pRec\":1.84e-16,\"pNull\":8.94e-40,\"synZ\":-1.38e0,\"misZ\":2.62e0,\"loeuf\":3.00e-2}", items[0].Value[0].GetJsonString()); Assert.Equal("{\"pLi\":3.26e-1,\"pRec\":4.99e-1,\"pNull\":1.76e-1,\"loeuf\":1.85e0}", items[2].Value[0].GetJsonString()); } } private static Stream GetStream_with_duplicate_gene_entries() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("gene\ttranscript\tobs_mis\texp_mis\toe_mis\tmu_mis\tpossible_mis\tobs_mis_pphen\texp_mis_pphen\toe_mis_pphen\tpossible_mis_pphen\tobs_syn\texp_syn\toe_syn\tmu_syn\tpossible_syn\tobs_lof\tmu_lof\tpossible_lof\texp_lof\tpLI\tpNull\tpRec\toe_lof\toe_syn_lower\toe_syn_upper\toe_mis_lower\toe_mis_upper\toe_lof_lower\toe_lof_upper\tconstraint_flag\tsyn_z\tmis_z\tlof_z\toe_lof_upper_rank\toe_lof_upper_bin\toe_lof_upper_bin_6\tn_sites\tclassic_caf\tmax_af\tno_lofs\tobs_het_lof\tobs_hom_lof\tdefined\tp\texp_hom_lof\tclassic_caf_afr\tclassic_caf_amr\tclassic_caf_asj\tclassic_caf_eas\tclassic_caf_fin\tclassic_caf_nfe\tclassic_caf_oth\tclassic_caf_sas\tp_afr\tp_amr\tp_asj\tp_eas\tp_fin\tp_nfe\tp_oth\tp_sas\ttranscript_type\tgene_id\ttranscript_level\tcds_length\tnum_coding_exons\tgene_type\tgene_length\texac_pLI\texac_obs_lof\texac_exp_lof\texac_oe_lof\tbrain_expression\tchromosome\tstart_position\tend_position"); writer.WriteLine("MDGA2\tENST00000426342\t306\t4.0043e+02\t7.6419e-01\t2.1096e-05\t4724\t78\t1.6525e+02\t4.7202e-01\t1923\t125\t1.3737e+02\t9.0993e-01\t7.1973e-06\t1413\t4\t2.0926e-06\t453\t3.8316e+01\t9.9922e-01\t8.6490e-12\t7.8128e-04\t1.0440e-01\t7.8600e-01\t1.0560e+00\t6.9500e-01\t8.4000e-01\t5.0000e-02\t2.3900e-01\t\t8.2988e-01\t1.6769e+00\t5.1372e+00\t1529\t0\t0\t7\t2.8103e-05\t4.0317e-06\t124784\t7\t0\t124791\t2.8047e-05\t9.8167e-05\t0.0000e+00\t2.8962e-05\t0.0000e+00\t0.0000e+00\t0.0000e+00\t3.5391e-05\t1.6672e-04\t3.2680e-05\t0.0000e+00\t2.8962e-05\t0.0000e+00\t0.0000e+00\t0.0000e+00\t3.5308e-05\t1.6492e-04\t3.2678e-05\tprotein_coding\tENSG00000139915\t2\t2181\t13\tprotein_coding\t835332\t9.9322e-01\t3\t2.7833e+01\t1.0779e-01\tNA\t14\t47308826\t48144157"); writer.WriteLine("MDGA2\tENST00000439988\t438\t5.5311e+02\t7.9189e-01\t2.9490e-05\t6608\t105\t2.0496e+02\t5.1228e-01\t2386\t180\t1.9491e+02\t9.2351e-01\t9.8371e-06\t2048\t11\t2.8074e-06\t627\t5.1882e+01\t6.6457e-01\t5.5841e-10\t3.3543e-01\t2.1202e-01\t8.1700e-01\t1.0450e+00\t7.3100e-01\t8.5700e-01\t1.3200e-01\t3.5100e-01\t\t8.3940e-01\t1.7393e+00\t5.2595e+00\t2989\t1\t0\t9\t3.6173e-05\t4.0463e-06\t124782\t9\t0\t124791\t3.6061e-05\t1.6228e-04\t6.4986e-05\t2.8962e-05\t0.0000e+00\t0.0000e+00\t0.0000e+00\t4.4275e-05\t1.6672e-04\t3.2680e-05\t6.4577e-05\t2.8962e-05\t0.0000e+00\t0.0000e+00\t0.0000e+00\t4.4135e-05\t1.6492e-04\t3.2678e-05\tprotein_coding\tENSG00000272781\t3\t3075\t17\tprotein_coding\t832866\tNA\tNA\tNA\tNA\tNA\t14\t47311134\t48143999"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetNonDuplicateItems() { var geneIdToSymbols = new Dictionary { {"ENST00000426342", "MDGA2"}, {"ENST00000439988", "MDGA2"} }; using (var reader = new GnomadGeneParser(new StreamReader(GetStream_with_duplicate_gene_entries()), geneIdToSymbols)) { var items = reader.GetItems().ToList(); Assert.Single(items); Assert.Equal("{\"pLi\":9.99e-1,\"pRec\":7.81e-4,\"pNull\":8.65e-12,\"synZ\":8.30e-1,\"misZ\":1.68e0,\"loeuf\":2.39e-1}", items[0].Value[0].GetJsonString()); } } private static Stream GetStream_resolve_without_loeuf() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("gene\ttranscript\tobs_mis\texp_mis\toe_mis\tmu_mis\tpossible_mis\tobs_mis_pphen\texp_mis_pphen\toe_mis_pphen\tpossible_mis_pphen\tobs_syn\texp_syn\toe_syn\tmu_syn\tpossible_syn\tobs_lof\tmu_lof\tpossible_lof\texp_lof\tpLI\tpNull\tpRec\toe_lof\toe_syn_lower\toe_syn_upper\toe_mis_lower\toe_mis_upper\toe_lof_lower\toe_lof_upper\tconstraint_flag\tsyn_z\tmis_z\tlof_z\toe_lof_upper_rank\toe_lof_upper_bin\toe_lof_upper_bin_6\tn_sites\tclassic_caf\tmax_af\tno_lofs\tobs_het_lof\tobs_hom_lof\tdefined\tp\texp_hom_lof\tclassic_caf_afr\tclassic_caf_amr\tclassic_caf_asj\tclassic_caf_eas\tclassic_caf_fin\tclassic_caf_nfe\tclassic_caf_oth\tclassic_caf_sas\tp_afr\tp_amr\tp_asj\tp_eas\tp_fin\tp_nfe\tp_oth\tp_sas\ttranscript_type\tgene_id\ttranscript_level\tcds_length\tnum_coding_exons\tgene_type\tgene_length\texac_pLI\texac_obs_lof\texac_exp_lof\texac_oe_lof\tbrain_expression\tchromosome\tstart_position\tend_position"); writer.WriteLine("NBPF8\tENST00000369365\t75\t3.0353e+01\t2.4709e+00\t1.4000e-06\t24299\t60\t2.5873e+01\t2.3190e+00\t14469\t22\t1.3347e+01\t1.6483e+00\t5.9757e-07\t6838\t8\t1.5346e-07\t3145\t3.7051e+00\t1.4181e-07\t9.6601e-01\t3.3989e-02\t2.1592e+00\t1.1460e+00\t1.9490e+00\t1.7400e+00\t1.9940e+00\t1.0280e+00\t1.9670e+00\tmis_too_many\t-1.8618e+00\t-2.8797e+00\t-2.0676e+00\t19130\t9\t5\t11\t8.5551e-05\t2.3640e-05\t120533\t19\t0\t120552\t7.8807e-05\t7.4870e-04\t2.1519e-04\t3.0820e-05\t0.0000e+00\t1.6820e-04\t4.8035e-05\t1.1691e-04\t0.0000e+00\t0.0000e+00\t2.1446e-04\t2.9290e-05\t0.0000e+00\t1.6411e-04\t4.7779e-05\t1.0166e-04\t0.0000e+00\t0.0000e+00\tprotein_coding\tENSG00000162825\t2\t11420\t91\tprotein_coding\t77674\tNA\tNA\tNA\tNA\tNA\t1\t144146808\t144224481"); writer.WriteLine("NBPF20\tENST00000369202\t450\t1.6927e+02\t2.6584e+00\t8.8521e-06\t29674\t110\t2.3246e+01\t4.7320e+00\t17017\t169\t6.7362e+01\t2.5089e+00\t3.9243e-06\t8313\t26\t6.7292e-07\t3810\t1.3886e+01\t1.9221e-22\t9.9999e-01\t7.9649e-06\t1.8724e+00\t1.8630e+00\t1.9970e+00\t1.9490e+00\t1.9990e+00\t1.2990e+00\t1.9710e+00\tmis_too_many|syn_outlier\t-9.7346e+00\t-7.6675e+00\t-3.0124e+00\t19150\t9\t5\t22\t3.0519e-04\t1.0143e-04\t125629\t75\t0\t125704\t2.9836e-04\t1.1190e-02\t1.5454e-04\t2.3190e-04\t1.0022e-04\t5.6199e-05\t7.5773e-04\t3.3476e-04\t3.2954e-04\t2.6141e-04\t1.2342e-04\t2.3129e-04\t9.9211e-05\t5.4367e-05\t7.3937e-04\t3.2537e-04\t3.2600e-04\t2.6134e-04\tprotein_coding\tENSG00000203832\t2\t13863\t110\tprotein_coding\t97258\t4.4592e-04\t7\t1.1898e+01\t5.8834e-01\tNA\t1\t148250249\t148347506"); writer.WriteLine("FAM231B\tENST00000601199\t100\t6.6880e+01\t1.4952e+00\t3.5319e-06\t1044\t29\t1.9078e+01\t1.5201e+00\t297\t40\t2.6953e+01\t1.4840e+00\t1.4161e-06\t357\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\t1.1490e+00\t1.8690e+00\t1.2710e+00\t1.7630e+00\tNA\tNA\tno_exp_lof\t-1.9754e+00\t-1.4391e+00\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tprotein_coding\tENSG00000268674\t3\t507\t1\tprotein_coding\t510\tNA\tNA\tNA\tNA\tNA\t1\t16865561\t16866070"); writer.WriteLine("FAM231D\tENST00000369173\t67\t7.4600e+01\t8.9812e-01\t3.7640e-06\t963\t18\t1.9611e+01\t9.1786e-01\t240\t20\t2.7120e+01\t7.3747e-01\t1.1812e-06\t318\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\t5.1800e-01\t1.0720e+00\t7.3700e-01\t1.1010e+00\tNA\tNA\tno_exp_lof\t1.0747e+00\t3.1268e-01\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tprotein_coding\tENSG00000203815\t3\t507\t1\tprotein_coding\t632\tNA\tNA\tNA\tNA\tNA\t1\t149675978\t149676609"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void ResolveDuplicateWithout_loeuf() { var geneIdToSymbols = new Dictionary { {"ENSG00000162825", "NBPF20"}, {"ENSG00000203832", "NBPF20"}, {"ENSG00000268674", "FAM231D"}, {"ENSG00000203815", "FAM231D"} }; using (var reader = new GnomadGeneParser(new StreamReader(GetStream_resolve_without_loeuf()), geneIdToSymbols)) { var items = reader.GetItems().ToList(); Assert.Equal(2, items.Count); Assert.Equal("{\"pLi\":1.42e-7,\"pRec\":3.40e-2,\"pNull\":9.66e-1,\"synZ\":-1.86e0,\"misZ\":-2.88e0,\"loeuf\":1.97e0}", items[0].Value[0].GetJsonString()); Assert.Equal("{\"synZ\":-1.98e0,\"misZ\":-1.44e0}", items[1].Value[0].GetJsonString()); } } } } ================================================ FILE: UnitTests/SAUtils/gnomAD/GnomadReaderTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using SAUtils; using SAUtils.DataStructures; using SAUtils.gnomAD; using SAUtils.ParseUtils; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.Interface.SA; using Variants; using Xunit; namespace UnitTests.SAUtils.gnomAD { public sealed class GnomadReaderTests { private static Stream GetGenomeStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("chr1\t10031\t.\tT\tC\t.\tAC0;AS_VQSR\tAC=0;AN=56642;AC_oth=0;AN_oth=782;nhomalt_oth=0;AC_sas=0;AN_sas=1120;nhomalt_sas=0;AC_XX=0;AN_XX=29308;nhomalt_XX=0;AC_fin=0;AN_fin=4326;nhomalt_fin=0;AC_XY=0;AN_XY=27334;nhomalt_XY=0;AC_eas=0;AN_eas=1712;nhomalt_eas=0;AC_amr=0;AN_amr=6420;nhomalt_amr=0;AC_afr=0;AN_afr=14642;nhomalt_afr=0;nhomalt=0;AC_asj=0;AN_asj=1550;nhomalt_asj=0;AC_controls_and_biobanks=0;AN_controls_and_biobanks=11608;AC_nfe=0;AN_nfe=25546;nhomalt_nfe=0;VarDP=35"); writer.WriteLine("chr1\t10114\trs1570391787\tT\tC\t.\tAS_VQSR\tAC=5;AN=22208;AC_oth=0;AN_oth=368;nhomalt_oth=0;AC_sas=0;AN_sas=518;nhomalt_sas=0;AC_XX=4;AN_XX=12336;nhomalt_XX=0;AC_fin=1;AN_fin=888;nhomalt_fin=0;AC_XY=1;AN_XY=9872;nhomalt_XY=0;AC_eas=0;AN_eas=560;nhomalt_eas=0;AC_amr=0;AN_amr=1580;nhomalt_amr=0;AC_afr=1;AN_afr=5362;nhomalt_afr=0;nhomalt=0;AC_asj=0;AN_asj=702;nhomalt_asj=0;AC_controls_and_biobanks=1;AN_controls_and_biobanks=3484;AC_nfe=3;AN_nfe=12078;nhomalt_nfe=0;VarDP=1597"); writer.WriteLine("chr1\t10120\trs1390810297\tT\tC\t.\tAC0;AS_VQSR\tAC=0;AN=34082;AC_oth=0;AN_oth=468;nhomalt_oth=0;AC_sas=0;AN_sas=716;nhomalt_sas=0;AC_XX=0;AN_XX=18954;nhomalt_XX=0;AC_fin=0;AN_fin=1110;nhomalt_fin=0;AC_XY=0;AN_XY=15128;nhomalt_XY=0;AC_eas=0;AN_eas=872;nhomalt_eas=0;AC_amr=0;AN_amr=2280;nhomalt_amr=0;AC_afr=0;AN_afr=8666;nhomalt_afr=0;nhomalt=0;AC_asj=0;AN_asj=1076;nhomalt_asj=0;AC_controls_and_biobanks=0;AN_controls_and_biobanks=4984;AC_nfe=0;AN_nfe=18672;nhomalt_nfe=0;VarDP=1035"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void GetItems_test() { var sequence = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "TGTGTTGTTATTCTGTGTGCAT", 10114 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var reader = new StreamReader(GetGenomeStream()); var gnomadReader = new GnomadSnvReader(reader, null, sequenceProvider); var items = gnomadReader.GetCombinedItems().ToList(); Assert.Equal(3, items.Count); Assert.Equal("\"coverage\":0,\"failedFilter\":true,\"allAf\":0,\"allAn\":56642,\"allAc\":0,\"allHc\":0,\"afrAf\":0,\"afrAn\":14642,\"afrAc\":0,\"afrHc\":0,\"amrAf\":0,\"amrAn\":6420,\"amrAc\":0,\"amrHc\":0,\"easAf\":0,\"easAn\":1712,\"easAc\":0,\"easHc\":0,\"finAf\":0,\"finAn\":4326,\"finAc\":0,\"finHc\":0,\"nfeAf\":0,\"nfeAn\":25546,\"nfeAc\":0,\"nfeHc\":0,\"asjAf\":0,\"asjAn\":1550,\"asjAc\":0,\"asjHc\":0,\"sasAf\":0,\"sasAn\":1120,\"sasAc\":0,\"sasHc\":0,\"othAf\":0,\"othAn\":782,\"othAc\":0,\"othHc\":0,\"maleAf\":0,\"maleAn\":27334,\"maleAc\":0,\"maleHc\":0,\"femaleAf\":0,\"femaleAn\":29308,\"femaleAc\":0,\"femaleHc\":0,\"controlsAllAf\":0,\"controlsAllAn\":11608,\"controlsAllAc\":0", items[0].GetJsonString()); Assert.Equal("\"coverage\":0,\"failedFilter\":true,\"allAf\":0.000225,\"allAn\":22208,\"allAc\":5,\"allHc\":0,\"afrAf\":0.000186,\"afrAn\":5362,\"afrAc\":1,\"afrHc\":0,\"amrAf\":0,\"amrAn\":1580,\"amrAc\":0,\"amrHc\":0,\"easAf\":0,\"easAn\":560,\"easAc\":0,\"easHc\":0,\"finAf\":0.001126,\"finAn\":888,\"finAc\":1,\"finHc\":0,\"nfeAf\":0.000248,\"nfeAn\":12078,\"nfeAc\":3,\"nfeHc\":0,\"asjAf\":0,\"asjAn\":702,\"asjAc\":0,\"asjHc\":0,\"sasAf\":0,\"sasAn\":518,\"sasAc\":0,\"sasHc\":0,\"othAf\":0,\"othAn\":368,\"othAc\":0,\"othHc\":0,\"maleAf\":0.000101,\"maleAn\":9872,\"maleAc\":1,\"maleHc\":0,\"femaleAf\":0.000324,\"femaleAn\":12336,\"femaleAc\":4,\"femaleHc\":0,\"controlsAllAf\":0.000287,\"controlsAllAn\":3484,\"controlsAllAc\":1", items[1].GetJsonString()); } private static Stream GetConflictingItemsStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t16558315\trs369787349\tT\tC,G,T,ACTGGCTGCCTGGCTTG\t818363\tAC0;LCR;RF;SEGDUP\tAC=87,7,0,2;AF=5.30488e-01,4.26829e-02,0.00000e+00,1.21951e-02;AN=164;AC_AFR=31,1,0,2;AC_AMR=3,0,0,0;AC_ASJ=0,0,0,0;AC_EAS=4,0,0,0;AC_FIN=33,5,0,0;AC_NFE=13,1,0,0;AC_OTH=3,0,0,0;AC_Male=40,1,0,0;AC_Female=47,6,0,2;AN_AFR=56;AN_AMR=4;AN_ASJ=0;AN_EAS=6;AN_FIN=64;AN_NFE=28;AN_OTH=6;AN_Male=78;AN_Female=86;AF_AFR=5.53571e-01,1.78571e-02,0.00000e+00,3.57143e-02;AF_AMR=7.50000e-01,0.00000e+00,0.00000e+00,0.00000e+00;AF_ASJ=.,.,.,.;AF_EAS=6.66667e-01,0.00000e+00,0.00000e+00,0.00000e+00;AF_FIN=5.15625e-01,7.81250e-02,0.00000e+00,0.00000e+00;AF_NFE=4.64286e-01,3.57143e-02,0.00000e+00,0.00000e+00;AF_OTH=5.00000e-01,0.00000e+00,0.00000e+00,0.00000e+00;AF_Male=5.12821e-01,1.28205e-02,0.00000e+00,0.00000e+00;AF_Female=5.46512e-01,6.97674e-02,0.00000e+00,2.32558e-02;GC_AFR=3,16,7,0,1,0,0,0,0,0,0,0,0,0,1;GC_AMR=0,1,1,0,0,0,0,0,0,0,0,0,0,0,0;GC_ASJ=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;GC_EAS=0,2,1,0,0,0,0,0,0,0,0,0,0,0,0;GC_FIN=2,18,6,0,3,1,0,0,0,0,0,0,0,0,0;GC_NFE=3,8,2,0,1,0,0,0,0,0,0,0,0,0,0;GC_OTH=1,1,1,0,0,0,0,0,0,0,0,0,0,0,0;GC_Male=6,23,8,0,1,0,0,0,0,0,0,0,0,0,0;GC_Female=3,23,10,0,4,1,0,0,0,0,0,0,0,0,1;AC_raw=7179,402,23,4;AN_raw=13956;AF_raw=5.14402e-01,2.88048e-02,1.64804e-03,2.86615e-04;GC_raw=2158,1885,2598,68,90,122,3,8,0,6,0,0,0,0,2;GC=9,46,18,0,5,1,0,0,0,0,0,0,0,0,1;AC_POPMAX=3,5,.,2;AN_POPMAX=4,64,.,56;AF_POPMAX=7.50000e-01,7.81250e-02,.,3.57143e-02"); writer.WriteLine("22\t16558315\trs376808508\tTAAGCCAGCCAGCCAGCCAAGCTGGCCAAGCCAGACAGGCAGCCAAGCCAACCAAGACACCCAGGCAGCCAAGCCAGC\tCAAGCCAGCCAGCCAGCCAAGCTGGCCAAGCCAGACAGGCAGCCAAGCCAACCAAGACACCCAGGCAGCCAAGCCAGC,T\t3.62825e+06\tLCR;RF;SEGDUP\tAC=155,1;AF=9.63451e-03,6.21581e-05;AN=16088;AC_AFR=46,1;AC_AMR=6,0;AC_ASJ=1,0;AC_EAS=3,0;AC_FIN=27,0;AC_NFE=67,0;AC_OTH=5,0;AC_Male=83,1;AC_Female=72,0;AN_AFR=3744;AN_AMR=534;AN_ASJ=186;AN_EAS=986;AN_FIN=1770;AN_NFE=8370;AN_OTH=498;AN_Male=8994;AN_Female=7094;AF_AFR=1.22863e-02,2.67094e-04;AF_AMR=1.12360e-02,0.00000e+00;AF_ASJ=5.37634e-03,0.00000e+00;AF_EAS=3.04260e-03,0.00000e+00;AF_FIN=1.52542e-02,0.00000e+00;AF_NFE=8.00478e-03,0.00000e+00;AF_OTH=1.00402e-02,0.00000e+00;AF_Male=9.22837e-03,1.11185e-04;AF_Female=1.01494e-02,0.00000e+00;GC_AFR=602,46,0,1,0,0;GC_AMR=64,6,0,0,0,0;GC_ASJ=20,1,0,0,0,0;GC_EAS=204,3,0,0,0,0;GC_FIN=255,23,2,0,0,0;GC_NFE=1083,51,8,0,0,0;GC_OTH=59,5,0,0,0,0;GC_Male=1304,71,6,1,0,0;GC_Female=983,64,4,0,0,0;AC_raw=413,1;AN_raw=28686;AF_raw=1.43973e-02,3.48602e-05;GC_raw=7802,349,30,1,0,0;GC=2287,135,10,1,0,0;AC_POPMAX=27,1;AN_POPMAX=1770,3744;AF_POPMAX=1.52542e-02,2.67094e-04"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void IdentifyConflictingItems() { var sequence = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "TAAGCCAGCCAGCCAGCCAAGCTGGCCAAGCCAGACAGGCAGCCAAGCCAACCAAGACACCCAGGCAGCCAAGCCAGC", 16558315 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var reader = new StreamReader(GetConflictingItemsStream()); var gnomadReader = new GnomadSnvReader(reader, null, sequenceProvider); var items = new List(); foreach (GnomadItem item in gnomadReader.GetCombinedItems()) { //item.Trim(); if (item.Position == 16558315) items.Add(item); } items = SuppDataUtilities.RemoveConflictingAlleles(items, false); //two if the items were removed as conflicting items Assert.Equal(3, items.Count); } private static Stream GetShiftingItemsStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("6\t157100396\trs572236007\tGCGC\tGCGCCGC,G,GCGCCGCCGC\t584951.32\tPASS\tAC=1218,2,16;AF=5.55505e-02,9.12159e-05,7.29727e-04;AN=21926;BaseQRankSum=-1.09000e-01;ClippingRankSum=0.00000e+00;DP=300864;FS=0.00000e+00;InbreedingCoeff=1.17500e-01;MQ=6.00000e+01;MQRankSum=5.00000e-02;QD=2.04400e+01;ReadPosRankSum=-3.20000e-02;SOR=6.96000e-01;VQSLOD=1.36000e+00;VQSR_culprit=FS;GQ_HIST_ALT=2|51|6|22|28|15|24|50|9|10|13|14|14|30|30|40|21|2|7|1303,0|0|0|0|0|0|0|1|0|0|2|0|2|0|1|1|0|0|0|0,0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|18;DP_HIST_ALT=118|405|463|353|224|88|27|9|4|0|0|0|0|0|0|0|0|0|0|0,1|3|3|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|3|3|5|4|3|0|1|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|2|2|15|66|110|105|196|124|306|173|171|111|49|51|27|13|4|0,0|0|0|1|3|0|1|0|1|0|0|0|0|1|0|0|0|0|0|0,0|0|0|0|0|0|1|0|2|3|3|3|5|2|0|0|0|0|0|0;GQ_HIST_ALL=884|807|422|735|789|518|941|1060|612|1106|1091|578|1371|377|750|327|591|92|328|1739;DP_HIST_ALL=1331|2305|2891|3184|2842|1490|781|244|45|4|1|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|2|3|18|66|112|105|199|127|309|175|176|115|49|51|27|13|4|0;AC_Male=661,1,6;AC_Female=557,1,10;AN_Male=12450;AN_Female=9476;AF_Male=5.30924e-02,8.03213e-05,4.81928e-04;AF_Female=5.87801e-02,1.05530e-04,1.05530e-03;GC_Male=5587,601,30,1,0,0,6,0,0,0;GC_Female=4205,487,35,1,0,0,10,0,0,0;GC_raw=13401,1525,166,7,0,0,19,0,0,0;AC_raw=1857,7,19;AN_raw=30236;GC=9792,1088,65,2,0,0,16,0,0,0;AF_raw=6.14169e-02,2.31512e-04,6.28390e-04;Hom_AFR=0,0,0;Hom_AMR=1,0,0;Hom_ASJ=0,0,0;Hom_EAS=0,0,0;Hom_FIN=5,0,0;Hom_NFE=56,0,0;Hom_OTH=3,0,0;Hom=65,0,0;Hom_raw=166,0,0;AC_AFR=123,0,15;AC_AMR=8,0,0;AC_ASJ=6,0,0;AC_EAS=0,0,0;AC_FIN=69,0,0;AC_NFE=979,2,1;AC_OTH=33,0,0;AN_AFR=7512;AN_AMR=376;AN_ASJ=206;AN_EAS=1556;AN_FIN=694;AN_NFE=11022;AN_OTH=560;AF_AFR=1.63738e-02,0.00000e+00,1.99681e-03;AF_AMR=2.12766e-02,0.00000e+00,0.00000e+00;AF_ASJ=2.91262e-02,0.00000e+00,0.00000e+00;AF_EAS=0.00000e+00,0.00000e+00,0.00000e+00;AF_FIN=9.94236e-02,0.00000e+00,0.00000e+00;AF_NFE=8.88224e-02,1.81455e-04,9.07276e-05;AF_OTH=5.89286e-02,0.00000e+00,0.00000e+00;POPMAX=FIN,NFE,AFR;AC_POPMAX=69,2,15;AN_POPMAX=694,11022,7512;AF_POPMAX=9.94236e-02,1.81455e-04,1.99681e-03;DP_MEDIAN=13,7,17;DREF_MEDIAN=1.00000e-25,5.31547e-07,3.16228e-39;GQ_MEDIAN=99,60,99;AB_MEDIAN=5.00000e-01,2.77778e-01,5.71429e-01;AS_RF=7.69554e-01,3.84245e-02,8.79158e-01;AS_FilterStatus=PASS,RF,PASS;CSQ=CGCCGC|inframe_insertion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000275248|protein_coding|1/20||ENST00000275248.4:c.1171_1176dupCCGCCG|ENSP00000275248.4:p.Pro391_Pro392dup|1314-1315|1162-1163|388|P/PPP|ccg/cCGCCGCcg|rs766249098|3||1||insertion|1|HGNC|18040|||||ENSP00000275248||G3XAA0|UPI0000231CAD|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656&hmmpanther:PTHR12656:SF11|14|||||||||||||||||||||||||||||,CGCCGC|inframe_insertion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000346085|protein_coding|1/20||ENST00000346085.5:c.1345_1350dupCCGCCG|ENSP00000344546.4:p.Pro449_Pro450dup|1337-1338|1336-1337|446|P/PPP|ccg/cCGCCGCcg|rs766249098|3||1||insertion|1|HGNC|18040|YES|||CCDS55072.1|ENSP00000344546|Q8NFD5||UPI000058E4B2|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656:SF11&hmmpanther:PTHR12656|14|||||||||||||||||||||||||||||,CGCCGC|inframe_insertion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000350026|protein_coding|1/19||ENST00000350026.5:c.1345_1350dupCCGCCG|ENSP00000055163.7:p.Pro449_Pro450dup|1337-1338|1336-1337|446|P/PPP|ccg/cCGCCGCcg|rs766249098|3||1||insertion|1|HGNC|18040||||CCDS5251.2|ENSP00000055163|Q8NFD5||UPI000058E2EA|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656&hmmpanther:PTHR12656:SF11|14|||||||||||||||||||||||||||||,CGCCGC|inframe_insertion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000367148|protein_coding|1/20||ENST00000367148.1:c.1345_1350dupCCGCCG|ENSP00000356116.1:p.Pro449_Pro450dup|1336-1337|1336-1337|446|P/PPP|ccg/cCGCCGCcg|rs766249098|3||1||insertion|1|HGNC|18040|||||ENSP00000356116|Q8NFD5|G3XAA0|UPI000058E4B3|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656:SF11&hmmpanther:PTHR12656|14|||||||||||||||||||||||||||||,CGCCGC|upstream_gene_variant|MODIFIER|ARID1B|ENSG00000049618|Transcript|ENST00000414678|protein_coding||||||||||rs766249098|3|167|1|cds_start_NF|insertion|1|HGNC|18040|||||ENSP00000412835||H0Y7H8|UPI0001D3BCFD|1|||||||||||||||||||||||||||||||||,CGCCGC|upstream_gene_variant|MODIFIER|RP11-230C9.2|ENSG00000271551|Transcript|ENST00000603191|lincRNA||||||||||rs766249098|3|2188|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,CGCCGC|upstream_gene_variant|MODIFIER|RP11-230C9.4|ENSG00000271265|Transcript|ENST00000604082|lincRNA||||||||||rs766249098|3|4603|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,CGCCGC|downstream_gene_variant|MODIFIER|RP11-230C9.3|ENSG00000270487|Transcript|ENST00000604792|antisense||||||||||rs766249098|3|1061|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,CGCCGC|downstream_gene_variant|MODIFIER|MIR4466|ENSG00000271899|Transcript|ENST00000606121|miRNA||||||||||rs766249098|3|412|-1||insertion|1|HGNC|41726|YES|||||||||||||||||||||||||||||||||||||||||,-|inframe_deletion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000275248|protein_coding|1/20||ENST00000275248.4:c.1174_1176delCCG|ENSP00000275248.4:p.Pro392del|1312-1314|1160-1162|387-388|AP/A|gCGCcg/gcg|rs766249098|2||1||insertion|1|HGNC|18040|||||ENSP00000275248||G3XAA0|UPI0000231CAD|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656&hmmpanther:PTHR12656:SF11|14|||||||||||||||||||||||||||||,-|inframe_deletion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000346085|protein_coding|1/20||ENST00000346085.5:c.1348_1350delCCG|ENSP00000344546.4:p.Pro450del|1335-1337|1334-1336|445-446|AP/A|gCGCcg/gcg|rs766249098|2||1||insertion|1|HGNC|18040|YES|||CCDS55072.1|ENSP00000344546|Q8NFD5||UPI000058E4B2|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656:SF11&hmmpanther:PTHR12656|14|||||||||||||||||||||||||||||,-|inframe_deletion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000350026|protein_coding|1/19||ENST00000350026.5:c.1348_1350delCCG|ENSP00000055163.7:p.Pro450del|1335-1337|1334-1336|445-446|AP/A|gCGCcg/gcg|rs766249098|2||1||insertion|1|HGNC|18040||||CCDS5251.2|ENSP00000055163|Q8NFD5||UPI000058E2EA|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656&hmmpanther:PTHR12656:SF11|14|||||||||||||||||||||||||||||,-|inframe_deletion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000367148|protein_coding|1/20||ENST00000367148.1:c.1348_1350delCCG|ENSP00000356116.1:p.Pro450del|1334-1336|1334-1336|445-446|AP/A|gCGCcg/gcg|rs766249098|2||1||insertion|1|HGNC|18040|||||ENSP00000356116|Q8NFD5|G3XAA0|UPI000058E4B3|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656:SF11&hmmpanther:PTHR12656|14|||||||||||||||||||||||||||||,-|upstream_gene_variant|MODIFIER|ARID1B|ENSG00000049618|Transcript|ENST00000414678|protein_coding||||||||||rs766249098|2|168|1|cds_start_NF|insertion|1|HGNC|18040|||||ENSP00000412835||H0Y7H8|UPI0001D3BCFD|1|||||||||||||||||||||||||||||||||,-|upstream_gene_variant|MODIFIER|RP11-230C9.2|ENSG00000271551|Transcript|ENST00000603191|lincRNA||||||||||rs766249098|2|2186|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,-|upstream_gene_variant|MODIFIER|RP11-230C9.4|ENSG00000271265|Transcript|ENST00000604082|lincRNA||||||||||rs766249098|2|4601|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,-|downstream_gene_variant|MODIFIER|RP11-230C9.3|ENSG00000270487|Transcript|ENST00000604792|antisense||||||||||rs766249098|2|1062|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,-|downstream_gene_variant|MODIFIER|MIR4466|ENSG00000271899|Transcript|ENST00000606121|miRNA||||||||||rs766249098|2|413|-1||insertion|1|HGNC|41726|YES|||||||||||||||||||||||||||||||||||||||||,CGC|inframe_insertion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000275248|protein_coding|1/20||ENST00000275248.4:c.1174_1176dupCCG|ENSP00000275248.4:p.Pro392dup|1314-1315|1162-1163|388|P/PP|ccg/cCGCcg|rs766249098|1||1||insertion|1|HGNC|18040|||||ENSP00000275248||G3XAA0|UPI0000231CAD|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656&hmmpanther:PTHR12656:SF11|14|||||||||||||||||||||||||||||,CGC|inframe_insertion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000346085|protein_coding|1/20||ENST00000346085.5:c.1348_1350dupCCG|ENSP00000344546.4:p.Pro450dup|1337-1338|1336-1337|446|P/PP|ccg/cCGCcg|rs766249098|1||1||insertion|1|HGNC|18040|YES|||CCDS55072.1|ENSP00000344546|Q8NFD5||UPI000058E4B2|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656:SF11&hmmpanther:PTHR12656|14|||||||||||||||||||||||||||||,CGC|inframe_insertion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000350026|protein_coding|1/19||ENST00000350026.5:c.1348_1350dupCCG|ENSP00000055163.7:p.Pro450dup|1337-1338|1336-1337|446|P/PP|ccg/cCGCcg|rs766249098|1||1||insertion|1|HGNC|18040||||CCDS5251.2|ENSP00000055163|Q8NFD5||UPI000058E2EA|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656&hmmpanther:PTHR12656:SF11|14|||||||||||||||||||||||||||||,CGC|inframe_insertion|MODERATE|ARID1B|ENSG00000049618|Transcript|ENST00000367148|protein_coding|1/20||ENST00000367148.1:c.1348_1350dupCCG|ENSP00000356116.1:p.Pro450dup|1336-1337|1336-1337|446|P/PP|ccg/cCGCcg|rs766249098|1||1||insertion|1|HGNC|18040|||||ENSP00000356116|Q8NFD5|G3XAA0|UPI000058E4B3|1|||Low_complexity_(Seg):seg&hmmpanther:PTHR12656:SF11&hmmpanther:PTHR12656|14|||||||||||||||||||||||||||||,CGC|upstream_gene_variant|MODIFIER|ARID1B|ENSG00000049618|Transcript|ENST00000414678|protein_coding||||||||||rs766249098|1|167|1|cds_start_NF|insertion|1|HGNC|18040|||||ENSP00000412835||H0Y7H8|UPI0001D3BCFD|1|||||||||||||||||||||||||||||||||,CGC|upstream_gene_variant|MODIFIER|RP11-230C9.2|ENSG00000271551|Transcript|ENST00000603191|lincRNA||||||||||rs766249098|1|2188|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,CGC|upstream_gene_variant|MODIFIER|RP11-230C9.4|ENSG00000271265|Transcript|ENST00000604082|lincRNA||||||||||rs766249098|1|4603|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,CGC|downstream_gene_variant|MODIFIER|RP11-230C9.3|ENSG00000270487|Transcript|ENST00000604792|antisense||||||||||rs766249098|1|1061|-1||insertion|1|Clone_based_vega_gene||YES|||||||||||||||||||||||||||||||||||||||||,CGC|downstream_gene_variant|MODIFIER|MIR4466|ENSG00000271899|Transcript|ENST00000606121|miRNA||||||||||rs766249098|1|412|-1||insertion|1|HGNC|41726|YES|||||||||||||||||||||||||||||||||||||||||,CGCCGC|regulatory_region_variant|MODIFIER|||RegulatoryFeature|ENSR00001231649|promoter||||||||||rs766249098|3||||insertion|1||||||||||||||||||||||||||||||||||||||||||||,-|regulatory_region_variant|MODIFIER|||RegulatoryFeature|ENSR00001231649|promoter||||||||||rs766249098|2||||insertion|1||||||||||||||||||||||||||||||||||||||||||||,CGC|regulatory_region_variant|MODIFIER|||RegulatoryFeature|ENSR00001231649|promoter||||||||||rs766249098|1||||insertion|1||||||||||||||||||||||||||||||||||||||||||||,CGCCGC|TF_binding_site_variant|MODIFIER|||MotifFeature|MA0162.2|||||||||||rs766249098|3||1||insertion|1|||||||||||||||||||||||||||||||||||||Egr1:MA0162.2|5|N|||||,CGCCGC|TF_binding_site_variant|MODIFIER|||MotifFeature|MA0162.2|||||||||||rs766249098|3||1||insertion|1|||||||||||||||||||||||||||||||||||||Egr1:MA0162.2|2|N|||||,CGC|TF_binding_site_variant|MODIFIER|||MotifFeature|MA0162.2|||||||||||rs766249098|1||1||insertion|1|||||||||||||||||||||||||||||||||||||Egr1:MA0162.2|5|N|||||,CGC|TF_binding_site_variant|MODIFIER|||MotifFeature|MA0162.2|||||||||||rs766249098|1||1||insertion|1|||||||||||||||||||||||||||||||||||||Egr1:MA0162.2|2|N|||||,-|TF_binding_site_variant|MODIFIER|||MotifFeature|MA0162.2|||||||||||rs766249098|2||1||insertion|1|||||||||||||||||||||||||||||||||||||Egr1:MA0162.2|2|N|||||,-|TF_binding_site_variant|MODIFIER|||MotifFeature|MA0162.2|||||||||||rs766249098|2||1||insertion|1|||||||||||||||||||||||||||||||||||||Egr1:MA0162.2|-1|N|||||;GC_AFR=3618,123,0,0,0,0,15,0,0,0;GC_AMR=181,6,1,0,0,0,0,0,0,0;GC_ASJ=97,6,0,0,0,0,0,0,0,0;GC_EAS=778,0,0,0,0,0,0,0,0,0;GC_FIN=283,59,5,0,0,0,0,0,0,0;GC_NFE=4585,867,56,2,0,0,1,0,0,0;GC_OTH=250,27,3,0,0,0,0,0,0,0;Hom_Male=30,0,0;Hom_Female=35,0,0\n"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void LeftShiftingItems() { var sequence = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "GCGCGC", 157100394 -1 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var reader = new StreamReader(GetShiftingItemsStream()); var gnomadReader = new GnomadSnvReader(reader, null, sequenceProvider); var items = gnomadReader.GetCombinedItems().ToList(); Assert.Equal(3, items.Count); Assert.Equal(157100397, items[0].Position); Assert.Equal(157100397, items[1].Position); Assert.Equal(157100397, items[2].Position); } private static Stream GetChr22ExomeStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t15528107\trs1231414491\tT\tC\t434.31\tPASS\tAC=2;AN=57352;AF=3.48724e-05;rf_tp_probability=0.205705;FS=0;InbreedingCoeff=0.0527;MQ=21.67;MQRankSum=0.289;QD=1.65;ReadPosRankSum=0.413;SOR=0.085;BaseQRankSum=0.881;ClippingRankSum=-0.175;DP=1563548;VQSLOD=2.41;VQSR_culprit=QD;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=multi-snv;allele_type=snv;n_alt_alleles=2;pab_max=0.387695;gq_hist_alt_bin_freq=0|0|0|0|0|1|0|0|0|0|0|0|0|1|0|1|0|0|0|0;gq_hist_all_bin_freq=35633|21017|2883|2706|1410|581|1244|1433|923|2307|2802|1588|3776|2801|1906|2362|2569|822|1708|3862;dp_hist_alt_bin_freq=0|0|1|0|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=126884|6290|2994|4863|7505|6812|4743|2599|957|376|174|63|25|20|15|5|2|2|1|1;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|1|1|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=120;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=958;controls_AF_afr_male=0;controls_nhomalt_afr_male=0;non_neuro_AC_eas_kor=0;non_neuro_AN_eas_kor=1844;non_neuro_AF_eas_kor=0;non_neuro_nhomalt_eas_kor=0;non_topmed_AC_amr=0;non_topmed_AN_amr=10422;non_topmed_AF_amr=0;non_topmed_nhomalt_amr=0;non_cancer_AC_asj_female=0;non_cancer_AN_asj_female=828;non_cancer_AF_asj_female=0;non_cancer_nhomalt_asj_female=0;AC_raw=3;AN_raw=151718;AF_raw=1.97735e-05;nhomalt_raw=0;AC_fin_female=0;AN_fin_female=840;AF_fin_female=0;nhomalt_fin_female=0;non_cancer_AC_oth_female=0;non_cancer_AN_oth_female=944;non_cancer_AF_oth_female=0;non_cancer_nhomalt_oth_female=0;AC_nfe_bgr=0;AN_nfe_bgr=28;AF_nfe_bgr=0;nhomalt_nfe_bgr=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=324;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;AC_sas_male=0;AN_sas_male=3672;AF_sas_male=0;nhomalt_sas_male=0;non_neuro_AC_afr_male=0;non_neuro_AN_afr_male=2218;non_neuro_AF_afr_male=0;non_neuro_nhomalt_afr_male=0;AC_afr_male=0;AN_afr_male=2234;AF_afr_male=0;nhomalt_afr_male=0;AC_afr=0;AN_afr=5236;AF_afr=0;nhomalt_afr=0;controls_AC_nfe_swe=0;controls_AN_nfe_swe=6;controls_AF_nfe_swe=0;controls_nhomalt_nfe_swe=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=2988;non_neuro_AF_afr_female=0;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=0;non_topmed_AN_amr_female=5636;non_topmed_AF_amr_female=0;non_topmed_nhomalt_amr_female=0;non_cancer_AC_female=1;non_cancer_AN_female=27774;non_cancer_AF_female=3.60049e-05;non_cancer_nhomalt_female=0;non_cancer_AC_nfe_onf=0;non_cancer_AN_nfe_onf=4264;non_cancer_AF_nfe_onf=0;non_cancer_nhomalt_nfe_onf=0;non_cancer_AC_male=1;non_cancer_AN_male=28730;non_cancer_AF_male=3.48068e-05;non_cancer_nhomalt_male=0;non_topmed_AC_oth_female=0;non_topmed_AN_oth_female=940;non_topmed_AF_oth_female=0;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=3794;AF_eas_female=0;nhomalt_eas_female=0;non_cancer_AC_sas_female=0;non_cancer_AN_sas_female=2380;non_cancer_AF_sas_female=0;non_cancer_nhomalt_sas_female=0;AC_afr_female=0;AN_afr_female=3002;AF_afr_female=0;nhomalt_afr_female=0;AC_sas=0;AN_sas=6056;AF_sas=0;nhomalt_sas=0;non_neuro_AC_female=1;non_neuro_AN_female=24066;non_neuro_AF_female=4.15524e-05;non_neuro_nhomalt_female=0;controls_AC_afr=0;controls_AN_afr=2244;controls_AF_afr=0;controls_nhomalt_afr=0;non_neuro_AC_eas_jpn=0;non_neuro_AN_eas_jpn=12;non_neuro_AF_eas_jpn=0;non_neuro_nhomalt_eas_jpn=0;AC_nfe_onf=0;AN_nfe_onf=4602;AF_nfe_onf=0;nhomalt_nfe_onf=0;non_cancer_AC_amr_male=0;non_cancer_AN_amr_male=4802;non_cancer_AF_amr_male=0;non_cancer_nhomalt_amr_male=0;controls_AC_fin_male=0;controls_AN_fin_male=718;controls_AF_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=2;non_neuro_AN_nfe_nwe=13330;non_neuro_AF_nfe_nwe=0.000150038;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=1076;AF_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=1;AN_nfe_female=10978;AF_nfe_female=9.10913e-05;nhomalt_nfe_female=0;AC_amr=0;AN_amr=10460;AF_amr=0;nhomalt_amr=0;non_topmed_AC_nfe_male=1;non_topmed_AN_nfe_male=10888;non_topmed_AF_nfe_male=9.18442e-05;non_topmed_nhomalt_nfe_male=0;non_neuro_AC_sas=0;non_neuro_AN_sas=6050;non_neuro_AF_sas=0;non_neuro_nhomalt_sas=0;non_cancer_AC_fin_male=0;non_cancer_AN_fin_male=1076;non_cancer_AF_fin_male=0;non_cancer_nhomalt_fin_male=0;non_cancer_AC_nfe_seu=0;non_cancer_AN_nfe_seu=58;non_cancer_AF_nfe_seu=0;non_cancer_nhomalt_nfe_seu=0;AC_eas=0;AN_eas=7828;AF_eas=0;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=1;non_neuro_AN_nfe_female=9114;non_neuro_AF_nfe_female=0.000109721;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=5206;non_neuro_AF_afr=0;non_neuro_nhomalt_afr=0;controls_AC_raw=0;controls_AN_raw=68088;controls_AF_raw=0;controls_nhomalt_raw=0;non_cancer_AC_eas=0;non_cancer_AN_eas=7814;non_cancer_AF_eas=0;non_cancer_nhomalt_eas=0;non_cancer_AC_amr_female=0;non_cancer_AN_amr_female=5648;non_cancer_AF_amr_female=0;non_cancer_nhomalt_amr_female=0;non_neuro_AC_nfe_swe=0;non_neuro_AN_nfe_swe=66;non_neuro_AF_nfe_swe=0;non_neuro_nhomalt_nfe_swe=0;controls_AC_male=0;controls_AN_male=11462;controls_AF_male=0;controls_nhomalt_male=0;non_topmed_AC_male=1;non_topmed_AN_male=28306;non_topmed_AF_male=3.53282e-05;non_topmed_nhomalt_male=0;controls_AC_eas_jpn=0;controls_AN_eas_jpn=12;controls_AF_eas_jpn=0;controls_nhomalt_eas_jpn=0;controls_AC_nfe_female=0;controls_AN_nfe_female=3742;controls_AF_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=7252;non_neuro_AF_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=3788;non_neuro_AF_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=694;AF_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=3332;controls_AF_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=1690;non_neuro_AF_fin=0;non_neuro_nhomalt_fin=0;non_topmed_AC_sas=0;non_topmed_AN_sas=6056;non_topmed_AF_sas=0;non_topmed_nhomalt_sas=0;non_cancer_AC_nfe_female=1;non_cancer_AN_nfe_female=10408;non_cancer_AF_nfe_female=9.60799e-05;non_cancer_nhomalt_nfe_female=0;AC_oth_female=0;AN_oth_female=962;AF_oth_female=0;nhomalt_oth_female=0;non_cancer_AC_asj=0;non_cancer_AN_asj=1506;non_cancer_AF_asj=0;non_cancer_nhomalt_asj=0;AC_nfe_swe=0;AN_nfe_swe=78;AF_nfe_swe=0;nhomalt_nfe_swe=0;controls_AC_nfe=0;controls_AN_nfe=7074;controls_AF_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=358;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=276;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=3272;non_neuro_AF_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=5804;controls_AF_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=2;AN_nfe_nwe=17428;AF_nfe_nwe=0.000114758;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=8;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;controls_AC_sas_female=0;controls_AN_sas_female=1110;controls_AF_sas_female=0;controls_nhomalt_sas_female=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=3980;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_cancer_AC_eas_jpn=0;non_cancer_AN_eas_jpn=12;non_cancer_AF_eas_jpn=0;non_cancer_nhomalt_eas_jpn=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=3664;non_neuro_AF_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=4032;non_topmed_AF_eas_male=0;non_topmed_nhomalt_eas_male=0;AC_eas_jpn=0;AN_eas_jpn=12;AF_eas_jpn=0;nhomalt_eas_jpn=0;non_cancer_AC_afr_male=0;non_cancer_AN_afr_male=2224;non_cancer_AF_afr_male=0;non_cancer_nhomalt_afr_male=0;non_cancer_AC_afr=0;non_cancer_AN_afr=5168;non_cancer_AF_afr=0;non_cancer_nhomalt_afr=0;controls_AC_amr_female=0;controls_AN_amr_female=3004;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=1004;non_neuro_AF_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=1;AN_female=28482;AF_female=3.51099e-05;nhomalt_female=0;non_neuro_AC_nfe_bgr=0;non_neuro_AN_nfe_bgr=12;non_neuro_AF_nfe_bgr=0;non_neuro_nhomalt_nfe_bgr=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=744;non_neuro_AF_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=66;non_topmed_AF_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=2;non_topmed_AN_nfe_nwe=16722;non_topmed_AF_nfe_nwe=0.000119603;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=4786;non_topmed_AF_amr_male=0;non_topmed_nhomalt_amr_male=0;non_cancer_AC_amr=0;non_cancer_AN_amr=10450;non_cancer_AF_amr=0;non_cancer_nhomalt_amr=0;non_topmed_AC_nfe_swe=0;non_topmed_AN_nfe_swe=78;non_topmed_AF_nfe_swe=0;non_topmed_nhomalt_nfe_swe=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=4310;non_topmed_AF_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_kor=0;controls_AN_eas_kor=918;controls_AF_eas_kor=0;controls_nhomalt_eas_kor=0;non_topmed_AC_eas_oea=0;non_topmed_AN_eas_oea=5968;non_topmed_AF_eas_oea=0;non_topmed_nhomalt_eas_oea=0;controls_AC_eas_male=0;controls_AN_eas_male=1922;controls_AF_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=340;controls_AF_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=2;non_topmed_AN=56086;non_topmed_AF=3.56595e-05;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=1248;controls_AF_fin=0;controls_nhomalt_fin=0;AC_eas_kor=0;AN_eas_kor=1846;AF_eas_kor=0;nhomalt_eas_kor=0;non_neuro_AC_nfe=2;non_neuro_AN_nfe=17204;non_neuro_AF_nfe=0.000116252;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=686;non_neuro_AF_fin_female=0;non_neuro_nhomalt_fin_female=0;non_cancer_AC_nfe_male=1;non_cancer_AN_nfe_male=11238;non_cancer_AF_nfe_male=8.89838e-05;non_cancer_nhomalt_nfe_male=0;controls_AC_eas_oea=0;controls_AN_eas_oea=2942;controls_AF_eas_oea=0;controls_nhomalt_eas_oea=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=120;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=1950;controls_AF_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=1528;non_topmed_AF_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=1234;controls_AF_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=2;non_neuro_AN=47470;non_neuro_AF=4.21319e-05;non_neuro_nhomalt=0;AC_eas_oea=0;AN_eas_oea=5970;AF_eas_oea=0;nhomalt_eas_oea=0;non_topmed_AC_nfe=2;non_topmed_AN_nfe=21324;non_topmed_AF_nfe=9.3791e-05;non_topmed_nhomalt_nfe=0;non_cancer_AC_oth=0;non_cancer_AN_oth=1952;non_cancer_AF_oth=0;non_cancer_nhomalt_oth=0;non_topmed_AC_raw=3;non_topmed_AN_raw=148374;non_topmed_AF_raw=2.02192e-05;non_topmed_nhomalt_raw=0;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=60;non_neuro_AF_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=990;non_topmed_AF_oth_male=0;non_topmed_nhomalt_oth_male=0;non_cancer_AC_oth_male=0;non_cancer_AN_oth_male=1008;non_cancer_AF_oth_male=0;non_cancer_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=70;AF_nfe_est=0;nhomalt_nfe_est=0;non_cancer_AC_afr_female=0;non_cancer_AN_afr_female=2944;non_cancer_AF_afr_female=0;non_cancer_nhomalt_afr_female=0;non_topmed_AC_afr_male=0;non_topmed_AN_afr_male=2182;non_topmed_AF_afr_male=0;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=4034;AF_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=3872;controls_AF_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=4034;non_neuro_AF_eas_male=0;non_neuro_nhomalt_eas_male=0;non_cancer_AC_nfe_nwe=2;non_cancer_AN_nfe_nwe=17224;non_cancer_AF_nfe_nwe=0.000116117;non_cancer_nhomalt_nfe_nwe=0;controls_AC_sas=0;controls_AN_sas=2804;controls_AF_sas=0;controls_nhomalt_sas=0;non_neuro_AC_sas_male=0;non_neuro_AN_sas_male=3666;non_neuro_AF_sas_male=0;non_neuro_nhomalt_sas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=376;non_neuro_AF_asj_male=0;non_neuro_nhomalt_asj_male=0;non_cancer_AC_nfe_bgr=0;non_cancer_AN_nfe_bgr=20;non_cancer_AF_nfe_bgr=0;non_cancer_nhomalt_nfe_bgr=0;controls_AC_oth=0;controls_AN_oth=698;controls_AF_oth=0;controls_nhomalt_oth=0;non_cancer_AC_eas_female=0;non_cancer_AN_eas_female=3782;non_cancer_AF_eas_female=0;non_cancer_nhomalt_eas_female=0;AC_nfe=2;AN_nfe=22326;AF_nfe=8.95817e-05;nhomalt_nfe=0;non_topmed_AC_female=1;non_topmed_AN_female=27780;non_topmed_AF_female=3.59971e-05;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=700;non_neuro_AF_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=3794;non_topmed_AF_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=2;non_neuro_AN_raw=125778;non_neuro_AF_raw=1.5901e-05;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=0;non_topmed_AN_eas=7826;non_topmed_AF_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=1076;non_topmed_AF_fin_male=0;non_topmed_nhomalt_fin_male=0;non_cancer_AC_asj_male=0;non_cancer_AN_asj_male=678;non_cancer_AF_asj_male=0;non_cancer_nhomalt_asj_male=0;AC_fin=0;AN_fin=1916;AF_fin=0;nhomalt_fin=0;AC_nfe_male=1;AN_nfe_male=11348;AF_nfe_male=8.81213e-05;nhomalt_nfe_male=0;non_topmed_AC_eas_kor=0;non_topmed_AN_eas_kor=1846;non_topmed_AF_eas_kor=0;non_topmed_nhomalt_eas_kor=0;controls_AC_amr_male=0;controls_AN_amr_male=2346;controls_AF_amr_male=0;controls_nhomalt_amr_male=0;non_neuro_AC_eas_oea=0;non_neuro_AN_eas_oea=5966;non_neuro_AF_eas_oea=0;non_neuro_nhomalt_eas_oea=0;AC_sas_female=0;AN_sas_female=2384;AF_sas_female=0;nhomalt_sas_female=0;controls_AC_afr_female=0;controls_AN_afr_female=1286;controls_AF_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=5350;controls_AF_amr=0;controls_nhomalt_amr=0;non_topmed_AC_eas_jpn=0;non_topmed_AN_eas_jpn=12;non_topmed_AF_eas_jpn=0;non_topmed_nhomalt_eas_jpn=0;AC_asj_female=0;AN_asj_female=864;AF_asj_female=0;nhomalt_asj_female=0;non_topmed_AC_nfe_bgr=0;non_topmed_AN_nfe_bgr=28;non_topmed_AF_nfe_bgr=0;non_topmed_nhomalt_nfe_bgr=0;non_cancer_AC_nfe_est=0;non_cancer_AN_nfe_est=62;non_cancer_AF_nfe_est=0;non_cancer_nhomalt_nfe_est=0;non_neuro_AC_eas=0;non_neuro_AN_eas=7822;non_neuro_AF_eas=0;non_neuro_nhomalt_eas=0;non_cancer_AC_nfe=2;non_cancer_AN_nfe=21646;non_cancer_AF_nfe=9.23958e-05;non_cancer_nhomalt_nfe=0;non_neuro_AC_male=1;non_neuro_AN_male=23404;non_neuro_AF_male=4.27277e-05;non_neuro_nhomalt_male=0;non_neuro_AC_sas_female=0;non_neuro_AN_sas_female=2384;non_neuro_AF_sas_female=0;non_neuro_nhomalt_sas_female=0;AC_asj=0;AN_asj=1558;AF_asj=0;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=22;controls_AF_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=848;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_cancer_AC_nfe_swe=0;non_cancer_AN_nfe_swe=18;non_cancer_AF_nfe_swe=0;non_cancer_nhomalt_nfe_swe=0;non_cancer_AC=2;non_cancer_AN=56504;non_cancer_AF=3.53957e-05;non_cancer_nhomalt=0;non_topmed_AC_oth=0;non_topmed_AN_oth=1930;non_topmed_AF_oth=0;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=838;non_topmed_AF_fin_female=0;non_topmed_nhomalt_fin_female=0;non_cancer_AC_fin_female=0;non_cancer_AN_fin_female=840;non_cancer_AF_fin_female=0;non_cancer_nhomalt_fin_female=0;AC_oth=0;AN_oth=1972;AF_oth=0;nhomalt_oth=0;non_neuro_AC_nfe_male=1;non_neuro_AN_nfe_male=8090;non_neuro_AF_nfe_male=0.000123609;non_neuro_nhomalt_nfe_male=0;controls_AC_female=0;controls_AN_female=12104;controls_AF_female=0;controls_nhomalt_female=0;non_cancer_AC_fin=0;non_cancer_AN_fin=1916;non_cancer_AF_fin=0;non_cancer_nhomalt_fin=0;non_topmed_AC_fin=0;non_topmed_AN_fin=1914;non_topmed_AF_fin=0;non_topmed_nhomalt_fin=0;non_cancer_AC_eas_oea=0;non_cancer_AN_eas_oea=5956;non_cancer_AF_eas_oea=0;non_cancer_nhomalt_eas_oea=0;non_topmed_AC_nfe_female=1;non_topmed_AN_nfe_female=10436;non_topmed_AF_nfe_female=9.58222e-05;non_topmed_nhomalt_nfe_female=0;non_cancer_AC_sas_male=0;non_cancer_AN_sas_male=3672;non_cancer_AF_sas_male=0;non_cancer_nhomalt_sas_male=0;controls_AC_asj_male=0;controls_AN_asj_male=152;controls_AF_asj_male=0;controls_nhomalt_asj_male=0;non_cancer_AC_raw=3;non_cancer_AN_raw=142294;non_cancer_AF_raw=2.10831e-05;non_cancer_nhomalt_raw=0;non_cancer_AC_eas_male=0;non_cancer_AN_eas_male=4032;non_cancer_AF_eas_male=0;non_cancer_nhomalt_eas_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=680;non_topmed_AF_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=0;non_neuro_AN_oth=1546;non_neuro_AF_oth=0;non_neuro_nhomalt_oth=0;AC_male=1;AN_male=28870;AF_male=3.4638e-05;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=530;controls_AF_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_nfe_bgr=0;controls_AN_nfe_bgr=0;controls_nhomalt_nfe_bgr=0;controls_AC_asj_female=0;controls_AN_asj_female=124;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=4802;AF_amr_male=0;nhomalt_amr_male=0;AC_amr_female=0;AN_amr_female=5658;AF_amr_female=0;nhomalt_amr_female=0;non_topmed_AC_sas_male=0;non_topmed_AN_sas_male=3672;non_topmed_AF_sas_male=0;non_topmed_nhomalt_sas_male=0;AC_oth_male=0;AN_oth_male=1010;AF_oth_male=0;nhomalt_oth_male=0;non_cancer_AC_sas=0;non_cancer_AN_sas=6052;non_cancer_AF_sas=0;non_cancer_nhomalt_sas=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=72;non_neuro_AF_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_cancer_AC_eas_kor=0;non_cancer_AN_eas_kor=1846;non_cancer_AF_eas_kor=0;non_cancer_nhomalt_eas_kor=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=2904;non_topmed_AF_afr_female=0;non_topmed_nhomalt_afr_female=0;controls_AC_sas_male=0;controls_AN_sas_male=1694;controls_AF_sas_male=0;controls_nhomalt_sas_male=0;non_topmed_AC_sas_female=0;non_topmed_AN_sas_female=2384;non_topmed_AF_sas_female=0;non_topmed_nhomalt_sas_female=0;non_topmed_AC_afr=0;non_topmed_AN_afr=5086;non_topmed_AF_afr=0;non_topmed_nhomalt_afr=0;controls_AC=0;controls_AN=23566;controls_AF=0;controls_nhomalt=0;non_neuro_AC_oth_female=0;non_neuro_AN_oth_female=802;non_neuro_AF_oth_female=0;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0;non_topmed_faf99_amr=0;faf95_afr=0;faf99_afr=0;faf95_sas=0;faf99_sas=0;controls_faf95_afr=0;controls_faf99_afr=0;faf95_amr=0;faf99_amr=0;non_neuro_faf95_sas=0;non_neuro_faf99_sas=0;faf95_eas=0;faf99_eas=0;faf95=5.78e-06;faf99=6.16e-06;non_neuro_faf95_afr=0;non_neuro_faf99_afr=0;non_cancer_faf95_eas=0;non_cancer_faf99_eas=0;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;non_topmed_faf95_sas=0;non_topmed_faf99_sas=0;controls_faf95_nfe=0;controls_faf99_nfe=0;non_cancer_faf95_afr=0;non_cancer_faf99_afr=0;non_cancer_faf95_amr=0;non_cancer_faf99_amr=0;non_topmed_faf95=5.91e-06;non_topmed_faf99=6.21e-06;non_neuro_faf95_nfe=2.021e-05;non_neuro_faf99_nfe=2.018e-05;non_neuro_faf95=6.98e-06;non_neuro_faf99=6.61e-06;non_topmed_faf95_nfe=1.651e-05;non_topmed_faf99_nfe=1.58e-05;controls_faf95_eas=0;controls_faf99_eas=0;controls_faf95_sas=0;controls_faf99_sas=0;faf95_nfe=1.581e-05;faf99_nfe=1.554e-05;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_cancer_faf95_nfe=1.628e-05;non_cancer_faf99_nfe=1.571e-05;non_cancer_faf95=5.87e-06;non_cancer_faf99=6.2e-06;non_cancer_faf95_sas=0;non_cancer_faf99_sas=0;non_topmed_faf95_afr=0;non_topmed_faf99_afr=0;controls_faf95=0;controls_faf99=0;popmax=nfe;AC_popmax=2;AN_popmax=22326;AF_popmax=8.95817e-05;nhomalt_popmax=0;age_hist_het_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_het_n_smaller=0;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_topmed_popmax=nfe;non_topmed_AC_popmax=2;non_topmed_AN_popmax=21324;non_topmed_AF_popmax=9.3791e-05;non_topmed_nhomalt_popmax=0;non_neuro_popmax=nfe;non_neuro_AC_popmax=2;non_neuro_AN_popmax=17204;non_neuro_AF_popmax=0.000116252;non_neuro_nhomalt_popmax=0;non_cancer_popmax=nfe;non_cancer_AC_popmax=2;non_cancer_AN_popmax=21646;non_cancer_AF_popmax=9.23958e-05;non_cancer_nhomalt_popmax=0"); writer.WriteLine("22\t15528109\trs755148717\tT\tG\t137.61\tRF\tAC=7;AN=57430;AF=0.000121888;rf_tp_probability=0.0196743;FS=0;InbreedingCoeff=-0.0742;MQ=21.11;MQRankSum=-0.938;QD=0.39;ReadPosRankSum=0.505;SOR=0.108;BaseQRankSum=-1.754;ClippingRankSum=0.331;DP=1566925;VQSLOD=3.54;VQSR_culprit=QD;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=snv;allele_type=snv;n_alt_alleles=1;pab_max=1;gq_hist_alt_bin_freq=0|5|3|5|1|5|2|2|1|1|2|1|0|0|1|0|0|0|0|0;gq_hist_all_bin_freq=36157|22748|3213|2957|1501|619|1254|1305|872|2125|2649|1489|4471|1984|2560|1805|3116|433|2157|4076;dp_hist_alt_bin_freq=8|8|3|4|4|0|2|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=126796|6316|2994|4736|7636|6775|4792|2642|957|379|174|63|25|20|15|5|2|2|1|1;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|2|4|5|4|0|0|3|0|2|1|0|3|0|2|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=124;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=940;controls_AF_afr_male=0;controls_nhomalt_afr_male=0;non_neuro_AC_eas_kor=0;non_neuro_AN_eas_kor=1846;non_neuro_AF_eas_kor=0;non_neuro_nhomalt_eas_kor=0;non_topmed_AC_amr=0;non_topmed_AN_amr=10454;non_topmed_AF_amr=0;non_topmed_nhomalt_amr=0;non_cancer_AC_asj_female=0;non_cancer_AN_asj_female=830;non_cancer_AF_asj_female=0;non_cancer_nhomalt_asj_female=0;AC_raw=29;AN_raw=156464;AF_raw=0.000185346;nhomalt_raw=2;AC_fin_female=0;AN_fin_female=862;AF_fin_female=0;nhomalt_fin_female=0;non_cancer_AC_oth_female=0;non_cancer_AN_oth_female=946;non_cancer_AF_oth_female=0;non_cancer_nhomalt_oth_female=0;AC_nfe_bgr=0;AN_nfe_bgr=28;AF_nfe_bgr=0;nhomalt_nfe_bgr=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=328;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;AC_sas_male=0;AN_sas_male=3676;AF_sas_male=0;nhomalt_sas_male=0;non_neuro_AC_afr_male=1;non_neuro_AN_afr_male=2182;non_neuro_AF_afr_male=0.000458295;non_neuro_nhomalt_afr_male=0;AC_afr_male=1;AN_afr_male=2198;AF_afr_male=0.000454959;nhomalt_afr_male=0;AC_afr=7;AN_afr=5128;AF_afr=0.00136505;nhomalt_afr=0;controls_AC_nfe_swe=0;controls_AN_nfe_swe=6;controls_AF_nfe_swe=0;controls_nhomalt_nfe_swe=0;non_neuro_AC_afr_female=6;non_neuro_AN_afr_female=2916;non_neuro_AF_afr_female=0.00205761;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=0;non_topmed_AN_amr_female=5658;non_topmed_AF_amr_female=0;non_topmed_nhomalt_amr_female=0;non_cancer_AC_female=6;non_cancer_AN_female=27772;non_cancer_AF_female=0.000216045;non_cancer_nhomalt_female=0;non_cancer_AC_nfe_onf=0;non_cancer_AN_nfe_onf=4272;non_cancer_AF_nfe_onf=0;non_cancer_nhomalt_nfe_onf=0;non_cancer_AC_male=1;non_cancer_AN_male=28776;non_cancer_AF_male=3.47512e-05;non_cancer_nhomalt_male=0;non_topmed_AC_oth_female=0;non_topmed_AN_oth_female=942;non_topmed_AF_oth_female=0;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=3796;AF_eas_female=0;nhomalt_eas_female=0;non_cancer_AC_sas_female=0;non_cancer_AN_sas_female=2380;non_cancer_AF_sas_female=0;non_cancer_nhomalt_sas_female=0;AC_afr_female=6;AN_afr_female=2930;AF_afr_female=0.00204778;nhomalt_afr_female=0;AC_sas=0;AN_sas=6060;AF_sas=0;nhomalt_sas=0;non_neuro_AC_female=6;non_neuro_AN_female=24070;non_neuro_AF_female=0.000249273;non_neuro_nhomalt_female=0;controls_AC_afr=3;controls_AN_afr=2194;controls_AF_afr=0.00136737;controls_nhomalt_afr=0;non_neuro_AC_eas_jpn=0;non_neuro_AN_eas_jpn=12;non_neuro_AF_eas_jpn=0;non_neuro_nhomalt_eas_jpn=0;AC_nfe_onf=0;AN_nfe_onf=4622;AF_nfe_onf=0;nhomalt_nfe_onf=0;non_cancer_AC_amr_male=0;non_cancer_AN_amr_male=4812;non_cancer_AF_amr_male=0;non_cancer_nhomalt_amr_male=0;controls_AC_fin_male=0;controls_AN_fin_male=748;controls_AF_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=0;non_neuro_AN_nfe_nwe=13348;non_neuro_AF_nfe_nwe=0;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=1106;AF_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=0;AN_nfe_female=11018;AF_nfe_female=0;nhomalt_nfe_female=0;AC_amr=0;AN_amr=10492;AF_amr=0;nhomalt_amr=0;non_topmed_AC_nfe_male=0;non_topmed_AN_nfe_male=10926;non_topmed_AF_nfe_male=0;non_topmed_nhomalt_nfe_male=0;non_neuro_AC_sas=0;non_neuro_AN_sas=6054;non_neuro_AF_sas=0;non_neuro_nhomalt_sas=0;non_cancer_AC_fin_male=0;non_cancer_AN_fin_male=1106;non_cancer_AF_fin_male=0;non_cancer_nhomalt_fin_male=0;non_cancer_AC_nfe_seu=0;non_cancer_AN_nfe_seu=60;non_cancer_AF_nfe_seu=0;non_cancer_nhomalt_nfe_seu=0;AC_eas=0;AN_eas=7834;AF_eas=0;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=0;non_neuro_AN_nfe_female=9140;non_neuro_AF_nfe_female=0;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=7;non_neuro_AN_afr=5098;non_neuro_AF_afr=0.00137309;non_neuro_nhomalt_afr=0;controls_AC_raw=14;controls_AN_raw=70130;controls_AF_raw=0.000199629;controls_nhomalt_raw=1;non_cancer_AC_eas=0;non_cancer_AN_eas=7820;non_cancer_AF_eas=0;non_cancer_nhomalt_eas=0;non_cancer_AC_amr_female=0;non_cancer_AN_amr_female=5670;non_cancer_AF_amr_female=0;non_cancer_nhomalt_amr_female=0;non_neuro_AC_nfe_swe=0;non_neuro_AN_nfe_swe=68;non_neuro_AF_nfe_swe=0;non_neuro_nhomalt_nfe_swe=0;controls_AC_male=0;controls_AN_male=11500;controls_AF_male=0;controls_nhomalt_male=0;non_topmed_AC_male=1;non_topmed_AN_male=28366;non_topmed_AF_male=3.52535e-05;non_topmed_nhomalt_male=0;controls_AC_eas_jpn=0;controls_AN_eas_jpn=12;controls_AF_eas_jpn=0;controls_nhomalt_eas_jpn=0;controls_AC_nfe_female=0;controls_AN_nfe_female=3742;controls_AF_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=7284;non_neuro_AF_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=3790;non_neuro_AF_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=698;AF_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=3336;controls_AF_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=1742;non_neuro_AF_fin=0;non_neuro_nhomalt_fin=0;non_topmed_AC_sas=0;non_topmed_AN_sas=6060;non_topmed_AF_sas=0;non_topmed_nhomalt_sas=0;non_cancer_AC_nfe_female=0;non_cancer_AN_nfe_female=10426;non_cancer_AF_nfe_female=0;non_cancer_nhomalt_nfe_female=0;AC_oth_female=0;AN_oth_female=964;AF_oth_female=0;nhomalt_oth_female=0;non_cancer_AC_asj=0;non_cancer_AN_asj=1512;non_cancer_AF_asj=0;non_cancer_nhomalt_asj=0;AC_nfe_swe=0;AN_nfe_swe=82;AF_nfe_swe=0;nhomalt_nfe_swe=0;controls_AC_nfe=0;controls_AN_nfe=7078;controls_AF_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=360;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=276;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=3284;non_neuro_AF_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=5808;controls_AF_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=0;AN_nfe_nwe=17476;AF_nfe_nwe=0;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=8;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;controls_AC_sas_female=0;controls_AN_sas_female=1110;controls_AF_sas_female=0;controls_nhomalt_sas_female=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=4000;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_cancer_AC_eas_jpn=0;non_cancer_AN_eas_jpn=12;non_cancer_AF_eas_jpn=0;non_cancer_nhomalt_eas_jpn=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=3678;non_neuro_AF_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=4036;non_topmed_AF_eas_male=0;non_topmed_nhomalt_eas_male=0;AC_eas_jpn=0;AN_eas_jpn=12;AF_eas_jpn=0;nhomalt_eas_jpn=0;non_cancer_AC_afr_male=1;non_cancer_AN_afr_male=2186;non_cancer_AF_afr_male=0.000457457;non_cancer_nhomalt_afr_male=0;non_cancer_AC_afr=7;non_cancer_AN_afr=5060;non_cancer_AF_afr=0.0013834;non_cancer_nhomalt_afr=0;controls_AC_amr_female=0;controls_AN_amr_female=3012;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=1034;non_neuro_AF_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=6;AN_female=28502;AF_female=0.000210512;nhomalt_female=0;non_neuro_AC_nfe_bgr=0;non_neuro_AN_nfe_bgr=12;non_neuro_AF_nfe_bgr=0;non_neuro_nhomalt_nfe_bgr=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=748;non_neuro_AF_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=66;non_topmed_AF_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=0;non_topmed_AN_nfe_nwe=16772;non_topmed_AF_nfe_nwe=0;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=4796;non_topmed_AF_amr_male=0;non_topmed_nhomalt_amr_male=0;non_cancer_AC_amr=0;non_cancer_AN_amr=10482;non_cancer_AF_amr=0;non_cancer_nhomalt_amr=0;non_topmed_AC_nfe_swe=0;non_topmed_AN_nfe_swe=82;non_topmed_AF_nfe_swe=0;non_topmed_nhomalt_nfe_swe=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=4330;non_topmed_AF_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_kor=0;controls_AN_eas_kor=920;controls_AF_eas_kor=0;controls_nhomalt_eas_kor=0;non_topmed_AC_eas_oea=0;non_topmed_AN_eas_oea=5972;non_topmed_AF_eas_oea=0;non_topmed_nhomalt_eas_oea=0;controls_AC_eas_male=0;controls_AN_eas_male=1924;controls_AF_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=342;controls_AF_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=7;non_topmed_AN=56168;non_topmed_AF=0.000124626;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=1300;controls_AF_fin=0;controls_nhomalt_fin=0;AC_eas_kor=0;AN_eas_kor=1848;AF_eas_kor=0;nhomalt_eas_kor=0;non_neuro_AC_nfe=0;non_neuro_AN_nfe=17240;non_neuro_AF_nfe=0;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=708;non_neuro_AF_fin_female=0;non_neuro_nhomalt_fin_female=0;non_cancer_AC_nfe_male=0;non_cancer_AN_nfe_male=11268;non_cancer_AF_nfe_male=0;non_cancer_nhomalt_nfe_male=0;controls_AC_eas_oea=0;controls_AN_eas_oea=2942;controls_AF_eas_oea=0;controls_nhomalt_eas_oea=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=124;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=1950;controls_AF_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=1536;non_topmed_AF_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=1234;controls_AF_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=7;non_neuro_AN=47502;non_neuro_AF=0.000147362;non_neuro_nhomalt=0;AC_eas_oea=0;AN_eas_oea=5974;AF_eas_oea=0;nhomalt_eas_oea=0;non_topmed_AC_nfe=0;non_topmed_AN_nfe=21402;non_topmed_AF_nfe=0;non_topmed_nhomalt_nfe=0;non_cancer_AC_oth=0;non_cancer_AN_oth=1956;non_cancer_AF_oth=0;non_cancer_nhomalt_oth=0;non_topmed_AC_raw=25;non_topmed_AN_raw=153024;non_topmed_AF_raw=0.000163373;non_topmed_nhomalt_raw=2;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=60;non_neuro_AF_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=996;non_topmed_AF_oth_male=0;non_topmed_nhomalt_oth_male=0;non_cancer_AC_oth_male=0;non_cancer_AN_oth_male=1010;non_cancer_AF_oth_male=0;non_cancer_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=70;AF_nfe_est=0;nhomalt_nfe_est=0;non_cancer_AC_afr_female=6;non_cancer_AN_afr_female=2874;non_cancer_AF_afr_female=0.00208768;non_cancer_nhomalt_afr_female=0;non_topmed_AC_afr_male=1;non_topmed_AN_afr_male=2146;non_topmed_AF_afr_male=0.000465983;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=4038;AF_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=3874;controls_AF_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=4038;non_neuro_AF_eas_male=0;non_neuro_nhomalt_eas_male=0;non_cancer_AC_nfe_nwe=0;non_cancer_AN_nfe_nwe=17260;non_cancer_AF_nfe_nwe=0;non_cancer_nhomalt_nfe_nwe=0;controls_AC_sas=0;controls_AN_sas=2808;controls_AF_sas=0;controls_nhomalt_sas=0;non_neuro_AC_sas_male=0;non_neuro_AN_sas_male=3670;non_neuro_AF_sas_male=0;non_neuro_nhomalt_sas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=376;non_neuro_AF_asj_male=0;non_neuro_nhomalt_asj_male=0;non_cancer_AC_nfe_bgr=0;non_cancer_AN_nfe_bgr=20;non_cancer_AF_nfe_bgr=0;non_cancer_nhomalt_nfe_bgr=0;controls_AC_oth=0;controls_AN_oth=702;controls_AF_oth=0;controls_nhomalt_oth=0;non_cancer_AC_eas_female=0;non_cancer_AN_eas_female=3784;non_cancer_AF_eas_female=0;non_cancer_nhomalt_eas_female=0;AC_nfe=0;AN_nfe=22402;AF_nfe=0;nhomalt_nfe=0;non_topmed_AC_female=6;non_topmed_AN_female=27802;non_topmed_AF_female=0.000215812;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=704;non_neuro_AF_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=3796;non_topmed_AF_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=28;non_neuro_AN_raw=129652;non_neuro_AF_raw=0.000215963;non_neuro_nhomalt_raw=2;non_topmed_AC_eas=0;non_topmed_AN_eas=7832;non_topmed_AF_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=1106;non_topmed_AF_fin_male=0;non_topmed_nhomalt_fin_male=0;non_cancer_AC_asj_male=0;non_cancer_AN_asj_male=682;non_cancer_AF_asj_male=0;non_cancer_nhomalt_asj_male=0;AC_fin=0;AN_fin=1968;AF_fin=0;nhomalt_fin=0;AC_nfe_male=0;AN_nfe_male=11384;AF_nfe_male=0;nhomalt_nfe_male=0;non_topmed_AC_eas_kor=0;non_topmed_AN_eas_kor=1848;non_topmed_AF_eas_kor=0;non_topmed_nhomalt_eas_kor=0;controls_AC_amr_male=0;controls_AN_amr_male=2360;controls_AF_amr_male=0;controls_nhomalt_amr_male=0;non_neuro_AC_eas_oea=0;non_neuro_AN_eas_oea=5970;non_neuro_AF_eas_oea=0;non_neuro_nhomalt_eas_oea=0;AC_sas_female=0;AN_sas_female=2384;AF_sas_female=0;nhomalt_sas_female=0;controls_AC_afr_female=3;controls_AN_afr_female=1254;controls_AF_afr_female=0.00239234;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=5372;controls_AF_amr=0;controls_nhomalt_amr=0;non_topmed_AC_eas_jpn=0;non_topmed_AN_eas_jpn=12;non_topmed_AF_eas_jpn=0;non_topmed_nhomalt_eas_jpn=0;AC_asj_female=0;AN_asj_female=868;AF_asj_female=0;nhomalt_asj_female=0;non_topmed_AC_nfe_bgr=0;non_topmed_AN_nfe_bgr=28;non_topmed_AF_nfe_bgr=0;non_topmed_nhomalt_nfe_bgr=0;non_cancer_AC_nfe_est=0;non_cancer_AN_nfe_est=62;non_cancer_AF_nfe_est=0;non_cancer_nhomalt_nfe_est=0;non_neuro_AC_eas=0;non_neuro_AN_eas=7828;non_neuro_AF_eas=0;non_neuro_nhomalt_eas=0;non_cancer_AC_nfe=0;non_cancer_AN_nfe=21694;non_cancer_AF_nfe=0;non_cancer_nhomalt_nfe=0;non_neuro_AC_male=1;non_neuro_AN_male=23432;non_neuro_AF_male=4.26767e-05;non_neuro_nhomalt_male=0;non_neuro_AC_sas_female=0;non_neuro_AN_sas_female=2384;non_neuro_AF_sas_female=0;non_neuro_nhomalt_sas_female=0;AC_asj=0;AN_asj=1566;AF_asj=0;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=22;controls_AF_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=852;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_cancer_AC_nfe_swe=0;non_cancer_AN_nfe_swe=20;non_cancer_AF_nfe_swe=0;non_cancer_nhomalt_nfe_swe=0;non_cancer_AC=7;non_cancer_AN=56548;non_cancer_AF=0.000123789;non_cancer_nhomalt=0;non_topmed_AC_oth=0;non_topmed_AN_oth=1938;non_topmed_AF_oth=0;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=860;non_topmed_AF_fin_female=0;non_topmed_nhomalt_fin_female=0;non_cancer_AC_fin_female=0;non_cancer_AN_fin_female=862;non_cancer_AF_fin_female=0;non_cancer_nhomalt_fin_female=0;AC_oth=0;AN_oth=1980;AF_oth=0;nhomalt_oth=0;non_neuro_AC_nfe_male=0;non_neuro_AN_nfe_male=8100;non_neuro_AF_nfe_male=0;non_neuro_nhomalt_nfe_male=0;controls_AC_female=3;controls_AN_female=12104;controls_AF_female=0.000247852;controls_nhomalt_female=0;non_cancer_AC_fin=0;non_cancer_AN_fin=1968;non_cancer_AF_fin=0;non_cancer_nhomalt_fin=0;non_topmed_AC_fin=0;non_topmed_AN_fin=1966;non_topmed_AF_fin=0;non_topmed_nhomalt_fin=0;non_cancer_AC_eas_oea=0;non_cancer_AN_eas_oea=5960;non_cancer_AF_eas_oea=0;non_cancer_nhomalt_eas_oea=0;non_topmed_AC_nfe_female=0;non_topmed_AN_nfe_female=10476;non_topmed_AF_nfe_female=0;non_topmed_nhomalt_nfe_female=0;non_cancer_AC_sas_male=0;non_cancer_AN_sas_male=3676;non_cancer_AF_sas_male=0;non_cancer_nhomalt_sas_male=0;controls_AC_asj_male=0;controls_AN_asj_male=152;controls_AF_asj_male=0;controls_nhomalt_asj_male=0;non_cancer_AC_raw=26;non_cancer_AN_raw=146718;non_cancer_AF_raw=0.000177211;non_cancer_nhomalt_raw=2;non_cancer_AC_eas_male=0;non_cancer_AN_eas_male=4036;non_cancer_AF_eas_male=0;non_cancer_nhomalt_eas_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=684;non_topmed_AF_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=0;non_neuro_AN_oth=1552;non_neuro_AF_oth=0;non_neuro_nhomalt_oth=0;AC_male=1;AN_male=28928;AF_male=3.45686e-05;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=552;controls_AF_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_nfe_bgr=0;controls_AN_nfe_bgr=0;controls_nhomalt_nfe_bgr=0;controls_AC_asj_female=0;controls_AN_asj_female=124;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=4812;AF_amr_male=0;nhomalt_amr_male=0;AC_amr_female=0;AN_amr_female=5680;AF_amr_female=0;nhomalt_amr_female=0;non_topmed_AC_sas_male=0;non_topmed_AN_sas_male=3676;non_topmed_AF_sas_male=0;non_topmed_nhomalt_sas_male=0;AC_oth_male=0;AN_oth_male=1016;AF_oth_male=0;nhomalt_oth_male=0;non_cancer_AC_sas=0;non_cancer_AN_sas=6056;non_cancer_AF_sas=0;non_cancer_nhomalt_sas=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=74;non_neuro_AF_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_cancer_AC_eas_kor=0;non_cancer_AN_eas_kor=1848;non_cancer_AF_eas_kor=0;non_cancer_nhomalt_eas_kor=0;non_topmed_AC_afr_female=6;non_topmed_AN_afr_female=2834;non_topmed_AF_afr_female=0.00211715;non_topmed_nhomalt_afr_female=0;controls_AC_sas_male=0;controls_AN_sas_male=1698;controls_AF_sas_male=0;controls_nhomalt_sas_male=0;non_topmed_AC_sas_female=0;non_topmed_AN_sas_female=2384;non_topmed_AF_sas_female=0;non_topmed_nhomalt_sas_female=0;non_topmed_AC_afr=7;non_topmed_AN_afr=4980;non_topmed_AF_afr=0.00140562;non_topmed_nhomalt_afr=0;controls_AC=3;controls_AN=23604;controls_AF=0.000127097;controls_nhomalt=0;non_neuro_AC_oth_female=0;non_neuro_AN_oth_female=804;non_neuro_AF_oth_female=0;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0;non_topmed_faf99_amr=0;faf95_afr=0.0006402;faf99_afr=0.00064042;faf95_sas=0;faf99_sas=0;controls_faf95_afr=0.00037237;controls_faf99_afr=0.00037266;faf95_amr=0;faf99_amr=0;non_neuro_faf95_sas=0;non_neuro_faf99_sas=0;faf95_eas=0;faf99_eas=0;faf95=5.65e-05;faf99=5.643e-05;non_neuro_faf95_afr=0.00064349;non_neuro_faf99_afr=0.00064347;non_cancer_faf95_eas=0;non_cancer_faf99_eas=0;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;non_topmed_faf95_sas=0;non_topmed_faf99_sas=0;controls_faf95_nfe=0;controls_faf99_nfe=0;non_cancer_faf95_afr=0.00064865;non_cancer_faf99_afr=0.00064914;non_cancer_faf95_amr=0;non_cancer_faf99_amr=0;non_topmed_faf95=5.763e-05;non_topmed_faf99=5.815e-05;non_neuro_faf95_nfe=0;non_neuro_faf99_nfe=0;non_neuro_faf95=6.903e-05;non_neuro_faf99=6.818e-05;non_topmed_faf95_nfe=0;non_topmed_faf99_nfe=0;controls_faf95_eas=0;controls_faf99_eas=0;controls_faf95_sas=0;controls_faf99_sas=0;faf95_nfe=0;faf99_nfe=0;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_cancer_faf95_nfe=0;non_cancer_faf99_nfe=0;non_cancer_faf95=5.729e-05;non_cancer_faf99=5.793e-05;non_cancer_faf95_sas=0;non_cancer_faf99_sas=0;non_topmed_faf95_afr=0.00065872;non_topmed_faf99_afr=0.0006589;controls_faf95=3.464e-05;controls_faf99=3.431e-05;popmax=afr;AC_popmax=7;AN_popmax=5128;AF_popmax=0.00136505;nhomalt_popmax=0;age_hist_het_bin_freq=0|0|0|1|1|0|1|1|1|2;age_hist_het_n_smaller=0;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_topmed_popmax=afr;non_topmed_AC_popmax=7;non_topmed_AN_popmax=4980;non_topmed_AF_popmax=0.00140562;non_topmed_nhomalt_popmax=0;non_neuro_popmax=afr;non_neuro_AC_popmax=7;non_neuro_AN_popmax=5098;non_neuro_AF_popmax=0.00137309;non_neuro_nhomalt_popmax=0;non_cancer_popmax=afr;non_cancer_AC_popmax=7;non_cancer_AN_popmax=5060;non_cancer_AF_popmax=0.0013834;non_cancer_nhomalt_popmax=0;controls_popmax=afr;controls_AC_popmax=3;controls_AN_popmax=2194;controls_AF_popmax=0.00136737;controls_nhomalt_popmax=0"); writer.WriteLine("22\t15528127\trs1458314445\tGA\tA\t274.09\tRF\tAC=2;AN=60120;AF=3.32668e-05;rf_tp_probability=0.178222;FS=0;InbreedingCoeff=-0.0715;MQ=21.62;MQRankSum=1.31;QD=6.53;ReadPosRankSum=-0.261;SOR=0.237;BaseQRankSum=1.6;ClippingRankSum=0.933;DP=1659589;VQSLOD=-0.4257;VQSR_culprit=FS;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=indel;allele_type=del;n_alt_alleles=1;pab_max=0.189247;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|2;gq_hist_all_bin_freq=36581|32253|5753|5266|2344|885|1349|1351|748|1737|2154|1284|4459|1376|2844|1696|3341|644|2633|6374;dp_hist_alt_bin_freq=0|0|0|0|2|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=121880|9604|3553|4752|7628|6874|5091|3164|1075|401|173|65|25|20|15|5|2|2|1|1;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|0|0|0|2|0|0|0|0|0|0|0|0|0|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=198;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=980;controls_AF_afr_male=0;controls_nhomalt_afr_male=0;non_neuro_AC_eas_kor=0;non_neuro_AN_eas_kor=1852;non_neuro_AF_eas_kor=0;non_neuro_nhomalt_eas_kor=0;non_topmed_AC_amr=0;non_topmed_AN_amr=10508;non_topmed_AF_amr=0;non_topmed_nhomalt_amr=0;non_cancer_AC_asj_female=0;non_cancer_AN_asj_female=838;non_cancer_AF_asj_female=0;non_cancer_nhomalt_asj_female=0;AC_raw=2;AN_raw=182702;AF_raw=1.09468e-05;nhomalt_raw=0;AC_fin_female=0;AN_fin_female=1502;AF_fin_female=0;nhomalt_fin_female=0;non_cancer_AC_oth_female=0;non_cancer_AN_oth_female=950;non_cancer_AF_oth_female=0;non_cancer_nhomalt_oth_female=0;AC_nfe_bgr=0;AN_nfe_bgr=68;AF_nfe_bgr=0;nhomalt_nfe_bgr=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=340;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;AC_sas_male=0;AN_sas_male=3712;AF_sas_male=0;nhomalt_sas_male=0;non_neuro_AC_afr_male=0;non_neuro_AN_afr_male=2252;non_neuro_AF_afr_male=0;non_neuro_nhomalt_afr_male=0;AC_afr_male=0;AN_afr_male=2270;AF_afr_male=0;nhomalt_afr_male=0;AC_afr=0;AN_afr=5334;AF_afr=0;nhomalt_afr=0;controls_AC_nfe_swe=0;controls_AN_nfe_swe=8;controls_AF_nfe_swe=0;controls_nhomalt_nfe_swe=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=3050;non_neuro_AF_afr_female=0;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=0;non_topmed_AN_amr_female=5682;non_topmed_AF_amr_female=0;non_topmed_nhomalt_amr_female=0;non_cancer_AC_female=1;non_cancer_AN_female=28894;non_cancer_AF_female=3.46093e-05;non_cancer_nhomalt_female=0;non_cancer_AC_nfe_onf=0;non_cancer_AN_nfe_onf=4430;non_cancer_AF_nfe_onf=0;non_cancer_nhomalt_nfe_onf=0;non_cancer_AC_male=1;non_cancer_AN_male=29890;non_cancer_AF_male=3.3456e-05;non_cancer_nhomalt_male=0;non_topmed_AC_oth_female=0;non_topmed_AN_oth_female=960;non_topmed_AF_oth_female=0;non_topmed_nhomalt_oth_female=0;AC_eas_female=1;AN_eas_female=3838;AF_eas_female=0.000260552;nhomalt_eas_female=0;non_cancer_AC_sas_female=0;non_cancer_AN_sas_female=2396;non_cancer_AF_sas_female=0;non_cancer_nhomalt_sas_female=0;AC_afr_female=0;AN_afr_female=3064;AF_afr_female=0;nhomalt_afr_female=0;AC_sas=0;AN_sas=6114;AF_sas=0;nhomalt_sas=0;non_neuro_AC_female=1;non_neuro_AN_female=25264;non_neuro_AF_female=3.9582e-05;non_neuro_nhomalt_female=0;controls_AC_afr=0;controls_AN_afr=2304;controls_AF_afr=0;controls_nhomalt_afr=0;non_neuro_AC_eas_jpn=0;non_neuro_AN_eas_jpn=34;non_neuro_AF_eas_jpn=0;non_neuro_nhomalt_eas_jpn=0;AC_nfe_onf=0;AN_nfe_onf=4940;AF_nfe_onf=0;nhomalt_nfe_onf=0;non_cancer_AC_amr_male=0;non_cancer_AN_amr_male=4840;non_cancer_AF_amr_male=0;non_cancer_nhomalt_amr_male=0;controls_AC_fin_male=0;controls_AN_fin_male=1312;controls_AF_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=0;non_neuro_AN_nfe_nwe=13618;non_neuro_AF_nfe_nwe=0;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=1686;AF_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=0;AN_nfe_female=11558;AF_nfe_female=0;nhomalt_nfe_female=0;AC_amr=0;AN_amr=10546;AF_amr=0;nhomalt_amr=0;non_topmed_AC_nfe_male=0;non_topmed_AN_nfe_male=11396;non_topmed_AF_nfe_male=0;non_topmed_nhomalt_nfe_male=0;non_neuro_AC_sas=0;non_neuro_AN_sas=6108;non_neuro_AF_sas=0;non_neuro_nhomalt_sas=0;non_cancer_AC_fin_male=0;non_cancer_AN_fin_male=1686;non_cancer_AF_fin_male=0;non_cancer_nhomalt_fin_male=0;non_cancer_AC_nfe_seu=0;non_cancer_AN_nfe_seu=98;non_cancer_AF_nfe_seu=0;non_cancer_nhomalt_nfe_seu=0;AC_eas=2;AN_eas=7908;AF_eas=0.000252908;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=0;non_neuro_AN_nfe_female=9464;non_neuro_AF_nfe_female=0;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=5302;non_neuro_AF_afr=0;non_neuro_nhomalt_afr=0;controls_AC_raw=1;controls_AN_raw=81800;controls_AF_raw=1.22249e-05;controls_nhomalt_raw=0;non_cancer_AC_eas=2;non_cancer_AN_eas=7872;non_cancer_AF_eas=0.000254065;non_cancer_nhomalt_eas=0;non_cancer_AC_amr_female=0;non_cancer_AN_amr_female=5692;non_cancer_AF_amr_female=0;non_cancer_nhomalt_amr_female=0;non_neuro_AC_nfe_swe=0;non_neuro_AN_nfe_swe=82;non_neuro_AF_nfe_swe=0;non_neuro_nhomalt_nfe_swe=0;controls_AC_male=0;controls_AN_male=12202;controls_AF_male=0;controls_nhomalt_male=0;non_topmed_AC_male=1;non_topmed_AN_male=29614;non_topmed_AF_male=3.37678e-05;non_topmed_nhomalt_male=0;controls_AC_eas_jpn=0;controls_AN_eas_jpn=34;controls_AF_eas_jpn=0;controls_nhomalt_eas_jpn=0;controls_AC_nfe_female=0;controls_AN_nfe_female=3772;controls_AF_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=7334;non_neuro_AF_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=1;non_neuro_AN_eas_female=3832;non_neuro_AF_eas_female=0.00026096;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=714;AF_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=3362;controls_AF_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=2934;non_neuro_AF_fin=0;non_neuro_nhomalt_fin=0;non_topmed_AC_sas=0;non_topmed_AN_sas=6114;non_topmed_AF_sas=0;non_topmed_nhomalt_sas=0;non_cancer_AC_nfe_female=0;non_cancer_AN_nfe_female=10722;non_cancer_AF_nfe_female=0;non_cancer_nhomalt_nfe_female=0;AC_oth_female=0;AN_oth_female=982;AF_oth_female=0;nhomalt_oth_female=0;non_cancer_AC_asj=0;non_cancer_AN_asj=1530;non_cancer_AF_asj=0;non_cancer_nhomalt_asj=0;AC_nfe_swe=0;AN_nfe_swe=114;AF_nfe_swe=0;nhomalt_nfe_swe=0;controls_AC_nfe=0;controls_AN_nfe=7134;controls_AF_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=362;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=276;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=3312;non_neuro_AF_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=5834;controls_AF_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=0;AN_nfe_nwe=18022;AF_nfe_nwe=0;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=20;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;controls_AC_sas_female=0;controls_AN_sas_female=1122;controls_AF_sas_female=0;controls_nhomalt_sas_female=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=4022;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_cancer_AC_eas_jpn=0;non_cancer_AN_eas_jpn=34;non_cancer_AF_eas_jpn=0;non_cancer_nhomalt_eas_jpn=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=3866;non_neuro_AF_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=1;non_topmed_AN_eas_male=4068;non_topmed_AF_eas_male=0.000245821;non_topmed_nhomalt_eas_male=0;AC_eas_jpn=0;AN_eas_jpn=34;AF_eas_jpn=0;nhomalt_eas_jpn=0;non_cancer_AC_afr_male=0;non_cancer_AN_afr_male=2254;non_cancer_AF_afr_male=0;non_cancer_nhomalt_afr_male=0;non_cancer_AC_afr=0;non_cancer_AN_afr=5234;non_cancer_AF_afr=0;non_cancer_nhomalt_afr=0;controls_AC_amr_female=0;controls_AN_amr_female=3018;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=1600;non_neuro_AF_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=1;AN_female=29938;AF_female=3.34024e-05;nhomalt_female=0;non_neuro_AC_nfe_bgr=0;non_neuro_AN_nfe_bgr=16;non_neuro_AF_nfe_bgr=0;non_neuro_nhomalt_nfe_bgr=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=758;non_neuro_AF_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=72;non_topmed_AF_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=0;non_topmed_AN_nfe_nwe=17314;non_topmed_AF_nfe_nwe=0;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=4826;non_topmed_AF_amr_male=0;non_topmed_nhomalt_amr_male=0;non_cancer_AC_amr=0;non_cancer_AN_amr=10532;non_cancer_AF_amr=0;non_cancer_nhomalt_amr=0;non_topmed_AC_nfe_swe=0;non_topmed_AN_nfe_swe=114;non_topmed_AF_nfe_swe=0;non_topmed_nhomalt_nfe_swe=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=4648;non_topmed_AF_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_kor=0;controls_AN_eas_kor=922;controls_AF_eas_kor=0;controls_nhomalt_eas_kor=0;non_topmed_AC_eas_oea=2;non_topmed_AN_eas_oea=6018;non_topmed_AF_eas_oea=0.000332336;non_topmed_nhomalt_eas_oea=0;controls_AC_eas_male=0;controls_AN_eas_male=1940;controls_AF_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=346;controls_AF_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=2;non_topmed_AN=58850;non_topmed_AF=3.39847e-05;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=2490;controls_AF_fin=0;controls_nhomalt_fin=0;AC_eas_kor=0;AN_eas_kor=1854;AF_eas_kor=0;nhomalt_eas_kor=0;non_neuro_AC_nfe=0;non_neuro_AN_nfe=17768;non_neuro_AF_nfe=0;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=1334;non_neuro_AF_fin_female=0;non_neuro_nhomalt_fin_female=0;non_cancer_AC_nfe_male=0;non_cancer_AN_nfe_male=11634;non_cancer_AF_nfe_male=0;non_cancer_nhomalt_nfe_male=0;controls_AC_eas_oea=1;controls_AN_eas_oea=2964;controls_AF_eas_oea=0.000337382;controls_nhomalt_eas_oea=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=198;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=1;controls_AN_eas_female=1980;controls_AF_eas_female=0.000505051;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=1572;non_topmed_AF_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=1250;controls_AF_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=2;non_neuro_AN=49650;non_neuro_AF=4.0282e-05;non_neuro_nhomalt=0;AC_eas_oea=2;AN_eas_oea=6020;AF_eas_oea=0.000332226;nhomalt_eas_oea=0;non_topmed_AC_nfe=0;non_topmed_AN_nfe=22414;non_topmed_AF_nfe=0;non_topmed_nhomalt_nfe=0;non_cancer_AC_oth=0;non_cancer_AN_oth=1966;non_cancer_AF_oth=0;non_cancer_nhomalt_oth=0;non_topmed_AC_raw=2;non_topmed_AN_raw=178626;non_topmed_AF_raw=1.11966e-05;non_topmed_nhomalt_raw=0;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=64;non_neuro_AF_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=1008;non_topmed_AF_oth_male=0;non_topmed_nhomalt_oth_male=0;non_cancer_AC_oth_male=0;non_cancer_AN_oth_male=1016;non_cancer_AF_oth_male=0;non_cancer_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=76;AF_nfe_est=0;nhomalt_nfe_est=0;non_cancer_AC_afr_female=0;non_cancer_AN_afr_female=2980;non_cancer_AF_afr_female=0;non_cancer_nhomalt_afr_female=0;non_topmed_AC_afr_male=0;non_topmed_AN_afr_male=2218;non_topmed_AF_afr_male=0;non_topmed_nhomalt_afr_male=0;AC_eas_male=1;AN_eas_male=4070;AF_eas_male=0.0002457;nhomalt_eas_male=0;controls_AC_eas=1;controls_AN_eas=3920;controls_AF_eas=0.000255102;controls_nhomalt_eas=0;non_neuro_AC_eas_male=1;non_neuro_AN_eas_male=4070;non_neuro_AF_eas_male=0.0002457;non_neuro_nhomalt_eas_male=0;non_cancer_AC_nfe_nwe=0;non_cancer_AN_nfe_nwe=17670;non_cancer_AF_nfe_nwe=0;non_cancer_nhomalt_nfe_nwe=0;controls_AC_sas=0;controls_AN_sas=2854;controls_AF_sas=0;controls_nhomalt_sas=0;non_neuro_AC_sas_male=0;non_neuro_AN_sas_male=3706;non_neuro_AF_sas_male=0;non_neuro_nhomalt_sas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=384;non_neuro_AF_asj_male=0;non_neuro_nhomalt_asj_male=0;non_cancer_AC_nfe_bgr=0;non_cancer_AN_nfe_bgr=56;non_cancer_AF_nfe_bgr=0;non_cancer_nhomalt_nfe_bgr=0;controls_AC_oth=0;controls_AN_oth=708;controls_AF_oth=0;controls_nhomalt_oth=0;non_cancer_AC_eas_female=1;non_cancer_AN_eas_female=3816;non_cancer_AF_eas_female=0.000262055;non_cancer_nhomalt_eas_female=0;AC_nfe=0;AN_nfe=23418;AF_nfe=0;nhomalt_nfe=0;non_topmed_AC_female=1;non_topmed_AN_female=29236;non_topmed_AF_female=3.42044e-05;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=724;non_neuro_AF_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=1;non_topmed_AN_eas_female=3838;non_topmed_AF_eas_female=0.000260552;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=2;non_neuro_AN_raw=151340;non_neuro_AF_raw=1.32153e-05;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=2;non_topmed_AN_eas=7906;non_topmed_AF_eas=0.000252972;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=1686;non_topmed_AF_fin_male=0;non_topmed_nhomalt_fin_male=0;non_cancer_AC_asj_male=0;non_cancer_AN_asj_male=692;non_cancer_AF_asj_male=0;non_cancer_nhomalt_asj_male=0;AC_fin=0;AN_fin=3188;AF_fin=0;nhomalt_fin=0;AC_nfe_male=0;AN_nfe_male=11860;AF_nfe_male=0;nhomalt_nfe_male=0;non_topmed_AC_eas_kor=0;non_topmed_AN_eas_kor=1854;non_topmed_AF_eas_kor=0;non_topmed_nhomalt_eas_kor=0;controls_AC_amr_male=0;controls_AN_amr_male=2378;controls_AF_amr_male=0;controls_nhomalt_amr_male=0;non_neuro_AC_eas_oea=2;non_neuro_AN_eas_oea=6016;non_neuro_AF_eas_oea=0.000332447;non_neuro_nhomalt_eas_oea=0;AC_sas_female=0;AN_sas_female=2402;AF_sas_female=0;nhomalt_sas_female=0;controls_AC_afr_female=0;controls_AN_afr_female=1324;controls_AF_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=5396;controls_AF_amr=0;controls_nhomalt_amr=0;non_topmed_AC_eas_jpn=0;non_topmed_AN_eas_jpn=34;non_topmed_AF_eas_jpn=0;non_topmed_nhomalt_eas_jpn=0;AC_asj_female=0;AN_asj_female=888;AF_asj_female=0;nhomalt_asj_female=0;non_topmed_AC_nfe_bgr=0;non_topmed_AN_nfe_bgr=68;non_topmed_AF_nfe_bgr=0;non_topmed_nhomalt_nfe_bgr=0;non_cancer_AC_nfe_est=0;non_cancer_AN_nfe_est=64;non_cancer_AF_nfe_est=0;non_cancer_nhomalt_nfe_est=0;non_neuro_AC_eas=2;non_neuro_AN_eas=7902;non_neuro_AF_eas=0.0002531;non_neuro_nhomalt_eas=0;non_cancer_AC_nfe=0;non_cancer_AN_nfe=22356;non_cancer_AF_nfe=0;non_cancer_nhomalt_nfe=0;non_neuro_AC_male=1;non_neuro_AN_male=24386;non_neuro_AF_male=4.10071e-05;non_neuro_nhomalt_male=0;non_neuro_AC_sas_female=0;non_neuro_AN_sas_female=2402;non_neuro_AF_sas_female=0;non_neuro_nhomalt_sas_female=0;AC_asj=0;AN_asj=1602;AF_asj=0;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=22;controls_AF_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=872;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_cancer_AC_nfe_swe=0;non_cancer_AN_nfe_swe=38;non_cancer_AF_nfe_swe=0;non_cancer_nhomalt_nfe_swe=0;non_cancer_AC=2;non_cancer_AN=58784;non_cancer_AF=3.40229e-05;non_cancer_nhomalt=0;non_topmed_AC_oth=0;non_topmed_AN_oth=1968;non_topmed_AF_oth=0;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=1500;non_topmed_AF_fin_female=0;non_topmed_nhomalt_fin_female=0;non_cancer_AC_fin_female=0;non_cancer_AN_fin_female=1500;non_cancer_AF_fin_female=0;non_cancer_nhomalt_fin_female=0;AC_oth=0;AN_oth=2010;AF_oth=0;nhomalt_oth=0;non_neuro_AC_nfe_male=0;non_neuro_AN_nfe_male=8304;non_neuro_AF_nfe_male=0;non_neuro_nhomalt_nfe_male=0;controls_AC_female=1;controls_AN_female=12880;controls_AF_female=7.76398e-05;controls_nhomalt_female=0;non_cancer_AC_fin=0;non_cancer_AN_fin=3186;non_cancer_AF_fin=0;non_cancer_nhomalt_fin=0;non_topmed_AC_fin=0;non_topmed_AN_fin=3186;non_topmed_AF_fin=0;non_topmed_nhomalt_fin=0;non_cancer_AC_eas_oea=2;non_cancer_AN_eas_oea=5984;non_cancer_AF_eas_oea=0.000334225;non_cancer_nhomalt_eas_oea=0;non_topmed_AC_nfe_female=0;non_topmed_AN_nfe_female=11018;non_topmed_AF_nfe_female=0;non_topmed_nhomalt_nfe_female=0;non_cancer_AC_sas_male=0;non_cancer_AN_sas_male=3712;non_cancer_AF_sas_male=0;non_cancer_nhomalt_sas_male=0;controls_AC_asj_male=0;controls_AN_asj_male=152;controls_AF_asj_male=0;controls_nhomalt_asj_male=0;non_cancer_AC_raw=2;non_cancer_AN_raw=171340;non_cancer_AF_raw=1.16727e-05;non_cancer_nhomalt_raw=0;non_cancer_AC_eas_male=1;non_cancer_AN_eas_male=4056;non_cancer_AF_eas_male=0.000246548;non_cancer_nhomalt_eas_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=700;non_topmed_AF_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=0;non_neuro_AN_oth=1578;non_neuro_AF_oth=0;non_neuro_nhomalt_oth=0;AC_male=1;AN_male=30182;AF_male=3.31323e-05;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=1178;controls_AF_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_nfe_bgr=0;controls_AN_nfe_bgr=0;controls_nhomalt_nfe_bgr=0;controls_AC_asj_female=0;controls_AN_asj_female=124;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=4842;AF_amr_male=0;nhomalt_amr_male=0;AC_amr_female=0;AN_amr_female=5704;AF_amr_female=0;nhomalt_amr_female=0;non_topmed_AC_sas_male=0;non_topmed_AN_sas_male=3712;non_topmed_AF_sas_male=0;non_topmed_nhomalt_sas_male=0;AC_oth_male=0;AN_oth_male=1028;AF_oth_male=0;nhomalt_oth_male=0;non_cancer_AC_sas=0;non_cancer_AN_sas=6108;non_cancer_AF_sas=0;non_cancer_nhomalt_sas=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=122;non_neuro_AF_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_cancer_AC_eas_kor=0;non_cancer_AN_eas_kor=1854;non_cancer_AF_eas_kor=0;non_cancer_nhomalt_eas_kor=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=2964;non_topmed_AF_afr_female=0;non_topmed_nhomalt_afr_female=0;controls_AC_sas_male=0;controls_AN_sas_male=1732;controls_AF_sas_male=0;controls_nhomalt_sas_male=0;non_topmed_AC_sas_female=0;non_topmed_AN_sas_female=2402;non_topmed_AF_sas_female=0;non_topmed_nhomalt_sas_female=0;non_topmed_AC_afr=0;non_topmed_AN_afr=5182;non_topmed_AF_afr=0;non_topmed_nhomalt_afr=0;controls_AC=1;controls_AN=25082;controls_AF=3.98692e-05;controls_nhomalt=0;non_neuro_AC_oth_female=0;non_neuro_AN_oth_female=820;non_neuro_AF_oth_female=0;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0;non_topmed_faf99_amr=0;faf95_afr=0;faf99_afr=0;faf95_sas=0;faf99_sas=0;controls_faf95_afr=0;controls_faf99_afr=0;faf95_amr=0;faf99_amr=0;non_neuro_faf95_sas=0;non_neuro_faf99_sas=0;faf95_eas=4.461e-05;faf99_eas=4.452e-05;faf95=5.52e-06;faf99=5.07e-06;non_neuro_faf95_afr=0;non_neuro_faf99_afr=0;non_cancer_faf95_eas=4.479e-05;non_cancer_faf99_eas=4.459e-05;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;non_topmed_faf95_sas=0;non_topmed_faf99_sas=0;controls_faf95_nfe=0;controls_faf99_nfe=0;non_cancer_faf95_afr=0;non_cancer_faf99_afr=0;non_cancer_faf95_amr=0;non_cancer_faf99_amr=0;non_topmed_faf95=5.64e-06;non_topmed_faf99=5.11e-06;non_neuro_faf95_nfe=0;non_neuro_faf99_nfe=0;non_neuro_faf95=6.68e-06;non_neuro_faf99=6.5e-06;non_topmed_faf95_nfe=0;non_topmed_faf99_nfe=0;controls_faf95_eas=1.3e-05;controls_faf99_eas=1.3e-05;controls_faf95_sas=0;controls_faf99_sas=0;faf95_nfe=0;faf99_nfe=0;non_topmed_faf95_eas=4.462e-05;non_topmed_faf99_eas=4.452e-05;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=4.464e-05;non_neuro_faf99_eas=4.453e-05;non_cancer_faf95_nfe=0;non_cancer_faf99_nfe=0;non_cancer_faf95=5.64e-06;non_cancer_faf99=5.11e-06;non_cancer_faf95_sas=0;non_cancer_faf99_sas=0;non_topmed_faf95_afr=0;non_topmed_faf99_afr=0;controls_faf95=2e-06;controls_faf99=2e-06;popmax=eas;AC_popmax=2;AN_popmax=7908;AF_popmax=0.000252908;nhomalt_popmax=0;age_hist_het_bin_freq=0|1|0|0|1|0|0|0|0|0;age_hist_het_n_smaller=0;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_topmed_popmax=eas;non_topmed_AC_popmax=2;non_topmed_AN_popmax=7906;non_topmed_AF_popmax=0.000252972;non_topmed_nhomalt_popmax=0;non_neuro_popmax=eas;non_neuro_AC_popmax=2;non_neuro_AN_popmax=7902;non_neuro_AF_popmax=0.0002531;non_neuro_nhomalt_popmax=0;non_cancer_popmax=eas;non_cancer_AC_popmax=2;non_cancer_AN_popmax=7872;non_cancer_AF_popmax=0.000254065;non_cancer_nhomalt_popmax=0;controls_popmax=eas;controls_AC_popmax=1;controls_AN_popmax=3920;controls_AF_popmax=0.000255102;controls_nhomalt_popmax=0"); writer.WriteLine("22\t15528135\trs1260541090\tT\tC\t38.68\tAC0;RF\tAC=0;AN=60606;AF=0;rf_tp_probability=0.0116862;FS=0;InbreedingCoeff=-0.0683;MQ=21.65;MQRankSum=0.922;QD=0.99;ReadPosRankSum=0.198;SOR=0.126;BaseQRankSum=0.198;ClippingRankSum=1.04;DP=1695127;VQSLOD=2.04;VQSR_culprit=QD;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=snv;allele_type=snv;n_alt_alleles=1;pab_max=0.000113074;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0;gq_hist_all_bin_freq=35156|35322|7058|6597|2982|985|1608|1457|808|1793|2156|1278|4500|1271|3081|1464|3504|580|2731|6670;dp_hist_alt_bin_freq=0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=120746|10478|3527|4643|7607|6825|5314|3379|1100|402|174|65|25|20|15|5|2|2|1|1;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=202;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=994;controls_AF_afr_male=0;controls_nhomalt_afr_male=0;non_neuro_AC_eas_kor=0;non_neuro_AN_eas_kor=1852;non_neuro_AF_eas_kor=0;non_neuro_nhomalt_eas_kor=0;non_topmed_AC_amr=0;non_topmed_AN_amr=10546;non_topmed_AF_amr=0;non_topmed_nhomalt_amr=0;non_cancer_AC_asj_female=0;non_cancer_AN_asj_female=838;non_cancer_AF_asj_female=0;non_cancer_nhomalt_asj_female=0;AC_raw=1;AN_raw=191364;AF_raw=5.22564e-06;nhomalt_raw=0;AC_fin_female=0;AN_fin_female=1632;AF_fin_female=0;nhomalt_fin_female=0;non_cancer_AC_oth_female=0;non_cancer_AN_oth_female=950;non_cancer_AF_oth_female=0;non_cancer_nhomalt_oth_female=0;AC_nfe_bgr=0;AN_nfe_bgr=66;AF_nfe_bgr=0;nhomalt_nfe_bgr=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=342;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;AC_sas_male=0;AN_sas_male=3722;AF_sas_male=0;nhomalt_sas_male=0;non_neuro_AC_afr_male=0;non_neuro_AN_afr_male=2270;non_neuro_AF_afr_male=0;non_neuro_nhomalt_afr_male=0;AC_afr_male=0;AN_afr_male=2288;AF_afr_male=0;nhomalt_afr_male=0;AC_afr=0;AN_afr=5362;AF_afr=0;nhomalt_afr=0;controls_AC_nfe_swe=0;controls_AN_nfe_swe=8;controls_AF_nfe_swe=0;controls_nhomalt_nfe_swe=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=3060;non_neuro_AF_afr_female=0;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=0;non_topmed_AN_amr_female=5704;non_topmed_AF_amr_female=0;non_topmed_nhomalt_amr_female=0;non_cancer_AC_female=0;non_cancer_AN_female=29084;non_cancer_AF_female=0;non_cancer_nhomalt_female=0;non_cancer_AC_nfe_onf=0;non_cancer_AN_nfe_onf=4460;non_cancer_AF_nfe_onf=0;non_cancer_nhomalt_nfe_onf=0;non_cancer_AC_male=0;non_cancer_AN_male=30116;non_cancer_AF_male=0;non_cancer_nhomalt_male=0;non_topmed_AC_oth_female=0;non_topmed_AN_oth_female=964;non_topmed_AF_oth_female=0;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=3842;AF_eas_female=0;nhomalt_eas_female=0;non_cancer_AC_sas_female=0;non_cancer_AN_sas_female=2398;non_cancer_AF_sas_female=0;non_cancer_nhomalt_sas_female=0;AC_afr_female=0;AN_afr_female=3074;AF_afr_female=0;nhomalt_afr_female=0;AC_sas=0;AN_sas=6126;AF_sas=0;nhomalt_sas=0;non_neuro_AC_female=0;non_neuro_AN_female=25486;non_neuro_AF_female=0;non_neuro_nhomalt_female=0;controls_AC_afr=0;controls_AN_afr=2324;controls_AF_afr=0;controls_nhomalt_afr=0;non_neuro_AC_eas_jpn=0;non_neuro_AN_eas_jpn=34;non_neuro_AF_eas_jpn=0;non_neuro_nhomalt_eas_jpn=0;AC_nfe_onf=0;AN_nfe_onf=5004;AF_nfe_onf=0;nhomalt_nfe_onf=0;non_cancer_AC_amr_male=0;non_cancer_AN_amr_male=4856;non_cancer_AF_amr_male=0;non_cancer_nhomalt_amr_male=0;controls_AC_fin_male=0;controls_AN_fin_male=1434;controls_AF_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=0;non_neuro_AN_nfe_nwe=13672;non_neuro_AF_nfe_nwe=0;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=1812;AF_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=0;AN_nfe_female=11620;AF_nfe_female=0;nhomalt_nfe_female=0;AC_amr=0;AN_amr=10584;AF_amr=0;nhomalt_amr=0;non_topmed_AC_nfe_male=0;non_topmed_AN_nfe_male=11470;non_topmed_AF_nfe_male=0;non_topmed_nhomalt_nfe_male=0;non_neuro_AC_sas=0;non_neuro_AN_sas=6120;non_neuro_AF_sas=0;non_neuro_nhomalt_sas=0;non_cancer_AC_fin_male=0;non_cancer_AN_fin_male=1812;non_cancer_AF_fin_male=0;non_cancer_nhomalt_fin_male=0;non_cancer_AC_nfe_seu=0;non_cancer_AN_nfe_seu=98;non_cancer_AF_nfe_seu=0;non_cancer_nhomalt_nfe_seu=0;AC_eas=0;AN_eas=7914;AF_eas=0;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=0;non_neuro_AN_nfe_female=9518;non_neuro_AF_nfe_female=0;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=5330;non_neuro_AF_afr=0;non_neuro_nhomalt_afr=0;controls_AC_raw=1;controls_AN_raw=85756;controls_AF_raw=1.1661e-05;controls_nhomalt_raw=0;non_cancer_AC_eas=0;non_cancer_AN_eas=7874;non_cancer_AF_eas=0;non_cancer_nhomalt_eas=0;non_cancer_AC_amr_female=0;non_cancer_AN_amr_female=5714;non_cancer_AF_amr_female=0;non_cancer_nhomalt_amr_female=0;non_neuro_AC_nfe_swe=0;non_neuro_AN_nfe_swe=80;non_neuro_AF_nfe_swe=0;non_neuro_nhomalt_nfe_swe=0;controls_AC_male=0;controls_AN_male=12354;controls_AF_male=0;controls_nhomalt_male=0;non_topmed_AC_male=0;non_topmed_AN_male=29860;non_topmed_AF_male=0;non_topmed_nhomalt_male=0;controls_AC_eas_jpn=0;controls_AN_eas_jpn=34;controls_AF_eas_jpn=0;controls_nhomalt_eas_jpn=0;controls_AC_nfe_female=0;controls_AN_nfe_female=3774;controls_AF_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=7370;non_neuro_AF_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=3836;non_neuro_AF_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=718;AF_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=3370;controls_AF_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=3180;non_neuro_AF_fin=0;non_neuro_nhomalt_fin=0;non_topmed_AC_sas=0;non_topmed_AN_sas=6126;non_topmed_AF_sas=0;non_topmed_nhomalt_sas=0;non_cancer_AC_nfe_female=0;non_cancer_AN_nfe_female=10750;non_cancer_AF_nfe_female=0;non_cancer_nhomalt_nfe_female=0;AC_oth_female=0;AN_oth_female=986;AF_oth_female=0;nhomalt_oth_female=0;non_cancer_AC_asj=0;non_cancer_AN_asj=1532;non_cancer_AF_asj=0;non_cancer_nhomalt_asj=0;AC_nfe_swe=0;AN_nfe_swe=110;AF_nfe_swe=0;nhomalt_nfe_swe=0;controls_AC_nfe=0;controls_AN_nfe=7144;controls_AF_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=362;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=276;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=3326;non_neuro_AF_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=5838;controls_AF_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=0;AN_nfe_nwe=18098;AF_nfe_nwe=0;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=20;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;controls_AC_sas_female=0;controls_AN_sas_female=1124;controls_AF_sas_female=0;controls_nhomalt_sas_female=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=4044;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_cancer_AC_eas_jpn=0;non_cancer_AN_eas_jpn=34;non_cancer_AF_eas_jpn=0;non_cancer_nhomalt_eas_jpn=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=3914;non_neuro_AF_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=4070;non_topmed_AF_eas_male=0;non_topmed_nhomalt_eas_male=0;AC_eas_jpn=0;AN_eas_jpn=34;AF_eas_jpn=0;nhomalt_eas_jpn=0;non_cancer_AC_afr_male=0;non_cancer_AN_afr_male=2270;non_cancer_AF_afr_male=0;non_cancer_nhomalt_afr_male=0;non_cancer_AC_afr=0;non_cancer_AN_afr=5256;non_cancer_AF_afr=0;non_cancer_nhomalt_afr=0;controls_AC_amr_female=0;controls_AN_amr_female=3036;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=1722;non_neuro_AF_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=0;AN_female=30174;AF_female=0;nhomalt_female=0;non_neuro_AC_nfe_bgr=0;non_neuro_AN_nfe_bgr=16;non_neuro_AF_nfe_bgr=0;non_neuro_nhomalt_nfe_bgr=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=756;non_neuro_AF_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=72;non_topmed_AF_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=0;non_topmed_AN_nfe_nwe=17390;non_topmed_AF_nfe_nwe=0;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=4842;non_topmed_AF_amr_male=0;non_topmed_nhomalt_amr_male=0;non_cancer_AC_amr=0;non_cancer_AN_amr=10570;non_cancer_AF_amr=0;non_cancer_nhomalt_amr=0;non_topmed_AC_nfe_swe=0;non_topmed_AN_nfe_swe=110;non_topmed_AF_nfe_swe=0;non_topmed_nhomalt_nfe_swe=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=4712;non_topmed_AF_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_kor=0;controls_AN_eas_kor=924;controls_AF_eas_kor=0;controls_nhomalt_eas_kor=0;non_topmed_AC_eas_oea=0;non_topmed_AN_eas_oea=6024;non_topmed_AF_eas_oea=0;non_topmed_nhomalt_eas_oea=0;controls_AC_eas_male=0;controls_AN_eas_male=1940;controls_AF_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=344;controls_AF_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=0;non_topmed_AN=59334;non_topmed_AF=0;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=2736;controls_AF_fin=0;controls_nhomalt_fin=0;AC_eas_kor=0;AN_eas_kor=1854;AF_eas_kor=0;nhomalt_eas_kor=0;non_neuro_AC_nfe=0;non_neuro_AN_nfe=17872;non_neuro_AF_nfe=0;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=1458;non_neuro_AF_fin_female=0;non_neuro_nhomalt_fin_female=0;non_cancer_AC_nfe_male=0;non_cancer_AN_nfe_male=11692;non_cancer_AF_nfe_male=0;non_cancer_nhomalt_nfe_male=0;controls_AC_eas_oea=0;controls_AN_eas_oea=2964;controls_AF_eas_oea=0;controls_nhomalt_eas_oea=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=202;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=1982;controls_AF_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=1578;non_topmed_AF_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=1256;controls_AF_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=0;non_neuro_AN=50088;non_neuro_AF=0;non_neuro_nhomalt=0;AC_eas_oea=0;AN_eas_oea=6026;AF_eas_oea=0;nhomalt_eas_oea=0;non_topmed_AC_nfe=0;non_topmed_AN_nfe=22552;non_topmed_AF_nfe=0;non_topmed_nhomalt_nfe=0;non_cancer_AC_oth=0;non_cancer_AN_oth=1964;non_cancer_AF_oth=0;non_cancer_nhomalt_oth=0;non_topmed_AC_raw=1;non_topmed_AN_raw=187054;non_topmed_AF_raw=5.34605e-06;non_topmed_nhomalt_raw=0;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=64;non_neuro_AF_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=1006;non_topmed_AF_oth_male=0;non_topmed_nhomalt_oth_male=0;non_cancer_AC_oth_male=0;non_cancer_AN_oth_male=1014;non_cancer_AF_oth_male=0;non_cancer_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=76;AF_nfe_est=0;nhomalt_nfe_est=0;non_cancer_AC_afr_female=0;non_cancer_AN_afr_female=2986;non_cancer_AF_afr_female=0;non_cancer_nhomalt_afr_female=0;non_topmed_AC_afr_male=0;non_topmed_AN_afr_male=2234;non_topmed_AF_afr_male=0;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=4072;AF_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=3922;controls_AF_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=4072;non_neuro_AF_eas_male=0;non_neuro_nhomalt_eas_male=0;non_cancer_AC_nfe_nwe=0;non_cancer_AN_nfe_nwe=17730;non_cancer_AF_nfe_nwe=0;non_cancer_nhomalt_nfe_nwe=0;controls_AC_sas=0;controls_AN_sas=2864;controls_AF_sas=0;controls_nhomalt_sas=0;non_neuro_AC_sas_male=0;non_neuro_AN_sas_male=3716;non_neuro_AF_sas_male=0;non_neuro_nhomalt_sas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=386;non_neuro_AF_asj_male=0;non_neuro_nhomalt_asj_male=0;non_cancer_AC_nfe_bgr=0;non_cancer_AN_nfe_bgr=54;non_cancer_AF_nfe_bgr=0;non_cancer_nhomalt_nfe_bgr=0;controls_AC_oth=0;controls_AN_oth=706;controls_AF_oth=0;controls_nhomalt_oth=0;non_cancer_AC_eas_female=0;non_cancer_AN_eas_female=3818;non_cancer_AF_eas_female=0;non_cancer_nhomalt_eas_female=0;AC_nfe=0;AN_nfe=23556;AF_nfe=0;nhomalt_nfe=0;non_topmed_AC_female=0;non_topmed_AN_female=29474;non_topmed_AF_female=0;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=728;non_neuro_AF_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=3842;non_topmed_AF_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=1;non_neuro_AN_raw=158316;non_neuro_AF_raw=6.31648e-06;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=0;non_topmed_AN_eas=7912;non_topmed_AF_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=1812;non_topmed_AF_fin_male=0;non_topmed_nhomalt_fin_male=0;non_cancer_AC_asj_male=0;non_cancer_AN_asj_male=694;non_cancer_AF_asj_male=0;non_cancer_nhomalt_asj_male=0;AC_fin=0;AN_fin=3444;AF_fin=0;nhomalt_fin=0;AC_nfe_male=0;AN_nfe_male=11936;AF_nfe_male=0;nhomalt_nfe_male=0;non_topmed_AC_eas_kor=0;non_topmed_AN_eas_kor=1854;non_topmed_AF_eas_kor=0;non_topmed_nhomalt_eas_kor=0;controls_AC_amr_male=0;controls_AN_amr_male=2380;controls_AF_amr_male=0;controls_nhomalt_amr_male=0;non_neuro_AC_eas_oea=0;non_neuro_AN_eas_oea=6022;non_neuro_AF_eas_oea=0;non_neuro_nhomalt_eas_oea=0;AC_sas_female=0;AN_sas_female=2404;AF_sas_female=0;nhomalt_sas_female=0;controls_AC_afr_female=0;controls_AN_afr_female=1330;controls_AF_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=5416;controls_AF_amr=0;controls_nhomalt_amr=0;non_topmed_AC_eas_jpn=0;non_topmed_AN_eas_jpn=34;non_topmed_AF_eas_jpn=0;non_topmed_nhomalt_eas_jpn=0;AC_asj_female=0;AN_asj_female=890;AF_asj_female=0;nhomalt_asj_female=0;non_topmed_AC_nfe_bgr=0;non_topmed_AN_nfe_bgr=66;non_topmed_AF_nfe_bgr=0;non_topmed_nhomalt_nfe_bgr=0;non_cancer_AC_nfe_est=0;non_cancer_AN_nfe_est=64;non_cancer_AF_nfe_est=0;non_cancer_nhomalt_nfe_est=0;non_neuro_AC_eas=0;non"); writer.Flush(); stream.Position = 0; return stream; } private static Stream GetChr22GenomeStream() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("chr22\t15528101\trs1340431007\tT\tC\t.\tAS_VQSR\tAC=2;AN=147274;AF=1.35801e-05;popmax=afr;faf95_popmax=8.52000e-06;AC-non_v2-XX=1;AN-non_v2-XX=58332;AF-non_v2-XX=1.71432e-05;nhomalt-non_v2-XX=0;AC-non_cancer-fin-XX=0;AN-non_cancer-fin-XX=2470;AF-non_cancer-fin-XX=0.00000;nhomalt-non_cancer-fin-XX=0;AC-non_neuro-nfe=0;AN-non_neuro-nfe=63168;AF-non_neuro-nfe=0.00000;nhomalt-non_neuro-nfe=0;AC-non_neuro-afr-XY=0;AN-non_neuro-afr-XY=12982;AF-non_neuro-afr-XY=0.00000;nhomalt-non_neuro-afr-XY=0;AC-non_neuro-nfe-XY=0;AN-non_neuro-nfe-XY=25918;AF-non_neuro-nfe-XY=0.00000;nhomalt-non_neuro-nfe-XY=0;AC-controls_and_biobanks-eas-XY=0;AN-controls_and_biobanks-eas-XY=1362;AF-controls_and_biobanks-eas-XY=0.00000;nhomalt-controls_and_biobanks-eas-XY=0;AC-non_neuro-sas-XX=0;AN-non_neuro-sas-XX=1126;AF-non_neuro-sas-XX=0.00000;nhomalt-non_neuro-sas-XX=0;AC-non_v2=1;AN-non_v2=110588;AF-non_v2=9.04257e-06;nhomalt-non_v2=0;AC-non_topmed-nfe-XX=0;AN-non_topmed-nfe-XX=9102;AF-non_topmed-nfe-XX=0.00000;nhomalt-non_topmed-nfe-XX=0;AC-non_v2-mid=0;AN-non_v2-mid=308;AF-non_v2-mid=0.00000;nhomalt-non_v2-mid=0;AC-non_topmed-sas=0;AN-non_topmed-sas=4616;AF-non_topmed-sas=0.00000;nhomalt-non_topmed-sas=0;AC-non_cancer-eas-XX=0;AN-non_cancer-eas-XX=2106;AF-non_cancer-eas-XX=0.00000;nhomalt-non_cancer-eas-XX=0;AC-amr-XY=0;AN-amr-XY=8204;AF-amr-XY=0.00000;nhomalt-amr-XY=0;AC-non_v2-nfe-XX=0;AN-non_v2-nfe-XX=31470;AF-non_v2-nfe-XX=0.00000;nhomalt-non_v2-nfe-XX=0;AC-controls_and_biobanks-XY=0;AN-controls_and_biobanks-XY=18696;AF-controls_and_biobanks-XY=0.00000;nhomalt-controls_and_biobanks-XY=0;AC-non_neuro-asj-XY=0;AN-non_neuro-asj-XY=1530;AF-non_neuro-asj-XY=0.00000;nhomalt-non_neuro-asj-XY=0;AC-oth=0;AN-oth=1992;AF-oth=0.00000;nhomalt-oth=0;AC-non_topmed-mid-XY=0;AN-non_topmed-mid-XY=130;AF-non_topmed-mid-XY=0.00000;nhomalt-non_topmed-mid-XY=0;AC-non_cancer-asj-XX=0;AN-non_cancer-asj-XX=1730;AF-non_cancer-asj-XX=0.00000;nhomalt-non_cancer-asj-XX=0;AC-sas-XY=0;AN-sas-XY=3518;AF-sas-XY=0.00000;nhomalt-sas-XY=0;AC-non_neuro-fin=0;AN-non_neuro-fin=6798;AF-non_neuro-fin=0.00000;nhomalt-non_neuro-fin=0;AC-non_topmed-amr-XY=0;AN-non_topmed-amr-XY=7318;AF-non_topmed-amr-XY=0.00000;nhomalt-non_topmed-amr-XY=0;AC-non_neuro-XX=1;AN-non_neuro-XX=68726;AF-non_neuro-XX=1.45505e-05;nhomalt-non_neuro-XX=0;AC-fin-XX=0;AN-fin-XX=2470;AF-fin-XX=0.00000;nhomalt-fin-XX=0;AC-controls_and_biobanks-asj-XX=0;AN-controls_and_biobanks-asj-XX=84;AF-controls_and_biobanks-asj-XX=0.00000;nhomalt-controls_and_biobanks-asj-XX=0;AC-non_v2-raw=1;AN-non_v2-raw=114266;AF-non_v2-raw=8.75151e-06;nhomalt-non_v2-raw=0;AC-non_v2-asj=0;AN-non_v2-asj=3036;AF-non_v2-asj=0.00000;nhomalt-non_v2-asj=0;AC-nfe-XX=0;AN-nfe-XX=38906;AF-nfe-XX=0.00000;nhomalt-nfe-XX=0;AC-controls_and_biobanks-raw=0;AN-controls_and_biobanks-raw=32822;AF-controls_and_biobanks-raw=0.00000;nhomalt-controls_and_biobanks-raw=0;AC-controls_and_biobanks-ami=0;AN-controls_and_biobanks-ami=58;AF-controls_and_biobanks-ami=0.00000;nhomalt-controls_and_biobanks-ami=0;AC-non_topmed-eas=0;AN-non_topmed-eas=3516;AF-non_topmed-eas=0.00000;nhomalt-non_topmed-eas=0;AC-non_v2-amr=0;AN-non_v2-amr=13232;AF-non_v2-amr=0.00000;nhomalt-non_v2-amr=0;AC-non_neuro-sas=0;AN-non_neuro-sas=4642;AF-non_neuro-sas=0.00000;nhomalt-non_neuro-sas=0;AC-non_cancer-fin-XY=0;AN-non_cancer-fin-XY=7820;AF-non_cancer-fin-XY=0.00000;nhomalt-non_cancer-fin-XY=0;AC-non_cancer-nfe-XY=0;AN-non_cancer-nfe-XY=26344;AF-non_cancer-nfe-XY=0.00000;nhomalt-non_cancer-nfe-XY=0;AC-non_v2-oth=0;AN-non_v2-oth=1762;AF-non_v2-oth=0.00000;nhomalt-non_v2-oth=0;AC-ami=0;AN-ami=898;AF-ami=0.00000;nhomalt-ami=0;AC-non_cancer-XY=1;AN-non_cancer-XY=69278;AF-non_cancer-XY=1.44346e-05;nhomalt-non_cancer-XY=0;AC-non_v2-sas=0;AN-non_v2-sas=3718;AF-non_v2-sas=0.00000;nhomalt-non_v2-sas=0;AC-non_topmed-afr-XX=0;AN-non_topmed-afr-XX=10838;AF-non_topmed-afr-XX=0.00000;nhomalt-non_topmed-afr-XX=0;AC-sas=0;AN-sas=4644;AF-sas=0.00000;nhomalt-sas=0;AC-non_neuro-nfe-XX=0;AN-non_neuro-nfe-XX=37250;AF-non_neuro-nfe-XX=0.00000;nhomalt-non_neuro-nfe-XX=0;AC-non_topmed-ami-XX=0;AN-non_topmed-ami-XX=64;AF-non_topmed-ami-XX=0.00000;nhomalt-non_topmed-ami-XX=0;AC-ami-XY=0;AN-ami-XY=438;AF-ami-XY=0.00000;nhomalt-ami-XY=0;AC-oth-XX=0;AN-oth-XX=986;AF-oth-XX=0.00000;nhomalt-oth-XX=0;AC-non_cancer-eas=0;AN-non_cancer-eas=4718;AF-non_cancer-eas=0.00000;nhomalt-non_cancer-eas=0;AC-non_topmed-XY=1;AN-non_topmed-XY=45584;AF-non_topmed-XY=2.19375e-05;nhomalt-non_topmed-XY=0;AC-non_v2-ami=0;AN-non_v2-ami=896;AF-non_v2-ami=0.00000;nhomalt-non_v2-ami=0;AC-non_neuro=1;AN-non_neuro=131044;AF-non_neuro=7.63102e-06;nhomalt-non_neuro=0;AC-amr-XX=0;AN-amr-XX=6532;AF-amr-XX=0.00000;nhomalt-amr-XX=0;AC-controls_and_biobanks-nfe-XY=0;AN-controls_and_biobanks-nfe-XY=3508;AF-controls_and_biobanks-nfe-XY=0.00000;nhomalt-controls_and_biobanks-nfe-XY=0;AC-controls_and_biobanks-eas=0;AN-controls_and_biobanks-eas=2378;AF-controls_and_biobanks-eas=0.00000;nhomalt-controls_and_biobanks-eas=0;AC-XX=1;AN-XX=75568;AF-XX=1.32331e-05;nhomalt-XX=0;AC-non_cancer-oth-XY=0;AN-non_cancer-oth-XY=962;AF-non_cancer-oth-XY=0.00000;nhomalt-non_cancer-oth-XY=0;AC-non_v2-XY=0;AN-non_v2-XY=52256;AF-non_v2-XY=0.00000;nhomalt-non_v2-XY=0;AC-non_topmed-amr-XX=0;AN-non_topmed-amr-XX=5084;AF-non_topmed-amr-XX=0.00000;nhomalt-non_topmed-amr-XX=0;AC-fin=0;AN-fin=10290;AF-fin=0.00000;nhomalt-fin=0;AC-controls_and_biobanks-nfe-XX=0;AN-controls_and_biobanks-nfe-XX=3210;AF-controls_and_biobanks-nfe-XX=0.00000;nhomalt-controls_and_biobanks-nfe-XX=0;AC-controls_and_biobanks-afr=0;AN-controls_and_biobanks-afr=8390;AF-controls_and_biobanks-afr=0.00000;nhomalt-controls_and_biobanks-afr=0;AC-asj-XX=0;AN-asj-XX=1850;AF-asj-XX=0.00000;nhomalt-asj-XX=0;AC-non_topmed-mid=0;AN-non_topmed-mid=270;AF-non_topmed-mid=0.00000;nhomalt-non_topmed-mid=0;AC-non_cancer-sas-XY=0;AN-non_cancer-sas-XY=3504;AF-non_cancer-sas-XY=0.00000;nhomalt-non_cancer-sas-XY=0;AC-sas-XX=0;AN-sas-XX=1126;AF-sas-XX=0.00000;nhomalt-sas-XX=0;AC-non_topmed=1;AN-non_topmed=76604;AF-non_topmed=1.30541e-05;nhomalt-non_topmed=0;AC-non_v2-oth-XX=0;AN-non_v2-oth-XX=886;AF-non_v2-oth-XX=0.00000;nhomalt-non_v2-oth-XX=0;AC-non_neuro-ami-XY=0;AN-non_neuro-ami-XY=428;AF-non_neuro-ami-XY=0.00000;nhomalt-non_neuro-ami-XY=0;AC-controls_and_biobanks-afr-XY=0;AN-controls_and_biobanks-afr-XY=4022;AF-controls_and_biobanks-afr-XY=0.00000;nhomalt-controls_and_biobanks-afr-XY=0;AC-controls_and_biobanks-amr-XX=0;AN-controls_and_biobanks-amr-XX=2404;AF-controls_and_biobanks-amr-XX=0.00000;nhomalt-controls_and_biobanks-amr-XX=0;AC-non_topmed-amr=0;AN-non_topmed-amr=12402;AF-non_topmed-amr=0.00000;nhomalt-non_topmed-amr=0;AC-controls_and_biobanks-sas-XX=0;AN-controls_and_biobanks-sas-XX=828;AF-controls_and_biobanks-sas-XX=0.00000;nhomalt-controls_and_biobanks-sas-XX=0;AC-controls_and_biobanks-amr=0;AN-controls_and_biobanks-amr=4556;AF-controls_and_biobanks-amr=0.00000;nhomalt-controls_and_biobanks-amr=0;AC-non_neuro-fin-XX=0;AN-non_neuro-fin-XX=644;AF-non_neuro-fin-XX=0.00000;nhomalt-non_neuro-fin-XX=0;AC-non_cancer-raw=2;AN-non_cancer-raw=147644;AF-non_cancer-raw=1.35461e-05;nhomalt-non_cancer-raw=0;AC-non_neuro-mid=0;AN-non_neuro-mid=308;AF-non_neuro-mid=0.00000;nhomalt-non_neuro-mid=0;AC-non_v2-asj-XY=0;AN-non_v2-asj-XY=1364;AF-non_v2-asj-XY=0.00000;nhomalt-non_v2-asj-XY=0;AC-non_v2-afr=1;AN-non_v2-afr=26636;AF-non_v2-afr=3.75432e-05;nhomalt-non_v2-afr=0;AC-non_neuro-fin-XY=0;AN-non_neuro-fin-XY=6154;AF-non_neuro-fin-XY=0.00000;nhomalt-non_neuro-fin-XY=0;AC-non_cancer-afr=2;AN-non_cancer-afr=38572;AF-non_cancer-afr=5.18511e-05;nhomalt-non_cancer-afr=0;AC-non_topmed-sas-XY=0;AN-non_topmed-sas-XY=3496;AF-non_topmed-sas-XY=0.00000;nhomalt-non_topmed-sas-XY=0;AC-mid-XY=0;AN-mid-XY=152;AF-mid-XY=0.00000;nhomalt-mid-XY=0;AC-non_v2-oth-XY=0;AN-non_v2-oth-XY=876;AF-non_v2-oth-XY=0.00000;nhomalt-non_v2-oth-XY=0;AC-controls_and_biobanks-fin=0;AN-controls_and_biobanks-fin=5356;AF-controls_and_biobanks-fin=0.00000;nhomalt-controls_and_biobanks-fin=0;AC-non_neuro-eas-XY=0;AN-non_neuro-eas-XY=2740;AF-non_neuro-eas-XY=0.00000;nhomalt-non_neuro-eas-XY=0;AC-non_topmed-eas-XX=0;AN-non_topmed-eas-XX=1344;AF-non_topmed-eas-XX=0.00000;nhomalt-non_topmed-eas-XX=0;AC-non_v2-afr-XX=1;AN-non_v2-afr-XX=14752;AF-non_v2-afr-XX=6.77874e-05;nhomalt-non_v2-afr-XX=0;AC-non_neuro-amr-XX=0;AN-non_neuro-amr-XX=6348;AF-non_neuro-amr-XX=0.00000;nhomalt-non_neuro-amr-XX=0;AC-non_cancer-ami=0;AN-non_cancer-ami=898;AF-non_cancer-ami=0.00000;nhomalt-non_cancer-ami=0;AC-XY=1;AN-XY=71706;AF-XY=1.39458e-05;nhomalt-XY=0;AC-non_topmed-asj-XX=0;AN-non_topmed-asj-XX=272;AF-non_topmed-asj-XX=0.00000;nhomalt-non_topmed-asj-XX=0;AC-non_topmed-eas-XY=0;AN-non_topmed-eas-XY=2172;AF-non_topmed-eas-XY=0.00000;nhomalt-non_topmed-eas-XY=0;AC-non_v2-eas-XY=0;AN-non_v2-eas-XY=1362;AF-non_v2-eas-XY=0.00000;nhomalt-non_v2-eas-XY=0;AC-eas=0;AN-eas=4944;AF-eas=0.00000;nhomalt-eas=0;AC-asj-XY=0;AN-asj-XY=1576;AF-asj-XY=0.00000;nhomalt-asj-XY=0;AC-non_v2-eas-XX=0;AN-non_v2-eas-XX=1258;AF-non_v2-eas-XX=0.00000;nhomalt-non_v2-eas-XX=0;AC-controls_and_biobanks-mid-XY=0;AN-controls_and_biobanks-mid-XY=112;AF-controls_and_biobanks-mid-XY=0.00000;nhomalt-controls_and_biobanks-mid-XY=0;AC-fin-XY=0;AN-fin-XY=7820;AF-fin-XY=0.00000;nhomalt-fin-XY=0;AC-non_topmed-nfe=0;AN-non_topmed-nfe=20552;AF-non_topmed-nfe=0.00000;nhomalt-non_topmed-nfe=0;AC-amr=0;AN-amr=14736;AF-amr=0.00000;nhomalt-amr=0;AC-non_neuro-ami=0;AN-non_neuro-ami=854;AF-non_neuro-ami=0.00000;nhomalt-non_neuro-ami=0;AC-non_cancer-nfe-XX=0;AN-non_cancer-nfe-XX=37672;AF-non_cancer-nfe-XX=0.00000;nhomalt-non_cancer-nfe-XX=0;AC-non_cancer-mid=0;AN-non_cancer-mid=304;AF-non_cancer-mid=0.00000;nhomalt-non_cancer-mid=0;AC-non_v2-mid-XY=0;AN-non_v2-mid-XY=146;AF-non_v2-mid-XY=0.00000;nhomalt-non_v2-mid-XY=0;AC-controls_and_biobanks-amr-XY=0;AN-controls_and_biobanks-amr-XY=2152;AF-controls_and_biobanks-amr-XY=0.00000;nhomalt-controls_and_biobanks-amr-XY=0;AC-non_cancer-ami-XY=0;AN-non_cancer-ami-XY=438;AF-non_cancer-ami-XY=0.00000;nhomalt-non_cancer-ami-XY=0;AC-non_neuro-asj-XX=0;AN-non_neuro-asj-XX=1814;AF-non_neuro-asj-XX=0.00000;nhomalt-non_neuro-asj-XX=0;AC-afr=2;AN-afr=38874;AF-afr=5.14483e-05;nhomalt-afr=0;AC-non_v2-sas-XX=0;AN-non_v2-sas-XX=758;AF-non_v2-sas-XX=0.00000;nhomalt-non_v2-sas-XX=0;AC-non_neuro-afr-XX=1;AN-non_neuro-afr-XX=17802;AF-non_neuro-afr-XX=5.61735e-05;nhomalt-non_neuro-afr-XX=0;AC-non_cancer-sas=0;AN-non_cancer-sas=4612;AF-non_cancer-sas=0.00000;nhomalt-non_cancer-sas=0;AC-non_topmed-fin=0;AN-non_topmed-fin=10198;AF-non_topmed-fin=0.00000;nhomalt-non_topmed-fin=0;AC-non_cancer-asj-XY=0;AN-non_cancer-asj-XY=1532;AF-non_cancer-asj-XY=0.00000;nhomalt-non_cancer-asj-XY=0;AC-non_cancer-mid-XY=0;AN-non_cancer-mid-XY=144;AF-non_cancer-mid-XY=0.00000;nhomalt-non_cancer-mid-XY=0;AC-raw=2;AN-raw=151834;AF-raw=1.31723e-05;nhomalt-raw=0;AC-non_topmed-XX=0;AN-non_topmed-XX=31020;AF-non_topmed-XX=0.00000;nhomalt-non_topmed-XX=0;AC-ami-XX=0;AN-ami-XX=460;AF-ami-XX=0.00000;nhomalt-ami-XX=0;AC-eas-XY=0;AN-eas-XY=2740;AF-eas-XY=0.00000;nhomalt-eas-XY=0;AC-controls_and_biobanks-mid=0;AN-controls_and_biobanks-mid=246;AF-controls_and_biobanks-mid=0.00000;nhomalt-controls_and_biobanks-mid=0;AC-non_v2-nfe-XY=0;AN-non_v2-nfe-XY=19880;AF-non_v2-nfe-XY=0.00000;nhomalt-non_v2-nfe-XY=0;AC-controls_and_biobanks-sas=0;AN-controls_and_biobanks-sas=3022;AF-controls_and_biobanks-sas=0.00000;nhomalt-controls_and_biobanks-sas=0;AC-non_v2-eas=0;AN-non_v2-eas=2620;AF-non_v2-eas=0.00000;nhomalt-non_v2-eas=0;AC-mid=0;AN-mid=314;AF-mid=0.00000;nhomalt-mid=0;AC-oth-XY=0;AN-oth-XY=1006;AF-oth-XY=0.00000;nhomalt-oth-XY=0;AC-non_cancer-nfe=0;AN-non_cancer-nfe=64016;AF-non_cancer-nfe=0.00000;nhomalt-non_cancer-nfe=0;AC-non_neuro-eas-XX=0;AN-non_neuro-eas-XX=2204;AF-non_neuro-eas-XX=0.00000;nhomalt-non_neuro-eas-XX=0;AC-non_neuro-sas-XY=0;AN-non_neuro-sas-XY=3516;AF-non_neuro-sas-XY=0.00000;nhomalt-non_neuro-sas-XY=0;AC-non_cancer-ami-XX=0;AN-non_cancer-ami-XX=460;AF-non_cancer-ami-XX=0.00000;nhomalt-non_cancer-ami-XX=0;AC-mid-XX=0;AN-mid-XX=162;AF-mid-XX=0.00000;nhomalt-mid-XX=0;AC-non_topmed-asj=0;AN-non_topmed-asj=978;AF-non_topmed-asj=0.00000;nhomalt-non_topmed-asj=0;AC-non_v2-asj-XX=0;AN-non_v2-asj-XX=1672;AF-non_v2-asj-XX=0.00000;nhomalt-non_v2-asj-XX=0;nhomalt=0;AC-non_v2-amr-XY=0;AN-non_v2-amr-XY=7436;AF-non_v2-amr-XY=0.00000;nhomalt-non_v2-amr-XY=0;AC-non_cancer-amr-XX=0;AN-non_cancer-amr-XX=6482;AF-non_cancer-amr-XX=0.00000;nhomalt-non_cancer-amr-XX=0;AC-controls_and_biobanks-afr-XX=0;AN-controls_and_biobanks-afr-XX=4368;AF-controls_and_biobanks-afr-XX=0.00000;nhomalt-controls_and_biobanks-afr-XX=0;AC-asj=0;AN-asj=3426;AF-asj=0.00000;nhomalt-asj=0;AC-non_topmed-asj-XY=0;AN-non_topmed-asj-XY=706;AF-non_topmed-asj-XY=0.00000;nhomalt-non_topmed-asj-XY=0;AC-non_v2-fin-XX=0;AN-non_v2-fin-XX=1118;AF-non_v2-fin-XX=0.00000;nhomalt-non_v2-fin-XX=0;AC-non_topmed-ami=0;AN-non_topmed-ami=104;AF-non_topmed-ami=0.00000;nhomalt-non_topmed-ami=0;AC-controls_and_biobanks-eas-XX=0;AN-controls_and_biobanks-eas-XX=1016;AF-controls_and_biobanks-eas-XX=0.00000;nhomalt-controls_and_biobanks-eas-XX=0;AC-controls_and_biobanks-fin-XX=0;AN-controls_and_biobanks-fin-XX=486;AF-controls_and_biobanks-fin-XX=0.00000;nhomalt-controls_and_biobanks-fin-XX=0;AC-non_topmed-raw=1;AN-non_topmed-raw=80440;AF-non_topmed-raw=1.24316e-05;nhomalt-non_topmed-raw=0;AC-non_cancer-eas-XY=0;AN-non_cancer-eas-XY=2612;AF-non_cancer-eas-XY=0.00000;nhomalt-non_cancer-eas-XY=0;AC-non_cancer=2;AN-non_cancer=143152;AF-non_cancer=1.39712e-05;nhomalt-non_cancer=0;AC-controls_and_biobanks-ami-XY=0;AN-controls_and_biobanks-ami-XY=28;AF-controls_and_biobanks-ami-XY=0.00000;nhomalt-controls_and_biobanks-ami-XY=0;AC-controls_and_biobanks-mid-XX=0;AN-controls_and_biobanks-mid-XX=134;AF-controls_and_biobanks-mid-XX=0.00000;nhomalt-controls_and_biobanks-mid-XX=0;AC-non_v2-afr-XY=0;AN-non_v2-afr-XY=11884;AF-non_v2-afr-XY=0.00000;nhomalt-non_v2-afr-XY=0;AC-non_v2-sas-XY=0;AN-non_v2-sas-XY=2960;AF-non_v2-sas-XY=0.00000;nhomalt-non_v2-sas-XY=0;AC-non_v2-fin=0;AN-non_v2-fin=7030;AF-non_v2-fin=0.00000;nhomalt-non_v2-fin=0;AC-non_neuro-oth=0;AN-non_neuro-oth=1902;AF-non_neuro-oth=0.00000;nhomalt-non_neuro-oth=0;AC-non_cancer-sas-XX=0;AN-non_cancer-sas-XX=1108;AF-non_cancer-sas-XX=0.00000;nhomalt-non_cancer-sas-XX=0;AC-non_neuro-asj=0;AN-non_neuro-asj=3344;AF-non_neuro-asj=0.00000;nhomalt-non_neuro-asj=0;AC-non_topmed-afr=1;AN-non_topmed-afr=22548;AF-non_topmed-afr=4.43498e-05;nhomalt-non_topmed-afr=0;AC-non_topmed-afr-XY=1;AN-non_topmed-afr-XY=11710;AF-non_topmed-afr-XY=8.53971e-05;nhomalt-non_topmed-afr-XY=0;AC-non_neuro-eas=0;AN-non_neuro-eas=4944;AF-non_neuro-eas=0.00000;nhomalt-non_neuro-eas=0;AC-afr-XX=1;AN-afr-XX=20872;AF-afr-XX=4.79111e-05;nhomalt-afr-XX=0;AC-non_neuro-mid-XY=0;AN-non_neuro-mid-XY=146;AF-non_neuro-mid-XY=0.00000;nhomalt-non_neuro-mid-XY=0;AC-non_topmed-fin-XX=0;AN-non_topmed-fin-XX=2412;AF-non_topmed-fin-XX=0.00000;nhomalt-non_topmed-fin-XX=0;AC-non_cancer-amr=0;AN-non_cancer-amr=14556;AF-non_cancer-amr=0.00000;nhomalt-non_cancer-amr=0;AC-non_v2-ami-XX=0;AN-non_v2-ami-XX=460;AF-non_v2-ami-XX=0.00000;nhomalt-non_v2-ami-XX=0;AC-afr-XY=1;AN-afr-XY=18002;AF-afr-XY=5.55494e-05;nhomalt-afr-XY=0;AC-non_v2-mid-XX=0;AN-non_v2-mid-XX=162;AF-non_v2-mid-XX=0.00000;nhomalt-non_v2-mid-XX=0;AC-non_topmed-fin-XY=0;AN-non_topmed-fin-XY=7786;AF-non_topmed-fin-XY=0.00000;nhomalt-non_topmed-fin-XY=0;AC-non_neuro-amr-XY=0;AN-non_neuro-amr-XY=7952;AF-non_neuro-amr-XY=0.00000;nhomalt-non_neuro-amr-XY=0;AC-non_topmed-mid-XX=0;AN-non_topmed-mid-XX=140;AF-non_topmed-mid-XX=0.00000;nhomalt-non_topmed-mid-XX=0;AC-controls_and_biobanks-asj-XY=0;AN-controls_and_biobanks-asj-XY=50;AF-controls_and_biobanks-asj-XY=0.00000;nhomalt-controls_and_biobanks-asj-XY=0;AC-non_v2-fin-XY=0;AN-non_v2-fin-XY=5912;AF-non_v2-fin-XY=0.00000;nhomalt-non_v2-fin-XY=0;AC-controls_and_biobanks-ami-XX=0;AN-controls_and_biobanks-ami-XX=30;AF-controls_and_biobanks-ami-XX=0.00000;nhomalt-controls_and_biobanks-ami-XX=0;AC-eas-XX=0;AN-eas-XX=2204;AF-eas-XX=0.00000;nhomalt-eas-XX=0;AC-non_cancer-amr-XY=0;AN-non_cancer-amr-XY=8074;AF-non_cancer-amr-XY=0.00000;nhomalt-non_cancer-amr-XY=0;AC-non_neuro-ami-XX=0;AN-non_neuro-ami-XX=426;AF-non_neuro-ami-XX=0.00000;nhomalt-non_neuro-ami-XX=0;AC-controls_and_biobanks=0;AN-controls_and_biobanks=31606;AF-controls_and_biobanks=0.00000;nhomalt-controls_and_biobanks=0;AC-controls_and_biobanks-oth=0;AN-controls_and_biobanks-oth=748;AF-controls_and_biobanks-oth=0.00000;nhomalt-controls_and_biobanks-oth=0;AC-nfe-XY=0;AN-nfe-XY=28250;AF-nfe-XY=0.00000;nhomalt-nfe-XY=0;AC-non_cancer-afr-XX=1;AN-non_cancer-afr-XX=20724;AF-non_cancer-afr-XX=4.82532e-05;nhomalt-non_cancer-afr-XX=0;AC-controls_and_biobanks-sas-XY=0;AN-controls_and_biobanks-sas-XY=2194;AF-controls_and_biobanks-sas-XY=0.00000;nhomalt-controls_and_biobanks-sas-XY=0;AC-non_cancer-oth=0;AN-non_cancer-oth=1924;AF-non_cancer-oth=0.00000;nhomalt-non_cancer-oth=0;AC-non_topmed-oth=0;AN-non_topmed-oth=1420;AF-non_topmed-oth=0.00000;nhomalt-non_topmed-oth=0;AC-non_topmed-nfe-XY=0;AN-non_topmed-nfe-XY=11450;AF-non_topmed-nfe-XY=0.00000;nhomalt-non_topmed-nfe-XY=0;AC-non_topmed-sas-XX=0;AN-non_topmed-sas-XX=1120;AF-non_topmed-sas-XX=0.00000;nhomalt-non_topmed-sas-XX=0;AC-non_v2-nfe=0;AN-non_v2-nfe=51350;AF-non_v2-nfe=0.00000;nhomalt-non_v2-nfe=0;AC-non_topmed-oth-XX=0;AN-non_topmed-oth-XX=644;AF-non_topmed-oth-XX=0.00000;nhomalt-non_topmed-oth-XX=0;AC-non_cancer-mid-XX=0;AN-non_cancer-mid-XX=160;AF-non_cancer-mid-XX=0.00000;nhomalt-non_cancer-mid-XX=0;AC-controls_and_biobanks-nfe=0;AN-controls_and_biobanks-nfe=6718;AF-controls_and_biobanks-nfe=0.00000;nhomalt-controls_and_biobanks-nfe=0;AC-controls_and_biobanks-oth-XY=0;AN-controls_and_biobanks-oth-XY=398;AF-controls_and_biobanks-oth-XY=0.00000;nhomalt-controls_and_biobanks-oth-XY=0;AC-controls_and_biobanks-fin-XY=0;AN-controls_and_biobanks-fin-XY=4870;AF-controls_and_biobanks-fin-XY=0.00000;nhomalt-controls_and_biobanks-fin-XY=0;AC-non_v2-amr-XX=0;AN-non_v2-amr-XX=5796;AF-non_v2-amr-XX=0.00000;nhomalt-non_v2-amr-XX=0;AC-non_cancer-asj=0;AN-non_cancer-asj=3262;AF-non_cancer-asj=0.00000;nhomalt-non_cancer-asj=0;AC-non_cancer-oth-XX=0;AN-non_cancer-oth-XX=962;AF-non_cancer-oth-XX=0.00000;nhomalt-non_cancer-oth-XX=0;AC-non_neuro-amr=0;AN-non_neuro-amr=14300;AF-non_neuro-amr=0.00000;nhomalt-non_neuro-amr=0;AC-non_cancer-XX=1;AN-non_cancer-XX=73874;AF-non_cancer-XX=1.35366e-05;nhomalt-non_cancer-XX=0;AC-non_v2-ami-XY=0;AN-non_v2-ami-XY=436;AF-non_v2-ami-XY=0.00000;nhomalt-non_v2-ami-XY=0;AC-non_neuro-raw=1;AN-non_neuro-raw=134494;AF-non_neuro-raw=7.43528e-06;nhomalt-non_neuro-raw=0;AC-non_neuro-afr=1;AN-non_neuro-afr=30784;AF-non_neuro-afr=3.24844e-05;nhomalt-non_neuro-afr=0;AC-non_topmed-ami-XY=0;AN-non_topmed-ami-XY=40;AF-non_topmed-ami-XY=0.00000;nhomalt-non_topmed-ami-XY=0;AC-non_neuro-oth-XY=0;AN-non_neuro-oth-XY=952;AF-non_neuro-oth-XY=0.00000;nhomalt-non_neuro-oth-XY=0;AC-non_neuro-oth-XX=0;AN-non_neuro-oth-XX=950;AF-non_neuro-oth-XX=0.00000;nhomalt-non_neuro-oth-XX=0;AC-controls_and_biobanks-XX=0;AN-controls_and_biobanks-XX=12910;AF-controls_and_biobanks-XX=0.00000;nhomalt-controls_and_biobanks-XX=0;AC-non_cancer-afr-XY=1;AN-non_cancer-afr-XY=17848;AF-non_cancer-afr-XY=5.60287e-05;nhomalt-non_cancer-afr-XY=0;AC-non_cancer-fin=0;AN-non_cancer-fin=10290;AF-non_cancer-fin=0.00000;nhomalt-non_cancer-fin=0;AC-controls_and_biobanks-asj=0;AN-controls_and_biobanks-asj=134;AF-controls_and_biobanks-asj=0.00000;nhomalt-controls_and_biobanks-asj=0;AC-non_topmed-oth-XY=0;AN-non_topmed-oth-XY=776;AF-non_topmed-oth-XY=0.00000;nhomalt-non_topmed-oth-XY=0;AC-non_neuro-mid-XX=0;AN-non_neuro-mid-XX=162;AF-non_neuro-mid-XX=0.00000;nhomalt-non_neuro-mid-XX=0;AC-controls_and_biobanks-oth-XX=0;AN-controls_and_biobanks-oth-XX=350;AF-controls_and_biobanks-oth-XX=0.00000;nhomalt-controls_and_biobanks-oth-XX=0;AC-non_neuro-XY=0;AN-non_neuro-XY=62318;AF-non_neuro-XY=0.00000;nhomalt-non_neuro-XY=0;AC-nfe=0;AN-nfe=67156;AF-nfe=0.00000;nhomalt-nfe=0;AC_popmax=2;AN_popmax=38874;AF_popmax=5.14483e-05;nhomalt_popmax=0;faf95-sas=0.00000;faf99-sas=0.00000;faf95-eas=0.00000;faf99-eas=0.00000;faf95-amr=0.00000;faf99-amr=0.00000;faf95-afr=8.52000e-06;faf99-afr=3.19000e-06;faf95=2.26000e-06;faf99=8.50000e-07;faf95-nfe=0.00000;faf99-nfe=0.00000;age_hist_het_bin_freq=0|0|0|0|0|0|1|0|0|0;age_hist_het_n_smaller=1;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;FS=.;MQ=30.0616;MQRankSum=0.369000;QD=8.54054;ReadPosRankSum=0.791000;VarDP=37;QUALapprox=316;AS_FS=.;AS_MQ=30.0616;AS_MQRankSum=0.369000;AS_pab_max=0.823803;AS_QD=8.54054;AS_ReadPosRankSum=0.791000;AS_SOR=0.329753;InbreedingCoeff=-1.32322e-05;AS_VQSLOD=-3.36890;AS_culprit=AS_MQ;allele_type=snv;n_alt_alleles=1;variant_type=snv;segdup;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|2;gq_hist_all_bin_freq=0|0|0|0|40527|14808|9280|4429|2130|1202|599|208|149|95|57|53|35|17|12|36;dp_hist_alt_bin_freq=0|0|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=0|0|7908|30432|19007|11405|4364|412|62|15|14|4|6|4|2|1|0|0|1|0;dp_hist_all_n_smaller=0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0;cadd_raw_score=0.738478;cadd_phred=8.80700;vep=C|upstream_gene_variant|MODIFIER|OR11H1|ENSG00000130538|Transcript|ENST00000252835|protein_coding||||||||||1|58|1|SNV||HGNC|HGNC:15404|YES||P2|CCDS74807.1|ENSP00000252835||||||||||||||||,C|upstream_gene_variant|MODIFIER|OR11H1|ENSG00000130538|Transcript|ENST00000643195|protein_coding||||||||||1|91|1|SNV||HGNC|HGNC:15404|||A2||ENSP00000495403||||||||||||||||,C|upstream_gene_variant|MODIFIER|OR11H1|81061|Transcript|NM_001005239.1|protein_coding||||||||||1|58|1|SNV||EntrezGene|HGNC:15404|YES||||NP_001005239.1||||||||||||||||"); writer.WriteLine("chr22\t15528106\trs1285219414\tT\tC\t.\tAS_VQSR\tAC=2;AN=148296;AF=1.34865e-05;popmax=sas;faf95_popmax=0.00000;AC-non_v2-XX=1;AN-non_v2-XX=58720;AF-non_v2-XX=1.70300e-05;nhomalt-non_v2-XX=0;AC-non_cancer-fin-XX=0;AN-non_cancer-fin-XX=2506;AF-non_cancer-fin-XX=0.00000;nhomalt-non_cancer-fin-XX=0;AC-non_neuro-nfe=0;AN-non_neuro-nfe=63364;AF-non_neuro-nfe=0.00000;nhomalt-non_neuro-nfe=0;AC-non_neuro-afr-XY=0;AN-non_neuro-afr-XY=13138;AF-non_neuro-afr-XY=0.00000;nhomalt-non_neuro-afr-XY=0;AC-non_neuro-nfe-XY=0;AN-non_neuro-nfe-XY=26012;AF-non_neuro-nfe-XY=0.00000;nhomalt-non_neuro-nfe-XY=0;AC-controls_and_biobanks-eas-XY=0;AN-controls_and_biobanks-eas-XY=1376;AF-controls_and_biobanks-eas-XY=0.00000;nhomalt-controls_and_biobanks-eas-XY=0;AC-non_neuro-sas-XX=1;AN-non_neuro-sas-XX=1132;AF-non_neuro-sas-XX=0.000883392;nhomalt-non_neuro-sas-XX=0;AC-non_v2=1;AN-non_v2=111410;AF-non_v2=8.97585e-06;nhomalt-non_v2=0;AC-non_topmed-nfe-XX=0;AN-non_topmed-nfe-XX=9164;AF-non_topmed-nfe-XX=0.00000;nhomalt-non_topmed-nfe-XX=0;AC-non_v2-mid=0;AN-non_v2-mid=308;AF-non_v2-mid=0.00000;nhomalt-non_v2-mid=0;AC-non_topmed-sas=1;AN-non_topmed-sas=4650;AF-non_topmed-sas=0.000215054;nhomalt-non_topmed-sas=0;AC-non_cancer-eas-XX=0;AN-non_cancer-eas-XX=2120;AF-non_cancer-eas-XX=0.00000;nhomalt-non_cancer-eas-XX=0;AC-amr-XY=0;AN-amr-XY=8258;AF-amr-XY=0.00000;nhomalt-amr-XY=0;AC-non_v2-nfe-XX=0;AN-non_v2-nfe-XX=31546;AF-non_v2-nfe-XX=0.00000;nhomalt-non_v2-nfe-XX=0;AC-controls_and_biobanks-XY=0;AN-controls_and_biobanks-XY=18868;AF-controls_and_biobanks-XY=0.00000;nhomalt-controls_and_biobanks-XY=0;AC-non_neuro-asj-XY=0;AN-non_neuro-asj-XY=1536;AF-non_neuro-asj-XY=0.00000;nhomalt-non_neuro-asj-XY=0;AC-oth=0;AN-oth=2008;AF-oth=0.00000;nhomalt-oth=0;AC-non_topmed-mid-XY=0;AN-non_topmed-mid-XY=130;AF-non_topmed-mid-XY=0.00000;nhomalt-non_topmed-mid-XY=0;AC-non_cancer-asj-XX=0;AN-non_cancer-asj-XX=1730;AF-non_cancer-asj-XX=0.00000;nhomalt-non_cancer-asj-XX=0;AC-sas-XY=0;AN-sas-XY=3546;AF-sas-XY=0.00000;nhomalt-sas-XY=0;AC-non_neuro-fin=0;AN-non_neuro-fin=6858;AF-non_neuro-fin=0.00000;nhomalt-non_neuro-fin=0;AC-non_topmed-amr-XY=0;AN-non_topmed-amr-XY=7364;AF-non_topmed-amr-XY=0.00000;nhomalt-non_topmed-amr-XY=0;AC-non_neuro-XX=1;AN-non_neuro-XX=69114;AF-non_neuro-XX=1.44688e-05;nhomalt-non_neuro-XX=0;AC-fin-XX=0;AN-fin-XX=2506;AF-fin-XX=0.00000;nhomalt-fin-XX=0;AC-controls_and_biobanks-asj-XX=0;AN-controls_and_biobanks-asj-XX=84;AF-controls_and_biobanks-asj-XX=0.00000;nhomalt-controls_and_biobanks-asj-XX=0;AC-non_v2-raw=1;AN-non_v2-raw=114440;AF-non_v2-raw=8.73820e-06;nhomalt-non_v2-raw=0;AC-non_v2-asj=0;AN-non_v2-asj=3044;AF-non_v2-asj=0.00000;nhomalt-non_v2-asj=0;AC-nfe-XX=0;AN-nfe-XX=39022;AF-nfe-XX=0.00000;nhomalt-nfe-XX=0;AC-controls_and_biobanks-raw=0;AN-controls_and_biobanks-raw=32864;AF-controls_and_biobanks-raw=0.00000;nhomalt-controls_and_biobanks-raw=0;AC-controls_and_biobanks-ami=0;AN-controls_and_biobanks-ami=58;AF-controls_and_biobanks-ami=0.00000;nhomalt-controls_and_biobanks-ami=0;AC-non_topmed-eas=0;AN-non_topmed-eas=3540;AF-non_topmed-eas=0.00000;nhomalt-non_topmed-eas=0;AC-non_v2-amr=0;AN-non_v2-amr=13332;AF-non_v2-amr=0.00000;nhomalt-non_v2-amr=0;AC-non_neuro-sas=1;AN-non_neuro-sas=4676;AF-non_neuro-sas=0.000213858;nhomalt-non_neuro-sas=0;AC-non_cancer-fin-XY=0;AN-non_cancer-fin-XY=7900;AF-non_cancer-fin-XY=0.00000;nhomalt-non_cancer-fin-XY=0;AC-non_cancer-nfe-XY=0;AN-non_cancer-nfe-XY=26438;AF-non_cancer-nfe-XY=0.00000;nhomalt-non_cancer-nfe-XY=0;AC-non_v2-oth=0;AN-non_v2-oth=1780;AF-non_v2-oth=0.00000;nhomalt-non_v2-oth=0;AC-ami=0;AN-ami=902;AF-ami=0.00000;nhomalt-ami=0;AC-non_cancer-XY=0;AN-non_cancer-XY=69802;AF-non_cancer-XY=0.00000;nhomalt-non_cancer-XY=0;AC-non_v2-sas=0;AN-non_v2-sas=3746;AF-non_v2-sas=0.00000;nhomalt-non_v2-sas=0;AC-non_topmed-afr-XX=1;AN-non_topmed-afr-XX=11042;AF-non_topmed-afr-XX=9.05633e-05;nhomalt-non_topmed-afr-XX=0;AC-sas=1;AN-sas=4678;AF-sas=0.000213767;nhomalt-sas=0;AC-non_neuro-nfe-XX=0;AN-non_neuro-nfe-XX=37352;AF-non_neuro-nfe-XX=0.00000;nhomalt-non_neuro-nfe-XX=0;AC-non_topmed-ami-XX=0;AN-non_topmed-ami-XX=64;AF-non_topmed-ami-XX=0.00000;nhomalt-non_topmed-ami-XX=0;AC-ami-XY=0;AN-ami-XY=436;AF-ami-XY=0.00000;nhomalt-ami-XY=0;AC-oth-XX=0;AN-oth-XX=996;AF-oth-XX=0.00000;nhomalt-oth-XX=0;AC-non_cancer-eas=0;AN-non_cancer-eas=4744;AF-non_cancer-eas=0.00000;nhomalt-non_cancer-eas=0;AC-non_topmed-XY=0;AN-non_topmed-XY=46034;AF-non_topmed-XY=0.00000;nhomalt-non_topmed-XY=0;AC-non_v2-ami=0;AN-non_v2-ami=900;AF-non_v2-ami=0.00000;nhomalt-non_v2-ami=0;AC-non_neuro=1;AN-non_neuro=131840;AF-non_neuro=7.58495e-06;nhomalt-non_neuro=0;AC-amr-XX=0;AN-amr-XX=6584;AF-amr-XX=0.00000;nhomalt-amr-XX=0;AC-controls_and_biobanks-nfe-XY=0;AN-controls_and_biobanks-nfe-XY=3528;AF-controls_and_biobanks-nfe-XY=0.00000;nhomalt-controls_and_biobanks-nfe-XY=0;AC-controls_and_biobanks-eas=0;AN-controls_and_biobanks-eas=2396;AF-controls_and_biobanks-eas=0.00000;nhomalt-controls_and_biobanks-eas=0;AC-XX=2;AN-XX=76046;AF-XX=2.62999e-05;nhomalt-XX=0;AC-non_cancer-oth-XY=0;AN-non_cancer-oth-XY=968;AF-non_cancer-oth-XY=0.00000;nhomalt-non_cancer-oth-XY=0;AC-non_v2-XY=0;AN-non_v2-XY=52690;AF-non_v2-XY=0.00000;nhomalt-non_v2-XY=0;AC-non_topmed-amr-XX=0;AN-non_topmed-amr-XX=5130;AF-non_topmed-amr-XX=0.00000;nhomalt-non_topmed-amr-XX=0;AC-fin=0;AN-fin=10406;AF-fin=0.00000;nhomalt-fin=0;AC-controls_and_biobanks-nfe-XX=0;AN-controls_and_biobanks-nfe-XX=3224;AF-controls_and_biobanks-nfe-XX=0.00000;nhomalt-controls_and_biobanks-nfe-XX=0;AC-controls_and_biobanks-afr=0;AN-controls_and_biobanks-afr=8532;AF-controls_and_biobanks-afr=0.00000;nhomalt-controls_and_biobanks-afr=0;AC-asj-XX=0;AN-asj-XX=1852;AF-asj-XX=0.00000;nhomalt-asj-XX=0;AC-non_topmed-mid=0;AN-non_topmed-mid=270;AF-non_topmed-mid=0.00000;nhomalt-non_topmed-mid=0;AC-non_cancer-sas-XY=0;AN-non_cancer-sas-XY=3532;AF-non_cancer-sas-XY=0.00000;nhomalt-non_cancer-sas-XY=0;AC-sas-XX=1;AN-sas-XX=1132;AF-sas-XX=0.000883392;nhomalt-sas-XX=0;AC-non_topmed=2;AN-non_topmed=77434;AF-non_topmed=2.58284e-05;nhomalt-non_topmed=0;AC-non_v2-oth-XX=0;AN-non_v2-oth-XX=898;AF-non_v2-oth-XX=0.00000;nhomalt-non_v2-oth-XX=0;AC-non_neuro-ami-XY=0;AN-non_neuro-ami-XY=426;AF-non_neuro-ami-XY=0.00000;nhomalt-non_neuro-ami-XY=0;AC-controls_and_biobanks-afr-XY=0;AN-controls_and_biobanks-afr-XY=4086;AF-controls_and_biobanks-afr-XY=0.00000;nhomalt-controls_and_biobanks-afr-XY=0;AC-controls_and_biobanks-amr-XX=0;AN-controls_and_biobanks-amr-XX=2432;AF-controls_and_biobanks-amr-XX=0.00000;nhomalt-controls_and_biobanks-amr-XX=0;AC-non_topmed-amr=0;AN-non_topmed-amr=12494;AF-non_topmed-amr=0.00000;nhomalt-non_topmed-amr=0;AC-controls_and_biobanks-sas-XX=0;AN-controls_and_biobanks-sas-XX=832;AF-controls_and_biobanks-sas-XX=0.00000;nhomalt-controls_and_biobanks-sas-XX=0;AC-controls_and_biobanks-amr=0;AN-controls_and_biobanks-amr=4590;AF-controls_and_biobanks-amr=0.00000;nhomalt-controls_and_biobanks-amr=0;AC-non_neuro-fin-XX=0;AN-non_neuro-fin-XX=648;AF-non_neuro-fin-XX=0.00000;nhomalt-non_neuro-fin-XX=0;AC-non_cancer-raw=2;AN-non_cancer-raw=147812;AF-non_cancer-raw=1.35307e-05;nhomalt-non_cancer-raw=0;AC-non_neuro-mid=0;AN-non_neuro-mid=308;AF-non_neuro-mid=0.00000;nhomalt-non_neuro-mid=0;AC-non_v2-asj-XY=0;AN-non_v2-asj-XY=1370;AF-non_v2-asj-XY=0.00000;nhomalt-non_v2-asj-XY=0;AC-non_v2-afr=1;AN-non_v2-afr=27044;AF-non_v2-afr=3.69768e-05;nhomalt-non_v2-afr=0;AC-non_neuro-fin-XY=0;AN-non_neuro-fin-XY=6210;AF-non_neuro-fin-XY=0.00000;nhomalt-non_neuro-fin-XY=0;AC-non_cancer-afr=1;AN-non_cancer-afr=39046;AF-non_cancer-afr=2.56108e-05;nhomalt-non_cancer-afr=0;AC-non_topmed-sas-XY=0;AN-non_topmed-sas-XY=3524;AF-non_topmed-sas-XY=0.00000;nhomalt-non_topmed-sas-XY=0;AC-mid-XY=0;AN-mid-XY=152;AF-mid-XY=0.00000;nhomalt-mid-XY=0;AC-non_v2-oth-XY=0;AN-non_v2-oth-XY=882;AF-non_v2-oth-XY=0.00000;nhomalt-non_v2-oth-XY=0;AC-controls_and_biobanks-fin=0;AN-controls_and_biobanks-fin=5400;AF-controls_and_biobanks-fin=0.00000;nhomalt-controls_and_biobanks-fin=0;AC-non_neuro-eas-XY=0;AN-non_neuro-eas-XY=2752;AF-non_neuro-eas-XY=0.00000;nhomalt-non_neuro-eas-XY=0;AC-non_topmed-eas-XX=0;AN-non_topmed-eas-XX=1356;AF-non_topmed-eas-XX=0.00000;nhomalt-non_topmed-eas-XX=0;AC-non_v2-afr-XX=1;AN-non_v2-afr-XX=14962;AF-non_v2-afr-XX=6.68360e-05;nhomalt-non_v2-afr-XX=0;AC-non_neuro-amr-XX=0;AN-non_neuro-amr-XX=6398;AF-non_neuro-amr-XX=0.00000;nhomalt-non_neuro-amr-XX=0;AC-non_cancer-ami=0;AN-non_cancer-ami=902;AF-non_cancer-ami=0.00000;nhomalt-non_cancer-ami=0;AC-XY=0;AN-XY=72250;AF-XY=0.00000;nhomalt-XY=0;AC-non_topmed-asj-XX=0;AN-non_topmed-asj-XX=276;AF-non_topmed-asj-XX=0.00000;nhomalt-non_topmed-asj-XX=0;AC-non_topmed-eas-XY=0;AN-non_topmed-eas-XY=2184;AF-non_topmed-eas-XY=0.00000;nhomalt-non_topmed-eas-XY=0;AC-non_v2-eas-XY=0;AN-non_v2-eas-XY=1364;AF-non_v2-eas-XY=0.00000;nhomalt-non_v2-eas-XY=0;AC-eas=0;AN-eas=4974;AF-eas=0.00000;nhomalt-eas=0;AC-asj-XY=0;AN-asj-XY=1584;AF-asj-XY=0.00000;nhomalt-asj-XY=0;AC-non_v2-eas-XX=0;AN-non_v2-eas-XX=1262;AF-non_v2-eas-XX=0.00000;nhomalt-non_v2-eas-XX=0;AC-controls_and_biobanks-mid-XY=0;AN-controls_and_biobanks-mid-XY=112;AF-controls_and_biobanks-mid-XY=0.00000;nhomalt-controls_and_biobanks-mid-XY=0;AC-fin-XY=0;AN-fin-XY=7900;AF-fin-XY=0.00000;nhomalt-fin-XY=0;AC-non_topmed-nfe=0;AN-non_topmed-nfe=20666;AF-non_topmed-nfe=0.00000;nhomalt-non_topmed-nfe=0;AC-amr=0;AN-amr=14842;AF-amr=0.00000;nhomalt-amr=0;AC-non_neuro-ami=0;AN-non_neuro-ami=858;AF-non_neuro-ami=0.00000;nhomalt-non_neuro-ami=0;AC-non_cancer-nfe-XX=0;AN-non_cancer-nfe-XX=37772;AF-non_cancer-nfe-XX=0.00000;nhomalt-non_cancer-nfe-XX=0;AC-non_cancer-mid=0;AN-non_cancer-mid=304;AF-non_cancer-mid=0.00000;nhomalt-non_cancer-mid=0;AC-non_v2-mid-XY=0;AN-non_v2-mid-XY=146;AF-non_v2-mid-XY=0.00000;nhomalt-non_v2-mid-XY=0;AC-controls_and_biobanks-amr-XY=0;AN-controls_and_biobanks-amr-XY=2158;AF-controls_and_biobanks-amr-XY=0.00000;nhomalt-controls_and_biobanks-amr-XY=0;AC-non_cancer-ami-XY=0;AN-non_cancer-ami-XY=436;AF-non_cancer-ami-XY=0.00000;nhomalt-non_cancer-ami-XY=0;AC-non_neuro-asj-XX=0;AN-non_neuro-asj-XX=1816;AF-non_neuro-asj-XX=0.00000;nhomalt-non_neuro-asj-XX=0;AC-afr=1;AN-afr=39358;AF-afr=2.54078e-05;nhomalt-afr=0;AC-non_v2-sas-XX=0;AN-non_v2-sas-XX=760;AF-non_v2-sas-XX=0.00000;nhomalt-non_v2-sas-XX=0;AC-non_neuro-afr-XX=0;AN-non_neuro-afr-XX=17992;AF-non_neuro-afr-XX=0.00000;nhomalt-non_neuro-afr-XX=0;AC-non_cancer-sas=1;AN-non_cancer-sas=4646;AF-non_cancer-sas=0.000215239;nhomalt-non_cancer-sas=0;AC-non_topmed-fin=0;AN-non_topmed-fin=10314;AF-non_topmed-fin=0.00000;nhomalt-non_topmed-fin=0;AC-non_cancer-asj-XY=0;AN-non_cancer-asj-XY=1538;AF-non_cancer-asj-XY=0.00000;nhomalt-non_cancer-asj-XY=0;AC-non_cancer-mid-XY=0;AN-non_cancer-mid-XY=144;AF-non_cancer-mid-XY=0.00000;nhomalt-non_cancer-mid-XY=0;AC-raw=2;AN-raw=152032;AF-raw=1.31551e-05;nhomalt-raw=0;AC-non_topmed-XX=2;AN-non_topmed-XX=31400;AF-non_topmed-XX=6.36943e-05;nhomalt-non_topmed-XX=0;AC-ami-XX=0;AN-ami-XX=466;AF-ami-XX=0.00000;nhomalt-ami-XX=0;AC-eas-XY=0;AN-eas-XY=2752;AF-eas-XY=0.00000;nhomalt-eas-XY=0;AC-controls_and_biobanks-mid=0;AN-controls_and_biobanks-mid=246;AF-controls_and_biobanks-mid=0.00000;nhomalt-controls_and_biobanks-mid=0;AC-non_v2-nfe-XY=0;AN-non_v2-nfe-XY=19952;AF-non_v2-nfe-XY=0.00000;nhomalt-non_v2-nfe-XY=0;AC-controls_and_biobanks-sas=0;AN-controls_and_biobanks-sas=3046;AF-controls_and_biobanks-sas=0.00000;nhomalt-controls_and_biobanks-sas=0;AC-non_v2-eas=0;AN-non_v2-eas=2626;AF-non_v2-eas=0.00000;nhomalt-non_v2-eas=0;AC-mid=0;AN-mid=314;AF-mid=0.00000;nhomalt-mid=0;AC-oth-XY=0;AN-oth-XY=1012;AF-oth-XY=0.00000;nhomalt-oth-XY=0;AC-non_cancer-nfe=0;AN-non_cancer-nfe=64210;AF-non_cancer-nfe=0.00000;nhomalt-non_cancer-nfe=0;AC-non_neuro-eas-XX=0;AN-non_neuro-eas-XX=2222;AF-non_neuro-eas-XX=0.00000;nhomalt-non_neuro-eas-XX=0;AC-non_neuro-sas-XY=0;AN-non_neuro-sas-XY=3544;AF-non_neuro-sas-XY=0.00000;nhomalt-non_neuro-sas-XY=0;AC-non_cancer-ami-XX=0;AN-non_cancer-ami-XX=466;AF-non_cancer-ami-XX=0.00000;nhomalt-non_cancer-ami-XX=0;AC-mid-XX=0;AN-mid-XX=162;AF-mid-XX=0.00000;nhomalt-mid-XX=0;AC-non_topmed-asj=0;AN-non_topmed-asj=990;AF-non_topmed-asj=0.00000;nhomalt-non_topmed-asj=0;AC-non_v2-asj-XX=0;AN-non_v2-asj-XX=1674;AF-non_v2-asj-XX=0.00000;nhomalt-non_v2-asj-XX=0;nhomalt=0;AC-non_v2-amr-XY=0;AN-non_v2-amr-XY=7490;AF-non_v2-amr-XY=0.00000;nhomalt-non_v2-amr-XY=0;AC-non_cancer-amr-XX=0;AN-non_cancer-amr-XX=6534;AF-non_cancer-amr-XX=0.00000;nhomalt-non_cancer-amr-XX=0;AC-controls_and_biobanks-afr-XX=0;AN-controls_and_biobanks-afr-XX=4446;AF-controls_and_biobanks-afr-XX=0.00000;nhomalt-controls_and_biobanks-afr-XX=0;AC-asj=0;AN-asj=3436;AF-asj=0.00000;nhomalt-asj=0;AC-non_topmed-asj-XY=0;AN-non_topmed-asj-XY=714;AF-non_topmed-asj-XY=0.00000;nhomalt-non_topmed-asj-XY=0;AC-non_v2-fin-XX=0;AN-non_v2-fin-XX=1148;AF-non_v2-fin-XX=0.00000;nhomalt-non_v2-fin-XX=0;AC-non_topmed-ami=0;AN-non_topmed-ami=104;AF-non_topmed-ami=0.00000;nhomalt-non_topmed-ami=0;AC-controls_and_biobanks-eas-XX=0;AN-controls_and_biobanks-eas-XX=1020;AF-controls_and_biobanks-eas-XX=0.00000;nhomalt-controls_and_biobanks-eas-XX=0;AC-controls_and_biobanks-fin-XX=0;AN-controls_and_biobanks-fin-XX=486;AF-controls_and_biobanks-fin-XX=0.00000;nhomalt-controls_and_biobanks-fin-XX=0;AC-non_topmed-raw=2;AN-non_topmed-raw=80614;AF-non_topmed-raw=2.48096e-05;nhomalt-non_topmed-raw=0;AC-non_cancer-eas-XY=0;AN-non_cancer-eas-XY=2624;AF-non_cancer-eas-XY=0.00000;nhomalt-non_cancer-eas-XY=0;AC-non_cancer=2;AN-non_cancer=144126;AF-non_cancer=1.38767e-05;nhomalt-non_cancer=0;AC-controls_and_biobanks-ami-XY=0;AN-controls_and_biobanks-ami-XY=28;AF-controls_and_biobanks-ami-XY=0.00000;nhomalt-controls_and_biobanks-ami-XY=0;AC-controls_and_biobanks-mid-XX=0;AN-controls_and_biobanks-mid-XX=134;AF-controls_and_biobanks-mid-XX=0.00000;nhomalt-controls_and_biobanks-mid-XX=0;AC-non_v2-afr-XY=0;AN-non_v2-afr-XY=12082;AF-non_v2-afr-XY=0.00000;nhomalt-non_v2-afr-XY=0;AC-non_v2-sas-XY=0;AN-non_v2-sas-XY=2986;AF-non_v2-sas-XY=0.00000;nhomalt-non_v2-sas-XY=0;AC-non_v2-fin=0;AN-non_v2-fin=7132;AF-non_v2-fin=0.00000;nhomalt-non_v2-fin=0;AC-non_neuro-oth=0;AN-non_neuro-oth=1918;AF-non_neuro-oth=0.00000;nhomalt-non_neuro-oth=0;AC-non_cancer-sas-XX=1;AN-non_cancer-sas-XX=1114;AF-non_cancer-sas-XX=0.000897666;nhomalt-non_cancer-sas-XX=0;AC-non_neuro-asj=0;AN-non_neuro-asj=3352;AF-non_neuro-asj=0.00000;nhomalt-non_neuro-asj=0;AC-non_topmed-afr=1;AN-non_topmed-afr=22970;AF-non_topmed-afr=4.35350e-05;nhomalt-non_topmed-afr=0;AC-non_topmed-afr-XY=0;AN-non_topmed-afr-XY=11928;AF-non_topmed-afr-XY=0.00000;nhomalt-non_topmed-afr-XY=0;AC-non_neuro-eas=0;AN-non_neuro-eas=4974;AF-non_neuro-eas=0.00000;nhomalt-non_neuro-eas=0;AC-afr-XX=1;AN-afr-XX=21104;AF-afr-XX=4.73844e-05;nhomalt-afr-XX=0;AC-non_neuro-mid-XY=0;AN-non_neuro-mid-XY=146;AF-non_neuro-mid-XY=0.00000;nhomalt-non_neuro-mid-XY=0;AC-non_topmed-fin-XX=0;AN-non_topmed-fin-XX=2448;AF-non_topmed-fin-XX=0.00000;nhomalt-non_topmed-fin-XX=0;AC-non_cancer-amr=0;AN-non_cancer-amr=14660;AF-non_cancer-amr=0.00000;nhomalt-non_cancer-amr=0;AC-non_v2-ami-XX=0;AN-non_v2-ami-XX=466;AF-non_v2-ami-XX=0.00000;nhomalt-non_v2-ami-XX=0;AC-afr-XY=0;AN-afr-XY=18254;AF-afr-XY=0.00000;nhomalt-afr-XY=0;AC-non_v2-mid-XX=0;AN-non_v2-mid-XX=162;AF-non_v2-mid-XX=0.00000;nhomalt-non_v2-mid-XX=0;AC-non_topmed-fin-XY=0;AN-non_topmed-fin-XY=7866;AF-non_topmed-fin-XY=0.00000;nhomalt-non_topmed-fin-XY=0;AC-non_neuro-amr-XY=0;AN-non_neuro-amr-XY=8004;AF-non_neuro-amr-XY=0.00000;nhomalt-non_neuro-amr-XY=0;AC-non_topmed-mid-XX=0;AN-non_topmed-mid-XX=140;AF-non_topmed-mid-XX=0.00000;nhomalt-non_topmed-mid-XX=0;AC-controls_and_biobanks-asj-XY=0;AN-controls_and_biobanks-asj-XY=50;AF-controls_and_biobanks-asj-XY=0.00000;nhomalt-controls_and_biobanks-asj-XY=0;AC-non_v2-fin-XY=0;AN-non_v2-fin-XY=5984;AF-non_v2-fin-XY=0.00000;nhomalt-non_v2-fin-XY=0;AC-controls_and_biobanks-ami-XX=0;AN-controls_and_biobanks-ami-XX=30;AF-controls_and_biobanks-ami-XX=0.00000;nhomalt-controls_and_biobanks-ami-XX=0;AC-eas-XX=0;AN-eas-XX=2222;AF-eas-XX=0.00000;nhomalt-eas-XX=0;AC-non_cancer-amr-XY=0;AN-non_cancer-amr-XY=8126;AF-non_cancer-amr-XY=0.00000;nhomalt-non_cancer-amr-XY=0;AC-non_neuro-ami-XX=0;AN-non_neuro-ami-XX=432;AF-non_neuro-ami-XX=0.00000;nhomalt-non_neuro-ami-XX=0;AC-controls_and_biobanks=0;AN-controls_and_biobanks=31908;AF-controls_and_biobanks=0.00000;nhomalt-controls_and_biobanks=0;AC-controls_and_biobanks-oth=0;AN-controls_and_biobanks-oth=754;AF-controls_and_biobanks-oth=0.00000;nhomalt-controls_and_biobanks-oth=0;AC-nfe-XY=0;AN-nfe-XY=28356;AF-nfe-XY=0.00000;nhomalt-nfe-XY=0;AC-non_cancer-afr-XX=1;AN-non_cancer-afr-XX=20950;AF-non_cancer-afr-XX=4.77327e-05;nhomalt-non_cancer-afr-XX=0;AC-controls_and_biobanks-sas-XY=0;AN-controls_and_biobanks-sas-XY=2214;AF-controls_and_biobanks-sas-XY=0.00000;nhomalt-controls_and_biobanks-sas-XY=0;AC-non_cancer-oth=0;AN-non_cancer-oth=1940;AF-non_cancer-oth=0.00000;nhomalt-non_cancer-oth=0;AC-non_topmed-oth=0;AN-non_topmed-oth=1436;AF-non_topmed-oth=0.00000;nhomalt-non_topmed-oth=0;AC-non_topmed-nfe-XY=0;AN-non_topmed-nfe-XY=11502;AF-non_topmed-nfe-XY=0.00000;nhomalt-non_topmed-nfe-XY=0;AC-non_topmed-sas-XX=1;AN-non_topmed-sas-XX=1126;AF-non_topmed-sas-XX=0.000888099;nhomalt-non_topmed-sas-XX=0;AC-non_v2-nfe=0;AN-non_v2-nfe=51498;AF-non_v2-nfe=0.00000;nhomalt-non_v2-nfe=0;AC-non_topmed-oth-XX=0;AN-non_topmed-oth-XX=654;AF-non_topmed-oth-XX=0.00000;nhomalt-non_topmed-oth-XX=0;AC-non_cancer-mid-XX=0;AN-non_cancer-mid-XX=160;AF-non_cancer-mid-XX=0.00000;nhomalt-non_cancer-mid-XX=0;AC-controls_and_biobanks-nfe=0;AN-controls_and_biobanks-nfe=6752;AF-controls_and_biobanks-nfe=0.00000;nhomalt-controls_and_biobanks-nfe=0;AC-controls_and_biobanks-oth-XY=0;AN-controls_and_biobanks-oth-XY=402;AF-controls_and_biobanks-oth-XY=0.00000;nhomalt-controls_and_biobanks-oth-XY=0;AC-controls_and_biobanks-fin-XY=0;AN-controls_and_biobanks-fin-XY=4914;AF-controls_and_biobanks-fin-XY=0.00000;nhomalt-controls_and_biobanks-fin-XY=0;AC-non_v2-amr-XX=0;AN-non_v2-amr-XX=5842;AF-non_v2-amr-XX=0.00000;nhomalt-non_v2-amr-XX=0;AC-non_cancer-asj=0;AN-non_cancer-asj=3268;AF-non_cancer-asj=0.00000;nhomalt-non_cancer-asj=0;AC-non_cancer-oth-XX=0;AN-non_cancer-oth-XX=972;AF-non_cancer-oth-XX=0.00000;nhomalt-non_cancer-oth-XX=0;AC-non_neuro-amr=0;AN-non_neuro-amr=14402;AF-non_neuro-amr=0.00000;nhomalt-non_neuro-amr=0;AC-non_cancer-XX=2;AN-non_cancer-XX=74324;AF-non_cancer-XX=2.69092e-05;nhomalt-non_cancer-XX=0;AC-non_v2-ami-XY=0;AN-non_v2-ami-XY=434;AF-non_v2-ami-XY=0.00000;nhomalt-non_v2-ami-XY=0;AC-non_neuro-raw=1;AN-non_neuro-raw=134658;AF-non_neuro-raw=7.42622e-06;nhomalt-non_neuro-raw=0;AC-non_neuro-afr=0;AN-non_neuro-afr=31130;AF-non_neuro-afr=0.00000;nhomalt-non_neuro-afr=0;AC-non_topmed-ami-XY=0;AN-non_topmed-ami-XY=40;AF-non_topmed-ami-XY=0.00000;nhomalt-non_topmed-ami-XY=0;AC-non_neuro-oth-XY=0;AN-non_neuro-oth-XY=958;AF-non_neuro-oth-XY=0.00000;nhomalt-non_neuro-oth-XY=0;AC-non_neuro-oth-XX=0;AN-non_neuro-oth-XX=960;AF-non_neuro-oth-XX=0.00000;nhomalt-non_neuro-oth-XX=0;AC-controls_and_biobanks-XX=0;AN-controls_and_biobanks-XX=13040;AF-controls_and_biobanks-XX=0.00000;nhomalt-controls_and_biobanks-XX=0;AC-non_cancer-afr-XY=0;AN-non_cancer-afr-XY=18096;AF-non_cancer-afr-XY=0.00000;nhomalt-non_cancer-afr-XY=0;AC-non_cancer-fin=0;AN-non_cancer-fin=10406;AF-non_cancer-fin=0.00000;nhomalt-non_cancer-fin=0;AC-controls_and_biobanks-asj=0;AN-controls_and_biobanks-asj=134;AF-controls_and_biobanks-asj=0.00000;nhomalt-controls_and_biobanks-asj=0;AC-non_topmed-oth-XY=0;AN-non_topmed-oth-XY=782;AF-non_topmed-oth-XY=0.00000;nhomalt-non_topmed-oth-XY=0;AC-non_neuro-mid-XX=0;AN-non_neuro-mid-XX=162;AF-non_neuro-mid-XX=0.00000;nhomalt-non_neuro-mid-XX=0;AC-controls_and_biobanks-oth-XX=0;AN-controls_and_biobanks-oth-XX=352;AF-controls_and_biobanks-oth-XX=0.00000;nhomalt-controls_and_biobanks-oth-XX=0;AC-non_neuro-XY=0;AN-non_neuro-XY=62726;AF-non_neuro-XY=0.00000;nhomalt-non_neuro-XY=0;AC-nfe=0;AN-nfe=67378;AF-nfe=0.00000;nhomalt-nfe=0;AC_popmax=1;AN_popmax=4678;AF_popmax=0.000213767;nhomalt_popmax=0;faf95-sas=0.00000;faf99-sas=0.00000;faf95-eas=0.00000;faf99-eas=0.00000;faf95-amr=0.00000;faf99-amr=0.00000;faf95-afr=0.00000;faf99-afr=0.00000;faf95=2.24000e-06;faf99=8.40000e-07;faf95-nfe=0.00000;faf99-nfe=0.00000;age_hist_het_bin_freq=0|0|0|0|0|1|0|0|0|0;age_hist_het_n_smaller=0;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;FS=.;MQ=29.1577;MQRankSum=1.04300;QD=6.02439;ReadPosRankSum=0.231000;VarDP=41;QUALapprox=247;AS_FS=.;AS_MQ=29.1577;AS_MQRankSum=1.04300;AS_pab_max=0.266846;AS_QD=6.02439;AS_ReadPosRankSum=0.231000;AS_SOR=0.172084;InbreedingCoeff=-1.32322e-05;AS_VQSLOD=-3.89780;AS_culprit=AS_MQ;allele_type=snv;n_alt_alleles=1;variant_type=snv;segdup;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|1;gq_hist_all_bin_freq=0|0|0|0|40506|14946|9410|4551|2182|1256|619|214|153|96|57|56|36|18|12|36;dp_hist_alt_bin_freq=0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=0|0|7574|30642|19321|11617|4452|425|69|16|14|4|6|4|2|1|0|0|1|0;dp_hist_all_n_smaller=0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|0|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0;cadd_raw_score=0.476082;cadd_phred=6.25300;vep=C|upstream_gene_variant|MODIFIER|OR11H1|ENSG00000130538|Transcript|ENST00000252835|protein_coding||||||||||1|53|1|SNV||HGNC|HGNC:15404|YES||P2|CCDS74807.1|ENSP00000252835||||||||||||||||,C|upstream_gene_variant|MODIFIER|OR11H1|ENSG00000130538|Transcript|ENST00000643195|protein_coding||||||||||1|86|1|SNV||HGNC|HGNC:15404|||A2||ENSP00000495403||||||||||||||||,C|upstream_gene_variant|MODIFIER|OR11H1|81061|Transcript|NM_001005239.1|protein_coding||||||||||1|53|1|SNV||EntrezGene|HGNC:15404|YES||||NP_001005239.1||||||||||||||||"); writer.WriteLine("chr22\t15528109\trs755148717\tT\tG\t.\tAS_VQSR\tAC=222;AN=147506;AF=0.00150502;popmax=afr;faf95_popmax=0.00503537;AC-non_v2-XX=88;AN-non_v2-XX=58422;AF-non_v2-XX=0.00150628;nhomalt-non_v2-XX=0;AC-non_cancer-fin-XX=0;AN-non_cancer-fin-XX=2524;AF-non_cancer-fin-XX=0.00000;nhomalt-non_cancer-fin-XX=0;AC-non_neuro-nfe=0;AN-non_neuro-nfe=63442;AF-non_neuro-nfe=0.00000;nhomalt-non_neuro-nfe=0;AC-non_neuro-afr-XY=63;AN-non_neuro-afr-XY=12798;AF-non_neuro-afr-XY=0.00492264;nhomalt-non_neuro-afr-XY=0;AC-non_neuro-nfe-XY=0;AN-non_neuro-nfe-XY=26044;AF-non_neuro-nfe-XY=0.00000;nhomalt-non_neuro-nfe-XY=0;AC-controls_and_biobanks-eas-XY=0;AN-controls_and_biobanks-eas-XY=1376;AF-controls_and_biobanks-eas-XY=0.00000;nhomalt-controls_and_biobanks-eas-XY=0;AC-non_neuro-sas-XX=0;AN-non_neuro-sas-XX=1138;AF-non_neuro-sas-XX=0.00000;nhomalt-non_neuro-sas-XX=0;AC-non_v2=138;AN-non_v2=110904;AF-non_v2=0.00124432;nhomalt-non_v2=0;AC-non_topmed-nfe-XX=0;AN-non_topmed-nfe-XX=9198;AF-non_topmed-nfe-XX=0.00000;nhomalt-non_topmed-nfe-XX=0;AC-non_v2-mid=0;AN-non_v2-mid=308;AF-non_v2-mid=0.00000;nhomalt-non_v2-mid=0;AC-non_topmed-sas=0;AN-non_topmed-sas=4672;AF-non_topmed-sas=0.00000;nhomalt-non_topmed-sas=0;AC-non_cancer-eas-XX=0;AN-non_cancer-eas-XX=2124;AF-non_cancer-eas-XX=0.00000;nhomalt-non_cancer-eas-XX=0;AC-amr-XY=2;AN-amr-XY=8270;AF-amr-XY=0.000241838;nhomalt-amr-XY=0;AC-non_v2-nfe-XX=0;AN-non_v2-nfe-XX=31582;AF-non_v2-nfe-XX=0.00000;nhomalt-non_v2-nfe-XX=0;AC-controls_and_biobanks-XY=23;AN-controls_and_biobanks-XY=18818;AF-controls_and_biobanks-XY=0.00122223;nhomalt-controls_and_biobanks-XY=0;AC-non_neuro-asj-XY=0;AN-non_neuro-asj-XY=1534;AF-non_neuro-asj-XY=0.00000;nhomalt-non_neuro-asj-XY=0;AC-oth=1;AN-oth=2004;AF-oth=0.000499002;nhomalt-oth=0;AC-non_topmed-mid-XY=0;AN-non_topmed-mid-XY=130;AF-non_topmed-mid-XY=0.00000;nhomalt-non_topmed-mid-XY=0;AC-non_cancer-asj-XX=0;AN-non_cancer-asj-XX=1732;AF-non_cancer-asj-XX=0.00000;nhomalt-non_cancer-asj-XX=0;AC-sas-XY=0;AN-sas-XY=3562;AF-sas-XY=0.00000;nhomalt-sas-XY=0;AC-non_neuro-fin=0;AN-non_neuro-fin=6870;AF-non_neuro-fin=0.00000;nhomalt-non_neuro-fin=0;AC-non_topmed-amr-XY=2;AN-non_topmed-amr-XY=7376;AF-non_topmed-amr-XY=0.000271150;nhomalt-non_topmed-amr-XY=0;AC-non_neuro-XX=122;AN-non_neuro-XX=68724;AF-non_neuro-XX=0.00177522;nhomalt-non_neuro-XX=0;AC-fin-XX=0;AN-fin-XX=2524;AF-fin-XX=0.00000;nhomalt-fin-XX=0;AC-controls_and_biobanks-asj-XX=0;AN-controls_and_biobanks-asj-XX=84;AF-controls_and_biobanks-asj-XX=0.00000;nhomalt-controls_and_biobanks-asj-XX=0;AC-non_v2-raw=266;AN-non_v2-raw=113936;AF-non_v2-raw=0.00233464;nhomalt-non_v2-raw=2;AC-non_v2-asj=0;AN-non_v2-asj=3044;AF-non_v2-asj=0.00000;nhomalt-non_v2-asj=0;AC-nfe-XX=0;AN-nfe-XX=39080;AF-nfe-XX=0.00000;nhomalt-nfe-XX=0;AC-controls_and_biobanks-raw=109;AN-controls_and_biobanks-raw=32720;AF-controls_and_biobanks-raw=0.00333130;nhomalt-controls_and_biobanks-raw=0;AC-controls_and_biobanks-ami=0;AN-controls_and_biobanks-ami=58;AF-controls_and_biobanks-ami=0.00000;nhomalt-controls_and_biobanks-ami=0;AC-non_topmed-eas=0;AN-non_topmed-eas=3550;AF-non_topmed-eas=0.00000;nhomalt-non_topmed-eas=0;AC-non_v2-amr=4;AN-non_v2-amr=13344;AF-non_v2-amr=0.000299760;nhomalt-non_v2-amr=0;AC-non_neuro-sas=0;AN-non_neuro-sas=4698;AF-non_neuro-sas=0.00000;nhomalt-non_neuro-sas=0;AC-non_cancer-fin-XY=0;AN-non_cancer-fin-XY=7912;AF-non_cancer-fin-XY=0.00000;nhomalt-non_cancer-fin-XY=0;AC-non_cancer-nfe-XY=0;AN-non_cancer-nfe-XY=26472;AF-non_cancer-nfe-XY=0.00000;nhomalt-non_cancer-nfe-XY=0;AC-non_v2-oth=1;AN-non_v2-oth=1778;AF-non_v2-oth=0.000562430;nhomalt-non_v2-oth=0;AC-ami=0;AN-ami=904;AF-ami=0.00000;nhomalt-ami=0;AC-non_cancer-XY=85;AN-non_cancer-XY=69406;AF-non_cancer-XY=0.00122468;nhomalt-non_cancer-XY=0;AC-non_v2-sas=0;AN-non_v2-sas=3766;AF-non_v2-sas=0.00000;nhomalt-non_v2-sas=0;AC-non_topmed-afr-XX=75;AN-non_topmed-afr-XX=10846;AF-non_topmed-afr-XX=0.00691499;nhomalt-non_topmed-afr-XX=0;AC-sas=0;AN-sas=4700;AF-sas=0.00000;nhomalt-sas=0;AC-non_neuro-nfe-XX=0;AN-non_neuro-nfe-XX=37398;AF-non_neuro-nfe-XX=0.00000;nhomalt-non_neuro-nfe-XX=0;AC-non_topmed-ami-XX=0;AN-non_topmed-ami-XX=64;AF-non_topmed-ami-XX=0.00000;nhomalt-non_topmed-ami-XX=0;AC-ami-XY=0;AN-ami-XY=438;AF-ami-XY=0.00000;nhomalt-ami-XY=0;AC-oth-XX=1;AN-oth-XX=990;AF-oth-XX=0.00101010;nhomalt-oth-XX=0;AC-non_cancer-eas=0;AN-non_cancer-eas=4762;AF-non_cancer-eas=0.00000;nhomalt-non_cancer-eas=0;AC-non_topmed-XY=59;AN-non_topmed-XY=45836;AF-non_topmed-XY=0.00128720;nhomalt-non_topmed-XY=0;AC-non_v2-ami=0;AN-non_v2-ami=902;AF-non_v2-ami=0.00000;nhomalt-non_v2-ami=0;AC-non_neuro=187;AN-non_neuro=131196;AF-non_neuro=0.00142535;nhomalt-non_neuro=0;AC-amr-XX=2;AN-amr-XX=6588;AF-amr-XX=0.000303582;nhomalt-amr-XX=0;AC-controls_and_biobanks-nfe-XY=0;AN-controls_and_biobanks-nfe-XY=3532;AF-controls_and_biobanks-nfe-XY=0.00000;nhomalt-controls_and_biobanks-nfe-XY=0;AC-controls_and_biobanks-eas=0;AN-controls_and_biobanks-eas=2398;AF-controls_and_biobanks-eas=0.00000;nhomalt-controls_and_biobanks-eas=0;AC-XX=137;AN-XX=75644;AF-XX=0.00181112;nhomalt-XX=0;AC-non_cancer-oth-XY=0;AN-non_cancer-oth-XY=968;AF-non_cancer-oth-XY=0.00000;nhomalt-non_cancer-oth-XY=0;AC-non_v2-XY=50;AN-non_v2-XY=52482;AF-non_v2-XY=0.000952708;nhomalt-non_v2-XY=0;AC-non_topmed-amr-XX=2;AN-non_topmed-amr-XX=5134;AF-non_topmed-amr-XX=0.000389560;nhomalt-non_topmed-amr-XX=0;AC-fin=0;AN-fin=10436;AF-fin=0.00000;nhomalt-fin=0;AC-controls_and_biobanks-nfe-XX=0;AN-controls_and_biobanks-nfe-XX=3228;AF-controls_and_biobanks-nfe-XX=0.00000;nhomalt-controls_and_biobanks-nfe-XX=0;AC-controls_and_biobanks-afr=69;AN-controls_and_biobanks-afr=8388;AF-controls_and_biobanks-afr=0.00822604;nhomalt-controls_and_biobanks-afr=0;AC-asj-XX=0;AN-asj-XX=1854;AF-asj-XX=0.00000;nhomalt-asj-XX=0;AC-non_topmed-mid=0;AN-non_topmed-mid=270;AF-non_topmed-mid=0.00000;nhomalt-non_topmed-mid=0;AC-non_cancer-sas-XY=0;AN-non_cancer-sas-XY=3548;AF-non_cancer-sas-XY=0.00000;nhomalt-non_cancer-sas-XY=0;AC-sas-XX=0;AN-sas-XX=1138;AF-sas-XX=0.00000;nhomalt-sas-XX=0;AC-non_topmed=137;AN-non_topmed=77092;AF-non_topmed=0.00177710;nhomalt-non_topmed=0;AC-non_v2-oth-XX=1;AN-non_v2-oth-XX=894;AF-non_v2-oth-XX=0.00111857;nhomalt-non_v2-oth-XX=0;AC-non_neuro-ami-XY=0;AN-non_neuro-ami-XY=426;AF-non_neuro-ami-XY=0.00000;nhomalt-non_neuro-ami-XY=0;AC-controls_and_biobanks-afr-XY=21;AN-controls_and_biobanks-afr-XY=4012;AF-controls_and_biobanks-afr-XY=0.00523430;nhomalt-controls_and_biobanks-afr-XY=0;AC-controls_and_biobanks-amr-XX=2;AN-controls_and_biobanks-amr-XX=2424;AF-controls_and_biobanks-amr-XX=0.000825083;nhomalt-controls_and_biobanks-amr-XX=0;AC-non_topmed-amr=4;AN-non_topmed-amr=12510;AF-non_topmed-amr=0.000319744;nhomalt-non_topmed-amr=0;AC-controls_and_biobanks-sas-XX=0;AN-controls_and_biobanks-sas-XX=836;AF-controls_and_biobanks-sas-XX=0.00000;nhomalt-controls_and_biobanks-sas-XX=0;AC-controls_and_biobanks-amr=4;AN-controls_and_biobanks-amr=4594;AF-controls_and_biobanks-amr=0.000870701;nhomalt-controls_and_biobanks-amr=0;AC-non_neuro-fin-XX=0;AN-non_neuro-fin-XX=650;AF-non_neuro-fin-XX=0.00000;nhomalt-non_neuro-fin-XX=0;AC-non_cancer-raw=418;AN-non_cancer-raw=147154;AF-non_cancer-raw=0.00284056;nhomalt-non_cancer-raw=2;AC-non_neuro-mid=0;AN-non_neuro-mid=308;AF-non_neuro-mid=0.00000;nhomalt-non_neuro-mid=0;AC-non_v2-asj-XY=0;AN-non_v2-asj-XY=1368;AF-non_v2-asj-XY=0.00000;nhomalt-non_v2-asj-XY=0;AC-non_v2-afr=133;AN-non_v2-afr=26392;AF-non_v2-afr=0.00503941;nhomalt-non_v2-afr=0;AC-non_neuro-fin-XY=0;AN-non_neuro-fin-XY=6220;AF-non_neuro-fin-XY=0.00000;nhomalt-non_neuro-fin-XY=0;AC-non_cancer-afr=217;AN-non_cancer-afr=38086;AF-non_cancer-afr=0.00569763;nhomalt-non_cancer-afr=0;AC-non_topmed-sas-XY=0;AN-non_topmed-sas-XY=3540;AF-non_topmed-sas-XY=0.00000;nhomalt-non_topmed-sas-XY=0;AC-mid-XY=0;AN-mid-XY=152;AF-mid-XY=0.00000;nhomalt-mid-XY=0;AC-non_v2-oth-XY=0;AN-non_v2-oth-XY=884;AF-non_v2-oth-XY=0.00000;nhomalt-non_v2-oth-XY=0;AC-controls_and_biobanks-fin=0;AN-controls_and_biobanks-fin=5404;AF-controls_and_biobanks-fin=0.00000;nhomalt-controls_and_biobanks-fin=0;AC-non_neuro-eas-XY=0;AN-non_neuro-eas-XY=2766;AF-non_neuro-eas-XY=0.00000;nhomalt-non_neuro-eas-XY=0;AC-non_topmed-eas-XX=0;AN-non_topmed-eas-XX=1352;AF-non_topmed-eas-XX=0.00000;nhomalt-non_topmed-eas-XX=0;AC-non_v2-afr-XX=85;AN-non_v2-afr-XX=14598;AF-non_v2-afr-XX=0.00582272;nhomalt-non_v2-afr-XX=0;AC-non_neuro-amr-XX=2;AN-non_neuro-amr-XX=6402;AF-non_neuro-amr-XX=0.000312402;nhomalt-non_neuro-amr-XX=0;AC-non_cancer-ami=0;AN-non_cancer-ami=904;AF-non_cancer-ami=0.00000;nhomalt-non_cancer-ami=0;AC-XY=85;AN-XY=71862;AF-XY=0.00118282;nhomalt-XY=0;AC-non_topmed-asj-XX=0;AN-non_topmed-asj-XX=276;AF-non_topmed-asj-XX=0.00000;nhomalt-non_topmed-asj-XX=0;AC-non_topmed-eas-XY=0;AN-non_topmed-eas-XY=2198;AF-non_topmed-eas-XY=0.00000;nhomalt-non_topmed-eas-XY=0;AC-non_v2-eas-XY=0;AN-non_v2-eas-XY=1378;AF-non_v2-eas-XY=0.00000;nhomalt-non_v2-eas-XY=0;AC-eas=0;AN-eas=4986;AF-eas=0.00000;nhomalt-eas=0;AC-asj-XY=0;AN-asj-XY=1582;AF-asj-XY=0.00000;nhomalt-asj-XY=0;AC-non_v2-eas-XX=0;AN-non_v2-eas-XX=1262;AF-non_v2-eas-XX=0.00000;nhomalt-non_v2-eas-XX=0;AC-controls_and_biobanks-mid-XY=0;AN-controls_and_biobanks-mid-XY=112;AF-controls_and_biobanks-mid-XY=0.00000;nhomalt-controls_and_biobanks-mid-XY=0;AC-fin-XY=0;AN-fin-XY=7912;AF-fin-XY=0.00000;nhomalt-fin-XY=0;AC-non_topmed-nfe=0;AN-non_topmed-nfe=20720;AF-non_topmed-nfe=0.00000;nhomalt-non_topmed-nfe=0;AC-amr=4;AN-amr=14858;AF-amr=0.000269215;nhomalt-amr=0;AC-non_neuro-ami=0;AN-non_neuro-ami=858;AF-non_neuro-ami=0.00000;nhomalt-non_neuro-ami=0;AC-non_cancer-nfe-XX=0;AN-non_cancer-nfe-XX=37828;AF-non_cancer-nfe-XX=0.00000;nhomalt-non_cancer-nfe-XX=0;AC-non_cancer-mid=0;AN-non_cancer-mid=304;AF-non_cancer-mid=0.00000;nhomalt-non_cancer-mid=0;AC-non_v2-mid-XY=0;AN-non_v2-mid-XY=146;AF-non_v2-mid-XY=0.00000;nhomalt-non_v2-mid-XY=0;AC-controls_and_biobanks-amr-XY=2;AN-controls_and_biobanks-amr-XY=2170;AF-controls_and_biobanks-amr-XY=0.000921659;nhomalt-controls_and_biobanks-amr-XY=0;AC-non_cancer-ami-XY=0;AN-non_cancer-ami-XY=438;AF-non_cancer-ami-XY=0.00000;nhomalt-non_cancer-ami-XY=0;AC-non_neuro-asj-XX=0;AN-non_neuro-asj-XX=1818;AF-non_neuro-asj-XX=0.00000;nhomalt-non_neuro-asj-XX=0;AC-afr=217;AN-afr=38392;AF-afr=0.00565222;nhomalt-afr=0;AC-non_v2-sas-XX=0;AN-non_v2-sas-XX=766;AF-non_v2-sas-XX=0.00000;nhomalt-non_v2-sas-XX=0;AC-non_neuro-afr-XX=119;AN-non_neuro-afr-XX=17550;AF-non_neuro-afr-XX=0.00678063;nhomalt-non_neuro-afr-XX=0;AC-non_cancer-sas=0;AN-non_cancer-sas=4668;AF-non_cancer-sas=0.00000;nhomalt-non_cancer-sas=0;AC-non_topmed-fin=0;AN-non_topmed-fin=10344;AF-non_topmed-fin=0.00000;nhomalt-non_topmed-fin=0;AC-non_cancer-asj-XY=0;AN-non_cancer-asj-XY=1536;AF-non_cancer-asj-XY=0.00000;nhomalt-non_cancer-asj-XY=0;AC-non_cancer-mid-XY=0;AN-non_cancer-mid-XY=144;AF-non_cancer-mid-XY=0.00000;nhomalt-non_cancer-mid-XY=0;AC-raw=421;AN-raw=151380;AF-raw=0.00278108;nhomalt-raw=2;AC-non_topmed-XX=78;AN-non_topmed-XX=31256;AF-non_topmed-XX=0.00249552;nhomalt-non_topmed-XX=0;AC-ami-XX=0;AN-ami-XX=466;AF-ami-XX=0.00000;nhomalt-ami-XX=0;AC-eas-XY=0;AN-eas-XY=2766;AF-eas-XY=0.00000;nhomalt-eas-XY=0;AC-controls_and_biobanks-mid=0;AN-controls_and_biobanks-mid=246;AF-controls_and_biobanks-mid=0.00000;nhomalt-controls_and_biobanks-mid=0;AC-non_v2-nfe-XY=0;AN-non_v2-nfe-XY=19988;AF-non_v2-nfe-XY=0.00000;nhomalt-non_v2-nfe-XY=0;AC-controls_and_biobanks-sas=0;AN-controls_and_biobanks-sas=3056;AF-controls_and_biobanks-sas=0.00000;nhomalt-controls_and_biobanks-sas=0;AC-non_v2-eas=0;AN-non_v2-eas=2640;AF-non_v2-eas=0.00000;nhomalt-non_v2-eas=0;AC-mid=0;AN-mid=314;AF-mid=0.00000;nhomalt-mid=0;AC-oth-XY=0;AN-oth-XY=1014;AF-oth-XY=0.00000;nhomalt-oth-XY=0;AC-non_cancer-nfe=0;AN-non_cancer-nfe=64300;AF-non_cancer-nfe=0.00000;nhomalt-non_cancer-nfe=0;AC-non_neuro-eas-XX=0;AN-non_neuro-eas-XX=2220;AF-non_neuro-eas-XX=0.00000;nhomalt-non_neuro-eas-XX=0;AC-non_neuro-sas-XY=0;AN-non_neuro-sas-XY=3560;AF-non_neuro-sas-XY=0.00000;nhomalt-non_neuro-sas-XY=0;AC-non_cancer-ami-XX=0;AN-non_cancer-ami-XX=466;AF-non_cancer-ami-XX=0.00000;nhomalt-non_cancer-ami-XX=0;AC-mid-XX=0;AN-mid-XX=162;AF-mid-XX=0.00000;nhomalt-mid-XX=0;AC-non_topmed-asj=0;AN-non_topmed-asj=988;AF-non_topmed-asj=0.00000;nhomalt-non_topmed-asj=0;AC-non_v2-asj-XX=0;AN-non_v2-asj-XX=1676;AF-non_v2-asj-XX=0.00000;nhomalt-non_v2-asj-XX=0;nhomalt=0;AC-non_v2-amr-XY=2;AN-non_v2-amr-XY=7496;AF-non_v2-amr-XY=0.000266809;nhomalt-non_v2-amr-XY=0;AC-non_cancer-amr-XX=2;AN-non_cancer-amr-XX=6538;AF-non_cancer-amr-XX=0.000305904;nhomalt-non_cancer-amr-XX=0;AC-controls_and_biobanks-afr-XX=48;AN-controls_and_biobanks-afr-XX=4376;AF-controls_and_biobanks-afr-XX=0.0109689;nhomalt-controls_and_biobanks-afr-XX=0;AC-asj=0;AN-asj=3436;AF-asj=0.00000;nhomalt-asj=0;AC-non_topmed-asj-XY=0;AN-non_topmed-asj-XY=712;AF-non_topmed-asj-XY=0.00000;nhomalt-non_topmed-asj-XY=0;AC-non_v2-fin-XX=0;AN-non_v2-fin-XX=1168;AF-non_v2-fin-XX=0.00000;nhomalt-non_v2-fin-XX=0;AC-non_topmed-ami=0;AN-non_topmed-ami=106;AF-non_topmed-ami=0.00000;nhomalt-non_topmed-ami=0;AC-controls_and_biobanks-eas-XX=0;AN-controls_and_biobanks-eas-XX=1022;AF-controls_and_biobanks-eas-XX=0.00000;nhomalt-controls_and_biobanks-eas-XX=0;AC-controls_and_biobanks-fin-XX=0;AN-controls_and_biobanks-fin-XX=486;AF-controls_and_biobanks-fin-XX=0.00000;nhomalt-controls_and_biobanks-fin-XX=0;AC-non_topmed-raw=228;AN-non_topmed-raw=80170;AF-non_topmed-raw=0.00284396;nhomalt-non_topmed-raw=1;AC-non_cancer-eas-XY=0;AN-non_cancer-eas-XY=2638;AF-non_cancer-eas-XY=0.00000;nhomalt-non_cancer-eas-XY=0;AC-non_cancer=222;AN-non_cancer=143336;AF-non_cancer=0.00154881;nhomalt-non_cancer=0;AC-controls_and_biobanks-ami-XY=0;AN-controls_and_biobanks-ami-XY=28;AF-controls_and_biobanks-ami-XY=0.00000;nhomalt-controls_and_biobanks-ami-XY=0;AC-controls_and_biobanks-mid-XX=0;AN-controls_and_biobanks-mid-XX=134;AF-controls_and_biobanks-mid-XX=0.00000;nhomalt-controls_and_biobanks-mid-XX=0;AC-non_v2-afr-XY=48;AN-non_v2-afr-XY=11794;AF-non_v2-afr-XY=0.00406987;nhomalt-non_v2-afr-XY=0;AC-non_v2-sas-XY=0;AN-non_v2-sas-XY=3000;AF-non_v2-sas-XY=0.00000;nhomalt-non_v2-sas-XY=0;AC-non_v2-fin=0;AN-non_v2-fin=7160;AF-non_v2-fin=0.00000;nhomalt-non_v2-fin=0;AC-non_neuro-oth=1;AN-non_neuro-oth=1914;AF-non_neuro-oth=0.000522466;nhomalt-non_neuro-oth=0;AC-non_cancer-sas-XX=0;AN-non_cancer-sas-XX=1120;AF-non_cancer-sas-XX=0.00000;nhomalt-non_cancer-sas-XX=0;AC-non_neuro-asj=0;AN-non_neuro-asj=3352;AF-non_neuro-asj=0.00000;nhomalt-non_neuro-asj=0;AC-non_topmed-afr=132;AN-non_topmed-afr=22500;AF-non_topmed-afr=0.00586667;nhomalt-non_topmed-afr=0;AC-non_topmed-afr-XY=57;AN-non_topmed-afr-XY=11654;AF-non_topmed-afr-XY=0.00489102;nhomalt-non_topmed-afr-XY=0;AC-non_neuro-eas=0;AN-non_neuro-eas=4986;AF-non_neuro-eas=0.00000;nhomalt-non_neuro-eas=0;AC-afr-XX=134;AN-afr-XX=20622;AF-afr-XX=0.00649791;nhomalt-afr-XX=0;AC-non_neuro-mid-XY=0;AN-non_neuro-mid-XY=146;AF-non_neuro-mid-XY=0.00000;nhomalt-non_neuro-mid-XY=0;AC-non_topmed-fin-XX=0;AN-non_topmed-fin-XX=2466;AF-non_topmed-fin-XX=0.00000;nhomalt-non_topmed-fin-XX=0;AC-non_cancer-amr=4;AN-non_cancer-amr=14672;AF-non_cancer-amr=0.000272628;nhomalt-non_cancer-amr=0;AC-non_v2-ami-XX=0;AN-non_v2-ami-XX=466;AF-non_v2-ami-XX=0.00000;nhomalt-non_v2-ami-XX=0;AC-afr-XY=83;AN-afr-XY=17770;AF-afr-XY=0.00467079;nhomalt-afr-XY=0;AC-non_v2-mid-XX=0;AN-non_v2-mid-XX=162;AF-non_v2-mid-XX=0.00000;nhomalt-non_v2-mid-XX=0;AC-non_topmed-fin-XY=0;AN-non_topmed-fin-XY=7878;AF-non_topmed-fin-XY=0.00000;nhomalt-non_topmed-fin-XY=0;AC-non_neuro-amr-XY=2;AN-non_neuro-amr-XY=8018;AF-non_neuro-amr-XY=0.000249439;nhomalt-non_neuro-amr-XY=0;AC-non_topmed-mid-XX=0;AN-non_topmed-mid-XX=140;AF-non_topmed-mid-XX=0.00000;nhomalt-non_topmed-mid-XX=0;AC-controls_and_biobanks-asj-XY=0;AN-controls_and_biobanks-asj-XY=50;AF-controls_and_biobanks-asj-XY=0.00000;nhomalt-controls_and_biobanks-asj-XY=0;AC-non_v2-fin-XY=0;AN-non_v2-fin-XY=5992;AF-non_v2-fin-XY=0.00000;nhomalt-non_v2-fin-XY=0;AC-controls_and_biobanks-ami-XX=0;AN-controls_and_biobanks-ami-XX=30;AF-controls_and_biobanks-ami-XX=0.00000;nhomalt-controls_and_biobanks-ami-XX=0;AC-eas-XX=0;AN-eas-XX=2220;AF-eas-XX=0.00000;nhomalt-eas-XX=0;AC-non_cancer-amr-XY=2;AN-non_cancer-amr-XY=8134;AF-non_cancer-amr-XY=0.000245881;nhomalt-non_cancer-amr-XY=0;AC-non_neuro-ami-XX=0;AN-non_neuro-ami-XX=432;AF-non_neuro-ami-XX=0.00000;nhomalt-non_neuro-ami-XX=0;AC-controls_and_biobanks=74;AN-controls_and_biobanks=31786;AF-controls_and_biobanks=0.00232807;nhomalt-controls_and_biobanks=0;AC-controls_and_biobanks-oth=1;AN-controls_and_biobanks-oth=748;AF-controls_and_biobanks-oth=0.00133690;nhomalt-controls_and_biobanks-oth=0;AC-nfe-XY=0;AN-nfe-XY=28396;AF-nfe-XY=0.00000;nhomalt-nfe-XY=0;AC-non_cancer-afr-XX=134;AN-non_cancer-afr-XX=20470;AF-non_cancer-afr-XX=0.00654617;nhomalt-non_cancer-afr-XX=0;AC-controls_and_biobanks-sas-XY=0;AN-controls_and_biobanks-sas-XY=2220;AF-controls_and_biobanks-sas-XY=0.00000;nhomalt-controls_and_biobanks-sas-XY=0;AC-non_cancer-oth=1;AN-non_cancer-oth=1936;AF-non_cancer-oth=0.000516529;nhomalt-non_cancer-oth=0;AC-non_topmed-oth=1;AN-non_topmed-oth=1432;AF-non_topmed-oth=0.000698324;nhomalt-non_topmed-oth=0;AC-non_topmed-nfe-XY=0;AN-non_topmed-nfe-XY=11522;AF-non_topmed-nfe-XY=0.00000;nhomalt-non_topmed-nfe-XY=0;AC-non_topmed-sas-XX=0;AN-non_topmed-sas-XX=1132;AF-non_topmed-sas-XX=0.00000;nhomalt-non_topmed-sas-XX=0;AC-non_v2-nfe=0;AN-non_v2-nfe=51570;AF-non_v2-nfe=0.00000;nhomalt-non_v2-nfe=0;AC-non_topmed-oth-XX=1;AN-non_topmed-oth-XX=648;AF-non_topmed-oth-XX=0.00154321;nhomalt-non_topmed-oth-XX=0;AC-non_cancer-mid-XX=0;AN-non_cancer-mid-XX=160;AF-non_cancer-mid-XX=0.00000;nhomalt-non_cancer-mid-XX=0;AC-controls_and_biobanks-nfe=0;AN-controls_and_biobanks-nfe=6760;AF-controls_and_biobanks-nfe=0.00000;nhomalt-controls_and_biobanks-nfe=0;AC-controls_and_biobanks-oth-XY=0;AN-controls_and_biobanks-oth-XY=400;AF-controls_and_biobanks-oth-XY=0.00000;nhomalt-controls_and_biobanks-oth-XY=0;AC-controls_and_biobanks-fin-XY=0;AN-controls_and_biobanks-fin-XY=4918;AF-controls_and_biobanks-fin-XY=0.00000;nhomalt-controls_and_biobanks-fin-XY=0;AC-non_v2-amr-XX=2;AN-non_v2-amr-XX=5848;AF-non_v2-amr-XX=0.000341997;nhomalt-non_v2-amr-XX=0;AC-non_cancer-asj=0;AN-non_cancer-asj=3268;AF-non_cancer-asj=0.00000;nhomalt-non_cancer-asj=0;AC-non_cancer-oth-XX=1;AN-non_cancer-oth-XX=968;AF-non_cancer-oth-XX=0.00103306;nhomalt-non_cancer-oth-XX=0;AC-non_neuro-amr=4;AN-non_neuro-amr=14420;AF-non_neuro-amr=0.000277393;nhomalt-non_neuro-amr=0;AC-non_cancer-XX=137;AN-non_cancer-XX=73930;AF-non_cancer-XX=0.00185310;nhomalt-non_cancer-XX=0;AC-non_v2-ami-XY=0;AN-non_v2-ami-XY=436;AF-non_v2-ami-XY=0.00000;nhomalt-non_v2-ami-XY=0;AC-non_neuro-raw=358;AN-non_neuro-raw=134154;AF-non_neuro-raw=0.00266857;nhomalt-non_neuro-raw=2;AC-non_neuro-afr=182;AN-non_neuro-afr=30348;AF-non_neuro-afr=0.00599710;nhomalt-non_neuro-afr=0;AC-non_topmed-ami-XY=0;AN-non_topmed-ami-XY=42;AF-non_topmed-ami-XY=0.00000;nhomalt-non_topmed-ami-XY=0;AC-non_neuro-oth-XY=0;AN-non_neuro-oth-XY=960;AF-non_neuro-oth-XY=0.00000;nhomalt-non_neuro-oth-XY=0;AC-non_neuro-oth-XX=1;AN-non_neuro-oth-XX=954;AF-non_neuro-oth-XX=0.00104822;nhomalt-non_neuro-oth-XX=0;AC-controls_and_biobanks-XX=51;AN-controls_and_biobanks-XX=12968;AF-controls_and_biobanks-XX=0.00393276;nhomalt-controls_and_biobanks-XX=0;AC-non_cancer-afr-XY=83;AN-non_cancer-afr-XY=17616;AF-non_cancer-afr-XY=0.00471163;nhomalt-non_cancer-afr-XY=0;AC-non_cancer-fin=0;AN-non_cancer-fin=10436;AF-non_cancer-fin=0.00000;nhomalt-non_cancer-fin=0;AC-controls_and_biobanks-asj=0;AN-controls_and_biobanks-asj=134;AF-controls_and_biobanks-asj=0.00000;nhomalt-controls_and_biobanks-asj=0;AC-non_topmed-oth-XY=0;AN-non_topmed-oth-XY=784;AF-non_topmed-oth-XY=0.00000;nhomalt-non_topmed-oth-XY=0;AC-non_neuro-mid-XX=0;AN-non_neuro-mid-XX=162;AF-non_neuro-mid-XX=0.00000;nhomalt-non_neuro-mid-XX=0;AC-controls_and_biobanks-oth-XX=1;AN-controls_and_biobanks-oth-XX=348;AF-controls_and_biobanks-oth-XX=0.00287356;nhomalt-controls_and_biobanks-oth-XX=0;AC-non_neuro-XY=65;AN-non_neuro-XY=62472;AF-non_neuro-XY=0.00104047;nhomalt-non_neuro-XY=0;AC-nfe=0;AN-nfe=67476;AF-nfe=0.00000;nhomalt-nfe=0;AC_popmax=217;AN_popmax=38392;AF_popmax=0.00565222;nhomalt_popmax=0;faf95-sas=0.00000;faf99-sas=0.00000;faf95-eas=0.00000;faf99-eas=0.00000;faf95-amr=9.15000e-05;faf99-amr=5.50200e-05;faf95-afr=0.00503537;faf99-afr=0.00479761;faf95=0.00134206;faf99=0.00127995;faf95-nfe=0.00000;faf99-nfe=0.00000;age_hist_het_bin_freq=1|7|10|9|17|9|19|6|4|5;age_hist_het_n_smaller=14;age_hist_het_n_larger=1;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;FS=4.82164e-16;MQ=29.2276;MQRankSum=-0.221000;QD=3.15747;ReadPosRankSum=0.267000;VarDP=17667;QUALapprox=55783;AS_FS=4.82164e-16;AS_MQ=29.2276;AS_MQRankSum=-0.230000;AS_pab_max=1.00000;AS_QD=3.15747;AS_ReadPosRankSum=0.265000;AS_SOR=6.79842;InbreedingCoeff=0.00673884;AS_VQSLOD=-40.1202;AS_culprit=AS_SOR;allele_type=snv;n_alt_alleles=1;variant_type=snv;segdup;gq_hist_alt_bin_freq=0|0|0|0|0|2|6|12|13|15|16|11|21|4|10|15|8|14|12|63;gq_hist_all_bin_freq=0|0|0|0|40100|14870|9325|4541|2186|1270|637|224|171|97|66|72|43|31|24|96;dp_hist_alt_bin_freq=0|0|72|72|44|11|16|5|1|0|1|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=0|0|7226|30432|19384|11668|4490|433|70|17|15|4|6|4|2|1|0|0|1|0;dp_hist_all_n_smaller=0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|0|115|53|40|8|3|1|2|0|0|0|0|0|0|0|0|0;cadd_raw_score=0.439889;cadd_phred=5.86400;vep=G|upstream_gene_variant|MODIFIER|OR11H1|ENSG00000130538|Transcript|ENST00000252835|protein_coding||||||||||1|50|1|SNV||HGNC|HGNC:15404|YES||P2|CCDS74807.1|ENSP00000252835||||||||||||||||,G|upstream_gene_variant|MODIFIER|OR11H1|ENSG00000130538|Transcript|ENST00000643195|protein_coding||||||||||1|83|1|SNV||HGNC|HGNC:15404|||A2||ENSP00000495403||||||||||||||||,G|upstream_gene_variant|MODIFIER|OR11H1|81061|Transcript|NM_001005239.1|protein_coding||||||||||1|50|1|SNV||EntrezGene|HGNC:15404|YES||||NP_001005239.1||||||||||||||||"); writer.WriteLine("chr22\t15528137\t.\tT\tA\t.\tAS_VQSR\tAC=3;AN=150706;AF=1.99063e-05;popmax=afr;faf95_popmax=1.95500e-05;AC-non_v2-XX=1;AN-non_v2-XX=59624;AF-non_v2-XX=1.67718e-05;nhomalt-non_v2-XX=0;AC-non_cancer-fin-XX=0;AN-non_cancer-fin-XX=2558;AF-non_cancer-fin-XX=0.00000;nhomalt-non_cancer-fin-XX=0;AC-non_neuro-nfe=0;AN-non_neuro-nfe=63704;AF-non_neuro-nfe=0.00000;nhomalt-non_neuro-nfe=0;AC-non_neuro-afr-XY=1;AN-non_neuro-afr-XY=13522;AF-non_neuro-afr-XY=7.39536e-05;nhomalt-non_neuro-afr-XY=0;AC-non_neuro-nfe-XY=0;AN-non_neuro-nfe-XY=26178;AF-non_neuro-nfe-XY=0.00000;nhomalt-non_neuro-nfe-XY=0;AC-controls_and_biobanks-eas-XY=0;AN-controls_and_biobanks-eas-XY=1388;AF-controls_and_biobanks-eas-XY=0.00000;nhomalt-controls_and_biobanks-eas-XY=0;AC-non_neuro-sas-XX=0;AN-non_neuro-sas-XX=1158;AF-non_neuro-sas-XX=0.00000;nhomalt-non_neuro-sas-XX=0;AC-non_v2=2;AN-non_v2=113394;AF-non_v2=1.76376e-05;nhomalt-non_v2=0;AC-non_topmed-nfe-XX=0;AN-non_topmed-nfe-XX=9250;AF-non_topmed-nfe-XX=0.00000;nhomalt-non_topmed-nfe-XX=0;AC-non_v2-mid=0;AN-non_v2-mid=308;AF-non_v2-mid=0.00000;nhomalt-non_v2-mid=0;AC-non_topmed-sas=0;AN-non_topmed-sas=4746;AF-non_topmed-sas=0.00000;nhomalt-non_topmed-sas=0;AC-non_cancer-eas-XX=0;AN-non_cancer-eas-XX=2152;AF-non_cancer-eas-XX=0.00000;nhomalt-non_cancer-eas-XX=0;AC-amr-XY=0;AN-amr-XY=8402;AF-amr-XY=0.00000;nhomalt-amr-XY=0;AC-non_v2-nfe-XX=0;AN-non_v2-nfe-XX=31698;AF-non_v2-nfe-XX=0.00000;nhomalt-non_v2-nfe-XX=0;AC-controls_and_biobanks-XY=0;AN-controls_and_biobanks-XY=19242;AF-controls_and_biobanks-XY=0.00000;nhomalt-controls_and_biobanks-XY=0;AC-non_neuro-asj-XY=0;AN-non_neuro-asj-XY=1548;AF-non_neuro-asj-XY=0.00000;nhomalt-non_neuro-asj-XY=0;AC-oth=0;AN-oth=2058;AF-oth=0.00000;nhomalt-oth=0;AC-non_topmed-mid-XY=0;AN-non_topmed-mid-XY=130;AF-non_topmed-mid-XY=0.00000;nhomalt-non_topmed-mid-XY=0;AC-non_cancer-asj-XX=0;AN-non_cancer-asj-XX=1736;AF-non_cancer-asj-XX=0.00000;nhomalt-non_cancer-asj-XX=0;AC-sas-XY=0;AN-sas-XY=3616;AF-sas-XY=0.00000;nhomalt-sas-XY=0;AC-non_neuro-fin=0;AN-non_neuro-fin=6944;AF-non_neuro-fin=0.00000;nhomalt-non_neuro-fin=0;AC-non_topmed-amr-XY=0;AN-non_topmed-amr-XY=7502;AF-non_topmed-amr-XY=0.00000;nhomalt-non_topmed-amr-XY=0;AC-non_neuro-XX=1;AN-non_neuro-XX=69968;AF-non_neuro-XX=1.42922e-05;nhomalt-non_neuro-XX=0;AC-fin-XX=0;AN-fin-XX=2558;AF-fin-XX=0.00000;nhomalt-fin-XX=0;AC-controls_and_biobanks-asj-XX=0;AN-controls_and_biobanks-asj-XX=82;AF-controls_and_biobanks-asj-XX=0.00000;nhomalt-controls_and_biobanks-asj-XX=0;AC-non_v2-raw=2;AN-non_v2-raw=114570;AF-non_v2-raw=1.74566e-05;nhomalt-non_v2-raw=0;AC-non_v2-asj=0;AN-non_v2-asj=3066;AF-non_v2-asj=0.00000;nhomalt-non_v2-asj=0;AC-nfe-XX=0;AN-nfe-XX=39218;AF-nfe-XX=0.00000;nhomalt-nfe-XX=0;AC-controls_and_biobanks-raw=1;AN-controls_and_biobanks-raw=32894;AF-controls_and_biobanks-raw=3.04007e-05;nhomalt-controls_and_biobanks-raw=0;AC-controls_and_biobanks-ami=0;AN-controls_and_biobanks-ami=60;AF-controls_and_biobanks-ami=0.00000;nhomalt-controls_and_biobanks-ami=0;AC-non_topmed-eas=0;AN-non_topmed-eas=3652;AF-non_topmed-eas=0.00000;nhomalt-non_topmed-eas=0;AC-non_v2-amr=0;AN-non_v2-amr=13588;AF-non_v2-amr=0.00000;nhomalt-non_v2-amr=0;AC-non_neuro-sas=0;AN-non_neuro-sas=4772;AF-non_neuro-sas=0.00000;nhomalt-non_neuro-sas=0;AC-non_cancer-fin-XY=0;AN-non_cancer-fin-XY=7992;AF-non_cancer-fin-XY=0.00000;nhomalt-non_cancer-fin-XY=0;AC-non_cancer-nfe-XY=0;AN-non_cancer-nfe-XY=26600;AF-non_cancer-nfe-XY=0.00000;nhomalt-non_cancer-nfe-XY=0;AC-non_v2-oth=0;AN-non_v2-oth=1832;AF-non_v2-oth=0.00000;nhomalt-non_v2-oth=0;AC-ami=0;AN-ami=910;AF-ami=0.00000;nhomalt-ami=0;AC-non_cancer-XY=1;AN-non_cancer-XY=71068;AF-non_cancer-XY=1.40710e-05;nhomalt-non_cancer-XY=0;AC-non_v2-sas=0;AN-non_v2-sas=3836;AF-non_v2-sas=0.00000;nhomalt-non_v2-sas=0;AC-non_topmed-afr-XX=1;AN-non_topmed-afr-XX=11566;AF-non_topmed-afr-XX=8.64603e-05;nhomalt-non_topmed-afr-XX=0;AC-sas=0;AN-sas=4774;AF-sas=0.00000;nhomalt-sas=0;AC-non_neuro-nfe-XX=0;AN-non_neuro-nfe-XX=37526;AF-non_neuro-nfe-XX=0.00000;nhomalt-non_neuro-nfe-XX=0;AC-non_topmed-ami-XX=0;AN-non_topmed-ami-XX=66;AF-non_topmed-ami-XX=0.00000;nhomalt-non_topmed-ami-XX=0;AC-ami-XY=0;AN-ami-XY=442;AF-ami-XY=0.00000;nhomalt-ami-XY=0;AC-oth-XX=0;AN-oth-XX=1016;AF-oth-XX=0.00000;nhomalt-oth-XX=0;AC-non_cancer-eas=0;AN-non_cancer-eas=4862;AF-non_cancer-eas=0.00000;nhomalt-non_cancer-eas=0;AC-non_topmed-XY=1;AN-non_topmed-XY=47186;AF-non_topmed-XY=2.11927e-05;nhomalt-non_topmed-XY=0;AC-non_v2-ami=0;AN-non_v2-ami=908;AF-non_v2-ami=0.00000;nhomalt-non_v2-ami=0;AC-non_neuro=2;AN-non_neuro=133672;AF-non_neuro=1.49620e-05;nhomalt-non_neuro=0;AC-amr-XX=0;AN-amr-XX=6720;AF-amr-XX=0.00000;nhomalt-amr-XX=0;AC-controls_and_biobanks-nfe-XY=0;AN-controls_and_biobanks-nfe-XY=3564;AF-controls_and_biobanks-nfe-XY=0.00000;nhomalt-controls_and_biobanks-nfe-XY=0;AC-controls_and_biobanks-eas=0;AN-controls_and_biobanks-eas=2416;AF-controls_and_biobanks-eas=0.00000;nhomalt-controls_and_biobanks-eas=0;AC-XX=2;AN-XX=77162;AF-XX=2.59195e-05;nhomalt-XX=0;AC-non_cancer-oth-XY=0;AN-non_cancer-oth-XY=996;AF-non_cancer-oth-XY=0.00000;nhomalt-non_cancer-oth-XY=0;AC-non_v2-XY=1;AN-non_v2-XY=53770;AF-non_v2-XY=1.85977e-05;nhomalt-non_v2-XY=0;AC-non_topmed-amr-XX=0;AN-non_topmed-amr-XX=5256;AF-non_topmed-amr-XX=0.00000;nhomalt-non_topmed-amr-XX=0;AC-fin=0;AN-fin=10550;AF-fin=0.00000;nhomalt-fin=0;AC-controls_and_biobanks-nfe-XX=0;AN-controls_and_biobanks-nfe-XX=3234;AF-controls_and_biobanks-nfe-XX=0.00000;nhomalt-controls_and_biobanks-nfe-XX=0;AC-controls_and_biobanks-afr=0;AN-controls_and_biobanks-afr=8878;AF-controls_and_biobanks-afr=0.00000;nhomalt-controls_and_biobanks-afr=0;AC-asj-XX=0;AN-asj-XX=1856;AF-asj-XX=0.00000;nhomalt-asj-XX=0;AC-non_topmed-mid=0;AN-non_topmed-mid=272;AF-non_topmed-mid=0.00000;nhomalt-non_topmed-mid=0;AC-non_cancer-sas-XY=0;AN-non_cancer-sas-XY=3602;AF-non_cancer-sas-XY=0.00000;nhomalt-non_cancer-sas-XY=0;AC-sas-XX=0;AN-sas-XX=1158;AF-sas-XX=0.00000;nhomalt-sas-XX=0;AC-non_topmed=2;AN-non_topmed=79442;AF-non_topmed=2.51756e-05;nhomalt-non_topmed=0;AC-non_v2-oth-XX=0;AN-non_v2-oth-XX=918;AF-non_v2-oth-XX=0.00000;nhomalt-non_v2-oth-XX=0;AC-non_neuro-ami-XY=0;AN-non_neuro-ami-XY=430;AF-non_neuro-ami-XY=0.00000;nhomalt-non_neuro-ami-XY=0;AC-controls_and_biobanks-afr-XY=0;AN-controls_and_biobanks-afr-XY=4272;AF-controls_and_biobanks-afr-XY=0.00000;nhomalt-controls_and_biobanks-afr-XY=0;AC-controls_and_biobanks-amr-XX=0;AN-controls_and_biobanks-amr-XX=2464;AF-controls_and_biobanks-amr-XX=0.00000;nhomalt-controls_and_biobanks-amr-XX=0;AC-non_topmed-amr=0;AN-non_topmed-amr=12758;AF-non_topmed-amr=0.00000;nhomalt-non_topmed-amr=0;AC-controls_and_biobanks-sas-XX=0;AN-controls_and_biobanks-sas-XX=840;AF-controls_and_biobanks-sas-XX=0.00000;nhomalt-controls_and_biobanks-sas-XX=0;AC-controls_and_biobanks-amr=0;AN-controls_and_biobanks-amr=4654;AF-controls_and_biobanks-amr=0.00000;nhomalt-controls_and_biobanks-amr=0;AC-non_neuro-fin-XX=0;AN-non_neuro-fin-XX=654;AF-non_neuro-fin-XX=0.00000;nhomalt-non_neuro-fin-XX=0;AC-non_cancer-raw=4;AN-non_cancer-raw=147934;AF-non_cancer-raw=2.70391e-05;nhomalt-non_cancer-raw=0;AC-non_neuro-mid=0;AN-non_neuro-mid=310;AF-non_neuro-mid=0.00000;nhomalt-non_neuro-mid=0;AC-non_v2-asj-XY=0;AN-non_v2-asj-XY=1382;AF-non_v2-asj-XY=0.00000;nhomalt-non_v2-asj-XY=0;AC-non_v2-afr=2;AN-non_v2-afr=28078;AF-non_v2-afr=7.12301e-05;nhomalt-non_v2-afr=0;AC-non_neuro-fin-XY=0;AN-non_neuro-fin-XY=6290;AF-non_neuro-fin-XY=0.00000;nhomalt-non_neuro-fin-XY=0;AC-non_cancer-afr=3;AN-non_cancer-afr=40360;AF-non_cancer-afr=7.43310e-05;nhomalt-non_cancer-afr=0;AC-non_topmed-sas-XY=0;AN-non_topmed-sas-XY=3594;AF-non_topmed-sas-XY=0.00000;nhomalt-non_topmed-sas-XY=0;AC-mid-XY=0;AN-mid-XY=152;AF-mid-XY=0.00000;nhomalt-mid-XY=0;AC-non_v2-oth-XY=0;AN-non_v2-oth-XY=914;AF-non_v2-oth-XY=0.00000;nhomalt-non_v2-oth-XY=0;AC-controls_and_biobanks-fin=0;AN-controls_and_biobanks-fin=5460;AF-controls_and_biobanks-fin=0.00000;nhomalt-controls_and_biobanks-fin=0;AC-non_neuro-eas-XY=0;AN-non_neuro-eas-XY=2838;AF-non_neuro-eas-XY=0.00000;nhomalt-non_neuro-eas-XY=0;AC-non_topmed-eas-XX=0;AN-non_topmed-eas-XX=1382;AF-non_topmed-eas-XX=0.00000;nhomalt-non_topmed-eas-XX=0;AC-non_v2-afr-XX=1;AN-non_v2-afr-XX=15472;AF-non_v2-afr-XX=6.46329e-05;nhomalt-non_v2-afr-XX=0;AC-non_neuro-amr-XX=0;AN-non_neuro-amr-XX=6534;AF-non_neuro-amr-XX=0.00000;nhomalt-non_neuro-amr-XX=0;AC-non_cancer-ami=0;AN-non_cancer-ami=910;AF-non_cancer-ami=0.00000;nhomalt-non_cancer-ami=0;AC-XY=1;AN-XY=73544;AF-XY=1.35973e-05;nhomalt-XY=0;AC-non_topmed-asj-XX=0;AN-non_topmed-asj-XX=270;AF-non_topmed-asj-XX=0.00000;nhomalt-non_topmed-asj-XX=0;AC-non_topmed-eas-XY=0;AN-non_topmed-eas-XY=2270;AF-non_topmed-eas-XY=0.00000;nhomalt-non_topmed-eas-XY=0;AC-non_v2-eas-XY=0;AN-non_v2-eas-XY=1438;AF-non_v2-eas-XY=0.00000;nhomalt-non_v2-eas-XY=0;AC-eas=0;AN-eas=5092;AF-eas=0.00000;nhomalt-eas=0;AC-asj-XY=0;AN-asj-XY=1594;AF-asj-XY=0.00000;nhomalt-asj-XY=0;AC-non_v2-eas-XX=0;AN-non_v2-eas-XX=1286;AF-non_v2-eas-XX=0.00000;nhomalt-non_v2-eas-XX=0;AC-controls_and_biobanks-mid-XY=0;AN-controls_and_biobanks-mid-XY=112;AF-controls_and_biobanks-mid-XY=0.00000;nhomalt-controls_and_biobanks-mid-XY=0;AC-fin-XY=0;AN-fin-XY=7992;AF-fin-XY=0.00000;nhomalt-fin-XY=0;AC-non_topmed-nfe=0;AN-non_topmed-nfe=20870;AF-non_topmed-nfe=0.00000;nhomalt-non_topmed-nfe=0;AC-amr=0;AN-amr=15122;AF-amr=0.00000;nhomalt-amr=0;AC-non_neuro-ami=0;AN-non_neuro-ami=862;AF-non_neuro-ami=0.00000;nhomalt-non_neuro-ami=0;AC-non_cancer-nfe-XX=0;AN-non_cancer-nfe-XX=37954;AF-non_cancer-nfe-XX=0.00000;nhomalt-non_cancer-nfe-XX=0;AC-non_cancer-mid=0;AN-non_cancer-mid=304;AF-non_cancer-mid=0.00000;nhomalt-non_cancer-mid=0;AC-non_v2-mid-XY=0;AN-non_v2-mid-XY=146;AF-non_v2-mid-XY=0.00000;nhomalt-non_v2-mid-XY=0;AC-controls_and_biobanks-amr-XY=0;AN-controls_and_biobanks-amr-XY=2190;AF-controls_and_biobanks-amr-XY=0.00000;nhomalt-controls_and_biobanks-amr-XY=0;AC-non_cancer-ami-XY=0;AN-non_cancer-ami-XY=442;AF-non_cancer-ami-XY=0.00000;nhomalt-non_cancer-ami-XY=0;AC-non_neuro-asj-XX=0;AN-non_neuro-asj-XX=1822;AF-non_neuro-asj-XX=0.00000;nhomalt-non_neuro-asj-XX=0;AC-afr=3;AN-afr=40676;AF-afr=7.37536e-05;nhomalt-afr=0;AC-non_v2-sas-XX=0;AN-non_v2-sas-XX=784;AF-non_v2-sas-XX=0.00000;nhomalt-non_v2-sas-XX=0;AC-non_neuro-afr-XX=1;AN-non_neuro-afr-XX=18444;AF-non_neuro-afr-XX=5.42182e-05;nhomalt-non_neuro-afr-XX=0;AC-non_cancer-sas=0;AN-non_cancer-sas=4742;AF-non_cancer-sas=0.00000;nhomalt-non_cancer-sas=0;AC-non_topmed-fin=0;AN-non_topmed-fin=10458;AF-non_topmed-fin=0.00000;nhomalt-non_topmed-fin=0;AC-non_cancer-asj-XY=0;AN-non_cancer-asj-XY=1550;AF-non_cancer-asj-XY=0.00000;nhomalt-non_cancer-asj-XY=0;AC-non_cancer-mid-XY=0;AN-non_cancer-mid-XY=144;AF-non_cancer-mid-XY=0.00000;nhomalt-non_cancer-mid-XY=0;AC-raw=4;AN-raw=152178;AF-raw=2.62850e-05;nhomalt-raw=0;AC-non_topmed-XX=1;AN-non_topmed-XX=32256;AF-non_topmed-XX=3.10020e-05;nhomalt-non_topmed-XX=0;AC-ami-XX=0;AN-ami-XX=468;AF-ami-XX=0.00000;nhomalt-ami-XX=0;AC-eas-XY=0;AN-eas-XY=2838;AF-eas-XY=0.00000;nhomalt-eas-XY=0;AC-controls_and_biobanks-mid=0;AN-controls_and_biobanks-mid=246;AF-controls_and_biobanks-mid=0.00000;nhomalt-controls_and_biobanks-mid=0;AC-non_v2-nfe-XY=0;AN-non_v2-nfe-XY=20096;AF-non_v2-nfe-XY=0.00000;nhomalt-non_v2-nfe-XY=0;AC-controls_and_biobanks-sas=0;AN-controls_and_biobanks-sas=3084;AF-controls_and_biobanks-sas=0.00000;nhomalt-controls_and_biobanks-sas=0;AC-non_v2-eas=0;AN-non_v2-eas=2724;AF-non_v2-eas=0.00000;nhomalt-non_v2-eas=0;AC-mid=0;AN-mid=316;AF-mid=0.00000;nhomalt-mid=0;AC-oth-XY=0;AN-oth-XY=1042;AF-oth-XY=0.00000;nhomalt-oth-XY=0;AC-non_cancer-nfe=0;AN-non_cancer-nfe=64554;AF-non_cancer-nfe=0.00000;nhomalt-non_cancer-nfe=0;AC-non_neuro-eas-XX=0;AN-non_neuro-eas-XX=2254;AF-non_neuro-eas-XX=0.00000;nhomalt-non_neuro-eas-XX=0;AC-non_neuro-sas-XY=0;AN-non_neuro-sas-XY=3614;AF-non_neuro-sas-XY=0.00000;nhomalt-non_neuro-sas-XY=0;AC-non_cancer-ami-XX=0;AN-non_cancer-ami-XX=468;AF-non_cancer-ami-XX=0.00000;nhomalt-non_cancer-ami-XX=0;AC-mid-XX=0;AN-mid-XX=164;AF-mid-XX=0.00000;nhomalt-mid-XX=0;AC-non_topmed-asj=0;AN-non_topmed-asj=982;AF-non_topmed-asj=0.00000;nhomalt-non_topmed-asj=0;AC-non_v2-asj-XX=0;AN-non_v2-asj-XX=1684;AF-non_v2-asj-XX=0.00000;nhomalt-non_v2-asj-XX=0;nhomalt=0;AC-non_v2-amr-XY=0;AN-non_v2-amr-XY=7628;AF-non_v2-amr-XY=0.00000;nhomalt-non_v2-amr-XY=0;AC-non_cancer-amr-XX=0;AN-non_cancer-amr-XX=6668;AF-non_cancer-amr-XX=0.00000;nhomalt-non_cancer-amr-XX=0;AC-controls_and_biobanks-afr-XX=0;AN-controls_and_biobanks-afr-XX=4606;AF-controls_and_biobanks-afr-XX=0.00000;nhomalt-controls_and_biobanks-afr-XX=0;AC-asj=0;AN-asj=3450;AF-asj=0.00000;nhomalt-asj=0;AC-non_topmed-asj-XY=0;AN-non_topmed-asj-XY=712;AF-non_topmed-asj-XY=0.00000;nhomalt-non_topmed-asj-XY=0;AC-non_v2-fin-XX=0;AN-non_v2-fin-XX=1192;AF-non_v2-fin-XX=0.00000;nhomalt-non_v2-fin-XX=0;AC-non_topmed-ami=0;AN-non_topmed-ami=110;AF-non_topmed-ami=0.00000;nhomalt-non_topmed-ami=0;AC-controls_and_biobanks-eas-XX=0;AN-controls_and_biobanks-eas-XX=1028;AF-controls_and_biobanks-eas-XX=0.00000;nhomalt-controls_and_biobanks-eas-XX=0;AC-controls_and_biobanks-fin-XX=0;AN-controls_and_biobanks-fin-XX=486;AF-controls_and_biobanks-fin-XX=0.00000;nhomalt-controls_and_biobanks-fin-XX=0;AC-non_topmed-raw=3;AN-non_topmed-raw=80736;AF-non_topmed-raw=3.71581e-05;nhomalt-non_topmed-raw=0;AC-non_cancer-eas-XY=0;AN-non_cancer-eas-XY=2710;AF-non_cancer-eas-XY=0.00000;nhomalt-non_cancer-eas-XY=0;AC-non_cancer=3;AN-non_cancer=146490;AF-non_cancer=2.04792e-05;nhomalt-non_cancer=0;AC-controls_and_biobanks-ami-XY=0;AN-controls_and_biobanks-ami-XY=30;AF-controls_and_biobanks-ami-XY=0.00000;nhomalt-controls_and_biobanks-ami-XY=0;AC-controls_and_biobanks-mid-XX=0;AN-controls_and_biobanks-mid-XX=134;AF-controls_and_biobanks-mid-XX=0.00000;nhomalt-controls_and_biobanks-mid-XX=0;AC-non_v2-afr-XY=1;AN-non_v2-afr-XY=12606;AF-non_v2-afr-XY=7.93273e-05;nhomalt-non_v2-afr-XY=0;AC-non_v2-sas-XY=0;AN-non_v2-sas-XY=3052;AF-non_v2-sas-XY=0.00000;nhomalt-non_v2-sas-XY=0;AC-non_v2-fin=0;AN-non_v2-fin=7260;AF-non_v2-fin=0.00000;nhomalt-non_v2-fin=0;AC-non_neuro-oth=0;AN-non_neuro-oth=1970;AF-non_neuro-oth=0.00000;nhomalt-non_neuro-oth=0;AC-non_cancer-sas-XX=0;AN-non_cancer-sas-XX=1140;AF-non_cancer-sas-XX=0.00000;nhomalt-non_cancer-sas-XX=0;AC-non_neuro-asj=0;AN-non_neuro-asj=3370;AF-non_neuro-asj=0.00000;nhomalt-non_neuro-asj=0;AC-non_topmed-afr=2;AN-non_topmed-afr=24110;AF-non_topmed-afr=8.29531e-05;nhomalt-non_topmed-afr=0;AC-non_topmed-afr-XY=1;AN-non_topmed-afr-XY=12544;AF-non_topmed-afr-XY=7.97194e-05;nhomalt-non_topmed-afr-XY=0;AC-non_neuro-eas=0;AN-non_neuro-eas=5092;AF-non_neuro-eas=0.00000;nhomalt-non_neuro-eas=0;AC-afr-XX=2;AN-afr-XX=21750;AF-afr-XX=9.19540e-05;nhomalt-afr-XX=0;AC-non_neuro-mid-XY=0;AN-non_neuro-mid-XY=146;AF-non_neuro-mid-XY=0.00000;nhomalt-non_neuro-mid-XY=0;AC-non_topmed-fin-XX=0;AN-non_topmed-fin-XX=2500;AF-non_topmed-fin-XX=0.00000;nhomalt-non_topmed-fin-XX=0;AC-non_cancer-amr=0;AN-non_cancer-amr=14934;AF-non_cancer-amr=0.00000;nhomalt-non_cancer-amr=0;AC-non_v2-ami-XX=0;AN-non_v2-ami-XX=468;AF-non_v2-ami-XX=0.00000;nhomalt-non_v2-ami-XX=0;AC-afr-XY=1;AN-afr-XY=18926;AF-afr-XY=5.28374e-05;nhomalt-afr-XY=0;AC-non_v2-mid-XX=0;AN-non_v2-mid-XX=162;AF-non_v2-mid-XX=0.00000;nhomalt-non_v2-mid-XX=0;AC-non_topmed-fin-XY=0;AN-non_topmed-fin-XY=7958;AF-non_topmed-fin-XY=0.00000;nhomalt-non_topmed-fin-XY=0;AC-non_neuro-amr-XY=0;AN-non_neuro-amr-XY=8148;AF-non_neuro-amr-XY=0.00000;nhomalt-non_neuro-amr-XY=0;AC-non_topmed-mid-XX=0;AN-non_topmed-mid-XX=142;AF-non_topmed-mid-XX=0.00000;nhomalt-non_topmed-mid-XX=0;AC-controls_and_biobanks-asj-XY=0;AN-controls_and_biobanks-asj-XY=52;AF-controls_and_biobanks-asj-XY=0.00000;nhomalt-controls_and_biobanks-asj-XY=0;AC-non_v2-fin-XY=0;AN-non_v2-fin-XY=6068;AF-non_v2-fin-XY=0.00000;nhomalt-non_v2-fin-XY=0;AC-controls_and_biobanks-ami-XX=0;AN-controls_and_biobanks-ami-XX=30;AF-controls_and_biobanks-ami-XX=0.00000;nhomalt-controls_and_biobanks-ami-XX=0;AC-eas-XX=0;AN-eas-XX=2254;AF-eas-XX=0.00000;nhomalt-eas-XX=0;AC-non_cancer-amr-XY=0;AN-non_cancer-amr-XY=8266;AF-non_cancer-amr-XY=0.00000;nhomalt-non_cancer-amr-XY=0;AC-non_neuro-ami-XX=0;AN-non_neuro-ami-XX=432;AF-non_neuro-ami-XX=0.00000;nhomalt-non_neuro-ami-XX=0;AC-controls_and_biobanks=0;AN-controls_and_biobanks=32504;AF-controls_and_biobanks=0.00000;nhomalt-controls_and_biobanks=0;AC-controls_and_biobanks-oth=0;AN-controls_and_biobanks-oth=774;AF-controls_and_biobanks-oth=0.00000;nhomalt-controls_and_biobanks-oth=0;AC-nfe-XY=0;AN-nfe-XY=28540;AF-nfe-XY=0.00000;nhomalt-nfe-XY=0;AC-non_cancer-afr-XX=2;AN-non_cancer-afr-XX=21594;AF-non_cancer-afr-XX=9.26183e-05;nhomalt-non_cancer-afr-XX=0;AC-controls_and_biobanks-sas-XY=0;AN-controls_and_biobanks-sas-XY=2244;AF-controls_and_biobanks-sas-XY=0.00000;nhomalt-controls_and_biobanks-sas-XY=0;AC-non_cancer-oth=0;AN-non_cancer-oth=1988;AF-non_cancer-oth=0.00000;nhomalt-non_cancer-oth=0;AC-non_topmed-oth=0;AN-non_topmed-oth=1484;AF-non_topmed-oth=0.00000;nhomalt-non_topmed-oth=0;AC-non_topmed-nfe-XY=0;AN-non_topmed-nfe-XY=11620;AF-non_topmed-nfe-XY=0.00000;nhomalt-non_topmed-nfe-XY=0;AC-non_topmed-sas-XX=0;AN-non_topmed-sas-XX=1152;AF-non_topmed-sas-XX=0.00000;nhomalt-non_topmed-sas-XX=0;AC-non_v2-nfe=0;AN-non_v2-nfe=51794;AF-non_v2-nfe=0.00000;nhomalt-non_v2-nfe=0;AC-non_topmed-oth-XX=0;AN-non_topmed-oth-XX=672;AF-non_topmed-oth-XX=0.00000;nhomalt-non_topmed-oth-XX=0;AC-non_cancer-mid-XX=0;AN-non_cancer-mid-XX=160;AF-non_cancer-mid-XX=0.00000;nhomalt-non_cancer-mid-XX=0;AC-controls_and_biobanks-nfe=0;AN-controls_and_biobanks-nfe=6798;AF-controls_and_biobanks-nfe=0.00000;nhomalt-controls_and_biobanks-nfe=0;AC-controls_and_biobanks-oth-XY=0;AN-controls_and_biobanks-oth-XY=416;AF-controls_and_biobanks-oth-XY=0.00000;nhomalt-controls_and_biobanks-oth-XY=0;AC-controls_and_biobanks-fin-XY=0;AN-controls_and_biobanks-fin-XY=4974;AF-controls_and_biobanks-fin-XY=0.00000;nhomalt-controls_and_biobanks-fin-XY=0;AC-non_v2-amr-XX=0;AN-non_v2-amr-XX=5960;AF-non_v2-amr-XX=0.00000;nhomalt-non_v2-amr-XX=0;AC-non_cancer-asj=0;AN-non_cancer-asj=3286;AF-non_cancer-asj=0.00000;nhomalt-non_cancer-asj=0;AC-non_cancer-oth-XX=0;AN-non_cancer-oth-XX=992;AF-non_cancer-oth-XX=0.00000;nhomalt-non_cancer-oth-XX=0;AC-non_neuro-amr=0;AN-non_neuro-amr=14682;AF-non_neuro-amr=0.00000;nhomalt-non_neuro-amr=0;AC-non_cancer-XX=2;AN-non_cancer-XX=75422;AF-non_cancer-XX=2.65175e-05;nhomalt-non_cancer-XX=0;AC-non_v2-ami-XY=0;AN-non_v2-ami-XY=440;AF-non_v2-ami-XY=0.00000;nhomalt-non_v2-ami-XY=0;AC-non_neuro-raw=3;AN-non_neuro-raw=134764;AF-non_neuro-raw=2.22611e-05;nhomalt-non_neuro-raw=0;AC-non_neuro-afr=2;AN-non_neuro-afr=31966;AF-non_neuro-afr=6.25665e-05;nhomalt-non_neuro-afr=0;AC-non_topmed-ami-XY=0;AN-non_topmed-ami-XY=44;AF-non_topmed-ami-XY=0.00000;nhomalt-non_topmed-ami-XY=0;AC-non_neuro-oth-XY=0;AN-non_neuro-oth-XY=990;AF-non_neuro-oth-XY=0.00000;nhomalt-non_neuro-oth-XY=0;AC-non_neuro-oth-XX=0;AN-non_neuro-oth-XX=980;AF-non_neuro-oth-XX=0.00000;nhomalt-non_neuro-oth-XX=0;AC-controls_and_biobanks-XX=0;AN-controls_and_biobanks-XX=13262;AF-controls_and_biobanks-XX=0.00000;nhomalt-controls_and_biobanks-XX=0;AC-non_cancer-afr-XY=1;AN-non_cancer-afr-XY=18766;AF-non_cancer-afr-XY=5.32879e-05;nhomalt-non_cancer-afr-XY=0;AC-non_cancer-fin=0;AN-non_cancer-fin=10550;AF-non_cancer-fin=0.00000;nhomalt-non_cancer-fin=0;AC-controls_and_biobanks-asj=0;AN-controls_and_biobanks-asj=134;AF-controls_and_biobanks-asj=0.00000;nhomalt-controls_and_biobanks-asj=0;AC-non_topmed-oth-XY=0;AN-non_topmed-oth-XY=812;AF-non_topmed-oth-XY=0.00000;nhomalt-non_topmed-oth-XY=0;AC-non_neuro-mid-XX=0;AN-non_neuro-mid-XX=164;AF-non_neuro-mid-XX=0.00000;nhomalt-non_neuro-mid-XX=0;AC-controls_and_biobanks-oth-XX=0;AN-controls_and_biobanks-oth-XX=358;AF-controls_and_biobanks-oth-XX=0.00000;nhomalt-controls_and_biobanks-oth-XX=0;AC-non_neuro-XY=1;AN-non_neuro-XY=63704;AF-non_neuro-XY=1.56976e-05;nhomalt-non_neuro-XY=0;AC-nfe=0;AN-nfe=67758;AF-nfe=0.00000;nhomalt-nfe=0;AC_popmax=3;AN_popmax=40676;AF_popmax=7.37536e-05;nhomalt_popmax=0;faf95-sas=0.00000;faf99-sas=0.00000;faf95-eas=0.00000;faf99-eas=0.00000;faf95-amr=0.00000;faf99-amr=0.00000;faf95-afr=1.95500e-05;faf99-afr=1.04200e-05;faf95=5.29000e-06;faf99=2.47000e-06;faf95-nfe=0.00000;faf99-nfe=0.00000;age_hist_het_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_het_n_smaller=1;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;FS=2.82574;MQ=29.6952;MQRankSum=0.0770000;QD=5.56039;ReadPosRankSum=0.405000;VarDP=207;QUALapprox=1151;AS_FS=4.28972;AS_MQ=29.6880;AS_MQRankSum=-0.429000;AS_pab_max=0.332306;AS_QD=5.44792;AS_ReadPosRankSum=1.14700;AS_SOR=1.84939;InbreedingCoeff=-2.63453e-05;AS_VQSLOD=-3.01260;AS_culprit=AS_MQ;allele_type=snv;n_alt_alleles=2;variant_type=multi-snv;segdup;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|3;gq_hist_all_bin_freq=0|0|0|0|39783|15257|9716|4889|2468|1467|768|285|210|143|89|82|54|34|32|76;dp_hist_alt_bin_freq=0|0|0|1|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=0|0|5915|31045|20271|12374|4976|552|128|35|24|9|9|5|3|2|2|0|3|0;dp_hist_all_n_smaller=0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|0|0|2|0|1|0|0|0|0|0|0|0|0|0|0|0|0;cadd_raw_score=0.495494;cadd_phred=6.45700;vep=A|upstream_gene_variant|MODIFIER|OR11H1|ENSG00000130538|Transcript|ENST00000252835|protein_coding||||||||||1|22|1|SNV||HGNC|HGNC:15404|YES||P2|CCDS74807.1|ENSP00000252835||||||||||||||||,A|upstream_gene_variant|MODIFIER|OR11H1|ENSG00000130538|Transcript|ENST00000643195|protein_coding||||||||||1|55|1|SNV||HGNC|HGNC:15404|||A2||ENSP00000495403||||||||||||||||,A|upstream_gene_variant|MODIFIER|OR11H1|81061|Transcript|NM_001005239.1|protein_coding||||||||||1|22|1|SNV||EntrezGene|HGNC:15404|YES||||NP_001005239.1||||||||||||||||"); writer.Flush(); stream.Position = 0; return stream; } /*[Fact] public void GetMergedItems() { var sequence = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "TTATCTTCTCTTCATTCTTAAAAAAGGAACACATTTTATA", 15528100 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var gnomadReader = new GnomadSnvReader(new StreamReader(GetChr22GenomeStream()), new StreamReader(GetChr22ExomeStream()), sequenceProvider); var items = gnomadReader.GetCombinedItems().ToList(); Assert.Equal(7, items.Count); Assert.Equal(15528101, items[0].Position); Assert.Equal(15528106, items[1].Position); Assert.Equal(15528107, items[2].Position); Assert.Equal(15528109, items[3].Position);//merged item Assert.Equal(15528126, items[4].Position); Assert.Equal(15528135, items[5].Position); Assert.Equal(15528137, items[6].Position); //this is the merged item. Assert.Equal("\"coverage\":23,\"failedFilter\":true,\"allAf\":0.000476,\"allAn\":86114,\"allAc\":41,\"allHc\":0,\"afrAf\":0.003149,\"afrAn\":13018,\"afrAc\":41,\"afrHc\":0,\"amrAf\":0,\"amrAn\":11274,\"amrAc\":0,\"amrHc\":0,\"easAf\":0,\"easAn\":9354,\"easAc\":0,\"easHc\":0,\"finAf\":0,\"finAn\":5344,\"finAc\":0,\"finHc\":0,\"nfeAf\":0,\"nfeAn\":36372,\"nfeAc\":0,\"nfeHc\":0,\"asjAf\":0,\"asjAn\":1780,\"asjAc\":0,\"asjHc\":0,\"sasAf\":0,\"sasAn\":6060,\"sasAc\":0,\"sasHc\":0,\"othAf\":0,\"othAn\":2912,\"othAc\":0,\"othHc\":0,\"maleAf\":0.000335,\"maleAn\":44796,\"maleAc\":15,\"maleHc\":0,\"femaleAf\":0.000629,\"femaleAn\":41318,\"femaleAc\":26,\"femaleHc\":0,\"controlsAllAf\":0.000476,\"controlsAllAn\":33612,\"controlsAllAc\":16", items[3].GetJsonString()); } */ private static Stream GetChr22_21006257_genome() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t21006258\trs992970331\tTAAA\tT\t8.74487e+06\tPASS\tAC=8785;AN=23616;AF=0.371994;rf_tp_probability=0.780993;FS=3.613;InbreedingCoeff=-0.0098;MQ=59.37;MQRankSum=0.038;QD=27.94;ReadPosRankSum=0.328;SOR=0.737;VQSR_POSITIVE_TRAIN_SITE;BaseQRankSum=0.204;ClippingRankSum=0;DP=452644;VQSLOD=1.05;VQSR_culprit=FS;lcr;rf_positive_label;rf_label=TP;rf_train;variant_type=mixed;allele_type=del;n_alt_alleles=6;was_mixed;pab_max=1;gq_hist_alt_bin_freq=222|154|167|171|189|203|185|250|180|248|248|215|290|269|209|244|214|136|184|6868;gq_hist_all_bin_freq=1339|303|316|362|449|460|480|587|494|564|572|457|589|544|414|425|443|260|300|10317;dp_hist_alt_bin_freq=40|850|2172|2746|2354|1452|750|287|129|38|8|8|5|5|0|1|0|1|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=279|2039|4133|4783|4115|2626|1400|572|236|74|18|11|9|7|1|4|0|2|0|1;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=85|7|20|47|139|324|609|709|1183|809|1365|834|713|425|229|268|236|171|65|63;AC_nfe_seu=27;AN_nfe_seu=74;AF_nfe_seu=0.364865;nhomalt_nfe_seu=3;controls_AC_afr_male=236;controls_AN_afr_male=1098;controls_AF_afr_male=0.214936;controls_nhomalt_afr_male=27;non_topmed_AC_amr=281;non_topmed_AN_amr=548;non_topmed_AF_amr=0.512774;non_topmed_nhomalt_amr=65;AC_raw=10257;AN_raw=30508;AF_raw=0.336207;nhomalt_raw=1994;AC_fin_female=250;AN_fin_female=780;AF_fin_female=0.320513;nhomalt_fin_female=38;non_neuro_AC_asj_female=17;non_neuro_AN_asj_female=50;non_neuro_AF_asj_female=0.34;non_neuro_nhomalt_asj_female=3;non_neuro_AC_afr_male=266;non_neuro_AN_afr_male=1272;non_neuro_AF_afr_male=0.209119;non_neuro_nhomalt_afr_male=29;AC_afr_male=947;AN_afr_male=4094;AF_afr_male=0.231314;nhomalt_afr_male=114;AC_afr=1641;AN_afr=7126;AF_afr=0.230283;nhomalt_afr=183;non_neuro_AC_afr_female=333;non_neuro_AN_afr_female=1468;non_neuro_AF_afr_female=0.226839;non_neuro_nhomalt_afr_female=34;non_topmed_AC_amr_female=128;non_topmed_AN_amr_female=270;non_topmed_AF_amr_female=0.474074;non_topmed_nhomalt_amr_female=24;non_topmed_AC_oth_female=144;non_topmed_AN_oth_female=354;non_topmed_AF_oth_female=0.40678;non_topmed_nhomalt_oth_female=29;AC_eas_female=75;AN_eas_female=464;AF_eas_female=0.161638;nhomalt_eas_female=6;AC_afr_female=694;AN_afr_female=3032;AF_afr_female=0.228892;nhomalt_afr_female=69;non_neuro_AC_female=2837;non_neuro_AN_female=7250;non_neuro_AF_female=0.39131;non_neuro_nhomalt_female=587;controls_AC_afr=459;controls_AN_afr=2006;controls_AF_afr=0.228814;controls_nhomalt_afr=51;AC_nfe_onf=831;AN_nfe_onf=1714;AF_nfe_onf=0.484831;nhomalt_nfe_onf=205;controls_AC_fin_male=76;controls_AN_fin_male=236;controls_AF_fin_male=0.322034;controls_nhomalt_fin_male=8;non_neuro_AC_nfe_nwe=3319;non_neuro_AN_nfe_nwe=6584;non_neuro_AF_nfe_nwe=0.504101;non_neuro_nhomalt_nfe_nwe=795;AC_fin_male=243;AN_fin_male=724;AF_fin_male=0.335635;nhomalt_fin_male=32;AC_nfe_female=2452;AN_nfe_female=5278;AF_nfe_female=0.46457;nhomalt_nfe_female=552;AC_amr=302;AN_amr=582;AF_amr=0.5189;nhomalt_amr=71;non_topmed_AC_nfe_male=1863;non_topmed_AN_nfe_male=3966;non_topmed_AF_nfe_male=0.469743;non_topmed_nhomalt_nfe_male=437;AC_eas=206;AN_eas=1372;AF_eas=0.150146;nhomalt_eas=17;nhomalt=1732;non_neuro_AC_nfe_female=2132;non_neuro_AN_nfe_female=4586;non_neuro_AF_nfe_female=0.464893;non_neuro_nhomalt_nfe_female=489;non_neuro_AC_afr=599;non_neuro_AN_afr=2740;non_neuro_AF_afr=0.218613;non_neuro_nhomalt_afr=63;controls_AC_raw=3012;controls_AN_raw=10336;controls_AF_raw=0.291409;controls_nhomalt_raw=537;controls_AC_male=1253;controls_AN_male=3856;controls_AF_male=0.324948;controls_nhomalt_male=225;non_topmed_AC_male=3484;non_topmed_AN_male=10244;non_topmed_AF_male=0.340102;non_topmed_nhomalt_male=665;controls_AC_nfe_female=642;controls_AN_nfe_female=1538;controls_AF_nfe_female=0.417425;controls_nhomalt_nfe_female=131;non_neuro_AC_amr=220;non_neuro_AN_amr=418;non_neuro_AF_amr=0.526316;non_neuro_nhomalt_amr=53;non_neuro_AC_eas_female=75;non_neuro_AN_eas_female=464;non_neuro_AF_eas_female=0.161638;non_neuro_nhomalt_eas_female=6;AC_asj_male=90;AN_asj_male=182;AF_asj_male=0.494505;nhomalt_asj_male=20;controls_AC_nfe_male=766;controls_AN_nfe_male=1798;controls_AF_nfe_male=0.426029;controls_nhomalt_nfe_male=160;non_neuro_AC_fin=139;non_neuro_AN_fin=438;non_neuro_AF_fin=0.317352;non_neuro_nhomalt_fin=18;AC_oth_female=149;AN_oth_female=366;AF_oth_female=0.407104;nhomalt_oth_female=30;controls_AC_nfe=1408;controls_AN_nfe=3336;controls_AF_nfe=0.422062;controls_nhomalt_nfe=291;controls_AC_oth_female=42;controls_AN_oth_female=122;controls_AF_oth_female=0.344262;controls_nhomalt_oth_female=6;controls_AC_asj=8;controls_AN_asj=28;controls_AF_asj=0.285714;controls_nhomalt_asj=0;non_neuro_AC_amr_male=102;non_neuro_AN_amr_male=186;non_neuro_AF_amr_male=0.548387;non_neuro_nhomalt_amr_male=27;controls_AC_nfe_nwe=239;controls_AN_nfe_nwe=510;controls_AF_nfe_nwe=0.468627;controls_nhomalt_nfe_nwe=54;AC_nfe_nwe=3663;AN_nfe_nwe=7306;AF_nfe_nwe=0.501369;nhomalt_nfe_nwe=862;controls_AC_nfe_seu=15;controls_AN_nfe_seu=40;controls_AF_nfe_seu=0.375;controls_nhomalt_nfe_seu=3;non_neuro_AC_amr_female=118;non_neuro_AN_amr_female=232;non_neuro_AF_amr_female=0.508621;non_neuro_nhomalt_amr_female=26;non_neuro_AC_nfe_onf=673;non_neuro_AN_nfe_onf=1414;non_neuro_AF_nfe_onf=0.475955;non_neuro_nhomalt_nfe_onf=155;non_topmed_AC_eas_male=130;non_topmed_AN_eas_male=892;non_topmed_AF_eas_male=0.14574;non_topmed_nhomalt_eas_male=11;controls_AC_amr_female=28;controls_AN_amr_female=60;controls_AF_amr_female=0.466667;controls_nhomalt_amr_female=5;non_neuro_AC_fin_male=76;non_neuro_AN_fin_male=236;non_neuro_AF_fin_male=0.322034;non_neuro_nhomalt_fin_male=8;AC_female=3780;AN_female=10272;AF_female=0.367991;nhomalt_female=726;non_neuro_AC_oth_male=108;non_neuro_AN_oth_male=274;non_neuro_AF_oth_male=0.394161;non_neuro_nhomalt_oth_male=21;non_topmed_AC_nfe_est=1206;non_topmed_AN_nfe_est=2938;non_topmed_AF_nfe_est=0.410483;non_topmed_nhomalt_nfe_est=240;non_topmed_AC_nfe_nwe=2010;non_topmed_AN_nfe_nwe=4018;non_topmed_AF_nfe_nwe=0.500249;non_topmed_nhomalt_nfe_nwe=488;non_topmed_AC_amr_male=153;non_topmed_AN_amr_male=278;non_topmed_AF_amr_male=0.55036;non_topmed_nhomalt_amr_male=41;non_topmed_AC_nfe_onf=581;non_topmed_AN_nfe_onf=1172;non_topmed_AF_nfe_onf=0.495734;non_topmed_nhomalt_nfe_onf=152;controls_AC_eas_male=74;controls_AN_eas_male=514;controls_AF_eas_male=0.143969;controls_nhomalt_eas_male=6;controls_AC_oth_male=55;controls_AN_oth_male=124;controls_AF_oth_male=0.443548;controls_nhomalt_oth_male=12;non_topmed_AC=6738;non_topmed_AN=19350;non_topmed_AF=0.348217;non_topmed_nhomalt=1276;controls_AC_fin=139;controls_AN_fin=436;controls_AF_fin=0.318807;controls_nhomalt_fin=18;non_neuro_AC_nfe=5059;non_neuro_AN_nfe=10588;non_neuro_AF_nfe=0.477805;non_neuro_nhomalt_nfe=1167;non_neuro_AC_fin_female=63;non_neuro_AN_fin_female=202;non_neuro_AF_fin_female=0.311881;non_neuro_nhomalt_fin_female=10;non_topmed_AC_nfe_seu=27;non_topmed_AN_nfe_seu=74;non_topmed_AF_nfe_seu=0.364865;non_topmed_nhomalt_nfe_seu=3;controls_AC_eas_female=49;controls_AN_eas_female=298;controls_AF_eas_female=0.16443;controls_nhomalt_eas_female=3;non_topmed_AC_asj=50;non_topmed_AN_asj=110;non_topmed_AF_asj=0.454545;non_topmed_nhomalt_asj=9;controls_AC_nfe_onf=118;controls_AN_nfe_onf=266;controls_AF_nfe_onf=0.443609;controls_nhomalt_nfe_onf=23;non_neuro_AC=6526;non_neuro_AN=16292;non_neuro_AF=0.400565;non_neuro_nhomalt=1378;non_topmed_AC_nfe=3824;non_topmed_AN_nfe=8202;non_topmed_AF_nfe=0.466228;non_topmed_nhomalt_nfe=883;non_topmed_AC_raw=8100;non_topmed_AN_raw=25734;non_topmed_AF_raw=0.314759;non_topmed_nhomalt_raw=1515;non_neuro_AC_nfe_est=1052;non_neuro_AN_nfe_est=2552;non_neuro_AF_nfe_est=0.412226;non_neuro_nhomalt_nfe_est=214;non_topmed_AC_oth_male=130;non_topmed_AN_oth_male=314;non_topmed_AF_oth_male=0.414013;non_topmed_nhomalt_oth_male=25;AC_nfe_est=1215;AN_nfe_est=2958;AF_nfe_est=0.410751;nhomalt_nfe_est=241;non_topmed_AC_afr_male=930;non_topmed_AN_afr_male=4008;non_topmed_AF_afr_male=0.232036;non_topmed_nhomalt_afr_male=111;AC_eas_male=131;AN_eas_male=908;AF_eas_male=0.144273;nhomalt_eas_male=11;controls_AC_eas=123;controls_AN_eas=812;controls_AF_eas=0.151478;controls_nhomalt_eas=9;non_neuro_AC_eas_male=131;non_neuro_AN_eas_male=908;non_neuro_AF_eas_male=0.144273;non_neuro_nhomalt_eas_male=11;non_neuro_AC_asj_male=79;non_neuro_AN_asj_male=164;non_neuro_AF_asj_male=0.481707;non_neuro_nhomalt_asj_male=17;controls_AC_oth=97;controls_AN_oth=246;controls_AF_oth=0.394309;controls_nhomalt_oth=18;AC_nfe=5736;AN_nfe=12052;AF_nfe=0.475938;nhomalt_nfe=1311;non_topmed_AC_female=3254;non_topmed_AN_female=9106;non_topmed_AF_female=0.357347;non_topmed_nhomalt_female=611;non_neuro_AC_asj=96;non_neuro_AN_asj=214;non_neuro_AF_asj=0.448598;non_neuro_nhomalt_asj=20;non_topmed_AC_eas_female=74;non_topmed_AN_eas_female=448;non_topmed_AF_eas_female=0.165179;non_topmed_nhomalt_eas_female=6;non_neuro_AC_raw=7450;non_neuro_AN_raw=20680;non_neuro_AF_raw=0.360251;non_neuro_nhomalt_raw=1557;non_topmed_AC_eas=204;non_topmed_AN_eas=1340;non_topmed_AF_eas=0.152239;non_topmed_nhomalt_eas=17;non_topmed_AC_fin_male=243;non_topmed_AN_fin_male=724;non_topmed_AF_fin_male=0.335635;non_topmed_nhomalt_fin_male=32;AC_fin=493;AN_fin=1504;AF_fin=0.327793;nhomalt_fin=70;AC_nfe_male=3284;AN_nfe_male=6774;AF_nfe_male=0.484795;nhomalt_nfe_male=759;controls_AC_amr_male=42;controls_AN_amr_male=76;controls_AF_amr_male=0.552632;controls_nhomalt_amr_male=12;controls_AC_afr_female=223;controls_AN_afr_female=908;controls_AF_afr_female=0.245595;controls_nhomalt_afr_female=24;controls_AC_amr=70;controls_AN_amr=136;controls_AF_amr=0.514706;controls_nhomalt_amr=17;AC_asj_female=21;AN_asj_female=64;AF_asj_female=0.328125;nhomalt_asj_female=3;non_neuro_AC_eas=206;non_neuro_AN_eas=1372;non_neuro_AF_eas=0.150146;non_neuro_nhomalt_eas=17;non_neuro_AC_male=3689;non_neuro_AN_male=9042;non_neuro_AF_male=0.407985;non_neuro_nhomalt_male=791;AC_asj=111;AN_asj=246;AF_asj=0.45122;nhomalt_asj=23;controls_AC_nfe_est=1036;controls_AN_nfe_est=2520;controls_AF_nfe_est=0.411111;controls_nhomalt_nfe_est=211;non_topmed_AC_asj_female=15;non_topmed_AN_asj_female=48;non_topmed_AF_asj_female=0.3125;non_topmed_nhomalt_asj_female=1;non_topmed_AC_oth=274;non_topmed_AN_oth=668;non_topmed_AF_oth=0.41018;non_topmed_nhomalt_oth=54;non_topmed_AC_fin_female=250;non_topmed_AN_fin_female=780;non_topmed_AF_fin_female=0.320513;non_topmed_nhomalt_fin_female=38;AC_oth=296;AN_oth=734;AF_oth=0.40327;nhomalt_oth=57;non_neuro_AC_nfe_male=2927;non_neuro_AN_nfe_male=6002;non_neuro_AF_nfe_male=0.487671;non_neuro_nhomalt_nfe_male=678;controls_AC_female=1051;controls_AN_female=3144;controls_AF_female=0.334288;controls_nhomalt_female=179;non_topmed_AC_fin=493;non_topmed_AN_fin=1504;non_topmed_AF_fin=0.327793;non_topmed_nhomalt_fin=70;non_topmed_AC_nfe_female=1961;non_topmed_AN_nfe_female=4236;non_topmed_AF_nfe_female=0.462937;non_topmed_nhomalt_nfe_female=446;controls_AC_asj_male=4;controls_AN_asj_male=10;controls_AF_asj_male=0.4;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=35;non_topmed_AN_asj_male=62;non_topmed_AF_asj_male=0.564516;non_topmed_nhomalt_asj_male=8;non_neuro_AC_oth=207;non_neuro_AN_oth=522;non_neuro_AF_oth=0.396552;non_neuro_nhomalt_oth=40;AC_male=5005;AN_male=13344;AF_male=0.375075;nhomalt_male=1006;controls_AC_fin_female=63;controls_AN_fin_female=200;controls_AF_fin_female=0.315;controls_nhomalt_fin_female=10;controls_AC_asj_female=4;controls_AN_asj_female=18;controls_AF_asj_female=0.222222;controls_nhomalt_asj_female=0;AC_amr_male=163;AN_amr_male=294;AF_amr_male=0.554422;nhomalt_amr_male=43;AC_amr_female=139;AN_amr_female=288;AF_amr_female=0.482639;nhomalt_amr_female=28;AC_oth_male=147;AN_oth_male=368;AF_oth_male=0.399457;nhomalt_oth_male=27;non_neuro_AC_nfe_seu=15;non_neuro_AN_nfe_seu=38;non_neuro_AF_nfe_seu=0.394737;non_neuro_nhomalt_nfe_seu=3;non_topmed_AC_afr_female=682;non_topmed_AN_afr_female=2970;non_topmed_AF_afr_female=0.22963;non_topmed_nhomalt_afr_female=67;non_topmed_AC_afr=1612;non_topmed_AN_afr=6978;non_topmed_AF_afr=0.231012;non_topmed_nhomalt_afr=178;controls_AC=2304;controls_AN=7000;controls_AF=0.329143;controls_nhomalt=404;non_neuro_AC_oth_female=99;non_neuro_AN_oth_female=248;non_neuro_AF_oth_female=0.399194;non_neuro_nhomalt_oth_female=19;non_topmed_faf95_amr=0.463516;non_topmed_faf99_amr=0.463517;faf95_afr=0.221013;faf99_afr=0.221013;controls_faf95_afr=0.211534;controls_faf99_afr=0.211534;faf95_amr=0.470781;faf99_amr=0.470782;faf95_eas=0.133362;faf99_eas=0.133363;faf95=0.36549;faf99=0.365489;non_neuro_faf95_afr=0.204131;non_neuro_faf99_afr=0.20413;non_neuro_faf95_amr=0.46934;non_neuro_faf99_amr=0.469341;controls_faf95_nfe=0.403732;controls_faf99_nfe=0.403733;non_topmed_faf95=0.341269;non_topmed_faf99=0.341269;non_neuro_faf95_nfe=0.466809;non_neuro_faf99_nfe=0.466809;non_neuro_faf95=0.392443;non_neuro_faf99=0.392443;non_topmed_faf95_nfe=0.453896;non_topmed_faf99_nfe=0.453895;controls_faf95_eas=0.129733;controls_faf99_eas=0.129733;faf95_nfe=0.465648;faf99_nfe=0.465648;non_topmed_faf95_eas=0.13514;non_topmed_faf99_eas=0.135141;controls_faf95_amr=0.417865;controls_faf99_amr=0.417865;non_neuro_faf95_eas=0.133362;non_neuro_faf99_eas=0.133363;non_topmed_faf95_afr=0.221629;non_topmed_faf99_afr=0.221629;controls_faf95=0.317945;controls_faf99=0.317945;controls_popmax=amr;controls_AC_popmax=70;controls_AN_popmax=136;controls_AF_popmax=0.514706;controls_nhomalt_popmax=17;popmax=amr;AC_popmax=302;AN_popmax=582;AF_popmax=0.5189;nhomalt_popmax=71;age_hist_het_bin_freq=187|208|358|521|663|483|366|190|104|64;age_hist_het_n_smaller=786;age_hist_het_n_larger=21;age_hist_hom_bin_freq=59|80|122|225|270|191|128|86|31|9;age_hist_hom_n_smaller=184;age_hist_hom_n_larger=5;non_neuro_popmax=amr;non_neuro_AC_popmax=220;non_neuro_AN_popmax=418;non_neuro_AF_popmax=0.526316;non_neuro_nhomalt_popmax=53;non_topmed_popmax=amr;non_topmed_AC_popmax=281;non_topmed_AN_popmax=548;non_topmed_AF_popmax=0.512774;non_topmed_nhomalt_popmax=65"); writer.Flush(); stream.Position = 0; return stream; } private static Stream GetChr22_21006257_exome() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t21006258\trs992970331\tTAAA\tT\t134845\tAC0\tAC=0;AN=0;rf_tp_probability=0.858094;FS=1.002;InbreedingCoeff=0.7272;MQ=60;MQRankSum=0.727;QD=37.76;ReadPosRankSum=0.736;SOR=1.029;VQSR_POSITIVE_TRAIN_SITE;BaseQRankSum=0.727;ClippingRankSum=0.731;DP=33139;VQSLOD=2.99;VQSR_culprit=MQRankSum;lcr;rf_positive_label;rf_label=TP;rf_train;variant_type=mixed;allele_type=del;n_alt_alleles=6;was_mixed;pab_max=1;gq_hist_alt_bin_freq=23|313|25|5|4|2|1|1|2|1|0|1|1|0|1|0|0|0|0|1;gq_hist_all_bin_freq=4777|1217|53|11|6|11|6|1|4|1|0|1|2|2|4|0|1|0|0|1;dp_hist_alt_bin_freq=373|8|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=164272|59|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|3|0|1|15|0|3|1|0|0|1;AC_nfe_seu=0;AN_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=0;controls_nhomalt_afr_male=0;non_neuro_AC_eas_kor=0;non_neuro_AN_eas_kor=0;non_neuro_nhomalt_eas_kor=0;non_topmed_AC_amr=0;non_topmed_AN_amr=0;non_topmed_nhomalt_amr=0;non_cancer_AC_asj_female=0;non_cancer_AN_asj_female=0;non_cancer_nhomalt_asj_female=0;AC_raw=578;AN_raw=9596;AF_raw=0.0602334;nhomalt_raw=279;AC_fin_female=0;AN_fin_female=0;nhomalt_fin_female=0;non_cancer_AC_oth_female=0;non_cancer_AN_oth_female=0;non_cancer_nhomalt_oth_female=0;AC_nfe_bgr=0;AN_nfe_bgr=0;nhomalt_nfe_bgr=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=0;non_neuro_nhomalt_asj_female=0;AC_sas_male=0;AN_sas_male=0;nhomalt_sas_male=0;non_neuro_AC_afr_male=0;non_neuro_AN_afr_male=0;non_neuro_nhomalt_afr_male=0;AC_afr_male=0;AN_afr_male=0;nhomalt_afr_male=0;AC_afr=0;AN_afr=0;nhomalt_afr=0;controls_AC_nfe_swe=0;controls_AN_nfe_swe=0;controls_nhomalt_nfe_swe=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=0;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=0;non_topmed_AN_amr_female=0;non_topmed_nhomalt_amr_female=0;non_cancer_AC_female=0;non_cancer_AN_female=0;non_cancer_nhomalt_female=0;non_cancer_AC_nfe_onf=0;non_cancer_AN_nfe_onf=0;non_cancer_nhomalt_nfe_onf=0;non_cancer_AC_male=0;non_cancer_AN_male=0;non_cancer_nhomalt_male=0;non_topmed_AC_oth_female=0;non_topmed_AN_oth_female=0;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=0;nhomalt_eas_female=0;non_cancer_AC_sas_female=0;non_cancer_AN_sas_female=0;non_cancer_nhomalt_sas_female=0;AC_afr_female=0;AN_afr_female=0;nhomalt_afr_female=0;AC_sas=0;AN_sas=0;nhomalt_sas=0;non_neuro_AC_female=0;non_neuro_AN_female=0;non_neuro_nhomalt_female=0;controls_AC_afr=0;controls_AN_afr=0;controls_nhomalt_afr=0;non_neuro_AC_eas_jpn=0;non_neuro_AN_eas_jpn=0;non_neuro_nhomalt_eas_jpn=0;AC_nfe_onf=0;AN_nfe_onf=0;nhomalt_nfe_onf=0;non_cancer_AC_amr_male=0;non_cancer_AN_amr_male=0;non_cancer_nhomalt_amr_male=0;controls_AC_fin_male=0;controls_AN_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=0;non_neuro_AN_nfe_nwe=0;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=0;AN_nfe_female=0;nhomalt_nfe_female=0;AC_amr=0;AN_amr=0;nhomalt_amr=0;non_topmed_AC_nfe_male=0;non_topmed_AN_nfe_male=0;non_topmed_nhomalt_nfe_male=0;non_neuro_AC_sas=0;non_neuro_AN_sas=0;non_neuro_nhomalt_sas=0;non_cancer_AC_fin_male=0;non_cancer_AN_fin_male=0;non_cancer_nhomalt_fin_male=0;non_cancer_AC_nfe_seu=0;non_cancer_AN_nfe_seu=0;non_cancer_nhomalt_nfe_seu=0;AC_eas=0;AN_eas=0;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=0;non_neuro_AN_nfe_female=0;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=0;non_neuro_nhomalt_afr=0;controls_AC_raw=246;controls_AN_raw=4364;controls_AF_raw=0.0563703;controls_nhomalt_raw=119;non_cancer_AC_eas=0;non_cancer_AN_eas=0;non_cancer_nhomalt_eas=0;non_cancer_AC_amr_female=0;non_cancer_AN_amr_female=0;non_cancer_nhomalt_amr_female=0;non_neuro_AC_nfe_swe=0;non_neuro_AN_nfe_swe=0;non_neuro_nhomalt_nfe_swe=0;controls_AC_male=0;controls_AN_male=0;controls_nhomalt_male=0;non_topmed_AC_male=0;non_topmed_AN_male=0;non_topmed_nhomalt_male=0;controls_AC_eas_jpn=0;controls_AN_eas_jpn=0;controls_nhomalt_eas_jpn=0;controls_AC_nfe_female=0;controls_AN_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=0;non_neuro_nhomalt_fin=0;non_topmed_AC_sas=0;non_topmed_AN_sas=0;non_topmed_nhomalt_sas=0;non_cancer_AC_nfe_female=0;non_cancer_AN_nfe_female=0;non_cancer_nhomalt_nfe_female=0;AC_oth_female=0;AN_oth_female=0;nhomalt_oth_female=0;non_cancer_AC_asj=0;non_cancer_AN_asj=0;non_cancer_nhomalt_asj=0;AC_nfe_swe=0;AN_nfe_swe=0;nhomalt_nfe_swe=0;controls_AC_nfe=0;controls_AN_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=0;AN_nfe_nwe=0;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=0;controls_nhomalt_nfe_seu=0;controls_AC_sas_female=0;controls_AN_sas_female=0;controls_nhomalt_sas_female=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=0;non_neuro_nhomalt_amr_female=0;non_cancer_AC_eas_jpn=0;non_cancer_AN_eas_jpn=0;non_cancer_nhomalt_eas_jpn=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=0;non_topmed_nhomalt_eas_male=0;AC_eas_jpn=0;AN_eas_jpn=0;nhomalt_eas_jpn=0;non_cancer_AC_afr_male=0;non_cancer_AN_afr_male=0;non_cancer_nhomalt_afr_male=0;non_cancer_AC_afr=0;non_cancer_AN_afr=0;non_cancer_nhomalt_afr=0;controls_AC_amr_female=0;controls_AN_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=0;AN_female=0;nhomalt_female=0;non_neuro_AC_nfe_bgr=0;non_neuro_AN_nfe_bgr=0;non_neuro_nhomalt_nfe_bgr=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=0;non_topmed_AN_nfe_nwe=0;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=0;non_topmed_nhomalt_amr_male=0;non_cancer_AC_amr=0;non_cancer_AN_amr=0;non_cancer_nhomalt_amr=0;non_topmed_AC_nfe_swe=0;non_topmed_AN_nfe_swe=0;non_topmed_nhomalt_nfe_swe=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_kor=0;controls_AN_eas_kor=0;controls_nhomalt_eas_kor=0;non_topmed_AC_eas_oea=0;non_topmed_AN_eas_oea=0;non_topmed_nhomalt_eas_oea=0;controls_AC_eas_male=0;controls_AN_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=0;non_topmed_AN=0;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=0;controls_nhomalt_fin=0;AC_eas_kor=0;AN_eas_kor=0;nhomalt_eas_kor=0;non_neuro_AC_nfe=0;non_neuro_AN_nfe=0;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=0;non_neuro_nhomalt_fin_female=0;non_cancer_AC_nfe_male=0;non_cancer_AN_nfe_male=0;non_cancer_nhomalt_nfe_male=0;controls_AC_eas_oea=0;controls_AN_eas_oea=0;controls_nhomalt_eas_oea=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=0;non_neuro_AN=0;non_neuro_nhomalt=0;AC_eas_oea=0;AN_eas_oea=0;nhomalt_eas_oea=0;non_topmed_AC_nfe=0;non_topmed_AN_nfe=0;non_topmed_nhomalt_nfe=0;non_cancer_AC_oth=0;non_cancer_AN_oth=0;non_cancer_nhomalt_oth=0;non_topmed_AC_raw=574;non_topmed_AN_raw=9488;non_topmed_AF_raw=0.0604975;non_topmed_nhomalt_raw=277;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=0;non_topmed_nhomalt_oth_male=0;non_cancer_AC_oth_male=0;non_cancer_AN_oth_male=0;non_cancer_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=0;nhomalt_nfe_est=0;non_cancer_AC_afr_female=0;non_cancer_AN_afr_female=0;non_cancer_nhomalt_afr_female=0;non_topmed_AC_afr_male=0;non_topmed_AN_afr_male=0;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=0;non_neuro_nhomalt_eas_male=0;non_cancer_AC_nfe_nwe=0;non_cancer_AN_nfe_nwe=0;non_cancer_nhomalt_nfe_nwe=0;controls_AC_sas=0;controls_AN_sas=0;controls_nhomalt_sas=0;non_neuro_AC_sas_male=0;non_neuro_AN_sas_male=0;non_neuro_nhomalt_sas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=0;non_neuro_nhomalt_asj_male=0;non_cancer_AC_nfe_bgr=0;non_cancer_AN_nfe_bgr=0;non_cancer_nhomalt_nfe_bgr=0;controls_AC_oth=0;controls_AN_oth=0;controls_nhomalt_oth=0;non_cancer_AC_eas_female=0;non_cancer_AN_eas_female=0;non_cancer_nhomalt_eas_female=0;AC_nfe=0;AN_nfe=0;nhomalt_nfe=0;non_topmed_AC_female=0;non_topmed_AN_female=0;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=457;non_neuro_AN_raw=8156;non_neuro_AF_raw=0.0560324;non_neuro_nhomalt_raw=220;non_topmed_AC_eas=0;non_topmed_AN_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=0;non_topmed_nhomalt_fin_male=0;non_cancer_AC_asj_male=0;non_cancer_AN_asj_male=0;non_cancer_nhomalt_asj_male=0;AC_fin=0;AN_fin=0;nhomalt_fin=0;AC_nfe_male=0;AN_nfe_male=0;nhomalt_nfe_male=0;non_topmed_AC_eas_kor=0;non_topmed_AN_eas_kor=0;non_topmed_nhomalt_eas_kor=0;controls_AC_amr_male=0;controls_AN_amr_male=0;controls_nhomalt_amr_male=0;non_neuro_AC_eas_oea=0;non_neuro_AN_eas_oea=0;non_neuro_nhomalt_eas_oea=0;AC_sas_female=0;AN_sas_female=0;nhomalt_sas_female=0;controls_AC_afr_female=0;controls_AN_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=0;controls_nhomalt_amr=0;non_topmed_AC_eas_jpn=0;non_topmed_AN_eas_jpn=0;non_topmed_nhomalt_eas_jpn=0;AC_asj_female=0;AN_asj_female=0;nhomalt_asj_female=0;non_topmed_AC_nfe_bgr=0;non_topmed_AN_nfe_bgr=0;non_topmed_nhomalt_nfe_bgr=0;non_cancer_AC_nfe_est=0;non_cancer_AN_nfe_est=0;non_cancer_nhomalt_nfe_est=0;non_neuro_AC_eas=0;non_neuro_AN_eas=0;non_neuro_nhomalt_eas=0;non_cancer_AC_nfe=0;non_cancer_AN_nfe=0;non_cancer_nhomalt_nfe=0;non_neuro_AC_male=0;non_neuro_AN_male=0;non_neuro_nhomalt_male=0;non_neuro_AC_sas_female=0;non_neuro_AN_sas_female=0;non_neuro_nhomalt_sas_female=0;AC_asj=0;AN_asj=0;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=0;non_topmed_nhomalt_asj_female=0;non_cancer_AC_nfe_swe=0;non_cancer_AN_nfe_swe=0;non_cancer_nhomalt_nfe_swe=0;non_cancer_AC=0;non_cancer_AN=0;non_cancer_nhomalt=0;non_topmed_AC_oth=0;non_topmed_AN_oth=0;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=0;non_topmed_nhomalt_fin_female=0;non_cancer_AC_fin_female=0;non_cancer_AN_fin_female=0;non_cancer_nhomalt_fin_female=0;AC_oth=0;AN_oth=0;nhomalt_oth=0;non_neuro_AC_nfe_male=0;non_neuro_AN_nfe_male=0;non_neuro_nhomalt_nfe_male=0;controls_AC_female=0;controls_AN_female=0;controls_nhomalt_female=0;non_cancer_AC_fin=0;non_cancer_AN_fin=0;non_cancer_nhomalt_fin=0;non_topmed_AC_fin=0;non_topmed_AN_fin=0;non_topmed_nhomalt_fin=0;non_cancer_AC_eas_oea=0;non_cancer_AN_eas_oea=0;non_cancer_nhomalt_eas_oea=0;non_topmed_AC_nfe_female=0;non_topmed_AN_nfe_female=0;non_topmed_nhomalt_nfe_female=0;non_cancer_AC_sas_male=0;non_cancer_AN_sas_male=0;non_cancer_nhomalt_sas_male=0;controls_AC_asj_male=0;controls_AN_asj_male=0;controls_nhomalt_asj_male=0;non_cancer_AC_raw=562;non_cancer_AN_raw=9344;non_cancer_AF_raw=0.0601455;non_cancer_nhomalt_raw=272;non_cancer_AC_eas_male=0;non_cancer_AN_eas_male=0;non_cancer_nhomalt_eas_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=0;non_neuro_AN_oth=0;non_neuro_nhomalt_oth=0;AC_male=0;AN_male=0;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_nfe_bgr=0;controls_AN_nfe_bgr=0;controls_nhomalt_nfe_bgr=0;controls_AC_asj_female=0;controls_AN_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=0;nhomalt_amr_male=0;AC_amr_female=0;AN_amr_female=0;nhomalt_amr_female=0;non_topmed_AC_sas_male=0;non_topmed_AN_sas_male=0;non_topmed_nhomalt_sas_male=0;AC_oth_male=0;AN_oth_male=0;nhomalt_oth_male=0;non_cancer_AC_sas=0;non_cancer_AN_sas=0;non_cancer_nhomalt_sas=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_cancer_AC_eas_kor=0;non_cancer_AN_eas_kor=0;non_cancer_nhomalt_eas_kor=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=0;non_topmed_nhomalt_afr_female=0;controls_AC_sas_male=0;controls_AN_sas_male=0;controls_nhomalt_sas_male=0;non_topmed_AC_sas_female=0;non_topmed_AN_sas_female=0;non_topmed_nhomalt_sas_female=0;non_topmed_AC_afr=0;non_topmed_AN_afr=0;non_topmed_nhomalt_afr=0;controls_AC=0;controls_AN=0;controls_nhomalt=0;non_neuro_AC_oth_female=0;non_neuro_AN_oth_female=0;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0;non_topmed_faf99_amr=0;faf95_afr=0;faf99_afr=0;faf95_sas=0;faf99_sas=0;controls_faf95_afr=0;controls_faf99_afr=0;faf95_amr=0;faf99_amr=0;non_neuro_faf95_sas=0;non_neuro_faf99_sas=0;faf95_eas=0;faf99_eas=0;faf95=0;faf99=0;non_neuro_faf95_afr=0;non_neuro_faf99_afr=0;non_cancer_faf95_eas=0;non_cancer_faf99_eas=0;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;non_topmed_faf95_sas=0;non_topmed_faf99_sas=0;controls_faf95_nfe=0;controls_faf99_nfe=0;non_cancer_faf95_afr=0;non_cancer_faf99_afr=0;non_cancer_faf95_amr=0;non_cancer_faf99_amr=0;non_topmed_faf95=0;non_topmed_faf99=0;non_neuro_faf95_nfe=0;non_neuro_faf99_nfe=0;non_neuro_faf95=0;non_neuro_faf99=0;non_topmed_faf95_nfe=0;non_topmed_faf99_nfe=0;controls_faf95_eas=0;controls_faf99_eas=0;controls_faf95_sas=0;controls_faf99_sas=0;faf95_nfe=0;faf99_nfe=0;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_cancer_faf95_nfe=0;non_cancer_faf99_nfe=0;non_cancer_faf95=0;non_cancer_faf99=0;non_cancer_faf95_sas=0;non_cancer_faf99_sas=0;non_topmed_faf95_afr=0;non_topmed_faf99_afr=0;controls_faf95=0;controls_faf99=0;age_hist_het_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_het_n_smaller=0;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0"); writer.Flush(); stream.Position = 0; return stream; } /*[Fact] public void CombineCoverage_when_one_AN_is_zero() { var sequence = new SimpleSequence(new string('G', VariantUtils.MaxUpstreamLength) + "TAAA", 21006257 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var gnomadReader = new GnomadSnvReader(new StreamReader(GetChr22_21006257_genome()), new StreamReader(GetChr22_21006257_exome()), sequenceProvider); var items = gnomadReader.GetCombinedItems().ToList(); Assert.Single(items); Assert.Equal(21, items[0].Coverage); }*/ private static Stream GetChr22_22055876_genome() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t22055876\trs113132860\tG\tT\t2.28838e+06\tPASS\tAC=228;AN=18686;AF=0.0122016;rf_tp_probability=0.374872;FS=2.447;InbreedingCoeff=0.0575;MQ=59.76;MQRankSum=-0.152;QD=15.22;ReadPosRankSum=-0.259;SOR=0.701;BaseQRankSum=-0.771;ClippingRankSum=0;DP=490478;VQSLOD=0.36;VQSR_culprit=FS;lcr;variant_type=mixed;allele_type=snv;n_alt_alleles=5;was_mixed;has_star;pab_max=1;gq_hist_alt_bin_freq=79|95|60|65|65|46|37|41|28|38|24|19|20|18|14|11|15|8|4|103;gq_hist_all_bin_freq=2799|865|616|841|825|614|753|768|561|775|568|463|553|397|310|247|235|179|221|5310;dp_hist_alt_bin_freq=18|144|272|199|78|56|18|5|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=95|956|2823|4449|4634|3537|2105|978|396|166|82|35|24|11|5|4|2|1|0|1;dp_hist_all_n_larger=6;ab_hist_alt_bin_freq=0|18|83|129|138|140|103|45|48|13|28|14|13|9|0|3|0|0|0|0;AC_nfe_seu=2;AN_nfe_seu=52;AF_nfe_seu=0.0384615;nhomalt_nfe_seu=0;controls_AC_afr_male=5;controls_AN_afr_male=806;controls_AF_afr_male=0.00620347;controls_nhomalt_afr_male=0;non_topmed_AC_amr=1;non_topmed_AN_amr=426;non_topmed_AF_amr=0.00234742;non_topmed_nhomalt_amr=0;AC_raw=559;AN_raw=27704;AF_raw=0.0201776;nhomalt_raw=4;AC_fin_female=32;AN_fin_female=618;AF_fin_female=0.0517799;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=48;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=5;non_neuro_AN_afr_male=914;non_neuro_AF_afr_male=0.00547046;non_neuro_nhomalt_afr_male=0;AC_afr_male=15;AN_afr_male=2982;AF_afr_male=0.00503018;nhomalt_afr_male=0;AC_afr=25;AN_afr=5118;AF_afr=0.00488472;nhomalt_afr=0;non_neuro_AC_afr_female=3;non_neuro_AN_afr_female=1086;non_neuro_AF_afr_female=0.00276243;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=1;non_topmed_AN_amr_female=200;non_topmed_AF_amr_female=0.005;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=7;non_topmed_AN_oth_female=260;non_topmed_AF_oth_female=0.0269231;non_topmed_nhomalt_oth_female=0;AC_eas_female=2;AN_eas_female=486;AF_eas_female=0.00411523;nhomalt_eas_female=0;AC_afr_female=10;AN_afr_female=2136;AF_afr_female=0.00468165;nhomalt_afr_female=0;non_neuro_AC_female=70;non_neuro_AN_female=5902;non_neuro_AF_female=0.0118604;non_neuro_nhomalt_female=0;controls_AC_afr=8;controls_AN_afr=1498;controls_AF_afr=0.00534045;controls_nhomalt_afr=0;AC_nfe_onf=11;AN_nfe_onf=1346;AF_nfe_onf=0.00817236;nhomalt_nfe_onf=0;controls_AC_fin_male=10;controls_AN_fin_male=180;controls_AF_fin_male=0.0555556;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=39;non_neuro_AN_nfe_nwe=5416;non_neuro_AF_nfe_nwe=0.00720089;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=26;AN_fin_male=508;AF_fin_male=0.0511811;nhomalt_fin_male=0;AC_nfe_female=54;AN_nfe_female=4252;AF_nfe_female=0.0126999;nhomalt_nfe_female=0;AC_amr=1;AN_amr=450;AF_amr=0.00222222;nhomalt_amr=0;non_topmed_AC_nfe_male=52;non_topmed_AN_nfe_male=3174;non_topmed_AF_nfe_male=0.0163831;non_topmed_nhomalt_nfe_male=0;AC_eas=4;AN_eas=1398;AF_eas=0.00286123;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=52;non_neuro_AN_nfe_female=3740;non_neuro_AF_nfe_female=0.0139037;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=8;non_neuro_AN_afr=2000;non_neuro_AF_afr=0.004;non_neuro_nhomalt_afr=0;controls_AC_raw=303;controls_AN_raw=9668;controls_AF_raw=0.0313405;controls_nhomalt_raw=0;controls_AC_male=65;controls_AN_male=3126;controls_AF_male=0.0207933;controls_nhomalt_male=0;non_topmed_AC_male=107;non_topmed_AN_male=8010;non_topmed_AF_male=0.0133583;non_topmed_nhomalt_male=0;controls_AC_nfe_female=40;controls_AN_nfe_female=1234;controls_AF_nfe_female=0.0324149;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=320;non_neuro_AF_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=2;non_neuro_AN_eas_female=486;non_neuro_AF_eas_female=0.00411523;non_neuro_nhomalt_eas_female=0;AC_asj_male=3;AN_asj_male=156;AF_asj_male=0.0192308;nhomalt_asj_male=0;controls_AC_nfe_male=41;controls_AN_nfe_male=1460;controls_AF_nfe_male=0.0280822;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=21;non_neuro_AN_fin=350;non_neuro_AF_fin=0.06;non_neuro_nhomalt_fin=0;AC_oth_female=7;AN_oth_female=274;AF_oth_female=0.0255474;nhomalt_oth_female=0;controls_AC_nfe=81;controls_AN_nfe=2694;controls_AF_nfe=0.0300668;controls_nhomalt_nfe=0;controls_AC_oth_female=2;controls_AN_oth_female=102;controls_AF_oth_female=0.0196078;controls_nhomalt_oth_female=0;controls_AC_asj=1;controls_AN_asj=18;controls_AF_asj=0.0555556;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=146;non_neuro_AF_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=13;controls_AN_nfe_nwe=342;controls_AF_nfe_nwe=0.0380117;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=47;AN_nfe_nwe=5898;AF_nfe_nwe=0.0079688;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=2;controls_AN_nfe_seu=28;controls_AF_nfe_seu=0.0714286;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=174;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=8;non_neuro_AN_nfe_onf=1138;non_neuro_AF_nfe_onf=0.00702988;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=2;non_topmed_AN_eas_male=894;non_topmed_AF_eas_male=0.00223714;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=0;controls_AN_amr_female=48;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=10;non_neuro_AN_fin_male=180;non_neuro_AF_fin_male=0.0555556;non_neuro_nhomalt_fin_male=0;AC_female=106;AN_female=8036;AF_female=0.0131906;nhomalt_female=0;non_neuro_AC_oth_male=6;non_neuro_AN_oth_male=222;non_neuro_AF_oth_male=0.027027;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=61;non_topmed_AN_nfe_est=2506;non_topmed_AF_nfe_est=0.0243416;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=28;non_topmed_AN_nfe_nwe=3100;non_topmed_AF_nfe_nwe=0.00903226;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=226;non_topmed_AF_amr_male=0;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=9;non_topmed_AN_nfe_onf=898;non_topmed_AF_nfe_onf=0.0100223;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=2;controls_AN_eas_male=514;controls_AF_eas_male=0.00389105;controls_nhomalt_eas_male=0;controls_AC_oth_male=6;controls_AN_oth_male=94;controls_AF_oth_male=0.0638298;controls_nhomalt_oth_male=0;non_topmed_AC=206;non_topmed_AN=15064;non_topmed_AF=0.013675;non_topmed_nhomalt=0;controls_AC_fin=21;controls_AN_fin=348;controls_AF_fin=0.0603448;controls_nhomalt_fin=0;non_neuro_AC_nfe=110;non_neuro_AN_nfe=8768;non_neuro_AF_nfe=0.0125456;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=11;non_neuro_AN_fin_female=170;non_neuro_AF_fin_female=0.0647059;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=2;non_topmed_AN_nfe_seu=52;non_topmed_AF_nfe_seu=0.0384615;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=2;controls_AN_eas_female=314;controls_AF_eas_female=0.00636943;controls_nhomalt_eas_female=0;non_topmed_AC_asj=3;non_topmed_AN_asj=78;non_topmed_AF_asj=0.0384615;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=5;controls_AN_nfe_onf=164;controls_AF_nfe_onf=0.0304878;controls_nhomalt_nfe_onf=0;non_neuro_AC=152;non_neuro_AN=13452;non_neuro_AF=0.0112994;non_neuro_nhomalt=0;non_topmed_AC_nfe=100;non_topmed_AN_nfe=6556;non_topmed_AF_nfe=0.0152532;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=512;non_topmed_AN_raw=23304;non_topmed_AF_raw=0.0219705;non_topmed_nhomalt_raw=4;non_neuro_AC_nfe_est=61;non_neuro_AN_nfe_est=2188;non_neuro_AF_nfe_est=0.0278793;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=9;non_topmed_AN_oth_male=248;non_topmed_AF_oth_male=0.0362903;non_topmed_nhomalt_oth_male=0;AC_nfe_est=61;AN_nfe_est=2522;AF_nfe_est=0.0241872;nhomalt_nfe_est=0;non_topmed_AC_afr_male=15;non_topmed_AN_afr_male=2918;non_topmed_AF_afr_male=0.00514051;non_topmed_nhomalt_afr_male=0;AC_eas_male=2;AN_eas_male=912;AF_eas_male=0.00219298;nhomalt_eas_male=0;controls_AC_eas=4;controls_AN_eas=828;controls_AF_eas=0.00483092;controls_nhomalt_eas=0;non_neuro_AC_eas_male=2;non_neuro_AN_eas_male=912;non_neuro_AF_eas_male=0.00219298;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=1;non_neuro_AN_asj_male=148;non_neuro_AF_asj_male=0.00675676;non_neuro_nhomalt_asj_male=0;controls_AC_oth=8;controls_AN_oth=196;controls_AF_oth=0.0408163;controls_nhomalt_oth=0;AC_nfe=121;AN_nfe=9818;AF_nfe=0.0123243;nhomalt_nfe=0;non_topmed_AC_female=99;non_topmed_AN_female=7054;non_topmed_AF_female=0.0140346;non_topmed_nhomalt_female=0;non_neuro_AC_asj=1;non_neuro_AN_asj=196;non_neuro_AF_asj=0.00510204;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=2;non_topmed_AN_eas_female=470;non_topmed_AF_eas_female=0.00425532;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=362;non_neuro_AN_raw=19100;non_neuro_AF_raw=0.0189529;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=4;non_topmed_AN_eas=1364;non_topmed_AF_eas=0.00293255;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=26;non_topmed_AN_fin_male=508;non_topmed_AF_fin_male=0.0511811;non_topmed_nhomalt_fin_male=0;AC_fin=58;AN_fin=1126;AF_fin=0.0515098;nhomalt_fin=0;AC_nfe_male=67;AN_nfe_male=5566;AF_nfe_male=0.0120374;nhomalt_nfe_male=0;controls_AC_amr_male=0;controls_AN_amr_male=68;controls_AF_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=3;controls_AN_afr_female=692;controls_AF_afr_female=0.00433526;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=116;controls_AF_amr=0;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=54;AF_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=4;non_neuro_AN_eas=1398;non_neuro_AF_eas=0.00286123;non_neuro_nhomalt_eas=0;non_neuro_AC_male=82;non_neuro_AN_male=7550;non_neuro_AF_male=0.0108609;non_neuro_nhomalt_male=0;AC_asj=3;AN_asj=210;AF_asj=0.0142857;nhomalt_asj=0;controls_AC_nfe_est=61;controls_AN_nfe_est=2160;controls_AF_nfe_est=0.0282407;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=36;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=16;non_topmed_AN_oth=508;non_topmed_AF_oth=0.0314961;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=32;non_topmed_AN_fin_female=618;non_topmed_AF_fin_female=0.0517799;non_topmed_nhomalt_fin_female=0;AC_oth=16;AN_oth=566;AF_oth=0.0282686;nhomalt_oth=0;non_neuro_AC_nfe_male=58;non_neuro_AN_nfe_male=5028;non_neuro_AF_nfe_male=0.0115354;non_neuro_nhomalt_nfe_male=0;controls_AC_female=58;controls_AN_female=2572;controls_AF_female=0.0225505;controls_nhomalt_female=0;non_topmed_AC_fin=58;non_topmed_AN_fin=1126;non_topmed_AF_fin=0.0515098;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=48;non_topmed_AN_nfe_female=3382;non_topmed_AF_nfe_female=0.0141928;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=1;controls_AN_asj_male=4;controls_AF_asj_male=0.25;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=3;non_topmed_AN_asj_male=42;non_topmed_AF_asj_male=0.0714286;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=8;non_neuro_AN_oth=420;non_neuro_AF_oth=0.0190476;non_neuro_nhomalt_oth=0;AC_male=122;AN_male=10650;AF_male=0.0114554;nhomalt_male=0;controls_AC_fin_female=11;controls_AN_fin_female=168;controls_AF_fin_female=0.0654762;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=14;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=234;AF_amr_male=0;nhomalt_amr_male=0;AC_amr_female=1;AN_amr_female=216;AF_amr_female=0.00462963;nhomalt_amr_female=0;AC_oth_male=9;AN_oth_male=292;AF_oth_male=0.0308219;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=2;non_neuro_AN_nfe_seu=26;non_neuro_AF_nfe_seu=0.0769231;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=9;non_topmed_AN_afr_female=2088;non_topmed_AF_afr_female=0.00431034;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=24;non_topmed_AN_afr=5006;non_topmed_AF_afr=0.00479425;non_topmed_nhomalt_afr=0;controls_AC=123;controls_AN=5698;controls_AF=0.0215865;controls_nhomalt=0;non_neuro_AC_oth_female=2;non_neuro_AN_oth_female=198;non_neuro_AF_oth_female=0.010101;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0.00012;non_topmed_faf99_amr=0.00012;faf95_afr=0.00339534;faf99_afr=0.00339557;controls_faf95_afr=0.00265678;controls_faf99_afr=0.00265672;faf95_amr=0.000113;faf99_amr=0.000113;faf95_eas=0.00097636;faf99_eas=0.00097723;faf95=0.0109033;faf99=0.0109026;non_neuro_faf95_afr=0.00199003;non_neuro_faf99_afr=0.00198958;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;controls_faf95_nfe=0.0247904;controls_faf99_nfe=0.0247903;non_topmed_faf95=0.0121462;non_topmed_faf99=0.0121458;non_neuro_faf95_nfe=0.0106444;non_neuro_faf99_nfe=0.0106446;non_neuro_faf95=0.00983491;non_neuro_faf99=0.00983455;non_topmed_faf95_nfe=0.0128339;non_topmed_faf99_nfe=0.0128338;controls_faf95_eas=0.00164922;controls_faf99_eas=0.00164971;faf95_nfe=0.0105404;faf99_nfe=0.0105404;non_topmed_faf95_eas=0.00100131;non_topmed_faf99_eas=0.00100135;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0.00097636;non_neuro_faf99_eas=0.00097723;non_topmed_faf95_afr=0.00330485;non_topmed_faf99_afr=0.0033051;controls_faf95=0.018487;controls_faf99=0.0184878;controls_popmax=nfe;controls_AC_popmax=81;controls_AN_popmax=2694;controls_AF_popmax=0.0300668;controls_nhomalt_popmax=0;popmax=nfe;AC_popmax=121;AN_popmax=9818;AF_popmax=0.0123243;nhomalt_popmax=0;age_hist_het_bin_freq=12|8|16|15|13|24|18|13|9|2;age_hist_het_n_smaller=33;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=nfe;non_neuro_AC_popmax=110;non_neuro_AN_popmax=8768;non_neuro_AF_popmax=0.0125456;non_neuro_nhomalt_popmax=0;non_topmed_popmax=nfe;non_topmed_AC_popmax=100;non_topmed_AN_popmax=6556;non_topmed_AF_popmax=0.0152532;non_topmed_nhomalt_popmax=0"); writer.WriteLine("22\t22055876\trs78003688\tG\tT\t4.27261e+06\tRF\tAC=195;AN=21390;AF=0.00911641;rf_tp_probability=0.0468734;FS=4.051;InbreedingCoeff=-0.0363;MQ=59.94;MQRankSum=-0.258;QD=21.64;ReadPosRankSum=-1.148;SOR=0.757;VQSR_NEGATIVE_TRAIN_SITE;BaseQRankSum=-1.472;ClippingRankSum=-0.047;DP=494695;VQSLOD=-1.473;VQSR_culprit=MQ;lcr;variant_type=snv;allele_type=snv;n_alt_alleles=1;has_star;pab_max=1;gq_hist_alt_bin_freq=88|80|83|69|65|58|49|34|24|24|28|14|24|9|14|6|7|6|5|40;gq_hist_all_bin_freq=4274|544|589|606|608|617|697|698|573|702|686|478|749|320|385|273|334|114|204|6729;dp_hist_alt_bin_freq=8|90|181|166|124|81|50|15|7|2|2|0|0|0|0|0|1|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=52|524|2070|3958|4924|4228|2550|1134|487|196|78|51|20|11|13|5|3|1|2|1;dp_hist_all_n_larger=2;ab_hist_alt_bin_freq=1|17|92|144|168|106|89|39|25|10|23|6|1|3|1|1|1|0|0|0;AC_nfe_seu=0;AN_nfe_seu=80;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=2;controls_AN_afr_male=1042;controls_AF_afr_male=0.00191939;controls_nhomalt_afr_male=0;non_topmed_AC_amr=3;non_topmed_AN_amr=558;non_topmed_AF_amr=0.00537634;non_topmed_nhomalt_amr=0;AC_raw=553;AN_raw=31242;AF_raw=0.0177005;nhomalt_raw=0;AC_fin_female=11;AN_fin_female=954;AF_fin_female=0.0115304;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=42;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=2;non_neuro_AN_afr_male=1210;non_neuro_AF_afr_male=0.00165289;non_neuro_nhomalt_afr_male=0;AC_afr_male=18;AN_afr_male=3792;AF_afr_male=0.00474684;nhomalt_afr_male=0;AC_afr=40;AN_afr=6628;AF_afr=0.006035;nhomalt_afr=0;non_neuro_AC_afr_female=10;non_neuro_AN_afr_female=1392;non_neuro_AF_afr_female=0.00718391;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=0;non_topmed_AN_amr_female=278;non_topmed_AF_amr_female=0;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=1;non_topmed_AN_oth_female=334;non_topmed_AF_oth_female=0.00299401;non_topmed_nhomalt_oth_female=0;AC_eas_female=3;AN_eas_female=340;AF_eas_female=0.00882353;nhomalt_eas_female=0;AC_afr_female=22;AN_afr_female=2836;AF_afr_female=0.0077574;nhomalt_afr_female=0;non_neuro_AC_female=66;non_neuro_AN_female=6530;non_neuro_AF_female=0.0101072;non_neuro_nhomalt_female=0;controls_AC_afr=11;controls_AN_afr=1910;controls_AF_afr=0.00575916;controls_nhomalt_afr=0;AC_nfe_onf=13;AN_nfe_onf=1572;AF_nfe_onf=0.00826972;nhomalt_nfe_onf=0;controls_AC_fin_male=4;controls_AN_fin_male=328;controls_AF_fin_male=0.0121951;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=49;non_neuro_AN_nfe_nwe=5624;non_neuro_AF_nfe_nwe=0.00871266;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=12;AN_fin_male=808;AF_fin_male=0.0148515;nhomalt_fin_male=0;AC_nfe_female=50;AN_nfe_female=4632;AF_nfe_female=0.0107945;nhomalt_nfe_female=0;AC_amr=3;AN_amr=590;AF_amr=0.00508475;nhomalt_amr=0;non_topmed_AC_nfe_male=42;non_topmed_AN_nfe_male=3640;non_topmed_AF_nfe_male=0.0115385;non_topmed_nhomalt_nfe_male=0;AC_eas=8;AN_eas=916;AF_eas=0.00873362;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=49;non_neuro_AN_nfe_female=4026;non_neuro_AF_nfe_female=0.0121709;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=12;non_neuro_AN_afr=2602;non_neuro_AF_afr=0.00461184;non_neuro_nhomalt_afr=0;controls_AC_raw=230;controls_AN_raw=10790;controls_AF_raw=0.021316;controls_nhomalt_raw=0;controls_AC_male=44;controls_AN_male=3586;controls_AF_male=0.0122699;controls_nhomalt_male=0;non_topmed_AC_male=84;non_topmed_AN_male=9378;non_topmed_AF_male=0.00895713;non_topmed_nhomalt_male=0;controls_AC_nfe_female=25;controls_AN_nfe_female=1444;controls_AF_nfe_female=0.017313;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=2;non_neuro_AN_amr=390;non_neuro_AF_amr=0.00512821;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=3;non_neuro_AN_eas_female=340;non_neuro_AF_eas_female=0.00882353;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=154;AF_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=33;controls_AN_nfe_male=1688;controls_AF_nfe_male=0.0195498;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=7;non_neuro_AN_fin=602;non_neuro_AF_fin=0.0116279;non_neuro_nhomalt_fin=0;AC_oth_female=1;AN_oth_female=344;AF_oth_female=0.00290698;nhomalt_oth_female=0;controls_AC_nfe=58;controls_AN_nfe=3132;controls_AF_nfe=0.0185185;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=122;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=28;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=2;non_neuro_AN_amr_male=162;non_neuro_AF_amr_male=0.0123457;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=5;controls_AN_nfe_nwe=578;controls_AF_nfe_nwe=0.00865052;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=51;AN_nfe_nwe=6324;AF_nfe_nwe=0.00806452;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=44;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=228;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=12;non_neuro_AN_nfe_onf=1254;non_neuro_AF_nfe_onf=0.00956938;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=5;non_topmed_AN_eas_male=564;non_topmed_AF_eas_male=0.00886525;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=0;controls_AN_amr_female=80;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=4;non_neuro_AN_fin_male=328;non_neuro_AF_fin_male=0.0121951;non_neuro_nhomalt_fin_male=0;AC_female=87;AN_female=9456;AF_female=0.00920051;nhomalt_female=0;non_neuro_AC_oth_male=3;non_neuro_AN_oth_male=250;non_neuro_AF_oth_male=0.012;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=52;non_topmed_AN_nfe_est=2578;non_topmed_AF_nfe_est=0.0201707;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=26;non_topmed_AN_nfe_nwe=3644;non_topmed_AF_nfe_nwe=0.00713502;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=3;non_topmed_AN_amr_male=280;non_topmed_AF_amr_male=0.0107143;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=7;non_topmed_AN_nfe_onf=1098;non_topmed_AF_nfe_onf=0.00637523;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=2;controls_AN_eas_male=320;controls_AF_eas_male=0.00625;controls_nhomalt_eas_male=0;controls_AC_oth_male=1;controls_AN_oth_male=126;controls_AF_oth_male=0.00793651;controls_nhomalt_oth_male=0;non_topmed_AC=164;non_topmed_AN=17866;non_topmed_AF=0.00917945;non_topmed_nhomalt=0;controls_AC_fin=7;controls_AN_fin=600;controls_AF_fin=0.0116667;controls_nhomalt_fin=0;non_neuro_AC_nfe=112;non_neuro_AN_nfe=9188;non_neuro_AF_nfe=0.0121898;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=3;non_neuro_AN_fin_female=274;non_neuro_AF_fin_female=0.0109489;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=80;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=1;controls_AN_eas_female=212;controls_AF_eas_female=0.00471698;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=100;non_topmed_AF_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=2;controls_AN_nfe_onf=274;controls_AF_nfe_onf=0.00729927;controls_nhomalt_nfe_onf=0;non_neuro_AC=145;non_neuro_AN=14356;non_neuro_AF=0.0101003;non_neuro_nhomalt=0;non_topmed_AC_nfe=85;non_topmed_AN_nfe=7400;non_topmed_AF_nfe=0.0114865;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=461;non_topmed_AN_raw=26444;non_topmed_AF_raw=0.0174331;non_topmed_nhomalt_raw=0;non_neuro_AC_nfe_est=51;non_neuro_AN_nfe_est=2268;non_neuro_AF_nfe_est=0.0224868;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=4;non_topmed_AN_oth_male=320;non_topmed_AF_oth_male=0.0125;non_topmed_nhomalt_oth_male=0;AC_nfe_est=52;AN_nfe_est=2596;AF_nfe_est=0.0200308;nhomalt_nfe_est=0;non_topmed_AC_afr_male=18;non_topmed_AN_afr_male=3710;non_topmed_AF_afr_male=0.00485175;non_topmed_nhomalt_afr_male=0;AC_eas_male=5;AN_eas_male=576;AF_eas_male=0.00868056;nhomalt_eas_male=0;controls_AC_eas=3;controls_AN_eas=532;controls_AF_eas=0.0056391;controls_nhomalt_eas=0;non_neuro_AC_eas_male=5;non_neuro_AN_eas_male=576;non_neuro_AF_eas_male=0.00868056;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=138;non_neuro_AF_asj_male=0;non_neuro_nhomalt_asj_male=0;controls_AC_oth=1;controls_AN_oth=248;controls_AF_oth=0.00403226;controls_nhomalt_oth=0;AC_nfe=116;AN_nfe=10572;AF_nfe=0.0109724;nhomalt_nfe=0;non_topmed_AC_female=80;non_topmed_AN_female=8488;non_topmed_AF_female=0.00942507;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=180;non_neuro_AF_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=3;non_topmed_AN_eas_female=332;non_topmed_AF_eas_female=0.00903614;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=396;non_neuro_AN_raw=21158;non_neuro_AF_raw=0.0187163;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=8;non_topmed_AN_eas=896;non_topmed_AF_eas=0.00892857;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=12;non_topmed_AN_fin_male=808;non_topmed_AF_fin_male=0.0148515;non_topmed_nhomalt_fin_male=0;AC_fin=23;AN_fin=1762;AF_fin=0.0130533;nhomalt_fin=0;AC_nfe_male=66;AN_nfe_male=5940;AF_nfe_male=0.0111111;nhomalt_nfe_male=0;controls_AC_amr_male=2;controls_AN_amr_male=70;controls_AF_amr_male=0.0285714;controls_nhomalt_amr_male=0;controls_AC_afr_female=9;controls_AN_afr_female=868;controls_AF_afr_female=0.0103687;controls_nhomalt_afr_female=0;controls_AC_amr=2;controls_AN_amr=150;controls_AF_amr=0.0133333;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=56;AF_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=8;non_neuro_AN_eas=916;non_neuro_AF_eas=0.00873362;non_neuro_nhomalt_eas=0;non_neuro_AC_male=79;non_neuro_AN_male=7826;non_neuro_AF_male=0.0100946;non_neuro_nhomalt_male=0;AC_asj=0;AN_asj=210;AF_asj=0;nhomalt_asj=0;controls_AC_nfe_est=51;controls_AN_nfe_est=2236;controls_AF_nfe_est=0.0228086;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=44;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=5;non_topmed_AN_oth=654;non_topmed_AF_oth=0.00764526;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=11;non_topmed_AN_fin_female=954;non_topmed_AF_fin_female=0.0115304;non_topmed_nhomalt_fin_female=0;AC_oth=5;AN_oth=712;AF_oth=0.00702247;nhomalt_oth=0;non_neuro_AC_nfe_male=63;non_neuro_AN_nfe_male=5162;non_neuro_AF_nfe_male=0.0122046;non_neuro_nhomalt_nfe_male=0;controls_AC_female=38;controls_AN_female=3014;controls_AF_female=0.0126078;controls_nhomalt_female=0;non_topmed_AC_fin=23;non_topmed_AN_fin=1762;non_topmed_AF_fin=0.0130533;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=43;non_topmed_AN_nfe_female=3760;non_topmed_AF_nfe_female=0.0114362;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=12;controls_AF_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=56;non_topmed_AF_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=4;non_neuro_AN_oth=478;non_neuro_AF_oth=0.0083682;non_neuro_nhomalt_oth=0;AC_male=108;AN_male=11934;AF_male=0.00904977;nhomalt_male=0;controls_AC_fin_female=3;controls_AN_fin_female=272;controls_AF_fin_female=0.0110294;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=16;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=3;AN_amr_male=296;AF_amr_male=0.0101351;nhomalt_amr_male=0;AC_amr_female=0;AN_amr_female=294;AF_amr_female=0;nhomalt_amr_female=0;AC_oth_male=4;AN_oth_male=368;AF_oth_male=0.0108696;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=42;non_neuro_AF_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=22;non_topmed_AN_afr_female=2786;non_topmed_AF_afr_female=0.00789663;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=40;non_topmed_AN_afr=6496;non_topmed_AF_afr=0.00615764;non_topmed_nhomalt_afr=0;controls_AC=82;controls_AN=6600;controls_AF=0.0124242;controls_nhomalt=0;non_neuro_AC_oth_female=1;non_neuro_AN_oth_female=228;non_neuro_AF_oth_female=0.00438596;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0.00146497;non_topmed_faf99_amr=0.00146446;faf95_afr=0.00455485;faf99_afr=0.0045552;controls_faf95_afr=0.0032293;controls_faf99_afr=0.00322926;faf95_amr=0.00138541;faf99_amr=0.00138591;faf95_eas=0.00434534;faf99_eas=0.00434526;faf95=0.00806925;faf99=0.00806959;non_neuro_faf95_afr=0.00266046;non_neuro_faf99_afr=0.00266027;non_neuro_faf95_amr=0.0009111;non_neuro_faf99_amr=0.00091051;controls_faf95_nfe=0.0147076;controls_faf99_nfe=0.014708;non_topmed_faf95=0.00803255;non_topmed_faf99=0.00803289;non_neuro_faf95_nfe=0.010359;non_neuro_faf99_nfe=0.0103585;non_neuro_faf95=0.00876127;non_neuro_faf99=0.00876123;non_topmed_faf95_nfe=0.00951637;non_topmed_faf99_nfe=0.00951633;controls_faf95_eas=0.00153645;controls_faf99_eas=0.00153604;faf95_nfe=0.00935211;faf99_nfe=0.00935142;non_topmed_faf95_eas=0.00444224;non_topmed_faf99_eas=0.00444266;controls_faf95_amr=0.00236834;controls_faf99_amr=0.00236814;non_neuro_faf95_eas=0.00434534;non_neuro_faf99_eas=0.00434526;non_topmed_faf95_afr=0.00464745;non_topmed_faf99_afr=0.0046478;controls_faf95=0.0102559;controls_faf99=0.0102563;controls_popmax=nfe;controls_AC_popmax=58;controls_AN_popmax=3132;controls_AF_popmax=0.0185185;controls_nhomalt_popmax=0;popmax=nfe;AC_popmax=116;AN_popmax=10572;AF_popmax=0.0109724;nhomalt_popmax=0;age_hist_het_bin_freq=14|10|14|12|23|18|16|7|5|4;age_hist_het_n_smaller=30;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=nfe;non_neuro_AC_popmax=112;non_neuro_AN_popmax=9188;non_neuro_AF_popmax=0.0121898;non_neuro_nhomalt_popmax=0;non_topmed_popmax=nfe;non_topmed_AC_popmax=85;non_topmed_AN_popmax=7400;non_topmed_AF_popmax=0.0114865;non_topmed_nhomalt_popmax=0"); writer.Flush(); stream.Position = 0; return stream; } private static Stream GetChr22_22055876_exome() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t22055876\trs113132860\tG\tT\t2.28838e+06\tPASS\tAC=228;AN=18686;AF=0.0122016;rf_tp_probability=0.374872;FS=2.447;InbreedingCoeff=0.0575;MQ=59.76;MQRankSum=-0.152;QD=15.22;ReadPosRankSum=-0.259;SOR=0.701;BaseQRankSum=-0.771;ClippingRankSum=0;DP=490478;VQSLOD=0.36;VQSR_culprit=FS;lcr;variant_type=mixed;allele_type=snv;n_alt_alleles=5;was_mixed;has_star;pab_max=1;gq_hist_alt_bin_freq=79|95|60|65|65|46|37|41|28|38|24|19|20|18|14|11|15|8|4|103;gq_hist_all_bin_freq=2799|865|616|841|825|614|753|768|561|775|568|463|553|397|310|247|235|179|221|5310;dp_hist_alt_bin_freq=18|144|272|199|78|56|18|5|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=95|956|2823|4449|4634|3537|2105|978|396|166|82|35|24|11|5|4|2|1|0|1;dp_hist_all_n_larger=6;ab_hist_alt_bin_freq=0|18|83|129|138|140|103|45|48|13|28|14|13|9|0|3|0|0|0|0;AC_nfe_seu=2;AN_nfe_seu=52;AF_nfe_seu=0.0384615;nhomalt_nfe_seu=0;controls_AC_afr_male=5;controls_AN_afr_male=806;controls_AF_afr_male=0.00620347;controls_nhomalt_afr_male=0;non_topmed_AC_amr=1;non_topmed_AN_amr=426;non_topmed_AF_amr=0.00234742;non_topmed_nhomalt_amr=0;AC_raw=559;AN_raw=27704;AF_raw=0.0201776;nhomalt_raw=4;AC_fin_female=32;AN_fin_female=618;AF_fin_female=0.0517799;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=48;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=5;non_neuro_AN_afr_male=914;non_neuro_AF_afr_male=0.00547046;non_neuro_nhomalt_afr_male=0;AC_afr_male=15;AN_afr_male=2982;AF_afr_male=0.00503018;nhomalt_afr_male=0;AC_afr=25;AN_afr=5118;AF_afr=0.00488472;nhomalt_afr=0;non_neuro_AC_afr_female=3;non_neuro_AN_afr_female=1086;non_neuro_AF_afr_female=0.00276243;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=1;non_topmed_AN_amr_female=200;non_topmed_AF_amr_female=0.005;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=7;non_topmed_AN_oth_female=260;non_topmed_AF_oth_female=0.0269231;non_topmed_nhomalt_oth_female=0;AC_eas_female=2;AN_eas_female=486;AF_eas_female=0.00411523;nhomalt_eas_female=0;AC_afr_female=10;AN_afr_female=2136;AF_afr_female=0.00468165;nhomalt_afr_female=0;non_neuro_AC_female=70;non_neuro_AN_female=5902;non_neuro_AF_female=0.0118604;non_neuro_nhomalt_female=0;controls_AC_afr=8;controls_AN_afr=1498;controls_AF_afr=0.00534045;controls_nhomalt_afr=0;AC_nfe_onf=11;AN_nfe_onf=1346;AF_nfe_onf=0.00817236;nhomalt_nfe_onf=0;controls_AC_fin_male=10;controls_AN_fin_male=180;controls_AF_fin_male=0.0555556;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=39;non_neuro_AN_nfe_nwe=5416;non_neuro_AF_nfe_nwe=0.00720089;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=26;AN_fin_male=508;AF_fin_male=0.0511811;nhomalt_fin_male=0;AC_nfe_female=54;AN_nfe_female=4252;AF_nfe_female=0.0126999;nhomalt_nfe_female=0;AC_amr=1;AN_amr=450;AF_amr=0.00222222;nhomalt_amr=0;non_topmed_AC_nfe_male=52;non_topmed_AN_nfe_male=3174;non_topmed_AF_nfe_male=0.0163831;non_topmed_nhomalt_nfe_male=0;AC_eas=4;AN_eas=1398;AF_eas=0.00286123;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=52;non_neuro_AN_nfe_female=3740;non_neuro_AF_nfe_female=0.0139037;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=8;non_neuro_AN_afr=2000;non_neuro_AF_afr=0.004;non_neuro_nhomalt_afr=0;controls_AC_raw=303;controls_AN_raw=9668;controls_AF_raw=0.0313405;controls_nhomalt_raw=0;controls_AC_male=65;controls_AN_male=3126;controls_AF_male=0.0207933;controls_nhomalt_male=0;non_topmed_AC_male=107;non_topmed_AN_male=8010;non_topmed_AF_male=0.0133583;non_topmed_nhomalt_male=0;controls_AC_nfe_female=40;controls_AN_nfe_female=1234;controls_AF_nfe_female=0.0324149;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=320;non_neuro_AF_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=2;non_neuro_AN_eas_female=486;non_neuro_AF_eas_female=0.00411523;non_neuro_nhomalt_eas_female=0;AC_asj_male=3;AN_asj_male=156;AF_asj_male=0.0192308;nhomalt_asj_male=0;controls_AC_nfe_male=41;controls_AN_nfe_male=1460;controls_AF_nfe_male=0.0280822;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=21;non_neuro_AN_fin=350;non_neuro_AF_fin=0.06;non_neuro_nhomalt_fin=0;AC_oth_female=7;AN_oth_female=274;AF_oth_female=0.0255474;nhomalt_oth_female=0;controls_AC_nfe=81;controls_AN_nfe=2694;controls_AF_nfe=0.0300668;controls_nhomalt_nfe=0;controls_AC_oth_female=2;controls_AN_oth_female=102;controls_AF_oth_female=0.0196078;controls_nhomalt_oth_female=0;controls_AC_asj=1;controls_AN_asj=18;controls_AF_asj=0.0555556;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=146;non_neuro_AF_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=13;controls_AN_nfe_nwe=342;controls_AF_nfe_nwe=0.0380117;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=47;AN_nfe_nwe=5898;AF_nfe_nwe=0.0079688;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=2;controls_AN_nfe_seu=28;controls_AF_nfe_seu=0.0714286;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=174;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=8;non_neuro_AN_nfe_onf=1138;non_neuro_AF_nfe_onf=0.00702988;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=2;non_topmed_AN_eas_male=894;non_topmed_AF_eas_male=0.00223714;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=0;controls_AN_amr_female=48;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=10;non_neuro_AN_fin_male=180;non_neuro_AF_fin_male=0.0555556;non_neuro_nhomalt_fin_male=0;AC_female=106;AN_female=8036;AF_female=0.0131906;nhomalt_female=0;non_neuro_AC_oth_male=6;non_neuro_AN_oth_male=222;non_neuro_AF_oth_male=0.027027;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=61;non_topmed_AN_nfe_est=2506;non_topmed_AF_nfe_est=0.0243416;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=28;non_topmed_AN_nfe_nwe=3100;non_topmed_AF_nfe_nwe=0.00903226;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=226;non_topmed_AF_amr_male=0;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=9;non_topmed_AN_nfe_onf=898;non_topmed_AF_nfe_onf=0.0100223;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=2;controls_AN_eas_male=514;controls_AF_eas_male=0.00389105;controls_nhomalt_eas_male=0;controls_AC_oth_male=6;controls_AN_oth_male=94;controls_AF_oth_male=0.0638298;controls_nhomalt_oth_male=0;non_topmed_AC=206;non_topmed_AN=15064;non_topmed_AF=0.013675;non_topmed_nhomalt=0;controls_AC_fin=21;controls_AN_fin=348;controls_AF_fin=0.0603448;controls_nhomalt_fin=0;non_neuro_AC_nfe=110;non_neuro_AN_nfe=8768;non_neuro_AF_nfe=0.0125456;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=11;non_neuro_AN_fin_female=170;non_neuro_AF_fin_female=0.0647059;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=2;non_topmed_AN_nfe_seu=52;non_topmed_AF_nfe_seu=0.0384615;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=2;controls_AN_eas_female=314;controls_AF_eas_female=0.00636943;controls_nhomalt_eas_female=0;non_topmed_AC_asj=3;non_topmed_AN_asj=78;non_topmed_AF_asj=0.0384615;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=5;controls_AN_nfe_onf=164;controls_AF_nfe_onf=0.0304878;controls_nhomalt_nfe_onf=0;non_neuro_AC=152;non_neuro_AN=13452;non_neuro_AF=0.0112994;non_neuro_nhomalt=0;non_topmed_AC_nfe=100;non_topmed_AN_nfe=6556;non_topmed_AF_nfe=0.0152532;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=512;non_topmed_AN_raw=23304;non_topmed_AF_raw=0.0219705;non_topmed_nhomalt_raw=4;non_neuro_AC_nfe_est=61;non_neuro_AN_nfe_est=2188;non_neuro_AF_nfe_est=0.0278793;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=9;non_topmed_AN_oth_male=248;non_topmed_AF_oth_male=0.0362903;non_topmed_nhomalt_oth_male=0;AC_nfe_est=61;AN_nfe_est=2522;AF_nfe_est=0.0241872;nhomalt_nfe_est=0;non_topmed_AC_afr_male=15;non_topmed_AN_afr_male=2918;non_topmed_AF_afr_male=0.00514051;non_topmed_nhomalt_afr_male=0;AC_eas_male=2;AN_eas_male=912;AF_eas_male=0.00219298;nhomalt_eas_male=0;controls_AC_eas=4;controls_AN_eas=828;controls_AF_eas=0.00483092;controls_nhomalt_eas=0;non_neuro_AC_eas_male=2;non_neuro_AN_eas_male=912;non_neuro_AF_eas_male=0.00219298;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=1;non_neuro_AN_asj_male=148;non_neuro_AF_asj_male=0.00675676;non_neuro_nhomalt_asj_male=0;controls_AC_oth=8;controls_AN_oth=196;controls_AF_oth=0.0408163;controls_nhomalt_oth=0;AC_nfe=121;AN_nfe=9818;AF_nfe=0.0123243;nhomalt_nfe=0;non_topmed_AC_female=99;non_topmed_AN_female=7054;non_topmed_AF_female=0.0140346;non_topmed_nhomalt_female=0;non_neuro_AC_asj=1;non_neuro_AN_asj=196;non_neuro_AF_asj=0.00510204;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=2;non_topmed_AN_eas_female=470;non_topmed_AF_eas_female=0.00425532;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=362;non_neuro_AN_raw=19100;non_neuro_AF_raw=0.0189529;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=4;non_topmed_AN_eas=1364;non_topmed_AF_eas=0.00293255;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=26;non_topmed_AN_fin_male=508;non_topmed_AF_fin_male=0.0511811;non_topmed_nhomalt_fin_male=0;AC_fin=58;AN_fin=1126;AF_fin=0.0515098;nhomalt_fin=0;AC_nfe_male=67;AN_nfe_male=5566;AF_nfe_male=0.0120374;nhomalt_nfe_male=0;controls_AC_amr_male=0;controls_AN_amr_male=68;controls_AF_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=3;controls_AN_afr_female=692;controls_AF_afr_female=0.00433526;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=116;controls_AF_amr=0;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=54;AF_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=4;non_neuro_AN_eas=1398;non_neuro_AF_eas=0.00286123;non_neuro_nhomalt_eas=0;non_neuro_AC_male=82;non_neuro_AN_male=7550;non_neuro_AF_male=0.0108609;non_neuro_nhomalt_male=0;AC_asj=3;AN_asj=210;AF_asj=0.0142857;nhomalt_asj=0;controls_AC_nfe_est=61;controls_AN_nfe_est=2160;controls_AF_nfe_est=0.0282407;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=36;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=16;non_topmed_AN_oth=508;non_topmed_AF_oth=0.0314961;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=32;non_topmed_AN_fin_female=618;non_topmed_AF_fin_female=0.0517799;non_topmed_nhomalt_fin_female=0;AC_oth=16;AN_oth=566;AF_oth=0.0282686;nhomalt_oth=0;non_neuro_AC_nfe_male=58;non_neuro_AN_nfe_male=5028;non_neuro_AF_nfe_male=0.0115354;non_neuro_nhomalt_nfe_male=0;controls_AC_female=58;controls_AN_female=2572;controls_AF_female=0.0225505;controls_nhomalt_female=0;non_topmed_AC_fin=58;non_topmed_AN_fin=1126;non_topmed_AF_fin=0.0515098;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=48;non_topmed_AN_nfe_female=3382;non_topmed_AF_nfe_female=0.0141928;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=1;controls_AN_asj_male=4;controls_AF_asj_male=0.25;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=3;non_topmed_AN_asj_male=42;non_topmed_AF_asj_male=0.0714286;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=8;non_neuro_AN_oth=420;non_neuro_AF_oth=0.0190476;non_neuro_nhomalt_oth=0;AC_male=122;AN_male=10650;AF_male=0.0114554;nhomalt_male=0;controls_AC_fin_female=11;controls_AN_fin_female=168;controls_AF_fin_female=0.0654762;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=14;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=234;AF_amr_male=0;nhomalt_amr_male=0;AC_amr_female=1;AN_amr_female=216;AF_amr_female=0.00462963;nhomalt_amr_female=0;AC_oth_male=9;AN_oth_male=292;AF_oth_male=0.0308219;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=2;non_neuro_AN_nfe_seu=26;non_neuro_AF_nfe_seu=0.0769231;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=9;non_topmed_AN_afr_female=2088;non_topmed_AF_afr_female=0.00431034;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=24;non_topmed_AN_afr=5006;non_topmed_AF_afr=0.00479425;non_topmed_nhomalt_afr=0;controls_AC=123;controls_AN=5698;controls_AF=0.0215865;controls_nhomalt=0;non_neuro_AC_oth_female=2;non_neuro_AN_oth_female=198;non_neuro_AF_oth_female=0.010101;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0.00012;non_topmed_faf99_amr=0.00012;faf95_afr=0.00339534;faf99_afr=0.00339557;controls_faf95_afr=0.00265678;controls_faf99_afr=0.00265672;faf95_amr=0.000113;faf99_amr=0.000113;faf95_eas=0.00097636;faf99_eas=0.00097723;faf95=0.0109033;faf99=0.0109026;non_neuro_faf95_afr=0.00199003;non_neuro_faf99_afr=0.00198958;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;controls_faf95_nfe=0.0247904;controls_faf99_nfe=0.0247903;non_topmed_faf95=0.0121462;non_topmed_faf99=0.0121458;non_neuro_faf95_nfe=0.0106444;non_neuro_faf99_nfe=0.0106446;non_neuro_faf95=0.00983491;non_neuro_faf99=0.00983455;non_topmed_faf95_nfe=0.0128339;non_topmed_faf99_nfe=0.0128338;controls_faf95_eas=0.00164922;controls_faf99_eas=0.00164971;faf95_nfe=0.0105404;faf99_nfe=0.0105404;non_topmed_faf95_eas=0.00100131;non_topmed_faf99_eas=0.00100135;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0.00097636;non_neuro_faf99_eas=0.00097723;non_topmed_faf95_afr=0.00330485;non_topmed_faf99_afr=0.0033051;controls_faf95=0.018487;controls_faf99=0.0184878;controls_popmax=nfe;controls_AC_popmax=81;controls_AN_popmax=2694;controls_AF_popmax=0.0300668;controls_nhomalt_popmax=0;popmax=nfe;AC_popmax=121;AN_popmax=9818;AF_popmax=0.0123243;nhomalt_popmax=0;age_hist_het_bin_freq=12|8|16|15|13|24|18|13|9|2;age_hist_het_n_smaller=33;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=nfe;non_neuro_AC_popmax=110;non_neuro_AN_popmax=8768;non_neuro_AF_popmax=0.0125456;non_neuro_nhomalt_popmax=0;non_topmed_popmax=nfe;non_topmed_AC_popmax=100;non_topmed_AN_popmax=6556;non_topmed_AF_popmax=0.0152532;non_topmed_nhomalt_popmax=0"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void RemoveConflictingItems() { var sequence = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "G", 22055875 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var gnomadReader = new GnomadSnvReader(new StreamReader(GetChr22_22055876_genome()), new StreamReader(GetChr22_22055876_exome()), sequenceProvider); var items = gnomadReader.GetCombinedItems().ToList(); Assert.Single(items); //the genome items result in a conflict. Only the exome item should be reported back Assert.Equal(18686, items[0].AllAlleleNumber); } private static Stream GetChr22_16689800_16689902_genome() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine("22\t16689820\trs1302088526\tC\tG\t30.01\tAC0;RF\tAC=0;AN=0;rf_tp_probability=0.0655868;FS=0;InbreedingCoeff=-0.1286;MQ=27;QD=10;SOR=2.833;DP=19893;VQSLOD=-82.37;VQSR_culprit=MQ;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=snv;allele_type=snv;n_alt_alleles=1;gq_hist_alt_bin_freq=0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;gq_hist_all_bin_freq=4957|4490|585|258|29|1|2|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_bin_freq=1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=19964|344|2|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=0;controls_nhomalt_afr_male=0;non_topmed_AC_amr=0;non_topmed_AN_amr=0;non_topmed_nhomalt_amr=0;AC_raw=2;AN_raw=17984;AF_raw=0.00011121;nhomalt_raw=1;AC_fin_female=0;AN_fin_female=0;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=0;non_neuro_AN_afr_male=0;non_neuro_nhomalt_afr_male=0;AC_afr_male=0;AN_afr_male=0;nhomalt_afr_male=0;AC_afr=0;AN_afr=0;nhomalt_afr=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=0;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=0;non_topmed_AN_amr_female=0;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=0;non_topmed_AN_oth_female=0;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=0;nhomalt_eas_female=0;AC_afr_female=0;AN_afr_female=0;nhomalt_afr_female=0;non_neuro_AC_female=0;non_neuro_AN_female=0;non_neuro_nhomalt_female=0;controls_AC_afr=0;controls_AN_afr=0;controls_nhomalt_afr=0;AC_nfe_onf=0;AN_nfe_onf=0;nhomalt_nfe_onf=0;controls_AC_fin_male=0;controls_AN_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=0;non_neuro_AN_nfe_nwe=0;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=0;AN_nfe_female=0;nhomalt_nfe_female=0;AC_amr=0;AN_amr=0;nhomalt_amr=0;non_topmed_AC_nfe_male=0;non_topmed_AN_nfe_male=0;non_topmed_nhomalt_nfe_male=0;AC_eas=0;AN_eas=0;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=0;non_neuro_AN_nfe_female=0;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=0;non_neuro_nhomalt_afr=0;controls_AC_raw=0;controls_AN_raw=5542;controls_AF_raw=0;controls_nhomalt_raw=0;controls_AC_male=0;controls_AN_male=0;controls_nhomalt_male=0;non_topmed_AC_male=0;non_topmed_AN_male=0;non_topmed_nhomalt_male=0;controls_AC_nfe_female=0;controls_AN_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=0;non_neuro_nhomalt_fin=0;AC_oth_female=0;AN_oth_female=0;nhomalt_oth_female=0;controls_AC_nfe=0;controls_AN_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=0;AN_nfe_nwe=0;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=0;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=0;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=0;controls_AN_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=0;AN_female=0;nhomalt_female=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=0;non_topmed_AN_nfe_nwe=0;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=0;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=0;controls_AN_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=0;non_topmed_AN=0;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=0;controls_nhomalt_fin=0;non_neuro_AC_nfe=0;non_neuro_AN_nfe=0;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=0;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=0;non_neuro_AN=0;non_neuro_nhomalt=0;non_topmed_AC_nfe=0;non_topmed_AN_nfe=0;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=2;non_topmed_AN_raw=14534;non_topmed_AF_raw=0.000137608;non_topmed_nhomalt_raw=1;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=0;non_topmed_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=0;nhomalt_nfe_est=0;non_topmed_AC_afr_male=0;non_topmed_AN_afr_male=0;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=0;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=0;non_neuro_nhomalt_asj_male=0;controls_AC_oth=0;controls_AN_oth=0;controls_nhomalt_oth=0;AC_nfe=0;AN_nfe=0;nhomalt_nfe=0;non_topmed_AC_female=0;non_topmed_AN_female=0;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=0;non_neuro_AN_raw=13036;non_neuro_AF_raw=0;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=0;non_topmed_AN_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=0;non_topmed_nhomalt_fin_male=0;AC_fin=0;AN_fin=0;nhomalt_fin=0;AC_nfe_male=0;AN_nfe_male=0;nhomalt_nfe_male=0;controls_AC_amr_male=0;controls_AN_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=0;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=0;non_neuro_AN_eas=0;non_neuro_nhomalt_eas=0;non_neuro_AC_male=0;non_neuro_AN_male=0;non_neuro_nhomalt_male=0;AC_asj=0;AN_asj=0;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=0;non_topmed_AN_oth=0;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=0;non_topmed_nhomalt_fin_female=0;AC_oth=0;AN_oth=0;nhomalt_oth=0;non_neuro_AC_nfe_male=0;non_neuro_AN_nfe_male=0;non_neuro_nhomalt_nfe_male=0;controls_AC_female=0;controls_AN_female=0;controls_nhomalt_female=0;non_topmed_AC_fin=0;non_topmed_AN_fin=0;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=0;non_topmed_AN_nfe_female=0;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=0;non_neuro_AN_oth=0;non_neuro_nhomalt_oth=0;AC_male=0;AN_male=0;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=0;nhomalt_amr_male=0;AC_amr_female=0;AN_amr_female=0;nhomalt_amr_female=0;AC_oth_male=0;AN_oth_male=0;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=0;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=0;non_topmed_AN_afr=0;non_topmed_nhomalt_afr=0;controls_AC=0;controls_AN=0;controls_nhomalt=0;non_neuro_AC_oth_female=0;non_neuro_AN_oth_female=0;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0;non_topmed_faf99_amr=0;faf95_afr=0;faf99_afr=0;controls_faf95_afr=0;controls_faf99_afr=0;faf95_amr=0;faf99_amr=0;faf95_eas=0;faf99_eas=0;faf95=0;faf99=0;non_neuro_faf95_afr=0;non_neuro_faf99_afr=0;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;controls_faf95_nfe=0;controls_faf99_nfe=0;non_topmed_faf95=0;non_topmed_faf99=0;non_neuro_faf95_nfe=0;non_neuro_faf99_nfe=0;non_neuro_faf95=0;non_neuro_faf99=0;non_topmed_faf95_nfe=0;non_topmed_faf99_nfe=0;controls_faf95_eas=0;controls_faf99_eas=0;faf95_nfe=0;faf99_nfe=0;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_topmed_faf95_afr=0;non_topmed_faf99_afr=0;controls_faf95=0;controls_faf99=0;age_hist_het_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_het_n_smaller=0;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0"); writer.WriteLine("22\t16689902\trs7289731\tG\tA\t3096.01\tAC0;RF\tAC=0;AN=0;rf_tp_probability=0.0665037;FS=0;InbreedingCoeff=-0.0964;MQ=27;MQRankSum=0.736;QD=15.8;ReadPosRankSum=0.736;SOR=3.525;BaseQRankSum=0.736;ClippingRankSum=0.736;DP=7972;VQSLOD=-105;VQSR_culprit=MQ;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=snv;allele_type=snv;n_alt_alleles=1;pab_max=1;gq_hist_alt_bin_freq=7|59|2|2|14|0|0|0|0|3|1|0|0|0|0|0|0|0|0|0;gq_hist_all_bin_freq=4019|1333|47|9|15|0|0|0|0|3|1|0|0|0|0|0|0|0|0|0;dp_hist_alt_bin_freq=86|2|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=20297|13|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|0|0|1|0|0|1|0|5|0|0|13|0|1|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=0;controls_nhomalt_afr_male=0;non_topmed_AC_amr=0;non_topmed_AN_amr=0;non_topmed_nhomalt_amr=0;AC_raw=133;AN_raw=9506;AF_raw=0.0139912;nhomalt_raw=59;AC_fin_female=0;AN_fin_female=0;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=0;non_neuro_AN_afr_male=0;non_neuro_nhomalt_afr_male=0;AC_afr_male=0;AN_afr_male=0;nhomalt_afr_male=0;AC_afr=0;AN_afr=0;nhomalt_afr=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=0;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=0;non_topmed_AN_amr_female=0;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=0;non_topmed_AN_oth_female=0;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=0;nhomalt_eas_female=0;AC_afr_female=0;AN_afr_female=0;nhomalt_afr_female=0;non_neuro_AC_female=0;non_neuro_AN_female=0;non_neuro_nhomalt_female=0;controls_AC_afr=0;controls_AN_afr=0;controls_nhomalt_afr=0;AC_nfe_onf=0;AN_nfe_onf=0;nhomalt_nfe_onf=0;controls_AC_fin_male=0;controls_AN_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=0;non_neuro_AN_nfe_nwe=0;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=0;AN_nfe_female=0;nhomalt_nfe_female=0;AC_amr=0;AN_amr=0;nhomalt_amr=0;non_topmed_AC_nfe_male=0;non_topmed_AN_nfe_male=0;non_topmed_nhomalt_nfe_male=0;AC_eas=0;AN_eas=0;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=0;non_neuro_AN_nfe_female=0;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=0;non_neuro_nhomalt_afr=0;controls_AC_raw=28;controls_AN_raw=2820;controls_AF_raw=0.00992908;controls_nhomalt_raw=12;controls_AC_male=0;controls_AN_male=0;controls_nhomalt_male=0;non_topmed_AC_male=0;non_topmed_AN_male=0;non_topmed_nhomalt_male=0;controls_AC_nfe_female=0;controls_AN_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=0;non_neuro_nhomalt_fin=0;AC_oth_female=0;AN_oth_female=0;nhomalt_oth_female=0;controls_AC_nfe=0;controls_AN_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=0;AN_nfe_nwe=0;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=0;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=0;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=0;controls_AN_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=0;AN_female=0;nhomalt_female=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=0;non_topmed_AN_nfe_nwe=0;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=0;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=0;controls_AN_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=0;non_topmed_AN=0;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=0;controls_nhomalt_fin=0;non_neuro_AC_nfe=0;non_neuro_AN_nfe=0;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=0;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=0;non_neuro_AN=0;non_neuro_nhomalt=0;non_topmed_AC_nfe=0;non_topmed_AN_nfe=0;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=86;non_topmed_AN_raw=7812;non_topmed_AF_raw=0.0110087;non_topmed_nhomalt_raw=38;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=0;non_topmed_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=0;nhomalt_nfe_est=0;non_topmed_AC_afr_male=0;non_topmed_AN_afr_male=0;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=0;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=0;non_neuro_nhomalt_asj_male=0;controls_AC_oth=0;controls_AN_oth=0;controls_nhomalt_oth=0;AC_nfe=0;AN_nfe=0;nhomalt_nfe=0;non_topmed_AC_female=0;non_topmed_AN_female=0;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=106;non_neuro_AN_raw=6790;non_neuro_AF_raw=0.0156112;non_neuro_nhomalt_raw=47;non_topmed_AC_eas=0;non_topmed_AN_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=0;non_topmed_nhomalt_fin_male=0;AC_fin=0;AN_fin=0;nhomalt_fin=0;AC_nfe_male=0;AN_nfe_male=0;nhomalt_nfe_male=0;controls_AC_amr_male=0;controls_AN_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=0;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=0;non_neuro_AN_eas=0;non_neuro_nhomalt_eas=0;non_neuro_AC_male=0;non_neuro_AN_male=0;non_neuro_nhomalt_male=0;AC_asj=0;AN_asj=0;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=0;non_topmed_AN_oth=0;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=0;non_topmed_nhomalt_fin_female=0;AC_oth=0;AN_oth=0;nhomalt_oth=0;non_neuro_AC_nfe_male=0;non_neuro_AN_nfe_male=0;non_neuro_nhomalt_nfe_male=0;controls_AC_female=0;controls_AN_female=0;controls_nhomalt_female=0;non_topmed_AC_fin=0;non_topmed_AN_fin=0;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=0;non_topmed_AN_nfe_female=0;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=0;non_neuro_AN_oth=0;non_neuro_nhomalt_oth=0;AC_male=0;AN_male=0;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=0;nhomalt_amr_male=0;AC_amr_female=0;AN_amr_female=0;nhomalt_amr_female=0;AC_oth_male=0;AN_oth_male=0;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=0;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=0;non_topmed_AN_afr=0;non_topmed_nhomalt_afr=0;controls_AC=0;controls_AN=0;controls_nhomalt=0;non_neuro_AC_oth_female=0;non_neuro_AN_oth_female=0;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0;non_topmed_faf99_amr=0;faf95_afr=0;faf99_afr=0;controls_faf95_afr=0;controls_faf99_afr=0;faf95_amr=0;faf99_amr=0;faf95_eas=0;faf99_eas=0;faf95=0;faf99=0;non_neuro_faf95_afr=0;non_neuro_faf99_afr=0;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;controls_faf95_nfe=0;controls_faf99_nfe=0;non_topmed_faf95=0;non_topmed_faf99=0;non_neuro_faf95_nfe=0;non_neuro_faf99_nfe=0;non_neuro_faf95=0;non_neuro_faf99=0;non_topmed_faf95_nfe=0;non_topmed_faf99_nfe=0;controls_faf95_eas=0;controls_faf99_eas=0;faf95_nfe=0;faf99_nfe=0;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_topmed_faf95_afr=0;non_topmed_faf99_afr=0;controls_faf95=0;controls_faf99=0;age_hist_het_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_het_n_smaller=0;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0"); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void DoNotReportCoverage_whenAnZero() { var sequence = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "C" + new string('G', 16689902 - 16689820) + "GGGGA", 16689820 - 1 - VariantUtils.MaxUpstreamLength); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var gnomadReader = new GnomadSnvReader(new StreamReader(GetChr22_16689800_16689902_genome()), null, sequenceProvider); var items = gnomadReader.GetCombinedItems().ToList(); Assert.Empty(items); } [Theory] [InlineData(null, null, null)] [InlineData(null, 1, 1)] [InlineData(1, null, 1)] [InlineData(1, 1, 2)] public void AddNullableInts(int? x, int? y, int? sum) { // I was not expecting null + 1 == null by default!! //var z = x + y; //Assert.Equal(sum, z); Assert.Equal(sum, SaParseUtilities.Add(x, y)); } //chr1 76226858 G GCTAGAATGAGTTA Sas_An=30614 Sas_An= } } ================================================ FILE: UnitTests/SAUtils/gnomAD/GnomadSvBedParserTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using SAUtils.DataStructures; using SAUtils.gnomAD; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.gnomAD; public sealed class GnomadSvBedParserTests { private static Stream GetStreamData(string dataString) { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.Write(dataString); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void TestGnomadSvBedParser() { const string bedData = "#chrom\tstart\tend\tname\tsvtype\tALGORITHMS\tBOTHSIDES_SUPPORT\tCHR2\tCPX_INTERVALS\tCPX_TYPE\tEND2\tEND\tEVIDENCE\tHIGH_SR_BACKGROUND\tPCRPLUS_DEPLETED\tPESR_GT_OVERDISPERSION\tPOS2\tPROTEIN_CODING__COPY_GAIN\tPROTEIN_CODING__DUP_LOF\tPROTEIN_CODING__DUP_PARTIAL\tPROTEIN_CODING__INTERGENIC\tPROTEIN_CODING__INTRONIC\tPROTEIN_CODING__INV_SPAN\tPROTEIN_CODING__LOF\tPROTEIN_CODING__MSV_EXON_OVR\tPROTEIN_CODING__NEAREST_TSS\tPROTEIN_CODING__PROMOTER\tPROTEIN_CODING__UTR\tSOURCE\tSTRANDS\tSVLEN\tSVTYPE\tUNRESOLVED_TYPE\tUNSTABLE_AF_PCRPLUS\tVARIABLE_ACROSS_BATCHES\tAN\tAC\tAF\tN_BI_GENOS\tN_HOMREF\tN_HET\tN_HOMALT\tFREQ_HOMREF\tFREQ_HET\tFREQ_HOMALT\tMALE_AN\tMALE_AC\tMALE_AF\tMALE_N_BI_GENOS\tMALE_N_HOMREF\tMALE_N_HET\tMALE_N_HOMALT\tMALE_FREQ_HOMREF\tMALE_FREQ_HET\tMALE_FREQ_HOMALT\tMALE_N_HEMIREF\tMALE_N_HEMIALT\tMALE_FREQ_HEMIREF\tMALE_FREQ_HEMIALT\tPAR\tFEMALE_AN\tFEMALE_AC\tFEMALE_AF\tFEMALE_N_BI_GENOS\tFEMALE_N_HOMREF\tFEMALE_N_HET\tFEMALE_N_HOMALT\tFEMALE_FREQ_HOMREF\tFEMALE_FREQ_HET\tFEMALE_FREQ_HOMALT\tPOPMAX_AF\tAFR_AN\tAFR_AC\tAFR_AF\tAFR_N_BI_GENOS\tAFR_N_HOMREF\tAFR_N_HET\tAFR_N_HOMALT\tAFR_FREQ_HOMREF\tAFR_FREQ_HET\tAFR_FREQ_HOMALT\tAFR_MALE_AN\tAFR_MALE_AC\tAFR_MALE_AF\tAFR_MALE_N_BI_GENOS\tAFR_MALE_N_HOMREF\tAFR_MALE_N_HET\tAFR_MALE_N_HOMALT\tAFR_MALE_FREQ_HOMREF\tAFR_MALE_FREQ_HET\tAFR_MALE_FREQ_HOMALT\tAFR_MALE_N_HEMIREF\tAFR_MALE_N_HEMIALT\tAFR_MALE_FREQ_HEMIREF\tAFR_MALE_FREQ_HEMIALT\tAFR_FEMALE_AN\tAFR_FEMALE_AC\tAFR_FEMALE_AF\tAFR_FEMALE_N_BI_GENOS\tAFR_FEMALE_N_HOMREF\tAFR_FEMALE_N_HET\tAFR_FEMALE_N_HOMALT\tAFR_FEMALE_FREQ_HOMREF\tAFR_FEMALE_FREQ_HET\tAFR_FEMALE_FREQ_HOMALT\tAMR_AN\tAMR_AC\tAMR_AF\tAMR_N_BI_GENOS\tAMR_N_HOMREF\tAMR_N_HET\tAMR_N_HOMALT\tAMR_FREQ_HOMREF\tAMR_FREQ_HET\tAMR_FREQ_HOMALT\tAMR_MALE_AN\tAMR_MALE_AC\tAMR_MALE_AF\tAMR_MALE_N_BI_GENOS\tAMR_MALE_N_HOMREF\tAMR_MALE_N_HET\tAMR_MALE_N_HOMALT\tAMR_MALE_FREQ_HOMREF\tAMR_MALE_FREQ_HET\tAMR_MALE_FREQ_HOMALT\tAMR_MALE_N_HEMIREF\tAMR_MALE_N_HEMIALT\tAMR_MALE_FREQ_HEMIREF\tAMR_MALE_FREQ_HEMIALT\tAMR_FEMALE_AN\tAMR_FEMALE_AC\tAMR_FEMALE_AF\tAMR_FEMALE_N_BI_GENOS\tAMR_FEMALE_N_HOMREF\tAMR_FEMALE_N_HET\tAMR_FEMALE_N_HOMALT\tAMR_FEMALE_FREQ_HOMREF\tAMR_FEMALE_FREQ_HET\tAMR_FEMALE_FREQ_HOMALT\tEAS_AN\tEAS_AC\tEAS_AF\tEAS_N_BI_GENOS\tEAS_N_HOMREF\tEAS_N_HET\tEAS_N_HOMALT\tEAS_FREQ_HOMREF\tEAS_FREQ_HET\tEAS_FREQ_HOMALT\tEAS_MALE_AN\tEAS_MALE_AC\tEAS_MALE_AF\tEAS_MALE_N_BI_GENOS\tEAS_MALE_N_HOMREF\tEAS_MALE_N_HET\tEAS_MALE_N_HOMALT\tEAS_MALE_FREQ_HOMREF\tEAS_MALE_FREQ_HET\tEAS_MALE_FREQ_HOMALT\tEAS_MALE_N_HEMIREF\tEAS_MALE_N_HEMIALT\tEAS_MALE_FREQ_HEMIREF\tEAS_MALE_FREQ_HEMIALT\tEAS_FEMALE_AN\tEAS_FEMALE_AC\tEAS_FEMALE_AF\tEAS_FEMALE_N_BI_GENOS\tEAS_FEMALE_N_HOMREF\tEAS_FEMALE_N_HET\tEAS_FEMALE_N_HOMALT\tEAS_FEMALE_FREQ_HOMREF\tEAS_FEMALE_FREQ_HET\tEAS_FEMALE_FREQ_HOMALT\tEUR_AN\tEUR_AC\tEUR_AF\tEUR_N_BI_GENOS\tEUR_N_HOMREF\tEUR_N_HET\tEUR_N_HOMALT\tEUR_FREQ_HOMREF\tEUR_FREQ_HET\tEUR_FREQ_HOMALT\tEUR_MALE_AN\tEUR_MALE_AC\tEUR_MALE_AF\tEUR_MALE_N_BI_GENOS\tEUR_MALE_N_HOMREF\tEUR_MALE_N_HET\tEUR_MALE_N_HOMALT\tEUR_MALE_FREQ_HOMREF\tEUR_MALE_FREQ_HET\tEUR_MALE_FREQ_HOMALT\tEUR_MALE_N_HEMIREF\tEUR_MALE_N_HEMIALT\tEUR_MALE_FREQ_HEMIREF\tEUR_MALE_FREQ_HEMIALT\tEUR_FEMALE_AN\tEUR_FEMALE_AC\tEUR_FEMALE_AF\tEUR_FEMALE_N_BI_GENOS\tEUR_FEMALE_N_HOMREF\tEUR_FEMALE_N_HET\tEUR_FEMALE_N_HOMALT\tEUR_FEMALE_FREQ_HOMREF\tEUR_FEMALE_FREQ_HET\tEUR_FEMALE_FREQ_HOMALT\tOTH_AN\tOTH_AC\tOTH_AF\tOTH_N_BI_GENOS\tOTH_N_HOMREF\tOTH_N_HET\tOTH_N_HOMALT\tOTH_FREQ_HOMREF\tOTH_FREQ_HET\tOTH_FREQ_HOMALT\tOTH_MALE_AN\tOTH_MALE_AC\tOTH_MALE_AF\tOTH_MALE_N_BI_GENOS\tOTH_MALE_N_HOMREF\tOTH_MALE_N_HET\tOTH_MALE_N_HOMALT\tOTH_MALE_FREQ_HOMREF\tOTH_MALE_FREQ_HET\tOTH_MALE_FREQ_HOMALT\tOTH_MALE_N_HEMIREF\tOTH_MALE_N_HEMIALT\tOTH_MALE_FREQ_HEMIREF\tOTH_MALE_FREQ_HEMIALT\tOTH_FEMALE_AN\tOTH_FEMALE_AC\tOTH_FEMALE_AF\tOTH_FEMALE_N_BI_GENOS\tOTH_FEMALE_N_HOMREF\tOTH_FEMALE_N_HET\tOTH_FEMALE_N_HOMALT\tOTH_FEMALE_FREQ_HOMREF\tOTH_FEMALE_FREQ_HET\tOTH_FEMALE_FREQ_HOMALT\tFILTER\n" + "1\t10641\t10642\tgnomAD-SV_v2.1_BND_1_1\tBND\tmanta\tFalse\t15\tNA\tNA\t10643\t10643\tPE,SR\tFalse\tFalse\tTrue\t10642\tNA\tNA\tNA\tFalse\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\t-1\tBND\tSINGLE_ENDER_--\tFalse\tFalse\t21366\t145\t0.006785999983549118\t10683\t10543\t135\t5\t0.9868950247764587\t0.012636899948120117\t0.00046803298755548894\t10866\t69\t0.00634999992325902\t5433\t5366\t65\t2\t0.987667977809906\t0.011963900178670883\t0.000368120992789045\tNA\tNA\tNA\tNA\tFalse\t10454\t76\t0.007269999943673611\t5227\t5154\t70\t3\t0.9860339760780334\t0.013392000459134579\t0.0005739430198445916\t0.015956999734044075\t9398\t72\t0.007660999894142151\t4699\t4629\t68\t2\t0.9851030111312866\t0.014471200294792652\t0.0004256220126990229\t5154\t33\t0.006403000093996525\t2577\t2544\t33\t0\t0.9871940016746521\t0.012805599719285965\t0.0\tNA\tNA\tNA\tNA\t4232\t39\t0.009216000325977802\t2116\t2079\t35\t2\t0.9825140237808228\t0.01654059998691082\t0.0009451800142414868\t1910\t7\t0.003664999967440963\t955\t949\t5\t1\t0.9937170147895813\t0.00523559981957078\t0.001047119963914156\t950\t4\t0.004211000166833401\t475\t472\t2\t1\t0.9936839938163757\t0.00421052984893322\t0.0021052600350230932\tNA\tNA\tNA\tNA\t952\t3\t0.0031510000117123127\t476\t473\t3\t0\t0.9936969876289368\t0.006302520167082548\t0.0\t2296\t31\t0.013501999899744987\t1148\t1117\t31\t0\t0.9729970097541809\t0.02700350061058998\t0.0\t1312\t13\t0.009909000247716904\t656\t643\t13\t0\t0.9801830053329468\t0.01981710083782673\t0.0\tNA\tNA\tNA\tNA\t976\t18\t0.018442999571561813\t488\t470\t18\t0\t0.9631149768829346\t0.03688519820570946\t0.0\t7574\t32\t0.004224999807775021\t3787\t3757\t28\t2\t0.9920780062675476\t0.007393720094114542\t0.0005281229969114065\t3374\t17\t0.005038999952375889\t1687\t1671\t15\t1\t0.9905160069465637\t0.008891520090401173\t0.000592768017668277\tNA\tNA\tNA\tNA\t4182\t15\t0.003587000072002411\t2091\t2077\t13\t1\t0.9933050274848938\t0.006217120215296745\t0.00047823999193497\t188\t3\t0.015956999734044075\t94\t91\t3\t0\t0.968084990978241\t0.03191490098834038\t0.0\t76\t2\t0.026316000148653984\t38\t36\t2\t0\t0.9473680257797241\t0.05263160169124603\t0.0\tNA\tNA\tNA\tNA\t112\t1\t0.008929000236093998\t56\t55\t1\t0\t0.982142984867096\t0.017857100814580917\t0.0\tUNRESOLVED\n" + "1\t20999\t26000\tgnomAD-SV_v2.1_DEL_1_1\tDEL\tdepth\tFalse\tNA\tNA\tNA\tNA\t26000\tRD\tFalse\tFalse\tFalse\tNA\tNA\tNA\tNA\tTrue\tNA\tNA\tNA\tNA\tOR4F5\tNA\tNA\tNA\tNA\t5000\tDEL\tNA\tFalse\tFalse\t8586\t138\t0.01607299968600273\t4293\t4155\t138\t0\t0.9678549766540527\t0.03214539960026741\t0.0\t4302\t69\t0.01603900082409382\t2151\t2082\t69\t0\t0.9679219722747803\t0.0320780985057354\t0.0\tNA\tNA\tNA\tNA\tFalse\t4272\t68\t0.015917999669909477\t2136\t2068\t68\t0\t0.9681649804115295\t0.031835198402404785\t0.0\t0.07199999690055847\t3718\t27\t0.007261999882757664\t1859\t1832\t27\t0\t0.985476016998291\t0.014523900113999844\t0.0\t2016\t16\t0.007937000133097172\t1008\t992\t16\t0\t0.9841269850730896\t0.015873000025749207\t0.0\tNA\tNA\tNA\tNA\t1702\t11\t0.00646300008520484\t851\t840\t11\t0\t0.9870740175247192\t0.01292600017040968\t0.0\t684\t8\t0.011695999652147293\t342\t334\t8\t0\t0.9766079783439636\t0.02339180000126362\t0.0\t326\t2\t0.006134999915957451\t163\t161\t2\t0\t0.9877300262451172\t0.012269900180399418\t0.0\tNA\tNA\tNA\tNA\t358\t6\t0.016759999096393585\t179\t173\t6\t0\t0.966480016708374\t0.033519599586725235\t0.0\t750\t54\t0.07199999690055847\t375\t321\t54\t0\t0.8560000061988831\t0.14399999380111694\t0.0\t418\t28\t0.06698600202798843\t209\t181\t28\t0\t0.8660290241241455\t0.13397100567817688\t0.0\tNA\tNA\tNA\tNA\t328\t25\t0.07621999830007553\t164\t139\t25\t0\t0.8475610017776489\t0.15243899822235107\t0.0\t3346\t48\t0.014344999566674232\t1673\t1625\t48\t0\t0.9713090062141418\t0.028690999373793602\t0.0\t1498\t22\t0.014685999602079391\t749\t727\t22\t0\t0.9706270098686218\t0.029372500255703926\t0.0\tNA\tNA\tNA\tNA\t1840\t26\t0.01413000002503395\t920\t894\t26\t0\t0.9717389941215515\t0.02826089970767498\t0.0\t88\t1\t0.011363999918103218\t44\t43\t1\t0\t0.9772729873657227\t0.022727299481630325\t0.0\t44\t1\t0.0227269995957613\t22\t21\t1\t0\t0.9545450210571289\t0.04545449838042259\t0.0\tNA\tNA\tNA\tNA\t44\t0\t0.0\t22\t22\t0\t0\t1.0\t0.0\t0.0\tUNSTABLE_AF_PCRMINUS,LOW_CALL_RATE\n" + "1\t39999\t47200\tgnomAD-SV_v2.1_DUP_1_1\tDUP\tdepth\tFalse\tNA\tNA\tNA\tNA\t47200\tRD\tFalse\tFalse\tFalse\tNA\tNA\tNA\tNA\tTrue\tNA\tNA\tNA\tNA\tOR4F5\tNA\tNA\tNA\tNA\t7200\tDUP\tNA\tFalse\tFalse\t13674\t943\t0.06896299868822098\t6837\t5985\t761\t91\t0.8753839731216431\t0.11130599677562714\t0.013309899717569351\t6878\t499\t0.07254999876022339\t3439\t2987\t405\t47\t0.8685659766197205\t0.11776699870824814\t0.01366680022329092\tNA\tNA\tNA\tNA\tFalse\t6770\t442\t0.0652879998087883\t3385\t2987\t354\t44\t0.8824219703674316\t0.10457900166511536\t0.012998499907553196\t0.1356939971446991\t6382\t866\t0.1356939971446991\t3191\t2415\t686\t90\t0.756816029548645\t0.21498000621795654\t0.0282042995095253\t3470\t460\t0.13256500661373138\t1735\t1322\t366\t47\t0.7619600296020508\t0.21095100045204163\t0.027089299634099007\tNA\tNA\tNA\tNA\t2904\t404\t0.1391180008649826\t1452\t1091\t318\t43\t0.7513769865036011\t0.21900799870491028\t0.029614299535751343\t918\t21\t0.02287600003182888\t459\t439\t19\t1\t0.9564269781112671\t0.041394300758838654\t0.0021786498837172985\t426\t12\t0.028169000521302223\t213\t201\t12\t0\t0.9436619877815247\t0.056338001042604446\t0.0\tNA\tNA\tNA\tNA\t490\t9\t0.018366999924182892\t245\t237\t7\t1\t0.9673470258712769\t0.02857140079140663\t0.004081630147993565\t1544\t17\t0.011009999550879002\t772\t755\t17\t0\t0.9779790043830872\t0.022020699456334114\t0.0\t902\t11\t0.012195000424981117\t451\t440\t11\t0\t0.9756100177764893\t0.024390200152993202\t0.0\tNA\tNA\tNA\tNA\t638\t6\t0.009403999894857407\t319\t313\t6\t0\t0.9811909794807434\t0.018808800727128983\t0.0\t4716\t37\t0.007845999673008919\t2358\t2321\t37\t0\t0.9843090176582336\t0.015691300854086876\t0.0\t2034\t15\t0.007375000044703484\t1017\t1002\t15\t0\t0.9852510094642639\t0.014749299734830856\t0.0\tNA\tNA\tNA\tNA\t2670\t22\t0.00824000034481287\t1335\t1313\t22\t0\t0.9835209846496582\t0.016479400917887688\t0.0\t114\t2\t0.01754399947822094\t57\t55\t2\t0\t0.9649119973182678\t0.035087700933218\t0.0\t46\t1\t0.02173900045454502\t23\t22\t1\t0\t0.9565219879150391\t0.04347829893231392\t0.0\tNA\tNA\tNA\tNA\t68\t1\t0.014705999754369259\t34\t33\t1\t0\t0.9705880284309387\t0.02941180020570755\t0.0\tUNSTABLE_AF_PCRMINUS,LOW_CALL_RATE\n" + "1\t54664\t54666\tgnomAD-SV_v2.1_INS_1_1\tINS\tmanta\tFalse\t1\tNA\tNA\t54717\t54666\tSR\tTrue\tFalse\tFalse\t54716\tNA\tNA\tNA\tTrue\tNA\tNA\tNA\tNA\tOR4F5\tNA\tNA\tNA\tNA\t52\tINS\tNA\tFalse\tFalse\t21306\t2\t9.40000027185306e-05\t10653\t10651\t2\t0\t0.9998120069503784\t0.0001877409958979115\t0.0\t10870\t1\t9.200000204145908e-05\t5435\t5434\t1\t0\t0.9998160004615784\t0.00018399300461169332\t0.0\tNA\tNA\tNA\tNA\tFalse\t10390\t1\t9.600000339560211e-05\t5195\t5194\t1\t0\t0.9998080134391785\t0.00019249299657531083\t0.0\t0.0001340000017080456\t9380\t1\t0.00010699999984353781\t4690\t4689\t1\t0\t0.9997869729995728\t0.00021322000247891992\t0.0\t5148\t1\t0.0001939999929163605\t2574\t2573\t1\t0\t0.9996110200881958\t0.0003884999896399677\t0.0\tNA\tNA\tNA\tNA\t4220\t0\t0.0\t2110\t2110\t0\t0\t1.0\t0.0\t0.0\t1908\t0\t0.0\t954\t954\t0\t0\t1.0\t0.0\t0.0\t952\t0\t0.0\t476\t476\t0\t0\t1.0\t0.0\t0.0\tNA\tNA\tNA\tNA\t948\t0\t0.0\t474\t474\t0\t0\t1.0\t0.0\t0.0\t2366\t0\t0.0\t1183\t1183\t0\t0\t1.0\t0.0\t0.0\t1366\t0\t0.0\t683\t683\t0\t0\t1.0\t0.0\t0.0\tNA\tNA\tNA\tNA\t992\t0\t0.0\t496\t496\t0\t0\t1.0\t0.0\t0.0\t7462\t1\t0.0001340000017080456\t3731\t3730\t1\t0\t0.9997320175170898\t0.00026802500360645354\t0.0\t3328\t0\t0.0\t1664\t1664\t0\t0\t1.0\t0.0\t0.0\tNA\tNA\tNA\tNA\t4116\t1\t0.00024300000222865492\t2058\t2057\t1\t0\t0.9995139837265015\t0.00048590899677947164\t0.0\t190\t0\t0.0\t95\t95\t0\t0\t1.0\t0.0\t0.0\t76\t0\t0.0\t38\t38\t0\t0\t1.0\t0.0\t0.0\tNA\tNA\tNA\tNA\t114\t0\t0.0\t57\t57\t0\t0\t1.0\t0.0\t0.0\tPASS\n"; using var reader = new StreamReader(GetStreamData(bedData)); using var gnomadSvParser = new GnomadSvBedParser(reader, ChromosomeUtilities.RefNameToChromosome); List svItemList = gnomadSvParser.GetItems().ToList(); // Count is 3 becuase breakends are skipped Assert.Equal(3, svItemList.Count); Assert.Equal( "\"chromosome\":\"1\",\"begin\":21001,\"end\":26000,\"variantId\":\"gnomAD-SV_v2.1_DEL_1_1\",\"variantType\":\"deletion\",\"failedFilter\":true,\"allAf\":0.016073,\"afrAf\":0.007262,\"amrAf\":0.011696,\"easAf\":0.072,\"eurAf\":0.014345,\"othAf\":0.011364,\"femaleAf\":0.015918,\"maleAf\":0.016039,\"allAc\":138,\"afrAc\":27,\"amrAc\":8,\"easAc\":54,\"eurAc\":48,\"othAc\":1,\"femaleAc\":68,\"maleAc\":69,\"allAn\":8586,\"afrAn\":3718,\"amrAn\":684,\"easAn\":750,\"eurAn\":3346,\"othAn\":88,\"femaleAn\":4272,\"maleAn\":4302,\"allHc\":0,\"afrHc\":0,\"amrHc\":0,\"easHc\":0,\"eurHc\":0,\"othHc\":43,\"femaleHc\":0,\"maleHc\":0", svItemList[0].GetJsonString() ); Assert.Equal( "\"chromosome\":\"1\",\"begin\":40001,\"end\":47200,\"variantId\":\"gnomAD-SV_v2.1_DUP_1_1\",\"variantType\":\"duplication\",\"failedFilter\":true,\"allAf\":0.068963,\"afrAf\":0.135694,\"amrAf\":0.022876,\"easAf\":0.01101,\"eurAf\":0.007846,\"othAf\":0.017544,\"femaleAf\":0.065288,\"maleAf\":0.07255,\"allAc\":943,\"afrAc\":866,\"amrAc\":21,\"easAc\":17,\"eurAc\":37,\"othAc\":2,\"femaleAc\":442,\"maleAc\":499,\"allAn\":13674,\"afrAn\":6382,\"amrAn\":918,\"easAn\":1544,\"eurAn\":4716,\"othAn\":114,\"femaleAn\":6770,\"maleAn\":6878,\"allHc\":91,\"afrHc\":90,\"amrHc\":1,\"easHc\":0,\"eurHc\":0,\"othHc\":55,\"femaleHc\":44,\"maleHc\":47", svItemList[1].GetJsonString() ); Assert.Equal( "\"chromosome\":\"1\",\"begin\":54666,\"end\":54665,\"variantId\":\"gnomAD-SV_v2.1_INS_1_1\",\"variantType\":\"insertion\",\"allAf\":0.000094,\"afrAf\":0.000107,\"amrAf\":0,\"easAf\":0,\"eurAf\":0.000134,\"othAf\":0,\"femaleAf\":0.000096,\"maleAf\":0.000092,\"allAc\":2,\"afrAc\":1,\"amrAc\":0,\"easAc\":0,\"eurAc\":1,\"othAc\":0,\"femaleAc\":1,\"maleAc\":1,\"allAn\":21306,\"afrAn\":9380,\"amrAn\":1908,\"easAn\":2366,\"eurAn\":7462,\"othAn\":190,\"femaleAn\":10390,\"maleAn\":10870,\"allHc\":0,\"afrHc\":0,\"amrHc\":0,\"easHc\":0,\"eurHc\":0,\"othHc\":95,\"femaleHc\":0,\"maleHc\":0", svItemList[2].GetJsonString() ); } [Fact] public void TestUnknownChromosome() { const string bedData = "#chrom\tstart\tend\tname\tsvtype\tALGORITHMS\tBOTHSIDES_SUPPORT\tCHR2\tCPX_INTERVALS\tCPX_TYPE\tEND2\tEND\tEVIDENCE\tHIGH_SR_BACKGROUND\tPCRPLUS_DEPLETED\tPESR_GT_OVERDISPERSION\tPOS2\tPROTEIN_CODING__COPY_GAIN\tPROTEIN_CODING__DUP_LOF\tPROTEIN_CODING__DUP_PARTIAL\tPROTEIN_CODING__INTERGENIC\tPROTEIN_CODING__INTRONIC\tPROTEIN_CODING__INV_SPAN\tPROTEIN_CODING__LOF\tPROTEIN_CODING__MSV_EXON_OVR\tPROTEIN_CODING__NEAREST_TSS\tPROTEIN_CODING__PROMOTER\tPROTEIN_CODING__UTR\tSOURCE\tSTRANDS\tSVLEN\tSVTYPE\tUNRESOLVED_TYPE\tUNSTABLE_AF_PCRPLUS\tVARIABLE_ACROSS_BATCHES\tAN\tAC\tAF\tN_BI_GENOS\tN_HOMREF\tN_HET\tN_HOMALT\tFREQ_HOMREF\tFREQ_HET\tFREQ_HOMALT\tMALE_AN\tMALE_AC\tMALE_AF\tMALE_N_BI_GENOS\tMALE_N_HOMREF\tMALE_N_HET\tMALE_N_HOMALT\tMALE_FREQ_HOMREF\tMALE_FREQ_HET\tMALE_FREQ_HOMALT\tMALE_N_HEMIREF\tMALE_N_HEMIALT\tMALE_FREQ_HEMIREF\tMALE_FREQ_HEMIALT\tPAR\tFEMALE_AN\tFEMALE_AC\tFEMALE_AF\tFEMALE_N_BI_GENOS\tFEMALE_N_HOMREF\tFEMALE_N_HET\tFEMALE_N_HOMALT\tFEMALE_FREQ_HOMREF\tFEMALE_FREQ_HET\tFEMALE_FREQ_HOMALT\tPOPMAX_AF\tAFR_AN\tAFR_AC\tAFR_AF\tAFR_N_BI_GENOS\tAFR_N_HOMREF\tAFR_N_HET\tAFR_N_HOMALT\tAFR_FREQ_HOMREF\tAFR_FREQ_HET\tAFR_FREQ_HOMALT\tAFR_MALE_AN\tAFR_MALE_AC\tAFR_MALE_AF\tAFR_MALE_N_BI_GENOS\tAFR_MALE_N_HOMREF\tAFR_MALE_N_HET\tAFR_MALE_N_HOMALT\tAFR_MALE_FREQ_HOMREF\tAFR_MALE_FREQ_HET\tAFR_MALE_FREQ_HOMALT\tAFR_MALE_N_HEMIREF\tAFR_MALE_N_HEMIALT\tAFR_MALE_FREQ_HEMIREF\tAFR_MALE_FREQ_HEMIALT\tAFR_FEMALE_AN\tAFR_FEMALE_AC\tAFR_FEMALE_AF\tAFR_FEMALE_N_BI_GENOS\tAFR_FEMALE_N_HOMREF\tAFR_FEMALE_N_HET\tAFR_FEMALE_N_HOMALT\tAFR_FEMALE_FREQ_HOMREF\tAFR_FEMALE_FREQ_HET\tAFR_FEMALE_FREQ_HOMALT\tAMR_AN\tAMR_AC\tAMR_AF\tAMR_N_BI_GENOS\tAMR_N_HOMREF\tAMR_N_HET\tAMR_N_HOMALT\tAMR_FREQ_HOMREF\tAMR_FREQ_HET\tAMR_FREQ_HOMALT\tAMR_MALE_AN\tAMR_MALE_AC\tAMR_MALE_AF\tAMR_MALE_N_BI_GENOS\tAMR_MALE_N_HOMREF\tAMR_MALE_N_HET\tAMR_MALE_N_HOMALT\tAMR_MALE_FREQ_HOMREF\tAMR_MALE_FREQ_HET\tAMR_MALE_FREQ_HOMALT\tAMR_MALE_N_HEMIREF\tAMR_MALE_N_HEMIALT\tAMR_MALE_FREQ_HEMIREF\tAMR_MALE_FREQ_HEMIALT\tAMR_FEMALE_AN\tAMR_FEMALE_AC\tAMR_FEMALE_AF\tAMR_FEMALE_N_BI_GENOS\tAMR_FEMALE_N_HOMREF\tAMR_FEMALE_N_HET\tAMR_FEMALE_N_HOMALT\tAMR_FEMALE_FREQ_HOMREF\tAMR_FEMALE_FREQ_HET\tAMR_FEMALE_FREQ_HOMALT\tEAS_AN\tEAS_AC\tEAS_AF\tEAS_N_BI_GENOS\tEAS_N_HOMREF\tEAS_N_HET\tEAS_N_HOMALT\tEAS_FREQ_HOMREF\tEAS_FREQ_HET\tEAS_FREQ_HOMALT\tEAS_MALE_AN\tEAS_MALE_AC\tEAS_MALE_AF\tEAS_MALE_N_BI_GENOS\tEAS_MALE_N_HOMREF\tEAS_MALE_N_HET\tEAS_MALE_N_HOMALT\tEAS_MALE_FREQ_HOMREF\tEAS_MALE_FREQ_HET\tEAS_MALE_FREQ_HOMALT\tEAS_MALE_N_HEMIREF\tEAS_MALE_N_HEMIALT\tEAS_MALE_FREQ_HEMIREF\tEAS_MALE_FREQ_HEMIALT\tEAS_FEMALE_AN\tEAS_FEMALE_AC\tEAS_FEMALE_AF\tEAS_FEMALE_N_BI_GENOS\tEAS_FEMALE_N_HOMREF\tEAS_FEMALE_N_HET\tEAS_FEMALE_N_HOMALT\tEAS_FEMALE_FREQ_HOMREF\tEAS_FEMALE_FREQ_HET\tEAS_FEMALE_FREQ_HOMALT\tEUR_AN\tEUR_AC\tEUR_AF\tEUR_N_BI_GENOS\tEUR_N_HOMREF\tEUR_N_HET\tEUR_N_HOMALT\tEUR_FREQ_HOMREF\tEUR_FREQ_HET\tEUR_FREQ_HOMALT\tEUR_MALE_AN\tEUR_MALE_AC\tEUR_MALE_AF\tEUR_MALE_N_BI_GENOS\tEUR_MALE_N_HOMREF\tEUR_MALE_N_HET\tEUR_MALE_N_HOMALT\tEUR_MALE_FREQ_HOMREF\tEUR_MALE_FREQ_HET\tEUR_MALE_FREQ_HOMALT\tEUR_MALE_N_HEMIREF\tEUR_MALE_N_HEMIALT\tEUR_MALE_FREQ_HEMIREF\tEUR_MALE_FREQ_HEMIALT\tEUR_FEMALE_AN\tEUR_FEMALE_AC\tEUR_FEMALE_AF\tEUR_FEMALE_N_BI_GENOS\tEUR_FEMALE_N_HOMREF\tEUR_FEMALE_N_HET\tEUR_FEMALE_N_HOMALT\tEUR_FEMALE_FREQ_HOMREF\tEUR_FEMALE_FREQ_HET\tEUR_FEMALE_FREQ_HOMALT\tOTH_AN\tOTH_AC\tOTH_AF\tOTH_N_BI_GENOS\tOTH_N_HOMREF\tOTH_N_HET\tOTH_N_HOMALT\tOTH_FREQ_HOMREF\tOTH_FREQ_HET\tOTH_FREQ_HOMALT\tOTH_MALE_AN\tOTH_MALE_AC\tOTH_MALE_AF\tOTH_MALE_N_BI_GENOS\tOTH_MALE_N_HOMREF\tOTH_MALE_N_HET\tOTH_MALE_N_HOMALT\tOTH_MALE_FREQ_HOMREF\tOTH_MALE_FREQ_HET\tOTH_MALE_FREQ_HOMALT\tOTH_MALE_N_HEMIREF\tOTH_MALE_N_HEMIALT\tOTH_MALE_FREQ_HEMIREF\tOTH_MALE_FREQ_HEMIALT\tOTH_FEMALE_AN\tOTH_FEMALE_AC\tOTH_FEMALE_AF\tOTH_FEMALE_N_BI_GENOS\tOTH_FEMALE_N_HOMREF\tOTH_FEMALE_N_HET\tOTH_FEMALE_N_HOMALT\tOTH_FEMALE_FREQ_HOMREF\tOTH_FEMALE_FREQ_HET\tOTH_FEMALE_FREQ_HOMALT\tFILTER\n" + "InvalidNumber\t10641\t10642\tgnomAD-SV_v2.1_BND_1_1\tBND\tmanta\tFalse\t15\tNA\tNA\t10643\t10643\tPE,SR\tFalse\tFalse\tTrue\t10642\tNA\tNA\tNA\tFalse\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\t-1\tBND\tSINGLE_ENDER_--\tFalse\tFalse\t21366\t145\t0.006785999983549118\t10683\t10543\t135\t5\t0.9868950247764587\t0.012636899948120117\t0.00046803298755548894\t10866\t69\t0.00634999992325902\t5433\t5366\t65\t2\t0.987667977809906\t0.011963900178670883\t0.000368120992789045\tNA\tNA\tNA\tNA\tFalse\t10454\t76\t0.007269999943673611\t5227\t5154\t70\t3\t0.9860339760780334\t0.013392000459134579\t0.0005739430198445916\t0.015956999734044075\t9398\t72\t0.007660999894142151\t4699\t4629\t68\t2\t0.9851030111312866\t0.014471200294792652\t0.0004256220126990229\t5154\t33\t0.006403000093996525\t2577\t2544\t33\t0\t0.9871940016746521\t0.012805599719285965\t0.0\tNA\tNA\tNA\tNA\t4232\t39\t0.009216000325977802\t2116\t2079\t35\t2\t0.9825140237808228\t0.01654059998691082\t0.0009451800142414868\t1910\t7\t0.003664999967440963\t955\t949\t5\t1\t0.9937170147895813\t0.00523559981957078\t0.001047119963914156\t950\t4\t0.004211000166833401\t475\t472\t2\t1\t0.9936839938163757\t0.00421052984893322\t0.0021052600350230932\tNA\tNA\tNA\tNA\t952\t3\t0.0031510000117123127\t476\t473\t3\t0\t0.9936969876289368\t0.006302520167082548\t0.0\t2296\t31\t0.013501999899744987\t1148\t1117\t31\t0\t0.9729970097541809\t0.02700350061058998\t0.0\t1312\t13\t0.009909000247716904\t656\t643\t13\t0\t0.9801830053329468\t0.01981710083782673\t0.0\tNA\tNA\tNA\tNA\t976\t18\t0.018442999571561813\t488\t470\t18\t0\t0.9631149768829346\t0.03688519820570946\t0.0\t7574\t32\t0.004224999807775021\t3787\t3757\t28\t2\t0.9920780062675476\t0.007393720094114542\t0.0005281229969114065\t3374\t17\t0.005038999952375889\t1687\t1671\t15\t1\t0.9905160069465637\t0.008891520090401173\t0.000592768017668277\tNA\tNA\tNA\tNA\t4182\t15\t0.003587000072002411\t2091\t2077\t13\t1\t0.9933050274848938\t0.006217120215296745\t0.00047823999193497\t188\t3\t0.015956999734044075\t94\t91\t3\t0\t0.968084990978241\t0.03191490098834038\t0.0\t76\t2\t0.026316000148653984\t38\t36\t2\t0\t0.9473680257797241\t0.05263160169124603\t0.0\tNA\tNA\tNA\tNA\t112\t1\t0.008929000236093998\t56\t55\t1\t0\t0.982142984867096\t0.017857100814580917\t0.0\tUNRESOLVED\n" + "1\t20999\t26000\tgnomAD-SV_v2.1_DEL_1_1\tDEL\tdepth\tFalse\tNA\tNA\tNA\tNA\t26000\tRD\tFalse\tFalse\tFalse\tNA\tNA\tNA\tNA\tTrue\tNA\tNA\tNA\tNA\tOR4F5\tNA\tNA\tNA\tNA\t5000\tDEL\tNA\tFalse\tFalse\t8586\t138\t0.01607299968600273\t4293\t4155\t138\t0\t0.9678549766540527\t0.03214539960026741\t0.0\t4302\t69\t0.01603900082409382\t2151\t2082\t69\t0\t0.9679219722747803\t0.0320780985057354\t0.0\tNA\tNA\tNA\tNA\tFalse\t4272\t68\t0.015917999669909477\t2136\t2068\t68\t0\t0.9681649804115295\t0.031835198402404785\t0.0\t0.07199999690055847\t3718\t27\t0.007261999882757664\t1859\t1832\t27\t0\t0.985476016998291\t0.014523900113999844\t0.0\t2016\t16\t0.007937000133097172\t1008\t992\t16\t0\t0.9841269850730896\t0.015873000025749207\t0.0\tNA\tNA\tNA\tNA\t1702\t11\t0.00646300008520484\t851\t840\t11\t0\t0.9870740175247192\t0.01292600017040968\t0.0\t684\t8\t0.011695999652147293\t342\t334\t8\t0\t0.9766079783439636\t0.02339180000126362\t0.0\t326\t2\t0.006134999915957451\t163\t161\t2\t0\t0.9877300262451172\t0.012269900180399418\t0.0\tNA\tNA\tNA\tNA\t358\t6\t0.016759999096393585\t179\t173\t6\t0\t0.966480016708374\t0.033519599586725235\t0.0\t750\t54\t0.07199999690055847\t375\t321\t54\t0\t0.8560000061988831\t0.14399999380111694\t0.0\t418\t28\t0.06698600202798843\t209\t181\t28\t0\t0.8660290241241455\t0.13397100567817688\t0.0\tNA\tNA\tNA\tNA\t328\t25\t0.07621999830007553\t164\t139\t25\t0\t0.8475610017776489\t0.15243899822235107\t0.0\t3346\t48\t0.014344999566674232\t1673\t1625\t48\t0\t0.9713090062141418\t0.028690999373793602\t0.0\t1498\t22\t0.014685999602079391\t749\t727\t22\t0\t0.9706270098686218\t0.029372500255703926\t0.0\tNA\tNA\tNA\tNA\t1840\t26\t0.01413000002503395\t920\t894\t26\t0\t0.9717389941215515\t0.02826089970767498\t0.0\t88\t1\t0.011363999918103218\t44\t43\t1\t0\t0.9772729873657227\t0.022727299481630325\t0.0\t44\t1\t0.0227269995957613\t22\t21\t1\t0\t0.9545450210571289\t0.04545449838042259\t0.0\tNA\tNA\tNA\tNA\t44\t0\t0.0\t22\t22\t0\t0\t1.0\t0.0\t0.0\tUNSTABLE_AF_PCRMINUS,LOW_CALL_RATE\n"; using var reader = new StreamReader(GetStreamData(bedData)); using var gnomadSvParser = new GnomadSvBedParser(reader, ChromosomeUtilities.RefNameToChromosome); List svItemList = gnomadSvParser.GetItems().ToList(); Assert.Single(svItemList); Assert.Equal( "\"chromosome\":\"1\",\"begin\":21001,\"end\":26000,\"variantId\":\"gnomAD-SV_v2.1_DEL_1_1\",\"variantType\":\"deletion\",\"failedFilter\":true,\"allAf\":0.016073,\"afrAf\":0.007262,\"amrAf\":0.011696,\"easAf\":0.072,\"eurAf\":0.014345,\"othAf\":0.011364,\"femaleAf\":0.015918,\"maleAf\":0.016039,\"allAc\":138,\"afrAc\":27,\"amrAc\":8,\"easAc\":54,\"eurAc\":48,\"othAc\":1,\"femaleAc\":68,\"maleAc\":69,\"allAn\":8586,\"afrAn\":3718,\"amrAn\":684,\"easAn\":750,\"eurAn\":3346,\"othAn\":88,\"femaleAn\":4272,\"maleAn\":4302,\"allHc\":0,\"afrHc\":0,\"amrHc\":0,\"easHc\":0,\"eurHc\":0,\"othHc\":43,\"femaleHc\":0,\"maleHc\":0", svItemList[0].GetJsonString() ); } [Fact] public void TestInvalidStart() { const string bedData = "#chrom\tstart\tend\tname\tsvtype\tALGORITHMS\tBOTHSIDES_SUPPORT\tCHR2\tCPX_INTERVALS\tCPX_TYPE\tEND2\tEND\tEVIDENCE\tHIGH_SR_BACKGROUND\tPCRPLUS_DEPLETED\tPESR_GT_OVERDISPERSION\tPOS2\tPROTEIN_CODING__COPY_GAIN\tPROTEIN_CODING__DUP_LOF\tPROTEIN_CODING__DUP_PARTIAL\tPROTEIN_CODING__INTERGENIC\tPROTEIN_CODING__INTRONIC\tPROTEIN_CODING__INV_SPAN\tPROTEIN_CODING__LOF\tPROTEIN_CODING__MSV_EXON_OVR\tPROTEIN_CODING__NEAREST_TSS\tPROTEIN_CODING__PROMOTER\tPROTEIN_CODING__UTR\tSOURCE\tSTRANDS\tSVLEN\tSVTYPE\tUNRESOLVED_TYPE\tUNSTABLE_AF_PCRPLUS\tVARIABLE_ACROSS_BATCHES\tAN\tAC\tAF\tN_BI_GENOS\tN_HOMREF\tN_HET\tN_HOMALT\tFREQ_HOMREF\tFREQ_HET\tFREQ_HOMALT\tMALE_AN\tMALE_AC\tMALE_AF\tMALE_N_BI_GENOS\tMALE_N_HOMREF\tMALE_N_HET\tMALE_N_HOMALT\tMALE_FREQ_HOMREF\tMALE_FREQ_HET\tMALE_FREQ_HOMALT\tMALE_N_HEMIREF\tMALE_N_HEMIALT\tMALE_FREQ_HEMIREF\tMALE_FREQ_HEMIALT\tPAR\tFEMALE_AN\tFEMALE_AC\tFEMALE_AF\tFEMALE_N_BI_GENOS\tFEMALE_N_HOMREF\tFEMALE_N_HET\tFEMALE_N_HOMALT\tFEMALE_FREQ_HOMREF\tFEMALE_FREQ_HET\tFEMALE_FREQ_HOMALT\tPOPMAX_AF\tAFR_AN\tAFR_AC\tAFR_AF\tAFR_N_BI_GENOS\tAFR_N_HOMREF\tAFR_N_HET\tAFR_N_HOMALT\tAFR_FREQ_HOMREF\tAFR_FREQ_HET\tAFR_FREQ_HOMALT\tAFR_MALE_AN\tAFR_MALE_AC\tAFR_MALE_AF\tAFR_MALE_N_BI_GENOS\tAFR_MALE_N_HOMREF\tAFR_MALE_N_HET\tAFR_MALE_N_HOMALT\tAFR_MALE_FREQ_HOMREF\tAFR_MALE_FREQ_HET\tAFR_MALE_FREQ_HOMALT\tAFR_MALE_N_HEMIREF\tAFR_MALE_N_HEMIALT\tAFR_MALE_FREQ_HEMIREF\tAFR_MALE_FREQ_HEMIALT\tAFR_FEMALE_AN\tAFR_FEMALE_AC\tAFR_FEMALE_AF\tAFR_FEMALE_N_BI_GENOS\tAFR_FEMALE_N_HOMREF\tAFR_FEMALE_N_HET\tAFR_FEMALE_N_HOMALT\tAFR_FEMALE_FREQ_HOMREF\tAFR_FEMALE_FREQ_HET\tAFR_FEMALE_FREQ_HOMALT\tAMR_AN\tAMR_AC\tAMR_AF\tAMR_N_BI_GENOS\tAMR_N_HOMREF\tAMR_N_HET\tAMR_N_HOMALT\tAMR_FREQ_HOMREF\tAMR_FREQ_HET\tAMR_FREQ_HOMALT\tAMR_MALE_AN\tAMR_MALE_AC\tAMR_MALE_AF\tAMR_MALE_N_BI_GENOS\tAMR_MALE_N_HOMREF\tAMR_MALE_N_HET\tAMR_MALE_N_HOMALT\tAMR_MALE_FREQ_HOMREF\tAMR_MALE_FREQ_HET\tAMR_MALE_FREQ_HOMALT\tAMR_MALE_N_HEMIREF\tAMR_MALE_N_HEMIALT\tAMR_MALE_FREQ_HEMIREF\tAMR_MALE_FREQ_HEMIALT\tAMR_FEMALE_AN\tAMR_FEMALE_AC\tAMR_FEMALE_AF\tAMR_FEMALE_N_BI_GENOS\tAMR_FEMALE_N_HOMREF\tAMR_FEMALE_N_HET\tAMR_FEMALE_N_HOMALT\tAMR_FEMALE_FREQ_HOMREF\tAMR_FEMALE_FREQ_HET\tAMR_FEMALE_FREQ_HOMALT\tEAS_AN\tEAS_AC\tEAS_AF\tEAS_N_BI_GENOS\tEAS_N_HOMREF\tEAS_N_HET\tEAS_N_HOMALT\tEAS_FREQ_HOMREF\tEAS_FREQ_HET\tEAS_FREQ_HOMALT\tEAS_MALE_AN\tEAS_MALE_AC\tEAS_MALE_AF\tEAS_MALE_N_BI_GENOS\tEAS_MALE_N_HOMREF\tEAS_MALE_N_HET\tEAS_MALE_N_HOMALT\tEAS_MALE_FREQ_HOMREF\tEAS_MALE_FREQ_HET\tEAS_MALE_FREQ_HOMALT\tEAS_MALE_N_HEMIREF\tEAS_MALE_N_HEMIALT\tEAS_MALE_FREQ_HEMIREF\tEAS_MALE_FREQ_HEMIALT\tEAS_FEMALE_AN\tEAS_FEMALE_AC\tEAS_FEMALE_AF\tEAS_FEMALE_N_BI_GENOS\tEAS_FEMALE_N_HOMREF\tEAS_FEMALE_N_HET\tEAS_FEMALE_N_HOMALT\tEAS_FEMALE_FREQ_HOMREF\tEAS_FEMALE_FREQ_HET\tEAS_FEMALE_FREQ_HOMALT\tEUR_AN\tEUR_AC\tEUR_AF\tEUR_N_BI_GENOS\tEUR_N_HOMREF\tEUR_N_HET\tEUR_N_HOMALT\tEUR_FREQ_HOMREF\tEUR_FREQ_HET\tEUR_FREQ_HOMALT\tEUR_MALE_AN\tEUR_MALE_AC\tEUR_MALE_AF\tEUR_MALE_N_BI_GENOS\tEUR_MALE_N_HOMREF\tEUR_MALE_N_HET\tEUR_MALE_N_HOMALT\tEUR_MALE_FREQ_HOMREF\tEUR_MALE_FREQ_HET\tEUR_MALE_FREQ_HOMALT\tEUR_MALE_N_HEMIREF\tEUR_MALE_N_HEMIALT\tEUR_MALE_FREQ_HEMIREF\tEUR_MALE_FREQ_HEMIALT\tEUR_FEMALE_AN\tEUR_FEMALE_AC\tEUR_FEMALE_AF\tEUR_FEMALE_N_BI_GENOS\tEUR_FEMALE_N_HOMREF\tEUR_FEMALE_N_HET\tEUR_FEMALE_N_HOMALT\tEUR_FEMALE_FREQ_HOMREF\tEUR_FEMALE_FREQ_HET\tEUR_FEMALE_FREQ_HOMALT\tOTH_AN\tOTH_AC\tOTH_AF\tOTH_N_BI_GENOS\tOTH_N_HOMREF\tOTH_N_HET\tOTH_N_HOMALT\tOTH_FREQ_HOMREF\tOTH_FREQ_HET\tOTH_FREQ_HOMALT\tOTH_MALE_AN\tOTH_MALE_AC\tOTH_MALE_AF\tOTH_MALE_N_BI_GENOS\tOTH_MALE_N_HOMREF\tOTH_MALE_N_HET\tOTH_MALE_N_HOMALT\tOTH_MALE_FREQ_HOMREF\tOTH_MALE_FREQ_HET\tOTH_MALE_FREQ_HOMALT\tOTH_MALE_N_HEMIREF\tOTH_MALE_N_HEMIALT\tOTH_MALE_FREQ_HEMIREF\tOTH_MALE_FREQ_HEMIALT\tOTH_FEMALE_AN\tOTH_FEMALE_AC\tOTH_FEMALE_AF\tOTH_FEMALE_N_BI_GENOS\tOTH_FEMALE_N_HOMREF\tOTH_FEMALE_N_HET\tOTH_FEMALE_N_HOMALT\tOTH_FEMALE_FREQ_HOMREF\tOTH_FEMALE_FREQ_HET\tOTH_FEMALE_FREQ_HOMALT\tFILTER\n" + "1\tInvalid-10641\t10642\tgnomAD-SV_v2.1_BND_1_1\tBND\tmanta\tFalse\t15\tNA\tNA\t10643\t10643\tPE,SR\tFalse\tFalse\tTrue\t10642\tNA\tNA\tNA\tFalse\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\t-1\tBND\tSINGLE_ENDER_--\tFalse\tFalse\t21366\t145\t0.006785999983549118\t10683\t10543\t135\t5\t0.9868950247764587\t0.012636899948120117\t0.00046803298755548894\t10866\t69\t0.00634999992325902\t5433\t5366\t65\t2\t0.987667977809906\t0.011963900178670883\t0.000368120992789045\tNA\tNA\tNA\tNA\tFalse\t10454\t76\t0.007269999943673611\t5227\t5154\t70\t3\t0.9860339760780334\t0.013392000459134579\t0.0005739430198445916\t0.015956999734044075\t9398\t72\t0.007660999894142151\t4699\t4629\t68\t2\t0.9851030111312866\t0.014471200294792652\t0.0004256220126990229\t5154\t33\t0.006403000093996525\t2577\t2544\t33\t0\t0.9871940016746521\t0.012805599719285965\t0.0\tNA\tNA\tNA\tNA\t4232\t39\t0.009216000325977802\t2116\t2079\t35\t2\t0.9825140237808228\t0.01654059998691082\t0.0009451800142414868\t1910\t7\t0.003664999967440963\t955\t949\t5\t1\t0.9937170147895813\t0.00523559981957078\t0.001047119963914156\t950\t4\t0.004211000166833401\t475\t472\t2\t1\t0.9936839938163757\t0.00421052984893322\t0.0021052600350230932\tNA\tNA\tNA\tNA\t952\t3\t0.0031510000117123127\t476\t473\t3\t0\t0.9936969876289368\t0.006302520167082548\t0.0\t2296\t31\t0.013501999899744987\t1148\t1117\t31\t0\t0.9729970097541809\t0.02700350061058998\t0.0\t1312\t13\t0.009909000247716904\t656\t643\t13\t0\t0.9801830053329468\t0.01981710083782673\t0.0\tNA\tNA\tNA\tNA\t976\t18\t0.018442999571561813\t488\t470\t18\t0\t0.9631149768829346\t0.03688519820570946\t0.0\t7574\t32\t0.004224999807775021\t3787\t3757\t28\t2\t0.9920780062675476\t0.007393720094114542\t0.0005281229969114065\t3374\t17\t0.005038999952375889\t1687\t1671\t15\t1\t0.9905160069465637\t0.008891520090401173\t0.000592768017668277\tNA\tNA\tNA\tNA\t4182\t15\t0.003587000072002411\t2091\t2077\t13\t1\t0.9933050274848938\t0.006217120215296745\t0.00047823999193497\t188\t3\t0.015956999734044075\t94\t91\t3\t0\t0.968084990978241\t0.03191490098834038\t0.0\t76\t2\t0.026316000148653984\t38\t36\t2\t0\t0.9473680257797241\t0.05263160169124603\t0.0\tNA\tNA\tNA\tNA\t112\t1\t0.008929000236093998\t56\t55\t1\t0\t0.982142984867096\t0.017857100814580917\t0.0\tUNRESOLVED\n"; using var reader = new StreamReader(GetStreamData(bedData)); using var gnomadSvParser = new GnomadSvBedParser(reader, ChromosomeUtilities.RefNameToChromosome); Assert.Throws(() => gnomadSvParser.GetItems().ToList()); } [Fact] public void TestInvalidEnd() { const string bedData = "#chrom\tstart\tend\tname\tsvtype\tALGORITHMS\tBOTHSIDES_SUPPORT\tCHR2\tCPX_INTERVALS\tCPX_TYPE\tEND2\tEND\tEVIDENCE\tHIGH_SR_BACKGROUND\tPCRPLUS_DEPLETED\tPESR_GT_OVERDISPERSION\tPOS2\tPROTEIN_CODING__COPY_GAIN\tPROTEIN_CODING__DUP_LOF\tPROTEIN_CODING__DUP_PARTIAL\tPROTEIN_CODING__INTERGENIC\tPROTEIN_CODING__INTRONIC\tPROTEIN_CODING__INV_SPAN\tPROTEIN_CODING__LOF\tPROTEIN_CODING__MSV_EXON_OVR\tPROTEIN_CODING__NEAREST_TSS\tPROTEIN_CODING__PROMOTER\tPROTEIN_CODING__UTR\tSOURCE\tSTRANDS\tSVLEN\tSVTYPE\tUNRESOLVED_TYPE\tUNSTABLE_AF_PCRPLUS\tVARIABLE_ACROSS_BATCHES\tAN\tAC\tAF\tN_BI_GENOS\tN_HOMREF\tN_HET\tN_HOMALT\tFREQ_HOMREF\tFREQ_HET\tFREQ_HOMALT\tMALE_AN\tMALE_AC\tMALE_AF\tMALE_N_BI_GENOS\tMALE_N_HOMREF\tMALE_N_HET\tMALE_N_HOMALT\tMALE_FREQ_HOMREF\tMALE_FREQ_HET\tMALE_FREQ_HOMALT\tMALE_N_HEMIREF\tMALE_N_HEMIALT\tMALE_FREQ_HEMIREF\tMALE_FREQ_HEMIALT\tPAR\tFEMALE_AN\tFEMALE_AC\tFEMALE_AF\tFEMALE_N_BI_GENOS\tFEMALE_N_HOMREF\tFEMALE_N_HET\tFEMALE_N_HOMALT\tFEMALE_FREQ_HOMREF\tFEMALE_FREQ_HET\tFEMALE_FREQ_HOMALT\tPOPMAX_AF\tAFR_AN\tAFR_AC\tAFR_AF\tAFR_N_BI_GENOS\tAFR_N_HOMREF\tAFR_N_HET\tAFR_N_HOMALT\tAFR_FREQ_HOMREF\tAFR_FREQ_HET\tAFR_FREQ_HOMALT\tAFR_MALE_AN\tAFR_MALE_AC\tAFR_MALE_AF\tAFR_MALE_N_BI_GENOS\tAFR_MALE_N_HOMREF\tAFR_MALE_N_HET\tAFR_MALE_N_HOMALT\tAFR_MALE_FREQ_HOMREF\tAFR_MALE_FREQ_HET\tAFR_MALE_FREQ_HOMALT\tAFR_MALE_N_HEMIREF\tAFR_MALE_N_HEMIALT\tAFR_MALE_FREQ_HEMIREF\tAFR_MALE_FREQ_HEMIALT\tAFR_FEMALE_AN\tAFR_FEMALE_AC\tAFR_FEMALE_AF\tAFR_FEMALE_N_BI_GENOS\tAFR_FEMALE_N_HOMREF\tAFR_FEMALE_N_HET\tAFR_FEMALE_N_HOMALT\tAFR_FEMALE_FREQ_HOMREF\tAFR_FEMALE_FREQ_HET\tAFR_FEMALE_FREQ_HOMALT\tAMR_AN\tAMR_AC\tAMR_AF\tAMR_N_BI_GENOS\tAMR_N_HOMREF\tAMR_N_HET\tAMR_N_HOMALT\tAMR_FREQ_HOMREF\tAMR_FREQ_HET\tAMR_FREQ_HOMALT\tAMR_MALE_AN\tAMR_MALE_AC\tAMR_MALE_AF\tAMR_MALE_N_BI_GENOS\tAMR_MALE_N_HOMREF\tAMR_MALE_N_HET\tAMR_MALE_N_HOMALT\tAMR_MALE_FREQ_HOMREF\tAMR_MALE_FREQ_HET\tAMR_MALE_FREQ_HOMALT\tAMR_MALE_N_HEMIREF\tAMR_MALE_N_HEMIALT\tAMR_MALE_FREQ_HEMIREF\tAMR_MALE_FREQ_HEMIALT\tAMR_FEMALE_AN\tAMR_FEMALE_AC\tAMR_FEMALE_AF\tAMR_FEMALE_N_BI_GENOS\tAMR_FEMALE_N_HOMREF\tAMR_FEMALE_N_HET\tAMR_FEMALE_N_HOMALT\tAMR_FEMALE_FREQ_HOMREF\tAMR_FEMALE_FREQ_HET\tAMR_FEMALE_FREQ_HOMALT\tEAS_AN\tEAS_AC\tEAS_AF\tEAS_N_BI_GENOS\tEAS_N_HOMREF\tEAS_N_HET\tEAS_N_HOMALT\tEAS_FREQ_HOMREF\tEAS_FREQ_HET\tEAS_FREQ_HOMALT\tEAS_MALE_AN\tEAS_MALE_AC\tEAS_MALE_AF\tEAS_MALE_N_BI_GENOS\tEAS_MALE_N_HOMREF\tEAS_MALE_N_HET\tEAS_MALE_N_HOMALT\tEAS_MALE_FREQ_HOMREF\tEAS_MALE_FREQ_HET\tEAS_MALE_FREQ_HOMALT\tEAS_MALE_N_HEMIREF\tEAS_MALE_N_HEMIALT\tEAS_MALE_FREQ_HEMIREF\tEAS_MALE_FREQ_HEMIALT\tEAS_FEMALE_AN\tEAS_FEMALE_AC\tEAS_FEMALE_AF\tEAS_FEMALE_N_BI_GENOS\tEAS_FEMALE_N_HOMREF\tEAS_FEMALE_N_HET\tEAS_FEMALE_N_HOMALT\tEAS_FEMALE_FREQ_HOMREF\tEAS_FEMALE_FREQ_HET\tEAS_FEMALE_FREQ_HOMALT\tEUR_AN\tEUR_AC\tEUR_AF\tEUR_N_BI_GENOS\tEUR_N_HOMREF\tEUR_N_HET\tEUR_N_HOMALT\tEUR_FREQ_HOMREF\tEUR_FREQ_HET\tEUR_FREQ_HOMALT\tEUR_MALE_AN\tEUR_MALE_AC\tEUR_MALE_AF\tEUR_MALE_N_BI_GENOS\tEUR_MALE_N_HOMREF\tEUR_MALE_N_HET\tEUR_MALE_N_HOMALT\tEUR_MALE_FREQ_HOMREF\tEUR_MALE_FREQ_HET\tEUR_MALE_FREQ_HOMALT\tEUR_MALE_N_HEMIREF\tEUR_MALE_N_HEMIALT\tEUR_MALE_FREQ_HEMIREF\tEUR_MALE_FREQ_HEMIALT\tEUR_FEMALE_AN\tEUR_FEMALE_AC\tEUR_FEMALE_AF\tEUR_FEMALE_N_BI_GENOS\tEUR_FEMALE_N_HOMREF\tEUR_FEMALE_N_HET\tEUR_FEMALE_N_HOMALT\tEUR_FEMALE_FREQ_HOMREF\tEUR_FEMALE_FREQ_HET\tEUR_FEMALE_FREQ_HOMALT\tOTH_AN\tOTH_AC\tOTH_AF\tOTH_N_BI_GENOS\tOTH_N_HOMREF\tOTH_N_HET\tOTH_N_HOMALT\tOTH_FREQ_HOMREF\tOTH_FREQ_HET\tOTH_FREQ_HOMALT\tOTH_MALE_AN\tOTH_MALE_AC\tOTH_MALE_AF\tOTH_MALE_N_BI_GENOS\tOTH_MALE_N_HOMREF\tOTH_MALE_N_HET\tOTH_MALE_N_HOMALT\tOTH_MALE_FREQ_HOMREF\tOTH_MALE_FREQ_HET\tOTH_MALE_FREQ_HOMALT\tOTH_MALE_N_HEMIREF\tOTH_MALE_N_HEMIALT\tOTH_MALE_FREQ_HEMIREF\tOTH_MALE_FREQ_HEMIALT\tOTH_FEMALE_AN\tOTH_FEMALE_AC\tOTH_FEMALE_AF\tOTH_FEMALE_N_BI_GENOS\tOTH_FEMALE_N_HOMREF\tOTH_FEMALE_N_HET\tOTH_FEMALE_N_HOMALT\tOTH_FEMALE_FREQ_HOMREF\tOTH_FEMALE_FREQ_HET\tOTH_FEMALE_FREQ_HOMALT\tFILTER\n" + "1\t20999\tInvalid-26000\tgnomAD-SV_v2.1_DEL_1_1\tDEL\tdepth\tFalse\tNA\tNA\tNA\tNA\t26000\tRD\tFalse\tFalse\tFalse\tNA\tNA\tNA\tNA\tTrue\tNA\tNA\tNA\tNA\tOR4F5\tNA\tNA\tNA\tNA\t5000\tDEL\tNA\tFalse\tFalse\t8586\t138\t0.01607299968600273\t4293\t4155\t138\t0\t0.9678549766540527\t0.03214539960026741\t0.0\t4302\t69\t0.01603900082409382\t2151\t2082\t69\t0\t0.9679219722747803\t0.0320780985057354\t0.0\tNA\tNA\tNA\tNA\tFalse\t4272\t68\t0.015917999669909477\t2136\t2068\t68\t0\t0.9681649804115295\t0.031835198402404785\t0.0\t0.07199999690055847\t3718\t27\t0.007261999882757664\t1859\t1832\t27\t0\t0.985476016998291\t0.014523900113999844\t0.0\t2016\t16\t0.007937000133097172\t1008\t992\t16\t0\t0.9841269850730896\t0.015873000025749207\t0.0\tNA\tNA\tNA\tNA\t1702\t11\t0.00646300008520484\t851\t840\t11\t0\t0.9870740175247192\t0.01292600017040968\t0.0\t684\t8\t0.011695999652147293\t342\t334\t8\t0\t0.9766079783439636\t0.02339180000126362\t0.0\t326\t2\t0.006134999915957451\t163\t161\t2\t0\t0.9877300262451172\t0.012269900180399418\t0.0\tNA\tNA\tNA\tNA\t358\t6\t0.016759999096393585\t179\t173\t6\t0\t0.966480016708374\t0.033519599586725235\t0.0\t750\t54\t0.07199999690055847\t375\t321\t54\t0\t0.8560000061988831\t0.14399999380111694\t0.0\t418\t28\t0.06698600202798843\t209\t181\t28\t0\t0.8660290241241455\t0.13397100567817688\t0.0\tNA\tNA\tNA\tNA\t328\t25\t0.07621999830007553\t164\t139\t25\t0\t0.8475610017776489\t0.15243899822235107\t0.0\t3346\t48\t0.014344999566674232\t1673\t1625\t48\t0\t0.9713090062141418\t0.028690999373793602\t0.0\t1498\t22\t0.014685999602079391\t749\t727\t22\t0\t0.9706270098686218\t0.029372500255703926\t0.0\tNA\tNA\tNA\tNA\t1840\t26\t0.01413000002503395\t920\t894\t26\t0\t0.9717389941215515\t0.02826089970767498\t0.0\t88\t1\t0.011363999918103218\t44\t43\t1\t0\t0.9772729873657227\t0.022727299481630325\t0.0\t44\t1\t0.0227269995957613\t22\t21\t1\t0\t0.9545450210571289\t0.04545449838042259\t0.0\tNA\tNA\tNA\tNA\t44\t0\t0.0\t22\t22\t0\t0\t1.0\t0.0\t0.0\tUNSTABLE_AF_PCRMINUS,LOW_CALL_RATE\n"; using var reader = new StreamReader(GetStreamData(bedData)); using var gnomadSvParser = new GnomadSvBedParser(reader, ChromosomeUtilities.RefNameToChromosome); Assert.Throws(() => gnomadSvParser.GetItems().ToList()); } [Fact] public void TestInvalidSvType() { const string bedData = "#chrom\tstart\tend\tname\tsvtype\tALGORITHMS\tBOTHSIDES_SUPPORT\tCHR2\tCPX_INTERVALS\tCPX_TYPE\tEND2\tEND\tEVIDENCE\tHIGH_SR_BACKGROUND\tPCRPLUS_DEPLETED\tPESR_GT_OVERDISPERSION\tPOS2\tPROTEIN_CODING__COPY_GAIN\tPROTEIN_CODING__DUP_LOF\tPROTEIN_CODING__DUP_PARTIAL\tPROTEIN_CODING__INTERGENIC\tPROTEIN_CODING__INTRONIC\tPROTEIN_CODING__INV_SPAN\tPROTEIN_CODING__LOF\tPROTEIN_CODING__MSV_EXON_OVR\tPROTEIN_CODING__NEAREST_TSS\tPROTEIN_CODING__PROMOTER\tPROTEIN_CODING__UTR\tSOURCE\tSTRANDS\tSVLEN\tSVTYPE\tUNRESOLVED_TYPE\tUNSTABLE_AF_PCRPLUS\tVARIABLE_ACROSS_BATCHES\tAN\tAC\tAF\tN_BI_GENOS\tN_HOMREF\tN_HET\tN_HOMALT\tFREQ_HOMREF\tFREQ_HET\tFREQ_HOMALT\tMALE_AN\tMALE_AC\tMALE_AF\tMALE_N_BI_GENOS\tMALE_N_HOMREF\tMALE_N_HET\tMALE_N_HOMALT\tMALE_FREQ_HOMREF\tMALE_FREQ_HET\tMALE_FREQ_HOMALT\tMALE_N_HEMIREF\tMALE_N_HEMIALT\tMALE_FREQ_HEMIREF\tMALE_FREQ_HEMIALT\tPAR\tFEMALE_AN\tFEMALE_AC\tFEMALE_AF\tFEMALE_N_BI_GENOS\tFEMALE_N_HOMREF\tFEMALE_N_HET\tFEMALE_N_HOMALT\tFEMALE_FREQ_HOMREF\tFEMALE_FREQ_HET\tFEMALE_FREQ_HOMALT\tPOPMAX_AF\tAFR_AN\tAFR_AC\tAFR_AF\tAFR_N_BI_GENOS\tAFR_N_HOMREF\tAFR_N_HET\tAFR_N_HOMALT\tAFR_FREQ_HOMREF\tAFR_FREQ_HET\tAFR_FREQ_HOMALT\tAFR_MALE_AN\tAFR_MALE_AC\tAFR_MALE_AF\tAFR_MALE_N_BI_GENOS\tAFR_MALE_N_HOMREF\tAFR_MALE_N_HET\tAFR_MALE_N_HOMALT\tAFR_MALE_FREQ_HOMREF\tAFR_MALE_FREQ_HET\tAFR_MALE_FREQ_HOMALT\tAFR_MALE_N_HEMIREF\tAFR_MALE_N_HEMIALT\tAFR_MALE_FREQ_HEMIREF\tAFR_MALE_FREQ_HEMIALT\tAFR_FEMALE_AN\tAFR_FEMALE_AC\tAFR_FEMALE_AF\tAFR_FEMALE_N_BI_GENOS\tAFR_FEMALE_N_HOMREF\tAFR_FEMALE_N_HET\tAFR_FEMALE_N_HOMALT\tAFR_FEMALE_FREQ_HOMREF\tAFR_FEMALE_FREQ_HET\tAFR_FEMALE_FREQ_HOMALT\tAMR_AN\tAMR_AC\tAMR_AF\tAMR_N_BI_GENOS\tAMR_N_HOMREF\tAMR_N_HET\tAMR_N_HOMALT\tAMR_FREQ_HOMREF\tAMR_FREQ_HET\tAMR_FREQ_HOMALT\tAMR_MALE_AN\tAMR_MALE_AC\tAMR_MALE_AF\tAMR_MALE_N_BI_GENOS\tAMR_MALE_N_HOMREF\tAMR_MALE_N_HET\tAMR_MALE_N_HOMALT\tAMR_MALE_FREQ_HOMREF\tAMR_MALE_FREQ_HET\tAMR_MALE_FREQ_HOMALT\tAMR_MALE_N_HEMIREF\tAMR_MALE_N_HEMIALT\tAMR_MALE_FREQ_HEMIREF\tAMR_MALE_FREQ_HEMIALT\tAMR_FEMALE_AN\tAMR_FEMALE_AC\tAMR_FEMALE_AF\tAMR_FEMALE_N_BI_GENOS\tAMR_FEMALE_N_HOMREF\tAMR_FEMALE_N_HET\tAMR_FEMALE_N_HOMALT\tAMR_FEMALE_FREQ_HOMREF\tAMR_FEMALE_FREQ_HET\tAMR_FEMALE_FREQ_HOMALT\tEAS_AN\tEAS_AC\tEAS_AF\tEAS_N_BI_GENOS\tEAS_N_HOMREF\tEAS_N_HET\tEAS_N_HOMALT\tEAS_FREQ_HOMREF\tEAS_FREQ_HET\tEAS_FREQ_HOMALT\tEAS_MALE_AN\tEAS_MALE_AC\tEAS_MALE_AF\tEAS_MALE_N_BI_GENOS\tEAS_MALE_N_HOMREF\tEAS_MALE_N_HET\tEAS_MALE_N_HOMALT\tEAS_MALE_FREQ_HOMREF\tEAS_MALE_FREQ_HET\tEAS_MALE_FREQ_HOMALT\tEAS_MALE_N_HEMIREF\tEAS_MALE_N_HEMIALT\tEAS_MALE_FREQ_HEMIREF\tEAS_MALE_FREQ_HEMIALT\tEAS_FEMALE_AN\tEAS_FEMALE_AC\tEAS_FEMALE_AF\tEAS_FEMALE_N_BI_GENOS\tEAS_FEMALE_N_HOMREF\tEAS_FEMALE_N_HET\tEAS_FEMALE_N_HOMALT\tEAS_FEMALE_FREQ_HOMREF\tEAS_FEMALE_FREQ_HET\tEAS_FEMALE_FREQ_HOMALT\tEUR_AN\tEUR_AC\tEUR_AF\tEUR_N_BI_GENOS\tEUR_N_HOMREF\tEUR_N_HET\tEUR_N_HOMALT\tEUR_FREQ_HOMREF\tEUR_FREQ_HET\tEUR_FREQ_HOMALT\tEUR_MALE_AN\tEUR_MALE_AC\tEUR_MALE_AF\tEUR_MALE_N_BI_GENOS\tEUR_MALE_N_HOMREF\tEUR_MALE_N_HET\tEUR_MALE_N_HOMALT\tEUR_MALE_FREQ_HOMREF\tEUR_MALE_FREQ_HET\tEUR_MALE_FREQ_HOMALT\tEUR_MALE_N_HEMIREF\tEUR_MALE_N_HEMIALT\tEUR_MALE_FREQ_HEMIREF\tEUR_MALE_FREQ_HEMIALT\tEUR_FEMALE_AN\tEUR_FEMALE_AC\tEUR_FEMALE_AF\tEUR_FEMALE_N_BI_GENOS\tEUR_FEMALE_N_HOMREF\tEUR_FEMALE_N_HET\tEUR_FEMALE_N_HOMALT\tEUR_FEMALE_FREQ_HOMREF\tEUR_FEMALE_FREQ_HET\tEUR_FEMALE_FREQ_HOMALT\tOTH_AN\tOTH_AC\tOTH_AF\tOTH_N_BI_GENOS\tOTH_N_HOMREF\tOTH_N_HET\tOTH_N_HOMALT\tOTH_FREQ_HOMREF\tOTH_FREQ_HET\tOTH_FREQ_HOMALT\tOTH_MALE_AN\tOTH_MALE_AC\tOTH_MALE_AF\tOTH_MALE_N_BI_GENOS\tOTH_MALE_N_HOMREF\tOTH_MALE_N_HET\tOTH_MALE_N_HOMALT\tOTH_MALE_FREQ_HOMREF\tOTH_MALE_FREQ_HET\tOTH_MALE_FREQ_HOMALT\tOTH_MALE_N_HEMIREF\tOTH_MALE_N_HEMIALT\tOTH_MALE_FREQ_HEMIREF\tOTH_MALE_FREQ_HEMIALT\tOTH_FEMALE_AN\tOTH_FEMALE_AC\tOTH_FEMALE_AF\tOTH_FEMALE_N_BI_GENOS\tOTH_FEMALE_N_HOMREF\tOTH_FEMALE_N_HET\tOTH_FEMALE_N_HOMALT\tOTH_FEMALE_FREQ_HOMREF\tOTH_FEMALE_FREQ_HET\tOTH_FEMALE_FREQ_HOMALT\tFILTER\n" + "1\t20999\t26000\tgnomAD-SV_v2.1_DEL_1_1\tINVALID-DEL\tdepth\tFalse\tNA\tNA\tNA\tNA\t26000\tRD\tFalse\tFalse\tFalse\tNA\tNA\tNA\tNA\tTrue\tNA\tNA\tNA\tNA\tOR4F5\tNA\tNA\tNA\tNA\t5000\tDEL\tNA\tFalse\tFalse\t8586\t138\t0.01607299968600273\t4293\t4155\t138\t0\t0.9678549766540527\t0.03214539960026741\t0.0\t4302\t69\t0.01603900082409382\t2151\t2082\t69\t0\t0.9679219722747803\t0.0320780985057354\t0.0\tNA\tNA\tNA\tNA\tFalse\t4272\t68\t0.015917999669909477\t2136\t2068\t68\t0\t0.9681649804115295\t0.031835198402404785\t0.0\t0.07199999690055847\t3718\t27\t0.007261999882757664\t1859\t1832\t27\t0\t0.985476016998291\t0.014523900113999844\t0.0\t2016\t16\t0.007937000133097172\t1008\t992\t16\t0\t0.9841269850730896\t0.015873000025749207\t0.0\tNA\tNA\tNA\tNA\t1702\t11\t0.00646300008520484\t851\t840\t11\t0\t0.9870740175247192\t0.01292600017040968\t0.0\t684\t8\t0.011695999652147293\t342\t334\t8\t0\t0.9766079783439636\t0.02339180000126362\t0.0\t326\t2\t0.006134999915957451\t163\t161\t2\t0\t0.9877300262451172\t0.012269900180399418\t0.0\tNA\tNA\tNA\tNA\t358\t6\t0.016759999096393585\t179\t173\t6\t0\t0.966480016708374\t0.033519599586725235\t0.0\t750\t54\t0.07199999690055847\t375\t321\t54\t0\t0.8560000061988831\t0.14399999380111694\t0.0\t418\t28\t0.06698600202798843\t209\t181\t28\t0\t0.8660290241241455\t0.13397100567817688\t0.0\tNA\tNA\tNA\tNA\t328\t25\t0.07621999830007553\t164\t139\t25\t0\t0.8475610017776489\t0.15243899822235107\t0.0\t3346\t48\t0.014344999566674232\t1673\t1625\t48\t0\t0.9713090062141418\t0.028690999373793602\t0.0\t1498\t22\t0.014685999602079391\t749\t727\t22\t0\t0.9706270098686218\t0.029372500255703926\t0.0\tNA\tNA\tNA\tNA\t1840\t26\t0.01413000002503395\t920\t894\t26\t0\t0.9717389941215515\t0.02826089970767498\t0.0\t88\t1\t0.011363999918103218\t44\t43\t1\t0\t0.9772729873657227\t0.022727299481630325\t0.0\t44\t1\t0.0227269995957613\t22\t21\t1\t0\t0.9545450210571289\t0.04545449838042259\t0.0\tNA\tNA\tNA\tNA\t44\t0\t0.0\t22\t22\t0\t0\t1.0\t0.0\t0.0\tUNSTABLE_AF_PCRMINUS,LOW_CALL_RATE\n"; using var reader = new StreamReader(GetStreamData(bedData)); using var gnomadSvParser = new GnomadSvBedParser(reader, ChromosomeUtilities.RefNameToChromosome); Assert.Throws(() => gnomadSvParser.GetItems().ToList()); } } ================================================ FILE: UnitTests/SAUtils/gnomAD/GnomadSvItemTests.cs ================================================ using SAUtils.DataStructures; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.gnomAD; public sealed class GnomadSvItemTests { [Fact] public void TestGnomadSvItem() { var gnomadSvItem = new GnomadSvItem(ChromosomeUtilities.Chr1, ""); Assert.Equal("", gnomadSvItem.InputLine); Assert.Equal("\"chromosome\":\"1\",\"begin\":0,\"end\":0,\"variantType\":\"unknown\"", gnomadSvItem.GetJsonString()); } } ================================================ FILE: UnitTests/SAUtils/gnomAD/GnomadSvTsvParserTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using SAUtils.DataStructures; using SAUtils.gnomAD; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.SAUtils.gnomAD; public sealed class GnomadSvTsvParserTests { private static Stream GetStreamData(string dataString) { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.Write(dataString); writer.Flush(); stream.Position = 0; return stream; } [Fact] public void TestGnomadSvTsvParser() { const string tsvData = "#variant_call_accession\tvariant_call_id\tvariant_call_type\texperiment_id\tsample_id\tsampleset_id\tassembly\tchrcontig\touter_start\tstart\tinner_start\tinner_stop\tstop\touter_stop\tinsertion_length\tvariant_region_acc\tvariant_region_id\tcopy_number\tdescription\tvalidation\tzygosity\torigin\tphenotype\thgvs_name\tplacement_method\tplacement_rank\tplacements_per_assembly\tremap_alignment\tremap_best_within_cluster\tremap_coverage\tremap_diff_chr\tremap_failure_code\tallele_count\tallele_frequency\tallele_number\n" + "nssv15777856\tgnomAD-SV_v2.1_CNV_10_564_alt_1\tcopy number variation\t1\t\t1\tGRCh38.p12\t10\t\t\t736806\t\t\t738184\t\t\tnsv4039284\t10__782746___784124______GRCh37.p13_copy_number_variation\t0\t\t\t\t\t\t\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=21,AFR_AC=10,AMR_AC=9,EAS_AC=0,EUR_AC=2,OTH_AC=0\tAF=0.038889,AFR_AF=0.044643,AMR_AF=0.03913,EAS_AF=0,EUR_AF=0.023256,OTH_AF=0\tAN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0\n" + "nssv15777857\tgnomAD-SV_v2.1_CNV_10_564_alt_10\talu insertion\t1\t\t1\tGRCh38.p12\t10\t\t\t736806\t\t\t738184\t\t\tnsv4039284\t10__782746___784124______GRCh37.p13_copy_number_variation\t9\t\t\t\t\t\t\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=0,AFR_AC=0,AMR_AC=0,EAS_AC=0,EUR_AC=0,OTH_AC=0\tAF=0,AFR_AF=0,AMR_AF=0,EAS_AF=0,EUR_AF=0,OTH_AF=0\tAN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0\n" + "nssv15777858\tgnomAD-SV_v2.1_CNV_10_564_alt_11\tdeletion\t1\t\t1\tGRCh38.p12\t10\t\t\t736806\t\t\t738184\t\t\tnsv4039284\t10__782746___784124______GRCh37.p13_copy_number_variation\t10\t\t\t\t\t\t\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=0,AFR_AC=0,AMR_AC=0,EAS_AC=0,EUR_AC=0,OTH_AC=0\tAF=0,AFR_AF=0,AMR_AF=0,EAS_AF=0,EUR_AF=0,OTH_AF=0\tAN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0\n" + "nssv15982321\tgnomAD-SV_v2.1_INS_11_75807\tinsertion\t1\t\t1\tGRCh38.p12\t11\t\t\t11946244\t\t\t11946244\t\t58\tnsv4549918\t11__11967791___11967792______GRCh37.p13_insertion\t\t\t\t\t\t\tNC_000011.10:g.11946244_11946245ins58\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=1,AFR_AC=0,AMR_AC=1,EAS_AC=0,EUR_AC=0,OTH_AC=0\tAF=4.6e-05,AFR_AF=0,AMR_AF=0.000518,EAS_AF=0,EUR_AF=0,OTH_AF=0\tAN=21694,AFR_AN=9534,AMR_AN=1930,EAS_AN=2416,EUR_AN=7624,OTH_AN=190\n"; using var reader = new StreamReader(GetStreamData(tsvData)); using var gnomadSvParser = new GnomadSvTsvParser(reader, ChromosomeUtilities.RefNameToChromosome); List svItemList = gnomadSvParser.GetItems().ToList(); Assert.Equal(4, svItemList.Count); Assert.Equal( "\"chromosome\":\"10\",\"begin\":736807,\"end\":738184,\"variantId\":\"gnomAD-SV_v2.1_CNV_10_564_alt_1\",\"variantType\":\"copy_number_variation\",\"allAf\":0.038889,\"afrAf\":0.044643,\"amrAf\":0.03913,\"easAf\":0,\"eurAf\":0.023256,\"othAf\":0,\"allAc\":21,\"afrAc\":10,\"amrAc\":9,\"easAc\":0,\"eurAc\":2,\"othAc\":0,\"allAn\":540,\"afrAn\":224,\"amrAn\":230,\"easAn\":0,\"eurAn\":86,\"othAn\":0", svItemList[0].GetJsonString() ); Assert.Equal( "\"chromosome\":\"10\",\"begin\":736807,\"end\":738184,\"variantId\":\"gnomAD-SV_v2.1_CNV_10_564_alt_10\",\"variantType\":\"mobile_element_insertion\",\"allAf\":0,\"afrAf\":0,\"amrAf\":0,\"easAf\":0,\"eurAf\":0,\"othAf\":0,\"allAc\":0,\"afrAc\":0,\"amrAc\":0,\"easAc\":0,\"eurAc\":0,\"othAc\":0,\"allAn\":540,\"afrAn\":224,\"amrAn\":230,\"easAn\":0,\"eurAn\":86,\"othAn\":0", svItemList[1].GetJsonString() ); Assert.Equal( "\"chromosome\":\"10\",\"begin\":736807,\"end\":738184,\"variantId\":\"gnomAD-SV_v2.1_CNV_10_564_alt_11\",\"variantType\":\"deletion\",\"allAf\":0,\"afrAf\":0,\"amrAf\":0,\"easAf\":0,\"eurAf\":0,\"othAf\":0,\"allAc\":0,\"afrAc\":0,\"amrAc\":0,\"easAc\":0,\"eurAc\":0,\"othAc\":0,\"allAn\":540,\"afrAn\":224,\"amrAn\":230,\"easAn\":0,\"eurAn\":86,\"othAn\":0", svItemList[2].GetJsonString() ); Assert.Equal( "\"chromosome\":\"11\",\"begin\":11946245,\"end\":11946244,\"variantId\":\"gnomAD-SV_v2.1_INS_11_75807\",\"variantType\":\"insertion\",\"allAf\":0.000046,\"afrAf\":0,\"amrAf\":0.000518,\"easAf\":0,\"eurAf\":0,\"othAf\":0,\"allAc\":1,\"afrAc\":0,\"amrAc\":1,\"easAc\":0,\"eurAc\":0,\"othAc\":0,\"allAn\":21694,\"afrAn\":9534,\"amrAn\":1930,\"easAn\":2416,\"eurAn\":7624,\"othAn\":190", svItemList[3].GetJsonString() ); } [Fact] public void TestUnknownChromosome() { const string tsvData = "#variant_call_accession\tvariant_call_id\tvariant_call_type\texperiment_id\tsample_id\tsampleset_id\tassembly\tchrcontig\touter_start\tstart\tinner_start\tinner_stop\tstop\touter_stop\tinsertion_length\tvariant_region_acc\tvariant_region_id\tcopy_number\tdescription\tvalidation\tzygosity\torigin\tphenotype\thgvs_name\tplacement_method\tplacement_rank\tplacements_per_assembly\tremap_alignment\tremap_best_within_cluster\tremap_coverage\tremap_diff_chr\tremap_failure_code\tallele_count\tallele_frequency\tallele_number\n" + "nssv15777856\tgnomAD-SV_v2.1_CNV_10_564_alt_1\tcopy number variation\t1\t\t1\tGRCh38.p12\tINVALID-1\t\t\t736806\t\t\t738184\t\t\tnsv4039284\t10__782746___784124______GRCh37.p13_copy_number_variation\t0\t\t\t\t\t\t\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=21,AFR_AC=10,AMR_AC=9,EAS_AC=0,EUR_AC=2,OTH_AC=0\tAF=0.038889,AFR_AF=0.044643,AMR_AF=0.03913,EAS_AF=0,EUR_AF=0.023256,OTH_AF=0\tAN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0\n" + "nssv15777857\tgnomAD-SV_v2.1_CNV_10_564_alt_10\tduplication\t1\t\t1\tGRCh38.p12\t10\t\t\t736806\t\t\t738184\t\t\tnsv4039284\t10__782746___784124______GRCh37.p13_copy_number_variation\t9\t\t\t\t\t\t\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=0,AFR_AC=0,AMR_AC=0,EAS_AC=0,EUR_AC=0,OTH_AC=0\tAF=0,AFR_AF=0,AMR_AF=0,EAS_AF=0,EUR_AF=0,OTH_AF=0\tAN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0\n"; using var reader = new StreamReader(GetStreamData(tsvData)); using var gnomadSvParser = new GnomadSvTsvParser(reader, ChromosomeUtilities.RefNameToChromosome); List svItemList = gnomadSvParser.GetItems().ToList(); Assert.Single(svItemList); Assert.Equal( "\"chromosome\":\"10\",\"begin\":736807,\"end\":738184,\"variantId\":\"gnomAD-SV_v2.1_CNV_10_564_alt_10\",\"variantType\":\"duplication\",\"allAf\":0,\"afrAf\":0,\"amrAf\":0,\"easAf\":0,\"eurAf\":0,\"othAf\":0,\"allAc\":0,\"afrAc\":0,\"amrAc\":0,\"easAc\":0,\"eurAc\":0,\"othAc\":0,\"allAn\":540,\"afrAn\":224,\"amrAn\":230,\"easAn\":0,\"eurAn\":86,\"othAn\":0", svItemList[0].GetJsonString() ); } [Fact] public void TestInvalidStart() { const string tsvData = "#variant_call_accession\tvariant_call_id\tvariant_call_type\texperiment_id\tsample_id\tsampleset_id\tassembly\tchrcontig\touter_start\tstart\tinner_start\tinner_stop\tstop\touter_stop\tinsertion_length\tvariant_region_acc\tvariant_region_id\tcopy_number\tdescription\tvalidation\tzygosity\torigin\tphenotype\thgvs_name\tplacement_method\tplacement_rank\tplacements_per_assembly\tremap_alignment\tremap_best_within_cluster\tremap_coverage\tremap_diff_chr\tremap_failure_code\tallele_count\tallele_frequency\tallele_number\n" + "nssv15777856\tgnomAD-SV_v2.1_CNV_10_564_alt_1\tcopy number variation\t1\t\t1\tGRCh38.p12\t10\t\t\tInvalid-736806\t\t\t738184\t\t\tnsv4039284\t10__782746___784124______GRCh37.p13_copy_number_variation\t0\t\t\t\t\t\t\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=21,AFR_AC=10,AMR_AC=9,EAS_AC=0,EUR_AC=2,OTH_AC=0\tAF=0.038889,AFR_AF=0.044643,AMR_AF=0.03913,EAS_AF=0,EUR_AF=0.023256,OTH_AF=0\tAN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0\n"; using var reader = new StreamReader(GetStreamData(tsvData)); using var gnomadSvParser = new GnomadSvTsvParser(reader, ChromosomeUtilities.RefNameToChromosome); Assert.Throws(() => gnomadSvParser.GetItems().ToList()); } [Fact] public void TestInvalidEnd() { const string tsvData = "#variant_call_accession\tvariant_call_id\tvariant_call_type\texperiment_id\tsample_id\tsampleset_id\tassembly\tchrcontig\touter_start\tstart\tinner_start\tinner_stop\tstop\touter_stop\tinsertion_length\tvariant_region_acc\tvariant_region_id\tcopy_number\tdescription\tvalidation\tzygosity\torigin\tphenotype\thgvs_name\tplacement_method\tplacement_rank\tplacements_per_assembly\tremap_alignment\tremap_best_within_cluster\tremap_coverage\tremap_diff_chr\tremap_failure_code\tallele_count\tallele_frequency\tallele_number\n" + "nssv15777856\tgnomAD-SV_v2.1_CNV_10_564_alt_1\tcopy number variation\t1\t\t1\tGRCh38.p12\t10\t\t\t736806\t\t\tInvalid-738184\t\t\tnsv4039284\t10__782746___784124______GRCh37.p13_copy_number_variation\t0\t\t\t\t\t\t\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=21,AFR_AC=10,AMR_AC=9,EAS_AC=0,EUR_AC=2,OTH_AC=0\tAF=0.038889,AFR_AF=0.044643,AMR_AF=0.03913,EAS_AF=0,EUR_AF=0.023256,OTH_AF=0\tAN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0\n"; using var reader = new StreamReader(GetStreamData(tsvData)); using var gnomadSvParser = new GnomadSvTsvParser(reader, ChromosomeUtilities.RefNameToChromosome); Assert.Throws(() => gnomadSvParser.GetItems().ToList()); } [Fact] public void TestInvalidSvType() { const string tsvData = "#variant_call_accession\tvariant_call_id\tvariant_call_type\texperiment_id\tsample_id\tsampleset_id\tassembly\tchrcontig\touter_start\tstart\tinner_start\tinner_stop\tstop\touter_stop\tinsertion_length\tvariant_region_acc\tvariant_region_id\tcopy_number\tdescription\tvalidation\tzygosity\torigin\tphenotype\thgvs_name\tplacement_method\tplacement_rank\tplacements_per_assembly\tremap_alignment\tremap_best_within_cluster\tremap_coverage\tremap_diff_chr\tremap_failure_code\tallele_count\tallele_frequency\tallele_number\n" + "nssv15777856\tgnomAD-SV_v2.1_CNV_10_564_alt_1\tINVALID copy number variation\t1\t\t1\tGRCh38.p12\t10\t\t\t736806\t\t\t738184\t\t\tnsv4039284\t10__782746___784124______GRCh37.p13_copy_number_variation\t0\t\t\t\t\t\t\tRemapped\tBestAvailable\tSingle\tFirst Pass\t0\t1\t\t\tAC=21,AFR_AC=10,AMR_AC=9,EAS_AC=0,EUR_AC=2,OTH_AC=0\tAF=0.038889,AFR_AF=0.044643,AMR_AF=0.03913,EAS_AF=0,EUR_AF=0.023256,OTH_AF=0\tAN=540,AFR_AN=224,AMR_AN=230,EAS_AN=0,EUR_AN=86,OTH_AN=0\n"; using var reader = new StreamReader(GetStreamData(tsvData)); using var gnomadSvParser = new GnomadSvTsvParser(reader, ChromosomeUtilities.RefNameToChromosome); Assert.Throws(() => gnomadSvParser.GetItems().ToList()); } } ================================================ FILE: UnitTests/SAUtils/gnomAD/LcrParserTests.cs ================================================ using System.IO; using System.Linq; using System.Text; using Moq; using SAUtils.gnomAD; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Providers; using Xunit; namespace UnitTests.SAUtils.gnomAD { public class LcrParserTests { private Stream GetGRCh37Stream() { var stream = new MemoryStream(); using (var writer = new StreamWriter(stream, Encoding.Default, 512*1024, true)) { writer.WriteLine("1:1-10000"); writer.WriteLine("1:40637-40658"); writer.WriteLine("1:77172-77195"); } stream.Position = 0; return stream; } private ISequenceProvider GetGRCh37() { var seqProvider = new Mock(); seqProvider.Setup(x => x.Sequence.Substring(0, It.IsAny())). Returns(new string('n',500)+new string ('N',500)); seqProvider.Setup(x => x.Sequence.Substring(40637-1, It.IsAny())). Returns(new string('A',50) +new string ('C',50)); seqProvider.Setup(x => x.Sequence.Substring(77172 -1, It.IsAny())). Returns(new string('T',50) +new string ('G',50)); seqProvider.SetupGet(x => x.RefNameToChromosome).Returns( ChromosomeUtilities.RefNameToChromosome); return seqProvider.Object; } private Stream GetGRCh38Stream() { var stream = new MemoryStream(); using (var writer = new StreamWriter(stream, Encoding.Default, 512 *1024, true)) { writer.WriteLine("chr1\t9999\t10468"); writer.WriteLine("chr1\t30853\t30959"); writer.WriteLine("chr1\t47317\t47328"); } stream.Position = 0; return stream; } private ISequenceProvider GetGRCh38() { var seqProvider = new Mock(); seqProvider.Setup(x => x.Sequence.Substring(9999 -1, It.IsAny())). Returns(new string('G',50) +new string ('C',50)); seqProvider.Setup(x => x.Sequence.Substring(30853 -1, It.IsAny())). Returns(new string('A',50) +new string ('C',50)); seqProvider.Setup(x => x.Sequence.Substring(47317 -1, It.IsAny())). Returns(new string('T',50) +new string ('G',50)); seqProvider.SetupGet(x => x.RefNameToChromosome).Returns( ChromosomeUtilities.RefNameToChromosome); return seqProvider.Object; } [Fact] public void GetGRCh37Lcrs() { var parser = new LcrRegionParser(new StreamReader(GetGRCh37Stream()), GetGRCh37()); var items = parser.GetItems().ToList(); Assert.Equal(2, items.Count); } [Fact] public void GetGRCh38Lcrs() { var parser = new LcrRegionParser(new StreamReader(GetGRCh38Stream()), GetGRCh38()); var items = parser.GetItems().ToList(); Assert.Equal(3, items.Count); } } } ================================================ FILE: UnitTests/SingleAnnotationLambda/SingleConfigTests.cs ================================================ using System; using Cloud.Messages.Single; using ErrorHandling.Exceptions; using SingleAnnotationLambda; using Xunit; namespace UnitTests.SingleAnnotationLambda { public sealed class SingleConfigTests { [Fact] public void Validate_Success() { SingleConfig config = GetConfig(); Exception ex = Record.Exception(() => { config.Validate(); }); Assert.Null(ex); } [Fact] public void Validate_NullId_ThrowException() { SingleConfig config = GetConfig(); config.id = null; Assert.Throws(() => config.Validate()); } [Fact] public void Validate_NullGenomeAssembly_ThrowException() { SingleConfig config = GetConfig(); config.genomeAssembly = null; Assert.Throws(() => config.Validate()); } [Fact] public void Validate_NullVariant_ThrowException() { SingleConfig config = GetConfig(); config.variant = null; Assert.Throws(() => config.Validate()); } private static SingleConfig GetConfig() => new SingleConfig { id = "Test", genomeAssembly = "Assembly", variant = new SingleVariant { chromosome = "1", position = 100, refAllele = "A", altAlleles = new[] { "T", "C"} } }; } } ================================================ FILE: UnitTests/SingleAnnotationLambda/SingleVariantTests.cs ================================================ using System; using System.Linq; using Cloud.Messages.Single; using ErrorHandling.Exceptions; using Xunit; namespace UnitTests.SingleAnnotationLambda { public sealed class SingleVariantTests { [Fact] public void GetVcfFields_AsExpected() { var variant = new SingleVariant { chromosome = "1", position = 100, refAllele = "A", altAlleles = new[] { "C", "AC" }, filters = new[] { "LowGQX", "NoPassedVariantGTs" }, infoField = "SNVHPOL=2;MQ=34", formatField = "GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL:ME:DQ", sampleFields = new[] { "0|0:15:15:6:4:6,0:4,0:2,0:0:PASS:0,18,170:0:.", "0|1:13:0:7:6:6,1:3,0:3,1:0:LowGQX:15,0,147:.:.", "0|1:18:0:9:8:8,1:2,0:6,1:0:LowGQX:20,0,156:.:." }, sampleNames = new[] { "NA12878", "NA12891", "NA12892" } }; string[] vcfFields = variant.GetVcfFields(); Assert.Equal(12, vcfFields.Length); Assert.Equal("1", vcfFields[0]); Assert.Equal("100", vcfFields[1]); Assert.True(vcfFields.SequenceEqual(new[] { "1", "100", ".", "A", "C,AC", ".", "LowGQX;NoPassedVariantGTs", "SNVHPOL=2;MQ=34", "GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL:ME:DQ", "0|0:15:15:6:4:6,0:4,0:2,0:0:PASS:0,18,170:0:.", "0|1:13:0:7:6:6,1:3,0:3,1:0:LowGQX:15,0,147:.:.", "0|1:18:0:9:8:8,1:2,0:6,1:0:LowGQX:20,0,156:.:." })); } [Fact] public void Validate_Success() { SingleVariant variant = GetConfig(); Exception ex = Record.Exception(() => { variant.Validate(); }); Assert.Null(ex); } [Fact] public void Validate_NullChromosome_ThrowException() { SingleVariant variant = GetConfig(); variant.chromosome = null; Assert.Throws(() => variant.Validate()); } [Fact] public void Validate_NullPosition_ThrowException() { SingleVariant variant = GetConfig(); variant.position = null; Assert.Throws(() => variant.Validate()); } [Fact] public void Validate_NullReferenceAllele_ThrowException() { SingleVariant variant = GetConfig(); variant.refAllele = null; Assert.Throws(() => variant.Validate()); } [Fact] public void Validate_NullAlternateAlleles_ThrowException() { SingleVariant variant = GetConfig(); variant.altAlleles = null; Assert.Throws(() => variant.Validate()); } [Fact] public void Validate_ZeroAlternateAlleles_ThrowException() { SingleVariant variant = GetConfig(); variant.altAlleles = new string[0]; Assert.Throws(() => variant.Validate()); } [Fact] public void Validate_SampleNamesAndSampleFields_NoFormatField_ThrowException() { SingleVariant variant = GetConfig(); variant.sampleNames = new[] {"Bob"}; variant.sampleFields = new[] { "0/1" }; Assert.Throws(() => variant.Validate()); } [Fact] public void Validate_FormatField_NoSampleNamesAndSampleFields_ThrowException() { SingleVariant variant = GetConfig(); variant.formatField = "GT"; Assert.Throws(() => variant.Validate()); } private static SingleVariant GetConfig() => new SingleVariant { chromosome = "1", position = 100, refAllele = "A", altAlleles = new[] { "T", "C" } }; } } ================================================ FILE: UnitTests/Tabix/BgzfBlockVcfReaderTests.cs ================================================ using Compression.FileHandling; using IO; using Tabix; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Tabix { public sealed class BgzfBlockVcfReaderTests { private const long FileOffset = 61413; private readonly BgzfBlock _block = new BgzfBlock(); [Fact] public void FindVariantsInBlock_NoVariants_ReturnFalse() { using (var stream = FileUtilities.GetReadStream(Resources.TopPath("miniHEXA_minimal.vcf.gz"))) { bool observedResults = BgzfBlockVcfReader.FindVariantsInBlocks(stream, FileOffset, FileOffset, _block, ChromosomeUtilities.Chr15, 1, 71589359); Assert.False(observedResults); } } [Fact] public void FindVariantsInBlock_ReturnTrue() { using (var stream = FileUtilities.GetReadStream(Resources.TopPath("miniHEXA_minimal.vcf.gz"))) { bool observedResults = BgzfBlockVcfReader.FindVariantsInBlocks(stream, FileOffset, FileOffset, _block, ChromosomeUtilities.Chr15, 71589360, 71589361); Assert.False(observedResults); } } private const string MixedLineEndingsInput = "C\t39\t.\t.\tGT\t0/1\t.\t1/1\n1\t100\t.\tT\tC\t39\t.\t.\tGT\t0/1\t.\t1/1\r\n2\t55927\t.\tT\tC\t39\t.\t.\tGT\t0/1\t.\t1/1\n2\t55928\t.\tT\tC\t39\t.\t.\tGT\t0/1\t.\t1/1\r\n2\t55929\t.\tT\tC\t39\t.\t.\tGT\t0/1\t.\t1/1\n3\t200\t.\tT\tC\t39\t.\t.\tGT\t0/1\t.\t1/1\r\n1\t"; [Fact] public void GetVcfPositions_MixedLineEndings_PartialEntries_MultipleChromosomes_ReturnTrue() { bool observedResults = BgzfBlockVcfReader.HasVcfPositionsOnInterval(MixedLineEndingsInput, ChromosomeUtilities.Chr2, 55927, 55928); Assert.True(observedResults); } [Fact] public void GetVcfPositions_MixedLineEndings_PartialEntries_MultipleChromosomes_ReturnFalse() { bool observedResults = BgzfBlockVcfReader.HasVcfPositionsOnInterval(MixedLineEndingsInput, ChromosomeUtilities.Chr2, 55930, 55940); Assert.False(observedResults); } [Fact] public void GetVcfPositions_SkipCorruptPositions() { const string input = "2\t55927i\t.\tT\tC\t39\t.\t.\tGT\t0/1\t.\t1/1\n2\t55928\t.\tT\tC\t39\t.\t.\tGT\t0/1\t.\t1/1"; bool observedResults = BgzfBlockVcfReader.HasVcfPositionsOnInterval(input, ChromosomeUtilities.Chr2, 55927, 55927); Assert.False(observedResults); } } } ================================================ FILE: UnitTests/Tabix/BinUtilitiesTests.cs ================================================ using System.Collections.Generic; using System.Linq; using Tabix; using Xunit; namespace UnitTests.Tabix { public sealed class BinUtilitiesTests { [Fact] public void BottomBin_Nominal() { int observedResults = BinUtilities.BottomBin(12517); Assert.Equal(7836, observedResults); } [Fact] public void ConvertPositionToBin_Nominal() { int observedResults = BinUtilities.ConvertPositionToBin(26699126); Assert.Equal(6310, observedResults); } [Fact] public void OverlappingBinsWithVariants_EndBeforeBegin_ReturnEmptyList() { IEnumerable results = BinUtilities.OverlappingBinsWithVariants(20, 10, null); Assert.Empty(results); } [Fact] public void OverlappingBinsWithVariants_EndBeyondMaxRefLen_CorrectEnd() { const int expectedBinId = 6310; var idToChunks = new Dictionary { [expectedBinId] = new[] { new Interval(1, 1) } }; List results = BinUtilities.OverlappingBinsWithVariants(10, int.MaxValue, idToChunks).ToList(); Assert.Single(results); Assert.Equal(expectedBinId, results[0]); } } } ================================================ FILE: UnitTests/Tabix/ReaderTests.cs ================================================ using System.IO; using System.Text; using IO; using Tabix; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Tabix { public sealed class ReaderTests { [Fact] public void Read_Nominal() { using (var stream = FileUtilities.GetReadStream(Resources.TopPath("miniHEXA_minimal.vcf.gz.tbi"))) { Index index = Reader.GetTabixIndex(stream, ChromosomeUtilities.RefNameToChromosome); Assert.Equal(1, index.BeginIndex); Assert.Equal('#', index.CommentChar); Assert.Equal(-1, index.EndIndex); Assert.Equal(Constants.VcfFormat, index.Format); Assert.Equal(0, index.NumLinesToSkip); Assert.Equal(0, index.SequenceNameIndex); Assert.Single(index.ReferenceSequences); var refSeq = index.ReferenceSequences[0]; Assert.Equal("chr15", refSeq.Chromosome.UcscName); Assert.Equal(4675, refSeq.LinearFileOffsets.Length); Assert.Equal((ulong)4587, refSeq.LinearFileOffsets[4370]); Assert.Equal(306, refSeq.IdToChunks.Count); var chunks = refSeq.IdToChunks[9062]; Assert.NotNull(chunks); Assert.Single(chunks); var chunk = chunks[0]; Assert.Equal((ulong)61269, chunk.Begin); Assert.Equal((ulong)991626923, chunk.End); } } [Fact] public void Read_NotTabixFormat() { using (var ms = new MemoryStream()) { using (var writer = new BinaryWriter(ms, Encoding.UTF8, true)) { writer.Write("The quick brown fox jumped over the lazy dog."); } ms.Position = 0; using (var reader = new BinaryReader(ms)) { Assert.Throws(delegate { Reader.Read(reader, null); }); } } } } } ================================================ FILE: UnitTests/Tabix/SearchTests.cs ================================================ using IO; using Tabix; using Xunit; using UnitTests.TestUtilities; namespace UnitTests.Tabix { public sealed class SearchTests { private readonly Search _search; private const string ChromosomeName = "chr15"; public SearchTests() { Index index; using (var stream = FileUtilities.GetReadStream(Resources.TopPath("miniHEXA_minimal.vcf.gz.tbi"))) { index = Reader.GetTabixIndex(stream, ChromosomeUtilities.RefNameToChromosome); } var vcfStream = FileUtilities.GetReadStream(Resources.TopPath("miniHEXA_minimal.vcf.gz")); _search = new Search(index, vcfStream); } [Fact] public void HasVariants_IntervalBeforeReads_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 1, 71589359); Assert.False(observedResult); } [Fact] public void HasVariants_IntervalOverlapsReads_HasVcfPositionsOnIntervalTrue_ReturnsTrue() { bool observedResult = _search.HasVariants(ChromosomeName, 1, 71589360); Assert.True(observedResult); } [Fact] public void HasVariants_IntervalOverlapsReads_ReturnsTrue() { bool observedResult = _search.HasVariants(ChromosomeName, 71589360, 76592131); Assert.True(observedResult); } [Fact] public void HasVariants_NoOverlap_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 76591006, 76592130); Assert.False(observedResult); } [Fact] public void HasVariants_IntervalAfterReads_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 76592132, 101991189); Assert.False(observedResult); } [Fact] public void HasVariants_NullRefSeq_ReturnsFalse() { bool observedResult = _search.HasVariants("chr18", 71589360, 76592131); Assert.False(observedResult); } } } ================================================ FILE: UnitTests/Tabix/SearchTestsLocalMother.cs ================================================ #if EXPANDED_TESTS using Genome; using System.Collections.Generic; using IO; using Tabix; using Xunit; namespace UnitTests.Tabix { public sealed class SearchTestsLocalMother { private readonly Search _search; private const string ChromosomeName = "chr2"; public SearchTestsLocalMother() { var chr2 = new Chromosome("chr2", "2", 1); var refNameToChromosome = new Dictionary { [chr2.EnsemblName] = chr2, [chr2.UcscName] = chr2 }; Index index; using (var stream = FileUtilities.GetReadStream(@"E:\Data\Nirvana\Data\Mother\Mother.vcf.gz.tbi")) { index = Reader.GetTabixIndex(stream, refNameToChromosome); } var vcfStream = FileUtilities.GetReadStream(@"E:\Data\Nirvana\Data\Mother\Mother.vcf.gz"); _search = new Search(index, vcfStream); } [Fact] public void HasVariants_IntervalBeforeReads_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 1, 11319); Assert.False(observedResult); } [Fact] public void HasVariants_IntervalOverlapsReads_HasVcfPositionsOnIntervalTrue_ReturnsTrue() { bool observedResult = _search.HasVariants(ChromosomeName, 1, 11320); Assert.True(observedResult); } [Fact] public void HasVariants_IntervalOverlapsReads_ReturnsTrue() { bool observedResult = _search.HasVariants(ChromosomeName, 217826, 435772); Assert.True(observedResult); } [Fact] public void HasVariants_NoOverlap_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 431200, 434667); Assert.False(observedResult); } [Fact] public void HasVariants_IntervalAfterReads_ReturnsTrue() { bool observedResult = _search.HasVariants(ChromosomeName, 243172390, 243199373); Assert.True(observedResult); } [Fact] public void HasVariants_IntervalAfterReads_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 243172391, 243199373); Assert.False(observedResult); } } } #endif ================================================ FILE: UnitTests/Tabix/SearchTestsRemoteMother.cs ================================================ #if EXPANDED_TESTS using Genome; using System.Collections.Generic; using IO; using Tabix; using Xunit; namespace UnitTests.Tabix { public sealed class SearchTestsRemoteMother { private readonly Search _search; private const string ChromosomeName = "chr2"; public SearchTestsRemoteMother() { var chr2 = new Chromosome("chr2", "2", 1); var refNameToChromosome = new Dictionary { [chr2.EnsemblName] = chr2, [chr2.UcscName] = chr2 }; Index index; using (var stream = PersistentStreamUtils.GetReadStream("https://illumina-annotation.s3.amazonaws.com/Test/Mother.vcf.gz.tbi")) { index = Reader.GetTabixIndex(stream, refNameToChromosome); } var vcfStream = PersistentStreamUtils.GetReadStream("https://illumina-annotation.s3.amazonaws.com/Test/Mother.vcf.gz"); _search = new Search(index, vcfStream); } [Fact] public void HasVariants_IntervalBeforeReads_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 1, 11319); Assert.False(observedResult); } [Fact] public void HasVariants_IntervalOverlapsReads_HasVcfPositionsOnIntervalTrue_ReturnsTrue() { bool observedResult = _search.HasVariants(ChromosomeName, 1, 11320); Assert.True(observedResult); } [Fact] public void HasVariants_IntervalOverlapsReads_ReturnsTrue() { bool observedResult = _search.HasVariants(ChromosomeName, 217826, 435772); Assert.True(observedResult); } [Fact] public void HasVariants_NoOverlap_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 431200, 434667); Assert.False(observedResult); } [Fact] public void HasVariants_IntervalAfterReads_ReturnsTrue() { bool observedResult = _search.HasVariants(ChromosomeName, 243172390, 243199373); Assert.True(observedResult); } [Fact] public void HasVariants_IntervalAfterReads_ReturnsFalse() { bool observedResult = _search.HasVariants(ChromosomeName, 243172391, 243199373); Assert.False(observedResult); } } } #endif ================================================ FILE: UnitTests/Tabix/SearchUtilitiesTests.cs ================================================ using System.Collections.Generic; using Tabix; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.Tabix { public sealed class SearchUtilitiesTests { private readonly Dictionary _refNameToTabixIndex; public SearchUtilitiesTests() { _refNameToTabixIndex = new Dictionary { ["chr1"] = 0, ["1"] = 0, ["chr2"] = 1, ["2"] = 1, ["chr15"] = 14, ["15"] = 14 }; } [Fact] public void GetMinOffset_Nominal() { const ulong expectedResults = 3591443256775; var linearFileOffsets = new ulong[1630]; linearFileOffsets[1629] = expectedResults; var idToChunks = new Dictionary { [6310] = new[] { new Interval(1, 1) } }; var refSeq = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, linearFileOffsets); ulong observedResults = SearchUtilities.GetMinOffset(refSeq, 26699125); Assert.Equal(expectedResults, observedResults); } [Fact] public void GetMinOffset_MissingBin() { const ulong expectedResults = 3723191187417; var linearFileOffsets = new ulong[2196]; linearFileOffsets[2195] = expectedResults; var idToChunks = new Dictionary { [6876] = new[] { new Interval(1, 1) } }; var refSeq = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, linearFileOffsets); ulong observedResults = SearchUtilities.GetMinOffset(refSeq, 35979265); Assert.Equal(expectedResults, observedResults); } [Fact] public void GetMinOffset_MissingFirstBin() { const ulong expectedResults = 4351134646660; var linearFileOffsets = new ulong[5353]; linearFileOffsets[5352] = expectedResults; var idToChunks = new Dictionary { [1254] = new[] { new Interval(1, 1) } }; var refSeq = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, linearFileOffsets); ulong observedResults = SearchUtilities.GetMinOffset(refSeq, 87687168); Assert.Equal(expectedResults, observedResults); } [Fact] public void GetMaxOffset_Nominal() { const ulong expectedResults = 3591443312067; var idToChunks = new Dictionary { [6311] = new[] { new Interval(3591443312067, 3592132724129) } }; var refSeq = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, null); ulong observedResults = SearchUtilities.GetMaxOffset(refSeq, 26699126); Assert.Equal(expectedResults, observedResults); } [Fact] public void GetMaxOffset_MissingBin() { const ulong expectedResults = 3724057593420; var idToChunks = new Dictionary { [6878] = new[] { new Interval(3724057593420, 3724057615020) } }; var refSeq = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, null); ulong observedResults = SearchUtilities.GetMaxOffset(refSeq, 35962881); Assert.Equal(expectedResults, observedResults); } [Fact] public void GetMaxOffset_MissingFirstBin() { const ulong expectedResults = 3724908138137; var idToChunks = new Dictionary { [860] = new[] { new Interval(3724908138137, 3724908155075) } }; var refSeq = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, null); ulong observedResults = SearchUtilities.GetMaxOffset(refSeq, 36028417); Assert.Equal(expectedResults, observedResults); } [Fact] public void GetMaxOffset_MissingAllOverlappingBins_ReturnMaxOffset() { const ulong expectedResults = ulong.MaxValue; var idToChunks = new Dictionary(); var refSeq = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, null); ulong observedResults = SearchUtilities.GetMaxOffset(refSeq, 243171329); Assert.Equal(expectedResults, observedResults); } [Fact] public void GetMinOverlapOffset_SingleBin() { const long expectedResults = 3591443256857; const ulong minOffset = 3591443256775; const ulong maxOffset = 3591443312067; var chunks = new[] { new Interval(3591443256857, 3591443311984) }; long observedResults = SearchUtilities.GetMinOverlapOffset(chunks, minOffset, maxOffset); Assert.Equal(expectedResults, observedResults); } [Fact] public void GetMinOverlapOffset_SingleBin_NullChunks() { const ulong minOffset = 3591443256775; const ulong maxOffset = 3591443312067; long observedResults = SearchUtilities.GetMinOverlapOffset(null, minOffset, maxOffset); Assert.Equal(0, observedResults); } [Fact] public void GetOffset_Nominal() { var linearFileOffsets = new ulong[1630]; linearFileOffsets[1629] = 3591443256775; var idToChunks = GetIdToChunks(); var refSeqs = new ReferenceIndex[2]; refSeqs[1] = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, linearFileOffsets); var index = new Index(Constants.VcfFormat, 0, 0, 0, '#', 0, refSeqs, _refNameToTabixIndex); long observedResult = index.GetOffset("chr2", 26699126); Assert.Equal(3591443256857, observedResult); } [Fact] public void GetOffset_HandleDiff_TabixIndex_And_RefIndex() { var linearFileOffsets = new ulong[1630]; linearFileOffsets[1629] = 3591443256775; var idToChunks = GetIdToChunks(); // tabix index 10 = chr2 = ref index 1 var refSeqs = new ReferenceIndex[11]; refSeqs[10] = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, linearFileOffsets); var refNameToTabixIndex = new Dictionary { ["chr2"] = 10 }; var index = new Index(Constants.VcfFormat, 0, 0, 0, '#', 0, refSeqs, refNameToTabixIndex); long observedResult = index.GetOffset("chr2", 26699126); Assert.Equal(3591443256857, observedResult); } [Fact] public void GetOffset_UnknownChromosome_ReturnMinusOne() { var index = new Index(Constants.VcfFormat, 0, 0, 0, '#', 0, null, _refNameToTabixIndex); long observedResult = index.GetOffset("chrUn", 26699126); Assert.Equal(-1, observedResult); } [Fact] public void GetOffset_FixNegativeBeginCoordinate() { var linearFileOffsets = new ulong[1]; linearFileOffsets[0] = 3213608733669; var idToChunks = new Dictionary { [585] = new[] { new Interval(3213608740412, 3213608740487) }, [4681] = new[] { new Interval(3213608733669, 3213608740412) }, [4682] = new[] { new Interval(3213608740487, 3214303562687) } }; var refSeqs = new ReferenceIndex[2]; refSeqs[1] = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, linearFileOffsets); var index = new Index(Constants.VcfFormat, 0, 0, 0, '#', 0, refSeqs, _refNameToTabixIndex); long observedResult = index.GetOffset("chr2", 0); Assert.Equal(3213608733669, observedResult); } [Fact] public void GetOffset_NoOverlappingBins_UseLinearIndex() { const long expectedOffset = 11418; var linearFileOffsets = new ulong[7]; linearFileOffsets[6] = expectedOffset; var idToChunks = new Dictionary(); var refSeqs = new ReferenceIndex[2]; refSeqs[0] = new ReferenceIndex(ChromosomeUtilities.Chr1, idToChunks, linearFileOffsets); var index = new Index(Constants.VcfFormat, 0, 0, 0, '#', 0, refSeqs, _refNameToTabixIndex); long observedResult = index.GetOffset("chr1", 100_000); Assert.Equal(expectedOffset, observedResult); } [Fact] public void GetOffset_NoOverlappingBins_UseLinearIndex_WithTruncatedIndex_ReturnMinusOne() { var linearFileOffsets = new ulong[1]; linearFileOffsets[0] = 11418; var idToChunks = new Dictionary(); var refSeqs = new ReferenceIndex[2]; refSeqs[0] = new ReferenceIndex(ChromosomeUtilities.Chr1, idToChunks, linearFileOffsets); var index = new Index(Constants.VcfFormat, 0, 0, 0, '#', 0, refSeqs, _refNameToTabixIndex); long observedResult = index.GetOffset("chr1", 100_000); Assert.Equal(-1, observedResult); } [Fact] public void GetFirstNonZeroValue_WithoutZeros() { var offsets = new ulong[10]; for (var i = 0; i < offsets.Length; i++) offsets[i] = (ulong)i + 1; long observedResult = offsets.FirstNonZeroValue(); Assert.Equal(1, observedResult); } [Fact] public void GetFirstNonZeroValue_WithLeadingZeros() { var offsets = new ulong[10]; for (var i = 5; i < offsets.Length; i++) offsets[i] = (ulong)i + 1; long observedResult = offsets.FirstNonZeroValue(); Assert.Equal(6, observedResult); } [Fact] public void GetFirstNonZeroValue_AllZeros_ReturnMinusOne() { var offsets = new ulong[10]; long observedResult = offsets.FirstNonZeroValue(); Assert.Equal(-1, observedResult); } [Fact] public void GetTabixReferenceSequence_NullChromosome_ReturnNull() { var linearFileOffsets = new ulong[1630]; linearFileOffsets[1629] = 3591443256775; var idToChunks = GetIdToChunks(); var refSeqs = new ReferenceIndex[2]; refSeqs[1] = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, linearFileOffsets); var index = new Index(Constants.VcfFormat, 0, 0, 0, '#', 0, refSeqs, _refNameToTabixIndex); var refSeq = index.GetTabixReferenceSequence(null); Assert.Null(refSeq); } [Fact] public void GetTabixReferenceSequence_Nominal() { var linearFileOffsets = new ulong[1630]; linearFileOffsets[1629] = 3591443256775; var idToChunks = GetIdToChunks(); var refSeqs = new ReferenceIndex[2]; refSeqs[1] = new ReferenceIndex(ChromosomeUtilities.Chr2, idToChunks, linearFileOffsets); var index = new Index(Constants.VcfFormat, 0, 0, 0, '#', 0, refSeqs, _refNameToTabixIndex); var refSeq = index.GetTabixReferenceSequence("chr2"); Assert.Equal("chr2", refSeq.Chromosome.UcscName); } [Fact] public void AdjustBegin_Nominal() { int observedResult = SearchUtilities.AdjustBegin(5); Assert.Equal(4, observedResult); } [Fact] public void AdjustBegin_CorrectNegativeNumbers() { int observedResult = SearchUtilities.AdjustBegin(0); Assert.Equal(0, observedResult); } [Fact] public void GetMinMaxFileOffset_Nominal() { var intervals = new [] { new Interval(3, 3), new Interval(2, 2), new Interval(1, 5), new Interval(5, 10), new Interval(2, 6), new Interval(8, 9) }; (long observedMinOffset, long observedMaxOffset) = SearchUtilities.GetMinMaxVirtualFileOffset(intervals); Assert.Equal(1, observedMinOffset); Assert.Equal(10, observedMaxOffset); } private static Dictionary GetIdToChunks() { return new Dictionary { [0] = new[] { new Interval(4099908124223, 4099908124304), new Interval(4951477375210, 4951477375293), new Interval(5624484975997, 5624484976080) }, [1] = new[] { new Interval(3340253330084, 3340253330164), new Interval(3465184408915, 3465184408994), new Interval(3568724955460, 3568724955542), new Interval(3691147500084, 3691147500165), new Interval(3795841311087, 3795841311169), new Interval(3910417270243, 3910417270325), new Interval(4000555183327, 4000555183408) }, [12] = new[] { new Interval(3584204706120, 3584204706202), new Interval(3603789121700, 3603789121782), new Interval(3618810913033, 3618810913115), new Interval(3636616069222, 3636616069304), new Interval(3651735457673, 3651735457755), new Interval(3666758669972, 3666758670054), new Interval(3678665150304, 3678665150385) }, [98] = new[] { new Interval(3586357202663, 3586357202745), new Interval(3587723007951, 3587723008032), new Interval(3589980566127, 3589980566208), new Interval(3592834453845, 3592834453927), new Interval(3595721982714, 3595721982795), new Interval(3598606802778, 3598606802860), new Interval(3600879093088, 3600879093169) }, [788] = new[] { new Interval(3589980579562, 3589980605258), new Interval(3590735269728, 3590735292546), new Interval(3591443256775, 3591443312067), new Interval(3592132724129, 3592132724210) }, [6310] = new[] { new Interval(3591443256857, 3591443311984) }, [6311] = new[] { new Interval(3591443312067, 3592132724129) } }; } } } ================================================ FILE: UnitTests/Tabix/VirtualPositionTests.cs ================================================ using Tabix; using Xunit; namespace UnitTests.Tabix { public sealed class VirtualPositionTests { [Fact] public void VirtualPosition_LoopBack() { const long expectedVirtualPosition = 3591443256775; (long fileOffset, int blockOffset) = VirtualPosition.From(expectedVirtualPosition); long observedVirtualPosition = VirtualPosition.To(fileOffset, blockOffset); Assert.Equal(expectedVirtualPosition, observedVirtualPosition); } } } ================================================ FILE: UnitTests/TestDataStructures/SimpleSequence.cs ================================================ using System.Collections.Generic; using Genome; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; namespace UnitTests.TestDataStructures { public sealed class SimpleSequence : ISequence { private readonly string _sequence; private readonly int _zeroBasedStartOffset; public int Length => _zeroBasedStartOffset + _sequence.Length; public Band[] CytogeneticBands => null; public SimpleSequence(string s, int zeroBasedStartOffset = 0) { _zeroBasedStartOffset = zeroBasedStartOffset; _sequence = s; } public string Substring(int offset, int length) { if (offset - _zeroBasedStartOffset + length > _sequence.Length || offset < _zeroBasedStartOffset) return ""; return _sequence.Substring(offset - _zeroBasedStartOffset, length); } } public sealed class SimpleSequenceProvider : ISequenceProvider { public string Name { get; } public GenomeAssembly Assembly { get; } public IEnumerable DataSourceVersions { get; } public void Annotate(IAnnotatedPosition annotatedPosition) { throw new System.NotImplementedException(); } public void PreLoad(Chromosome chromosome, List positions) { throw new System.NotImplementedException(); } public ISequence Sequence { get; } public Dictionary RefNameToChromosome { get; } public Dictionary RefIndexToChromosome { get; } public void LoadChromosome(Chromosome chromosome) { } public SimpleSequenceProvider(GenomeAssembly assembly, ISequence sequence, Dictionary refNameToChromosome) { Assembly = assembly; Sequence = sequence; RefNameToChromosome = refNameToChromosome; } public void Dispose() { } } } ================================================ FILE: UnitTests/TestUtilities/AnnotationUtilities.cs ================================================ using System.Collections.Generic; using Genome; using Nirvana; using OptimizedCore; using VariantAnnotation; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Vcf; using Vcf.VariantCreator; namespace UnitTests.TestUtilities { public static class AnnotationUtilities { internal static IAnnotatedPosition GetAnnotatedPosition(string cacheFilePrefix, List saPaths, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, string vcfLine) { var annotationFiles = new AnnotationFiles(); saPaths?.ForEach(x => annotationFiles.AddFiles(x)); var refMinorProvider = ProviderUtilities.GetRefMinorProvider(annotationFiles); var (annotator, sequenceProvider) = GetAnnotatorAndSequenceProvider(cacheFilePrefix, saPaths); var variantFactory = new VariantFactory(sequenceProvider.Sequence, new VariantId()); var position = ParseVcfLine(vcfLine, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory); var annotatedPosition = annotator.Annotate(position); return annotatedPosition; } internal static IPosition ParseVcfLine(string vcfLine, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, VariantFactory variantFactory, HashSet customInfoKeys=null) { var simplePosition = GetSimplePosition(vcfLine, sequenceProvider.RefNameToChromosome); return Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory, false, customInfoKeys); } internal static SimplePosition GetSimplePosition(string vcfLine, Dictionary refNameToChromosome) { string[] vcfFields = vcfLine.OptimizedSplit('\t'); var chromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, vcfFields[VcfCommon.ChromIndex]); int position = int.Parse(vcfFields[VcfCommon.PosIndex]); return SimplePosition.GetSimplePosition(chromosome, position, vcfFields, new NullVcfFilter()); } private static (Annotator Annotator, ISequenceProvider SequenceProvider) GetAnnotatorAndSequenceProvider(string cacheFilePrefix, List saPaths) { var annotationFiles = new AnnotationFiles(); saPaths?.ForEach(x => annotationFiles.AddFiles(x)); string sequenceFilePath = cacheFilePrefix + ".bases"; var sequenceProvider = ProviderUtilities.GetSequenceProvider(sequenceFilePath); var transcriptAnnotationProvider = ProviderUtilities.GetTranscriptAnnotationProvider(cacheFilePrefix, sequenceProvider, null); var saProvider = ProviderUtilities.GetNsaProvider(annotationFiles); var lcrProvider = ProviderUtilities.GetLcrProvider(annotationFiles); var conservationProvider = ProviderUtilities.GetConservationProvider(annotationFiles); var annotator = new Annotator(transcriptAnnotationProvider, sequenceProvider, saProvider, conservationProvider, lcrProvider, null, null, null); return (annotator,sequenceProvider); } } } ================================================ FILE: UnitTests/TestUtilities/ByteUtilities.cs ================================================ using System.Security.Cryptography; namespace UnitTests.TestUtilities { public static class ByteUtilities { public static byte[] GetRandomBytes(int numBytes) { var buffer = new byte[numBytes]; using (var csp = RandomNumberGenerator.Create()) csp.GetBytes(buffer); return buffer; } } } ================================================ FILE: UnitTests/TestUtilities/ChromosomeUtilities.cs ================================================ using System.Collections.Generic; using Genome; namespace UnitTests.TestUtilities { public static class ChromosomeUtilities { public static readonly Chromosome Chr1 = new Chromosome("chr1", "1", "", "", 1, 0); public static readonly Chromosome Chr2 = new Chromosome("chr2", "2", "", "", 1, 1); public static readonly Chromosome Chr3 = new Chromosome("chr3", "3", "", "", 1, 2); public static readonly Chromosome Chr4 = new Chromosome("chr4", "4", "", "", 1, 3); public static readonly Chromosome Chr5 = new Chromosome("chr5", "5", "", "", 1, 4); public static readonly Chromosome Chr6 = new Chromosome("chr6", "6", "", "", 1, 5); public static readonly Chromosome Chr7 = new Chromosome("chr7", "7", "", "", 1, 6); public static readonly Chromosome Chr8 = new Chromosome("chr8", "8", "", "", 1, 7); public static readonly Chromosome Chr9 = new Chromosome("chr9", "9", "", "", 1, 8); public static readonly Chromosome Chr10 = new Chromosome("chr10", "10", "", "", 1, 9); public static readonly Chromosome Chr11 = new Chromosome("chr11", "11", "", "", 1, 10); public static readonly Chromosome Chr12 = new Chromosome("chr12", "12", "", "", 1, 11); public static readonly Chromosome Chr13 = new Chromosome("chr13", "13", "", "", 1, 12); public static readonly Chromosome Chr14 = new Chromosome("chr14", "14", "", "", 1, 13); public static readonly Chromosome Chr15 = new Chromosome("chr15", "15", "", "", 1, 14); public static readonly Chromosome Chr16 = new Chromosome("chr16", "16", "", "", 1, 15); public static readonly Chromosome Chr17 = new Chromosome("chr17", "17", "", "", 1, 16); public static readonly Chromosome Chr18 = new Chromosome("chr18", "18", "", "", 1, 17); public static readonly Chromosome Chr19 = new Chromosome("chr19", "19", "", "", 1, 18); public static readonly Chromosome Chr20 = new Chromosome("chr20", "20", "", "", 1, 19); public static readonly Chromosome Chr21 = new Chromosome("chr21", "21", "", "", 1, 20); public static readonly Chromosome Chr22 = new Chromosome("chr22", "22", "", "", 1, 21); public static readonly Chromosome ChrX = new Chromosome("chrX", "X", "", "", 1, 22); public static readonly Chromosome ChrY = new Chromosome("chrY", "Y", "", "", 1, 23); public static readonly Chromosome ChrM = new Chromosome("chrM", "MT", "", "", 1, 24); public static readonly Chromosome Bob = new Chromosome("bob", "bob", "", "", 1, Chromosome.UnknownReferenceIndex); public static readonly Dictionary RefNameToChromosome = new Dictionary(); public static readonly Dictionary RefIndexToChromosome = new Dictionary(); static ChromosomeUtilities() { Chromosome[] chromosomes = { Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr8, Chr9, Chr10, Chr11, Chr12, Chr13, Chr14, Chr15, Chr16, Chr17, Chr18, Chr19, Chr20, Chr21, Chr22, ChrX, ChrY, ChrM }; foreach (var chromosome in chromosomes) AddChromosome(chromosome); } private static void AddChromosome(Chromosome chromosome) { RefIndexToChromosome[chromosome.Index] = chromosome; RefNameToChromosome[chromosome.EnsemblName] = chromosome; RefNameToChromosome[chromosome.UcscName] = chromosome; } } } ================================================ FILE: UnitTests/TestUtilities/RandomPath.cs ================================================ using System.IO; namespace UnitTests.TestUtilities { public static class RandomPath { public static string GetRandomPath() => Path.Combine(Path.GetTempPath(), Path.GetRandomFileName()); } } ================================================ FILE: UnitTests/TestUtilities/ResourceUtilities.cs ================================================ using System.IO; using IO; namespace UnitTests.TestUtilities { public static class ResourceUtilities { public static Stream GetReadStream(string path, bool checkMissingFile = true) { var missingFile = !File.Exists(path); if (!checkMissingFile && missingFile) return null; if (missingFile) { throw new FileNotFoundException($"ERROR: The unit test resource file ({path}) was not found."); } return FileUtilities.GetReadStream(path); } } } ================================================ FILE: UnitTests/TestUtilities/Resources.cs ================================================ using System; using System.IO; namespace UnitTests.TestUtilities { public static class Resources { // ReSharper disable once MemberCanBePrivate.Global public static readonly string Top; public static string TopPath(string path) => Path.Combine(Top, path); public static string EndToEnd37(string path) => Path.Combine(Top, "EndToEnd", "GRCh37", path); public static string InputFiles(string path) => Path.Combine(Top, "InputFiles", path); public static string ClinvarXmlFiles(string path) => Path.Combine(Top, "ClinVarXmlFiles", path); public static string VcvXmlFiles(string path) => Path.Combine(Top, "ClinVarXmlFiles","VCVs", path); public static string SaGRCh37(string path) => Path.Combine(Top, "SA", "GRCh37", path); public static string SaPath(string path) => Path.Combine(Top, "SA", path); public static string MockSaFiles => Path.Combine(Top, "SA", "MockSaFiles"); static Resources() { var solutionDir = GetParentDirectory(AppContext.BaseDirectory); Top = Path.Combine(solutionDir, "UnitTests", "Resources"); } private static string GetParentDirectory(string directory) { while (true) { directory = Path.GetDirectoryName(directory); if (directory == null) return string.Empty; var unitTestDir = Path.Combine(directory, "UnitTests"); if (Directory.Exists(unitTestDir)) break; } return directory; } } } ================================================ FILE: UnitTests/TestUtilities/TestDataGenerator.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestDataStructures; using VariantAnnotation.GenericScore; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace UnitTests.TestUtilities { public static class TestDataGenerator { public static void GenerateTestData( Dictionary>> testSetup, List saItems, Func scoreFunc, ISequenceProvider sequenceProvider ) { foreach ((Chromosome chromosome, List> chromosomeTests) in testSetup) { foreach (Dictionary chromosomeTest in chromosomeTests) { var startPosition = (int) chromosomeTest["startPosition"]; var endPosition = (int) chromosomeTest["endPosition"]; var expectedScores = new List(); for (int i = startPosition; i <= endPosition; i++) { double score = scoreFunc(i, endPosition); expectedScores.Add(score); string refAllele = sequenceProvider.Sequence.Substring(i - 1, 1); saItems.Add(new GenericScoreItem(chromosome, i, refAllele, "A", score)); } chromosomeTest["expectedScores"] = expectedScores; } } } public static void GenerateRandomScoreData( Dictionary>> testSetup, List saItems, ISequenceProvider sequenceProvider ) { var random = new Random(1); GenerateTestData(testSetup, saItems, (_, _) => Math.Round(random.NextDouble(), 8), sequenceProvider); } public static ISequenceProvider GetSequenceProvider() { var sequence = new SimpleSequence(new string('A', 1_000_000)); return new SimpleSequenceProvider(GenomeAssembly.GRCh37, sequence, ChromosomeUtilities.RefNameToChromosome); } public static ScoreReader GetScoreReaderWithRandomData( Dictionary>> testSetup ) { string[] nucleotides = {"A", "C", "G", "T"}; var writeStream = new MemoryStream(); var indexStream = new MemoryStream(); var saItems = new List(); var version = new DataSourceVersion("Test", "1", DateTime.Parse(DateTime.Now.ToString("yyyy-MM-dd")).Ticks, "No description"); var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", null), new SaItemValidator(true, true) ); // Scoring function to fill random scores GenerateRandomScoreData(testSetup, saItems, GetSequenceProvider()); using var scoreFileWriter = new ScoreFileWriter( writerSettings, writeStream, indexStream, version, GetSequenceProvider(), SaCommon.SchemaVersion, leaveOpen: true ); // Write saItems to stream scoreFileWriter.Write(saItems); // Reset streams in preparation for reading them indexStream.Position = 0; writeStream.Position = 0; // Read the scores return ScoreReader.Read(writeStream, indexStream); } public static ( List saItems, WriterSettings writerSettings, MemoryStream indexStream, MemoryStream writeStream, DataSourceVersion version, Dictionary>> testSetup ) GetRandomSingleChromosomeData(Chromosome chromosome, int startPosition, int endPosition) { const int blockLength = 10_000; string[] nucleotides = {"A", "C", "G", "T"}; var testSetup = new Dictionary>> { { chromosome, new List> { new Dictionary { {"startPosition", startPosition}, {"endPosition", endPosition}, } } }, }; var writeStream = new MemoryStream(); var indexStream = new MemoryStream(); var saItems = new List(); var version = new DataSourceVersion("Test", "1", DateTime.Parse(DateTime.Now.ToString("yyyy-MM-dd")).Ticks, "No description"); GenerateRandomScoreData(testSetup, saItems, TestDataGenerator.GetSequenceProvider()); var writerSettings = new WriterSettings( blockLength, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); return (saItems, writerSettings, indexStream, writeStream, version, testSetup); } } } ================================================ FILE: UnitTests/UnitTests.csproj ================================================  net6.0 ..\bin\$(Configuration) true all runtime; build; native; contentfiles; analyzers ================================================ FILE: UnitTests/VariantAnnotation/Algorithms/SwapTests.cs ================================================ using VariantAnnotation.Algorithms; using Xunit; namespace UnitTests.VariantAnnotation.Algorithms { public sealed class SwapTests { [Fact] public void Swap_Int() { const int expectedA = 5; const int expectedB = 3; int observedA = expectedB; int observedB = expectedA; Swap.Int(ref observedA, ref observedB); Assert.Equal(expectedA, observedA); Assert.Equal(expectedB, observedB); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedPositionTests.cs ================================================ using System.Collections.Generic; using Moq; using OptimizedCore; using UnitTests.SAUtils.InputFileParsers; using UnitTests.TestUtilities; using VariantAnnotation; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Pools; using Variants; using Vcf; using Vcf.Info; using Vcf.Sample; using Vcf.VariantCreator; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class AnnotatedPositionTests { [Fact] public void GetJsonString_DifferentOriginalChromosomeName() { const string originalChromosomeName = "originalChr1"; IVariant[] variants = GetVariants(); ISample[] samples = GetSamples(); IAnnotatedVariant[] annotatedVariants = Annotator.GetAnnotatedVariants(variants); var position = GetPosition(originalChromosomeName, variants, samples); var annotatedPosition = AnnotatedPositionPool.Get(position, annotatedVariants); var sb = annotatedPosition.GetJsonStringBuilder(); var observedResult = sb.ToString(); StringBuilderPool.Return(sb); PositionPool.Return((Position)annotatedPosition.Position); AnnotatedPositionPool.Return(annotatedPosition); Assert.NotNull(observedResult); Assert.Contains($"\"chromosome\":\"{originalChromosomeName}\"", observedResult); } [Fact] public void GetJsonString_NullAnnotatedVariants() { const string originalChromosomeName = "originalChr1"; var position = GetPosition(originalChromosomeName, null, null); var annotatedPosition = AnnotatedPositionPool.Get(position, null); var sb= annotatedPosition.GetJsonStringBuilder(); AnnotatedPositionPool.Return(annotatedPosition); Assert.Null(sb); } //21 9411410 . C T 9.51 DRAGENSnpHardQUAL AC=2;AF=1.000;AN=2;DP=2;FS=0.000;MQ=100.00;QD=9.51;SOR=1.609 GT:AD:AF:DP:GQ:FT:F1R2:F2R1:PL:GP:PP ./.:.:.:0:0:.:.:. ./.:.:.:0:0:.:.:. 1/1:0,1:1.000:1:3:PASS:0,1:0,0:45,3,0:1.0415e+01,3.4301e+00,3.4199e+00:45,3,0 [Fact] public void GetJsonString_fisherStrand() { const string vcfLine = "21\t9411410\t.\tC\tT\t9.51\tDRAGENSnpHardQUAL\tAC=2;AF=1.000;AN=2;DP=2;FS=0.000;MQ=100.00;QD=9.51;SOR=1.609"; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(9411410, "C", 'A', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, new VariantId()); var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); IVariant[] variants = GetVariants(); IAnnotatedVariant[] annotatedVariants = Annotator.GetAnnotatedVariants(variants); var annotatedPosition = AnnotatedPositionPool.Get(position, annotatedVariants); var sb = annotatedPosition.GetJsonStringBuilder(); var observedResult = sb.ToString(); StringBuilderPool.Return(sb); AnnotatedPositionPool.Return(annotatedPosition); Assert.NotNull(observedResult); Assert.Contains("\"fisherStrandBias\":0", observedResult); } [Fact] public void GetJsonString_StrelkaSomatic() { const string vcfLine = "chr1 13813 . T G . LowQscore SOMATIC;QSS=33;TQSS=1;NT=ref;QSS_NT=16;TQSS_NT=1;SGT=TT->GT;DP=266;MQ=23.89;MQ0=59;ALTPOS=69;ALTMAP=37;ReadPosRankSum=1.22;SNVSB=5.92;PNOISE=0.00;PNOISE2=0.00;VQSR=1.93;FS=12.123"; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(13813, "T", 'C', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, new VariantId()); var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); IVariant[] variants = GetVariants(); IAnnotatedVariant[] annotatedVariants = Annotator.GetAnnotatedVariants(variants); var annotatedPosition = AnnotatedPositionPool.Get(position, annotatedVariants); var sb = annotatedPosition.GetJsonStringBuilder(); var observedResult = sb.ToString(); StringBuilderPool.Return(sb); AnnotatedPositionPool.Return(annotatedPosition); Assert.NotNull(observedResult); Assert.Contains("\"jointSomaticNormalQuality\":16", observedResult); Assert.Contains("\"recalibratedQuality\":1.93", observedResult); Assert.Contains("\"mappingQuality\":23.89", observedResult); Assert.Contains("\"fisherStrandBias\":12.123", observedResult); } [Fact] public void GetJsonString_custom_info() { const string vcfLine = "chr1 13813 . T G . LowQscore SOMATIC;QSS=33;TQSS=1;NT=ref;QSS_NT=16;TQSS_NT=1;SGT=TT->GT;DP=266;MQ=23.89;MQ0=59;ALTPOS=69;ALTMAP=37;ReadPosRankSum=1.22;SNVSB=5.92;PNOISE=0.00;PNOISE2=0.00;VQSR=1.93;FS=12.123"; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(13813, "T", 'C', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, new VariantId()); var customInfoKeys = new HashSet() {"SGT","SOMATIC" }; var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory, customInfoKeys); IVariant[] variants = GetVariants(); IAnnotatedVariant[] annotatedVariants = Annotator.GetAnnotatedVariants(variants); var annotatedPosition = AnnotatedPositionPool.Get(position, annotatedVariants); var sb = annotatedPosition.GetJsonStringBuilder(); var observedResult = sb.ToString(); StringBuilderPool.Return(sb); AnnotatedPositionPool.Return(annotatedPosition); Assert.NotNull(observedResult); Assert.Contains("\"jointSomaticNormalQuality\":16", observedResult); Assert.Contains("\"recalibratedQuality\":1.93", observedResult); Assert.Contains("\"mappingQuality\":23.89", observedResult); Assert.Contains("\"fisherStrandBias\":12.123", observedResult); Assert.Contains("vcfInfo", observedResult); Assert.Contains("\"SGT\":\"TT->GT\"", observedResult); Assert.Contains("\"SOMATIC\":\"true\"", observedResult); } [Fact] public void GetJsonString_BreakEndEventId() { const string vcfLine = "1\t38432782\tMantaBND:2312:0:1:0:0:0:0\tG\tG]6:28863899]\t971\tPASS\tSVTYPE=BND;MATEID=MantaBND:2312:0:1:0:0:0:1;EVENT=MantaBND:2312:0:1:0:0:0:0;JUNCTION_QUAL=716;BND_DEPTH=52;MATE_BND_DEPTH=56"; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(38432782, "G", 'C', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, new VariantId()); var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); IVariant[] variants = GetVariants(); IAnnotatedVariant[] annotatedVariants = Annotator.GetAnnotatedVariants(variants); var annotatedPosition = AnnotatedPositionPool.Get(position, annotatedVariants); var sb = annotatedPosition.GetJsonStringBuilder(); var observedResult = sb.ToString(); StringBuilderPool.Return(sb); PositionPool.Return((Position)annotatedPosition.Position); AnnotatedPositionPool.Return(annotatedPosition); Assert.NotNull(observedResult); Assert.Contains("\"breakendEventId\":\"MantaBND:2312:0:1:0:0:0:0\"", observedResult); } [Fact] public void GetJsonString_LogOddsRatio() { const string vcfLine = "1\t38432782\tMantaBND:2312:0:1:0:0:0:0\tG\tG]6:28863899]\t971\tPASS\tSVTYPE=BND;LOD=3.1456;MATEID=MantaBND:2312:0:1:0:0:0:1;EVENT=MantaBND:2312:0:1:0:0:0:0;JUNCTION_QUAL=716;BND_DEPTH=52;MATE_BND_DEPTH=56"; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(38432782, "G", 'C', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, new VariantId()); var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); IVariant[] variants = GetVariants(); IAnnotatedVariant[] annotatedVariants = Annotator.GetAnnotatedVariants(variants); var annotatedPosition = AnnotatedPositionPool.Get(position, annotatedVariants); var sb = annotatedPosition.GetJsonStringBuilder(); var observedResult = sb.ToString(); StringBuilderPool.Return(sb); PositionPool.Return((Position)annotatedPosition.Position); AnnotatedPositionPool.Return(annotatedPosition); Assert.NotNull(observedResult); Assert.Contains("\"logOddsRatio\":3.146", observedResult); } private static ISample[] GetSamples() => new ISample[] { Sample.EmptySample }; private static IVariant[] GetVariants() { var variant = new Mock(); variant.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); variant.SetupGet(x => x.Type).Returns(VariantType.SNV); variant.SetupGet(x => x.Start).Returns(949523); variant.SetupGet(x => x.End).Returns(949523); variant.SetupGet(x => x.RefAllele).Returns("C"); variant.SetupGet(x => x.AltAllele).Returns("T"); variant.SetupGet(x => x.Behavior).Returns(AnnotationBehavior.SmallVariants); return new[] { variant.Object }; } private static IPosition GetPosition(string originalChromosomeName, IVariant[] variants, ISample[] samples) { var vcfFields = new string[8]; vcfFields[0] = originalChromosomeName; InfoData infoData = new InfoDataBuilder().Create(); return PositionPool.Get(ChromosomeUtilities.Chr1, 949523, 949523, "C", new[] {"T"}, null, null, variants, samples, infoData, vcfFields, new[] { false }, false); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/AnnotatedVariantTests.cs ================================================ using System.Collections.Generic; using Moq; using OptimizedCore; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.Pools; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class AnnotatedVariantTests { private const string OriginalChromosomeName = "BoB"; [Fact] public void GetJsonString_RefMinor_WithTranscripts() { IVariant variant = GetRefMinorVariant(); var annotatedVariant = AnnotatedVariantPool.Get(variant); AddRegulatoryRegion(annotatedVariant); AddTranscript(annotatedVariant); const string expectedResult = "{\"vid\":\"bob:100:G\",\"chromosome\":\"BoB\",\"begin\":100,\"end\":200,\"isReferenceMinorAllele\":true,\"refAllele\":\"A\",\"altAllele\":\"G\",\"variantType\":\"SNV\",\"linkedVids\":[\"bob:100:102:TAT\"],\"regulatoryRegions\":[{\"id\":\"7157\",\"type\":\"TF_binding_site\",\"consequence\":[\"regulatory_region_amplification\"]}],\"transcripts\":[]}"; var sb = annotatedVariant.GetJsonStringBuilder(OriginalChromosomeName); var observedResult = sb.ToString(); StringBuilderPool.Return(sb); VariantPool.Return((Variant)variant); AnnotatedVariantPool.Return(annotatedVariant); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetJsonString_RecomposedSnvAfterTrimming_IsRecomposedTrue() { IVariant variant = VariantPool.Get(ChromosomeUtilities.Bob, 100, 200, "A", "G", VariantType.SNV, "bob-100-A-G", false, false, true, new[] { "bob-100-A-G" }, AnnotationBehavior.SmallVariants, false); var annotatedVariant = AnnotatedVariantPool.Get(variant); const string expectedResult = "{\"vid\":\"bob-100-A-G\",\"chromosome\":\"BoB\",\"begin\":100,\"end\":200,\"refAllele\":\"A\",\"altAllele\":\"G\",\"variantType\":\"SNV\",\"isRecomposedVariant\":true,\"linkedVids\":[\"bob-100-A-G\"]}"; var sb = annotatedVariant.GetJsonStringBuilder(OriginalChromosomeName); var observedResult = sb.ToString(); StringBuilderPool.Return(sb); AnnotatedVariantPool.Return(annotatedVariant); Assert.Equal(expectedResult, observedResult); VariantPool.Return((Variant)variant); } private static void AddRegulatoryRegion(IAnnotatedVariant annotatedVariant) { var regulatoryRegion = new RegulatoryRegion(ChromosomeUtilities.Bob, 103, 104, CompactId.Convert("7157"), RegulatoryRegionType.TF_binding_site); var consequences = new List { ConsequenceTag.regulatory_region_amplification }; annotatedVariant.RegulatoryRegions.Add(new AnnotatedRegulatoryRegion(regulatoryRegion, consequences)); } private static void AddTranscript(IAnnotatedVariant annotatedVariant) { var annotatedTranscript = new Mock(); annotatedTranscript.SetupGet(x => x.Transcript.Id).Returns(CompactId.Convert("ENST00000540021")); annotatedTranscript.SetupGet(x => x.Transcript.Start).Returns(966300); annotatedTranscript.SetupGet(x => x.Transcript.End).Returns(966405); annotatedTranscript.SetupGet(x => x.AlternateCodons).Returns("cAt/cGt"); annotatedVariant.Transcripts.Add(annotatedTranscript.Object); } private static IVariant GetRefMinorVariant() { return VariantPool.Get(ChromosomeUtilities.Bob, 100, 200, "A", "G", VariantType.SNV, "bob:100:G", true, false, false, new[] { "bob:100:102:TAT" }, AnnotationBehavior.SmallVariants, false); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/ConsequenceTests.cs ================================================ using System.Collections.Generic; using Moq; using VariantAnnotation.AnnotatedPositions.Consequence; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Pools; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class ConsequenceTests { [Theory] [InlineData(false, ConsequenceTag.upstream_gene_variant)] [InlineData(true, ConsequenceTag.downstream_gene_variant)] public void DetermineFlankingVariantEffects(bool isDownStreamVariant, ConsequenceTag expectedConsequence) { List observedConsequences = Consequences.DetermineFlankingVariantEffects(isDownStreamVariant); Assert.Single(observedConsequences); Assert.Equal(expectedConsequence, observedConsequences[0]); } [Theory] [InlineData(VariantType.deletion, true, false, ConsequenceTag.transcript_ablation)] [InlineData(VariantType.copy_number_gain, false, true, ConsequenceTag.transcript_amplification)] public void DetermineSmallVariantEffects_Tier1(VariantType variantType, bool isAblation, bool isAmplification, ConsequenceTag expectedResult) { var featureEffect = new Mock(); featureEffect.Setup(x => x.Ablation()).Returns(isAblation); featureEffect.Setup(x => x.Amplification()).Returns(isAmplification); var variantEffect = new Mock(); // make sure these tier 2 effects don't show up featureEffect.Setup(x => x.Elongation()).Returns(true); variantEffect.Setup(x => x.IsMatureMirnaVariant()).Returns(true); var consequence = new Consequences(variantType, variantEffect.Object, featureEffect.Object); consequence.DetermineSmallVariantEffects(); List observedConsequences = consequence.GetConsequences(); Assert.Contains(expectedResult, observedConsequences); } [Fact] public void DetermineSmallVariantEffects_Tier2() { var featureEffect = new Mock(); featureEffect.Setup(x => x.Ablation()).Returns(false); featureEffect.Setup(x => x.Amplification()).Returns(false); var variantEffect = new Mock(); variantEffect.Setup(x => x.IsMatureMirnaVariant()).Returns(true); // make sure these tier 3 effects don't show up variantEffect.Setup(x => x.IsStartLost()).Returns(true); var consequence = new Consequences(VariantType.SNV, variantEffect.Object, featureEffect.Object); consequence.DetermineSmallVariantEffects(); List observedConsequences = consequence.GetConsequences(); Assert.Single(observedConsequences); Assert.Equal(ConsequenceTag.mature_miRNA_variant, observedConsequences[0]); } [Theory] [InlineData(VariantType.SNV, true)] [InlineData(VariantType.insertion, true)] [InlineData(VariantType.deletion, true)] [InlineData(VariantType.indel, true)] [InlineData(VariantType.MNV, true)] [InlineData(VariantType.duplication, false)] // no change [InlineData(VariantType.complex_structural_alteration, true)] [InlineData(VariantType.structural_alteration, true)] [InlineData(VariantType.tandem_duplication, false)] // no change [InlineData(VariantType.translocation_breakend, true)] [InlineData(VariantType.inversion, true)] [InlineData(VariantType.short_tandem_repeat_variation, true)] [InlineData(VariantType.copy_number_variation, false)] // no change [InlineData(VariantType.copy_number_loss, false)] // no change [InlineData(VariantType.copy_number_gain, false)] // no change [InlineData(VariantType.run_of_homozygosity, false)] // no change public void NeedsTranscriptVariant_NoConsequences_EvaluateByVariantType(VariantType variantType, bool expectedResult) { var consequences = new List(); bool observedResult = Consequences.NeedsTranscriptVariant(variantType, consequences); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(ConsequenceTag.transcript_ablation, true)] // parallel [InlineData(ConsequenceTag.transcript_amplification, false)] // parallel, no change public void NeedsTranscriptVariant_Tier1(ConsequenceTag consequence, bool expectedResult) { var consequences = new List {consequence}; bool observedResult = Consequences.NeedsTranscriptVariant(VariantType.unknown, consequences); Assert.Equal(expectedResult, observedResult); } [Fact] public void NeedsTranscriptVariant_Tier2_MatureMirnaVariant() { var consequences = new List {ConsequenceTag.mature_miRNA_variant}; bool observedResult = Consequences.NeedsTranscriptVariant(VariantType.unknown, consequences); Assert.False(observedResult); } [Theory] [InlineData(ConsequenceTag.splice_donor_variant, false)] [InlineData(ConsequenceTag.splice_acceptor_variant, false)] [InlineData(ConsequenceTag.stop_gained, false)] [InlineData(ConsequenceTag.frameshift_variant, false)] [InlineData(ConsequenceTag.stop_lost, false)] [InlineData(ConsequenceTag.start_lost, false)] [InlineData(ConsequenceTag.inframe_insertion, false)] [InlineData(ConsequenceTag.inframe_deletion, false)] [InlineData(ConsequenceTag.missense_variant, false)] [InlineData(ConsequenceTag.protein_altering_variant, false)] [InlineData(ConsequenceTag.splice_region_variant, false)] [InlineData(ConsequenceTag.incomplete_terminal_codon_variant, false)] [InlineData(ConsequenceTag.start_retained_variant, false)] [InlineData(ConsequenceTag.stop_retained_variant, false)] [InlineData(ConsequenceTag.synonymous_variant, false)] [InlineData(ConsequenceTag.coding_sequence_variant, false)] [InlineData(ConsequenceTag.five_prime_UTR_variant, false)] [InlineData(ConsequenceTag.three_prime_UTR_variant, false)] [InlineData(ConsequenceTag.non_coding_transcript_exon_variant, false)] [InlineData(ConsequenceTag.intron_variant, false)] [InlineData(ConsequenceTag.NMD_transcript_variant, false)] [InlineData(ConsequenceTag.non_coding_transcript_variant, false)] [InlineData(ConsequenceTag.feature_elongation, true)] // parallel [InlineData(ConsequenceTag.feature_truncation, true)] // parallel public void NeedsTranscriptVariant_Tier3(ConsequenceTag consequence, bool expectedResult) { var consequences = new List {consequence}; bool observedResult = Consequences.NeedsTranscriptVariant(VariantType.unknown, consequences); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(ConsequenceTag.feature_elongation, true)] // parallel [InlineData(ConsequenceTag.feature_truncation, true)] // parallel [InlineData(ConsequenceTag.five_prime_duplicated_transcript, false)] // child [InlineData(ConsequenceTag.three_prime_duplicated_transcript, false)] // child public void NeedsTranscriptVariant_Tier2_SV(ConsequenceTag consequence, bool expectedResult) { var consequences = new List {consequence}; bool observedResult = Consequences.NeedsTranscriptVariant(VariantType.unknown, consequences); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(ConsequenceTag.copy_number_increase, false)] // no change [InlineData(ConsequenceTag.copy_number_decrease, false)] // no change [InlineData(ConsequenceTag.copy_number_change, false)] // no change public void NeedsTranscriptVariant_CNV(ConsequenceTag consequence, bool expectedResult) { var consequences = new List {consequence}; bool observedResult = Consequences.NeedsTranscriptVariant(VariantType.unknown, consequences); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(ConsequenceTag.short_tandem_repeat_change, true)] // parallel [InlineData(ConsequenceTag.short_tandem_repeat_expansion, true)] // parallel [InlineData(ConsequenceTag.short_tandem_repeat_contraction, true)] // parallel public void NeedsTranscriptVariant_STR(ConsequenceTag consequence, bool expectedResult) { var consequences = new List {consequence}; bool observedResult = Consequences.NeedsTranscriptVariant(VariantType.unknown, consequences); Assert.Equal(expectedResult, observedResult); } [Fact] public void DetermineSmallVariantEffects_Tier3() { var cache = new VariantEffectCache(); cache.Add(ConsequenceTag.mature_miRNA_variant, false); cache.Add(ConsequenceTag.splice_donor_variant, true); cache.Add(ConsequenceTag.splice_acceptor_variant, true); cache.Add(ConsequenceTag.stop_gained, true); cache.Add(ConsequenceTag.frameshift_variant, true); cache.Add(ConsequenceTag.stop_lost, true); cache.Add(ConsequenceTag.start_lost, true); cache.Add(ConsequenceTag.inframe_insertion, true); cache.Add(ConsequenceTag.inframe_deletion, true); cache.Add(ConsequenceTag.missense_variant, true); cache.Add(ConsequenceTag.protein_altering_variant, true); cache.Add(ConsequenceTag.splice_region_variant, true); cache.Add(ConsequenceTag.incomplete_terminal_codon_variant, true); cache.Add(ConsequenceTag.stop_retained_variant, true); cache.Add(ConsequenceTag.synonymous_variant, true); cache.Add(ConsequenceTag.coding_sequence_variant, true); cache.Add(ConsequenceTag.five_prime_UTR_variant, true); cache.Add(ConsequenceTag.three_prime_UTR_variant, true); cache.Add(ConsequenceTag.non_coding_transcript_exon_variant, true); cache.Add(ConsequenceTag.intron_variant, true); cache.Add(ConsequenceTag.NMD_transcript_variant, true); cache.Add(ConsequenceTag.non_coding_transcript_variant, true); var simpleVariant = new Mock(); simpleVariant.SetupGet(x => x.RefAllele).Returns("G"); simpleVariant.SetupGet(x => x.AltAllele).Returns("C"); var positionalEffect = new TranscriptPositionalEffect {IsWithinIntron = true}; var variantEffect = new VariantEffect(positionalEffect, simpleVariant.Object, null, null, null, null, null, null, null, null, cache); var featureEffect = new Mock(); featureEffect.Setup(x => x.Ablation()).Returns(false); featureEffect.Setup(x => x.Amplification()).Returns(false); featureEffect.Setup(x => x.Truncation()).Returns(true); featureEffect.Setup(x => x.Elongation()).Returns(true); var consequence = new Consequences(VariantType.SNV, variantEffect, featureEffect.Object); consequence.DetermineSmallVariantEffects(); List observedConsequence = consequence.GetConsequences(); Assert.Equal(ConsequenceTag.splice_donor_variant, observedConsequence[0]); Assert.Equal(ConsequenceTag.splice_acceptor_variant, observedConsequence[1]); Assert.Equal(ConsequenceTag.stop_gained, observedConsequence[2]); Assert.Equal(ConsequenceTag.frameshift_variant, observedConsequence[3]); Assert.Equal(ConsequenceTag.stop_lost, observedConsequence[4]); Assert.Equal(ConsequenceTag.start_lost, observedConsequence[5]); Assert.Equal(ConsequenceTag.inframe_insertion, observedConsequence[6]); Assert.Equal(ConsequenceTag.inframe_deletion, observedConsequence[7]); Assert.Equal(ConsequenceTag.missense_variant, observedConsequence[8]); Assert.Equal(ConsequenceTag.protein_altering_variant, observedConsequence[9]); Assert.Equal(ConsequenceTag.splice_region_variant, observedConsequence[10]); Assert.Equal(ConsequenceTag.incomplete_terminal_codon_variant, observedConsequence[11]); Assert.Equal(ConsequenceTag.stop_retained_variant, observedConsequence[12]); Assert.Equal(ConsequenceTag.synonymous_variant, observedConsequence[13]); Assert.Equal(ConsequenceTag.coding_sequence_variant, observedConsequence[14]); Assert.Equal(ConsequenceTag.five_prime_UTR_variant, observedConsequence[15]); Assert.Equal(ConsequenceTag.three_prime_UTR_variant, observedConsequence[16]); Assert.Equal(ConsequenceTag.non_coding_transcript_exon_variant, observedConsequence[17]); Assert.Equal(ConsequenceTag.intron_variant, observedConsequence[18]); Assert.Equal(ConsequenceTag.NMD_transcript_variant, observedConsequence[19]); Assert.Equal(ConsequenceTag.non_coding_transcript_variant, observedConsequence[20]); } [Theory] [InlineData(true, true)] [InlineData(true, false)] [InlineData(false, true)] [InlineData(false, false)] public void DetermineRegulatoryVariantEffects(bool isAmplification, bool isAblation) { var featureEffect = new Mock(); featureEffect.Setup(x => x.Ablation()).Returns(isAblation); featureEffect.Setup(x => x.Amplification()).Returns(isAmplification); var consequence = new Consequences(VariantType.unknown, null, featureEffect.Object); consequence.DetermineRegulatoryVariantEffects(); List observedConsequences = consequence.GetConsequences(); Assert.Contains(ConsequenceTag.regulatory_region_variant, observedConsequences); if (isAblation) { Assert.Contains(ConsequenceTag.regulatory_region_ablation, observedConsequences); } else { Assert.DoesNotContain(ConsequenceTag.regulatory_region_ablation, observedConsequences); } if (isAmplification) { Assert.Contains(ConsequenceTag.regulatory_region_amplification, observedConsequences); } else { Assert.DoesNotContain(ConsequenceTag.regulatory_region_amplification, observedConsequences); } } public static IEnumerable SvTheoryParameters() { yield return new object[] {VariantType.copy_number_loss, true, false, false, false, false, false, new[] {ConsequenceTag.transcript_ablation, ConsequenceTag.copy_number_decrease}}; yield return new object[] {VariantType.copy_number_gain, false, true, false, false, false, false, new[] {ConsequenceTag.transcript_amplification, ConsequenceTag.copy_number_increase}}; yield return new object[] {VariantType.deletion, true, false, true, false, false, false, new[] {ConsequenceTag.transcript_ablation, ConsequenceTag.transcript_variant}}; yield return new object[] {VariantType.duplication, false, true, true, false, false, false, new[] {ConsequenceTag.transcript_amplification}}; yield return new object[] {VariantType.tandem_duplication, false, false, true, false, false, false, new[] {ConsequenceTag.feature_elongation, ConsequenceTag.transcript_variant}}; yield return new object[] {VariantType.copy_number_loss, false, false, false, true, false, false, new[] {ConsequenceTag.feature_truncation, ConsequenceTag.copy_number_decrease}}; yield return new object[] {VariantType.copy_number_gain, false, false, false, false, true, false, new[] {ConsequenceTag.five_prime_duplicated_transcript, ConsequenceTag.copy_number_increase}}; yield return new object[] {VariantType.duplication, false, false, false, false, false, true, new[] {ConsequenceTag.three_prime_duplicated_transcript}}; } [Theory] [MemberData(nameof(SvTheoryParameters))] public void DetermineStructuralVariantEffect(VariantType variantType, bool isAblation, bool isAmplification, bool isElongation, bool isTruncation, bool isFivePrimeDuplicatedTranscript, bool isThreePrimeDuplicatedTranscript, ConsequenceTag[] expectedResults) { IFeatureVariantEffects featureVariantEffects = GetFeatureVariantEffects(isAblation, isAmplification, isTruncation, isElongation, isFivePrimeDuplicatedTranscript, isThreePrimeDuplicatedTranscript); var variant = VariantPool.Get(null, 0, 0, null, null, variantType, null, false, false, false, null, AnnotationBehavior.StructuralVariants, true); var consequence = new Consequences(variantType, null, featureVariantEffects); consequence.DetermineStructuralVariantEffect(variant); ConsequenceTag[] observedResults = consequence.GetConsequences().ToArray(); Assert.Equal(expectedResults, observedResults); VariantPool.Return(variant); } private static IFeatureVariantEffects GetFeatureVariantEffects(bool isAblation, bool isAmplification, bool isTruncation, bool isElongation, bool isFivePrimeDuplicatedTranscript, bool isThreePrimeDuplicatedTranscript) { var featureEffectsMock = new Mock(); featureEffectsMock.Setup(x => x.Ablation()).Returns(isAblation); featureEffectsMock.Setup(x => x.Amplification()).Returns(isAmplification); featureEffectsMock.Setup(x => x.Elongation()).Returns(isElongation); featureEffectsMock.Setup(x => x.Truncation()).Returns(isTruncation); featureEffectsMock.Setup(x => x.FivePrimeDuplicatedTranscript()).Returns(isFivePrimeDuplicatedTranscript); featureEffectsMock.Setup(x => x.ThreePrimeDuplicatedTranscript()).Returns(isThreePrimeDuplicatedTranscript); return featureEffectsMock.Object; } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/HgvsCodingNomenclatureTests.cs ================================================ using Genome; using Moq; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class HgvsCodingNomenclatureTests { private readonly ITranscript _forwardTranscript; private readonly ITranscript _reverseTranscript; private readonly ITranscript _gapTranscript; public HgvsCodingNomenclatureTests() { _forwardTranscript = GetForwardTranscript(); _reverseTranscript = GetReverseTranscript(); _gapTranscript = GetGapTranscript(); } internal static ITranscript GetForwardTranscript() { // get info from ENST00000343938.4 var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 1260147, 1260482, 1, 336), new TranscriptRegion(TranscriptRegionType.Intron, 1, 1260483, 1262215, 336, 337), new TranscriptRegion(TranscriptRegionType.Exon, 2, 1262216, 1262412, 337, 533), new TranscriptRegion(TranscriptRegionType.Intron, 2, 1262413, 1262620, 533, 534), new TranscriptRegion(TranscriptRegionType.Exon, 3, 1262621, 1264277, 534, 2190) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(1262291, 1263143, 412, 1056, 645)); var transcript = new Mock(); transcript.SetupGet(x => x.Id).Returns(CompactId.Convert("ENST00000343938", 4)); transcript.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); transcript.SetupGet(x => x.Start).Returns(1260147); transcript.SetupGet(x => x.End).Returns(1264277); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.Translation).Returns(translation.Object); transcript.SetupGet(x => x.TotalExonLength).Returns(2190); return transcript.Object; } private static ITranscript GetForwardTranscriptWithoutUtr() { //ENST00000579622.1 chrX:70361035-70361156, non-coding, forward strand, no utr var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 70361035,70361156, 1, 122) }; var transcript = new Mock(); transcript.SetupGet(x => x.Id).Returns(CompactId.Convert("ENST00000579622", 1)); transcript.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.ChrX); transcript.SetupGet(x => x.Start).Returns(70361035); transcript.SetupGet(x => x.End).Returns(70361156); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.TotalExonLength).Returns(122); return transcript.Object; } internal static ITranscript GetReverseTranscript() { // get info from "ENST00000423372.3 var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 2, 134901, 135802, 1760, 2661), new TranscriptRegion(TranscriptRegionType.Intron, 1, 135803, 137620, 1759, 1760), new TranscriptRegion(TranscriptRegionType.Exon, 1, 137621, 139379, 1, 1759) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(138530, 139309, 71, 850, 780)); var transcript = new Mock(); transcript.SetupGet(x => x.Id).Returns(CompactId.Convert("ENST00000423372", 3)); transcript.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); transcript.SetupGet(x => x.Start).Returns(134901); transcript.SetupGet(x => x.End).Returns(139379); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(true); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.Translation).Returns(translation.Object); transcript.SetupGet(x => x.TotalExonLength).Returns(2661); return transcript.Object; } private static ITranscript GetGapTranscript() { //NM_000314.4 var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 89623195, 89623860, 1, 666), new TranscriptRegion(TranscriptRegionType.Gap, 1, 89623861, 89623861, 666, 667), new TranscriptRegion(TranscriptRegionType.Exon, 1, 89623862, 89624305, 667, 1110), new TranscriptRegion(TranscriptRegionType.Intron, 1, 89624306, 89653781, 1110, 1111), new TranscriptRegion(TranscriptRegionType.Exon, 2, 89653782, 89653866, 1111, 1195), new TranscriptRegion(TranscriptRegionType.Intron, 2, 89653867, 89685269, 1195, 1196), new TranscriptRegion(TranscriptRegionType.Exon, 3, 89685270, 89685314, 1196, 1240), new TranscriptRegion(TranscriptRegionType.Intron, 3, 89685315, 89690802, 1240, 1241), new TranscriptRegion(TranscriptRegionType.Exon, 4, 89690803, 89690846, 1241, 1284), new TranscriptRegion(TranscriptRegionType.Intron, 4, 89690847, 89692769, 1284, 1285), new TranscriptRegion(TranscriptRegionType.Exon, 5, 89692770, 89693008, 1285, 1523), new TranscriptRegion(TranscriptRegionType.Intron, 5, 89693009, 89711874, 1523, 1524), new TranscriptRegion(TranscriptRegionType.Exon, 6, 89711875, 89712016, 1524, 1665), new TranscriptRegion(TranscriptRegionType.Intron, 6, 89712017, 89717609, 1665, 1666), new TranscriptRegion(TranscriptRegionType.Exon, 7, 89717610, 89717776, 1666, 1832), new TranscriptRegion(TranscriptRegionType.Intron, 7, 89717777, 89720650, 1832, 1833), new TranscriptRegion(TranscriptRegionType.Exon, 8, 89720651, 89720875, 1833, 2057), new TranscriptRegion(TranscriptRegionType.Intron, 8, 89720876, 89725043, 2057, 2058), new TranscriptRegion(TranscriptRegionType.Exon, 9, 89725044, 89728532, 2058, 5546) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(89624227, 89725229, 1032, 2243, 1212)); var rnaEdits = new IRnaEdit[3]; rnaEdits[0] = new RnaEdit(667, 667, null); rnaEdits[1] = new RnaEdit(707, 707, "C"); rnaEdits[2] = new RnaEdit(5548, 5547, "AAAAAAAAAAAAAAAAAAAAAAAAAA"); var transcript = new Mock(); transcript.SetupGet(x => x.Id).Returns(CompactId.Convert("NM_000314", 4)); transcript.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr10); transcript.SetupGet(x => x.Start).Returns(89623195); transcript.SetupGet(x => x.End).Returns(89728532); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.Translation).Returns(translation.Object); transcript.SetupGet(x => x.TotalExonLength).Returns(5546); transcript.SetupGet(x => x.RnaEdits).Returns(rnaEdits); return transcript.Object; } [Theory] [InlineData(89623861, 89623861, "T", "", "T", VariantType.deletion, null)] [InlineData(89623861, 89623861, "T", "G", "T", VariantType.SNV, null)] [InlineData(89623901, 89623901, "G", "C", "C", VariantType.SNV, "NM_000314.4:c.-326=")] [InlineData(89623901, 89623901, "G", "T", "C", VariantType.SNV, "NM_000314.4:c.-326C>T")] [InlineData(89623861, 89623863, "TGG", "", "GG", VariantType.deletion, "NM_000314.4:c.-365_-364del")] [InlineData(89623859, 89623861, "GCT", "", "GC", VariantType.deletion, "NM_000314.4:c.-367_-366del")] [InlineData(89623860, 89623862, "CTG", "", "CG", VariantType.deletion, "NM_000314.4:c.-366_-365del")] [InlineData(89624304, 89624308, "CTGTA", "", "CT", VariantType.deletion, "NM_000314.4:c.78_79+3del")] [InlineData(89624308, 89624310, "ATC", "", "ATC", VariantType.deletion, "NM_000314.4:c.79+3_79+5del")] public void GetHgvscAnnotation_in_intron_gap_substitution(int variantStart, int variantEnd, string reference, string alt, string transcriptRef, VariantType variantType, string expectedHgvsc) { var (startIndex, _) = MappedPositionUtilities.FindRegion(_gapTranscript.TranscriptRegions, variantStart); var (endIndex, _) = MappedPositionUtilities.FindRegion(_gapTranscript.TranscriptRegions, variantEnd); var variant = new SimpleVariant(ChromosomeUtilities.Chr10, variantStart, variantEnd, reference, alt, variantType); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_gapTranscript, variant, null, startIndex, endIndex, transcriptRef, null); Assert.Equal(expectedHgvsc, observedHgvsc); } [Fact] public void GetHgvscAnnotation_substitution_in_3UTR() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1260247, 1260247, "A", "G", VariantType.SNV); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, null, 0, 0, null, null); Assert.Equal("ENST00000343938.4:c.-311A>G", observedHgvsc); } [Fact] public void GetHgvscAnnotation_substitution_in_intron_before_TSS() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1262210, 1262210, "C", "G", VariantType.SNV); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, null, 1, 1, null, null); Assert.Equal("ENST00000343938.4:c.-75-6C>G", observedHgvsc); } [Fact] public void GetHgvscAnnotation_insertion_in_coding_region() { var sequence = new Mock(); sequence.Setup(x => x.Substring(1262627, 1)).Returns("A"); var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1262629, 1262628, "", "G", VariantType.insertion); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, sequence.Object, 4, 4, null, null); Assert.Equal("ENST00000343938.4:c.130_131insG", observedHgvsc); } [Fact] public void GetHgvscAnnotation_insertion_after_coding_region() { var sequence = new Mock(); sequence.Setup(x => x.Substring(1262627, 1)).Returns("A"); var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1263159, 1263158, "", "G", VariantType.insertion); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, sequence.Object, 4, 4, null, null); Assert.Equal("ENST00000343938.4:c.*15_*16insG", observedHgvsc); } [Fact] public void GetHgvscAnnotation_duplication_in_coding_region() { var sequence = new Mock(); sequence.Setup(x => x.Substring(1262626, 2)).Returns("TA"); var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1262629, 1262628, "", "TA", VariantType.insertion); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, sequence.Object, 4, 4, null, null); Assert.Equal("ENST00000343938.4:c.129_130dup", observedHgvsc); } [Fact] public void ApplyDuplicationAdjustments_NonCoding_Reverse() { var regions = new ITranscriptRegion[3]; regions[0] = new TranscriptRegion(TranscriptRegionType.Exon, 2, 20976856, 20977050, 154, 348); regions[1] = new TranscriptRegion(TranscriptRegionType.Intron, 1, 20977051, 20977054, 153, 154); regions[2] = new TranscriptRegion(TranscriptRegionType.Exon, 1, 20977055, 20977207, 1, 153); var observedResults = regions.ShiftDuplication(20977006, "AACT", true); Assert.Equal("AACT", observedResults.RefAllele); Assert.Equal(20977009, observedResults.Start); Assert.Equal(20977006, observedResults.End); } [Fact] public void ApplyDuplicationAdjustments_Coding_Forward() { var regions = new ITranscriptRegion[41]; for (int i = 0; i < 22; i++) regions[i] = new TranscriptRegion(TranscriptRegionType.Exon, 0, 107000000, 107334926, 1, 1564); for (int i = 23; i < regions.Length; i++) regions[i] = new TranscriptRegion(TranscriptRegionType.Exon, 0, 107335162, 108000000, 1662, 1700); regions[21] = new TranscriptRegion(TranscriptRegionType.Intron, 11, 107334926, 107335065, 1565, 1566); regions[22] = new TranscriptRegion(TranscriptRegionType.Exon, 12, 107335066, 107335161, 1566, 1661); var observedResults = regions.ShiftDuplication(107335068, "AGTC", false); Assert.Equal("AGTC", observedResults.RefAllele); Assert.Equal(107335064, observedResults.Start); Assert.Equal(107335067, observedResults.End); } [Fact] public void GetHgvscAnnotation_Deletion_start_before_transcript() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1260144, 1260148, "ATGTC", "", VariantType.deletion); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, null, -1, 0, null, null); Assert.Null(observedHgvsc); } [Fact] public void GetHgvscAnnotation_Delin_start_from_Exon_end_in_intron() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1262410, 1262414, "ATGTC", "TG", VariantType.indel); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, null, 2, 3, null, null); Assert.Equal("ENST00000343938.4:c.120_122+2delinsTG", observedHgvsc); } [Fact] public void GetHgvscAnnotation_inversion_start_from_Exon_end_in_intron() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1262410, 1262414, "ATGTC", "GACAT", VariantType.MNV); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, null, 2, 3, null, null); Assert.Equal("ENST00000343938.4:c.120_122+2inv", observedHgvsc); } [Fact] public void GetHgvscAnnotation_Deletion_end_after_transcript() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1260143, 1260148, "ATGTC", "", VariantType.deletion); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, null, -1, 0, null, null); Assert.Null(observedHgvsc); } [Fact] public void GetHgvscAnnotation_Reference_no_hgvsc() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1260138, 1260138, "A", "A", VariantType.reference); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_forwardTranscript, variant, null, -1, -1, null, null); Assert.Null(observedHgvsc); } [Fact] public void GetHgvscAnnotation_substitution_in_intron_of_reverse_gene() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 136000, 136000, "A", "G", VariantType.SNV); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_reverseTranscript, variant, null, 1, 1, null, null); Assert.Equal("ENST00000423372.3:c.*910-198T>C", observedHgvsc); } [Fact] public void GetHgvscAnnotation_substitution_after_stopCodon_of_reverse_gene() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 138529, 138529, "A", "G", VariantType.SNV); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_reverseTranscript, variant, null, 2, -1, null, null); Assert.Equal("ENST00000423372.3:c.*1T>C", observedHgvsc); } [Fact] public void GetHgvscAnnotation_deletion_of_reverse_gene() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 135802, 137619, "ATCGTGGGTTGT", "", VariantType.deletion); var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(_reverseTranscript, variant, null, 0, 1, null, null); Assert.Equal("ENST00000423372.3:c.*909+2_*910del", observedHgvsc); } [Fact] public void GetHgvscAnnotation_insertion_at_last_position() { var sequence = new Mock(); sequence.Setup(x => x.Substring(70361157-12, 12)).Returns("TATATATATATA"); var variant = new SimpleVariant(ChromosomeUtilities.ChrX, 70361157, 70361156, "", "ACACCAGCAGCA", VariantType.insertion);//right shifted variant var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(GetForwardTranscriptWithoutUtr(), variant, sequence.Object, 0, 0, null, null); Assert.Equal("ENST00000579622.1:n.122_123insACACCAGCAGCA", observedHgvsc); } [Fact] public void GetHgvscAnnotation_duplication_at_last_position() { var sequence = new Mock(); sequence.Setup(x => x.Substring(70361156 - 4, 4)).Returns("ACAC"); var variant = new SimpleVariant(ChromosomeUtilities.ChrX, 70361157, 70361156, "", "ACAC", VariantType.insertion);//right shifted variant var observedHgvsc = HgvsCodingNomenclature.GetHgvscAnnotation(GetForwardTranscriptWithoutUtr(), variant, sequence.Object, 0, 0, null, null); Assert.Equal("ENST00000579622.1:n.119_122dup", observedHgvsc); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/HgvsProteinNomenclatureTests.cs ================================================ using Moq; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Pools; using VariantAnnotation.TranscriptAnnotation; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class HgvsProteinNomenclatureTests { public const string Enst00000343938GenomicSequence = "GAGGGCGGGGCGAGGGCGGGGCGGTGGGCGGGGACGGGGCCCGCACGGCGGCTACGGCCTAGGTGAGCGGCTCGGACTCGGCGGCCGCACCTGCCCAACCCAACCCGCACGGTCCGGAAGTCGCCGAGGGGCCGGGAGCGGGAGGGGACGTCGTCCTAGAGGGCCGGAGCGGGCGGGCGGCCGAGGACCCGGCTCCCGCGCAGGACGGAGCCGTGGCTCAGGTCGGCCCCTCCCCAACACCACCCCGGGCCTCCGCCCCTTCCTGGGCCTCTCGGTGGAGCAGGGACCCGAACCGGTGCCCATCCAGTCCGGTGCCATCTGAAGCCCCCTTCCCAGGTGAGACTCGTAGCGCTCGCTCGACAGGGTCTGGTCCCACCCACAAGGCCTGGGGCGCCGTGGGGCCCCGTCTCCTGCTGGCCCCCCAGCCTGCTGTCAGCCCCCGTGCTCTGTGCTCAGGCCGCCCTCGCGCCCGGCCCTGACCTTGGGCCGTTGGGCTGCCCTGGGAAAGGCCTGGAGGTGTCCTGGGTCACCTTCCTGGGCTGGCAAGCTGCCTGCCTCCTGCACAGCCACTGCCCTTCCTGTTGTTACCGAGCCACCAGCCACAGCTCTGAGAAGCTCCTGGCAGCTTCTGTTTGCCACTGGCTCGAATCTGGGCAGGAAGGCAAGGCCCGCAGAATATCTGGTGACCAAGAAGGAAACCCCAGAGCCTCAGAGACCATCTTCTCAGTGGACAAAATTAAGGCCCGAGGAGGGGAGGGGCGTGCTGGAAGTCTATGGGACTGCATCTTTCTGAGGCCCAGGAGCAGCCATCCCCCACACCTGAAGCCCGGTGAGCTCACATCTGGGGCCTCCGCCTGGTGCCAAGCATGCAACCCAACCTGTGGGGCCTGCAACGCCAGGCTTCAGCACCCTGCAGGCACCAGTGCTCCAGCAGCCTGGGCCACGGGCTGGGCAGGGCTTGCAGCCCATGATCCCTAGTGATGAAGGGCCCAGTCCTAGGGTGCTGAGCAACCTGCCCACCTGCTCCTGGCCAGGAGCTCTCACCACGGCTGGGTGCCCTTCCCCCTCCCCCACCGATGGAGTCCCTGCAGCCAGGGAGGCCAGGACAGGGCTCCCAGCACCAACCGGCCTAGGAACCCCCAGGCCCTCTTCCTGGTCGAGGTGGAATGCAGCTGACTCTCAGGTTCCCCAGAGCAGGTGCGGGCCCGTGGGGCACCCGGGGAGACAGGGCAAGGGTGCTTGGCAACACTCACACAAAGCATGGGTGCCTGGATGTCTGTGGATCTGTGGAGTGACTATGTGAATGCCAGCAGAATCCAAAGCAGGGCCTGGGCCACTCGTGGAAGGCTCCCTAGGGCTAGTACAAGAGCCTCGTGGCAATCTTCTGAGTGGTAAAACCCATCTGTGTGGGACATGGAGTTTCAGCAACAGGAGTGAAAACACGTGTCCATCCATCCAGCAAGTGCCAGCCCTACAGCCTCTTTTCTGCTTTTGGGGATGTAGCAGTGAGGAAGATGGGGCAGCCTGCCCGGCAGCATCCCCCCACCCCCGGCCCCACCTGTCTCTGCTTTCTGCTGTGTCTGTTTTCTTGTCTAGGACTTCAGAACTTCCTGTCTTTGTTGTCATCTGACCCCACCCCAGATGGCTGCTCGCACTCCCCATGCACCCAGATAGATGGCTAGGATGGTGCTTGGCTCTCGGCAGGGGCTTAGTATTTCTCCAGCTGGTAAAAGCAGATACAGCATCTAGAGAGAGAAACAAAAACAAGAAAGCACCAGCAGAGACACCTGCTGCAGACAGCGGGGCCTAGTGGTCTGATAAAGCCAGAGGGGGCCACTCTCGGGGTCAGGGACTGACACGGAGTCAGTGGCCTGATCCACAGGAGGGGCTGTGCCAAGGTCCCTGAATGCGCAATCCTGATGAAGGGTGGGTCAGGGTGGTGTGCCTGAGAGCCTGCGGCTTGGCTGGGAGCAGAGCCAGGCAGCTCCTGGGAGGAAGCTCCATGAGGGGCATGAGTGTTCAGTGAGCGGCAATGGGATCGCAGCTATTTTGTTCCCCTCCACACACAGAAAATGAGCCACAGAGCAAGCTGACCCCAGCGACACAGCCCCCCAGCCCTACTGTATTTCCGTTCCTATCAAAAAATGGATGACTCGGAGACAGGTTTCAATCTGAAAGTCGTCCTGGTCAGTTTCAAGCAGTGTCTCGATGAGAAGGAAGAGGTCTTGCTGGACCCCTACATTGCCAGCTGGAAGGGCCTGGTCAGGTGCGTGTGCCAGGGCTGCCTCCTGAGGTGGGCGCTCCCCTGGCCCGAGTCCCATATGTGGCATCTGCCTCCCGACTGCCTGTCCCCACCAGCTTTGCTGCCCGTTTCCAGATGGGTGTGAGCCCCCGCAGGCTGGGCAGCGTCCCCTGCACCCCAGGCGGGCTGCCCCAGGCCTGGGCGAGGACTCGAGCCCCGCTCCCTTCCACAGGTTTCTGAACAGCCTGGGCACCATCTTCTCATTCATCTCCAAGGACGTGGTCTCCAAGCTGCGGATCATGGAGCGCCTCAGGGGCGGCCCGCAGAGCGAGCACTACCGCAGCCTGCAGGCCATGGTGGCCCACGAGCTGAGCAACCGGCTGGTGGACCTGGAGCGCCGCTCCCACCACCCGGAGTCTGGCTGCCGGACGGTGCTGCGCCTGCACCGCGCCCTGCACTGGCTGCAGCTGTTCCTGGAGGGCCTGCGTACCAGCCCCGAGGACGCACGCACCTCCGCGCTCTGCGCCGACTCCTACAACGCCTCGCTGGCCGCCTACCACCCCTGGGTCGTGCGCCGCGCCGTCACCGTGGCCTTCTGCACGCTGCCCACACGCGAGGTCTTCCTGGAGGCCATGAACGTGGGGCCCCCGGAGCAGGCCGTGCAGATGCTAGGCGAGGCCCTCCCCTTCATCCAGCGTGTCTACAACGTCTCCCAGAAGCTCTACGCCGAGCACTCCCTGCTGGACCTGCCCTAGGGGCGGGAAGCCAGGGCCGCACCGGCTTTCCTGCTGCAGATCTGGGCTGCGGTGGCCAGGGCCGTGAGTCCCGTGGCAGAGCCTTCTGGGCGCTGCGGGAACAGGAGATCCTCTGTCGCCCCTGTGAGCTGAGCTGGTTAGGAACCACAGACTGTGACAGAGAAGGTGGCGACCAGCCCAGAAGAGGCCCACCCTCTCGGTCCGGAACAAGACGCCTCGGCCACGGCTCCCCCTCGGCCTATTACACGCGTGCGCAGCCAGGCCTCGCCAGGGTGCGGTGCAGAGCAGAGCAGGCAGGGGTGGGGGCCGGGCCTGCAAGAGCCCGAAAGGTCGCCACCCCCTAGCCTGTGGGGTGCATCTGCGAACCAGGGTGAAGTCACAGGTCCCGGGGTGTGGAGGCTCCATCCTTTCTCCTTTCTGCCAGCCGATGTGTCCTCATCTCAGGCCCGTGCCTGGGACCCCGTGTCTGCCCAGGTGGGCAGCCTTGAGCCCAGGGGACTCAGTGCCCTCCATGCCCTGGCTGGCAGAAACCCTCAACAGCAGTCTGGGCACTGTGGGGCTCTCCCCGCCTCTCCTGCCTTGTTTGCCCCTCAGCGTGCCAGGCAGACTGGGGGCAGGACAGCCGGAAGCTGAGACCAAGGCTCCTCACAGAAGGGCCCAGGAAGTCCCCGCCCTTGGGACAGCCTCCTCCGTAGCCCCTGCACGGCACCAGTTCCCCGAGGGACGCAGCAGGCCGCCTCCCGCAGCGGCCGTGGGTCTGCACAGCCCAGCCCAGCCCAAGGCCCCCAGGAGCTGGGACTCTGCTACACCCAGTGAAATGCTGTGTCCCTTCTCCCCCGTGCCCCTTGATGCCCCCTCCCCACAGTGCTCAGGAGACCCGTGGGGCACGGAACAGGAGGGTCTGGACCCTGTGGCCCAGCCAAAGGCTACCAGACAGCCACAACCAGCCCAGCCACCATCCAGTGCCTGGGGCCTGGCCACTGGCTCTTCACAGTGGACCCCAGCACCTCGGGGTGGCAGAGGGACGGCCCCCACGGCCCAGCAGACATGCGAGCTTCCAGAGTGCAATCTATGTGATGTCTTCCAACGTTAATAAATCACACAGCCTCCCAGGAGGGAGACGCTGGGGTGCAC"; public static ITranscript GetMockedTranscriptOnForwardStrand() { var mockedTranscript = new Mock(); //get info from ENST00000343938.4 const int start = 1260147; const int end = 1264277; var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 1260147, 1260482, 1, 336), new TranscriptRegion(TranscriptRegionType.Intron, 1, 1260483, 1262215, 336, 337), new TranscriptRegion(TranscriptRegionType.Exon, 2, 1262216, 1262412, 337, 533), new TranscriptRegion(TranscriptRegionType.Intron, 2, 1262413, 1262620, 533,534), new TranscriptRegion(TranscriptRegionType.Exon, 3, 1262621, 1264277, 534, 2190) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(1262291, 1263143, 412, 1056, 645)); translation.SetupGet(x => x.ProteinId).Returns(CompactId.Convert("ENST00000343938", 4)); translation.SetupGet(x => x.PeptideSeq).Returns("MDDSETGFNLKVVLVSFKQCLDEKEEVLLDPYIASWKGLVRFLNSLGTIFSFISKDVVSKLRIMERLRGGPQSEHYRSLQAMVAHELSNRLVDLERRSHHPESGCRTVLRLHRALHWLQLFLEGLRTSPEDARTSALCADSYNASLAAYHPWVVRRAVTVAFCTLPTREVFLEAMNVGPPEQAVQMLGEALPFIQRVYNVSQKLYAEHSLLDLP"); var gene = new Mock(); gene.SetupGet(x => x.OnReverseStrand).Returns(false); gene.SetupGet(x => x.EnsemblId).Returns(CompactId.Convert("ENSG00000224051 ")); mockedTranscript.SetupGet(x => x.Id).Returns(CompactId.Convert("ENST00000343938", 4)); mockedTranscript.SetupGet(x => x.Source).Returns(Source.Ensembl); mockedTranscript.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); mockedTranscript.SetupGet(x => x.Start).Returns(start); mockedTranscript.SetupGet(x => x.End).Returns(end); mockedTranscript.SetupGet(x => x.Gene).Returns(gene.Object); mockedTranscript.SetupGet(x => x.TranscriptRegions).Returns(transcriptRegions); mockedTranscript.SetupGet(x => x.Translation).Returns(translation.Object); mockedTranscript.SetupGet(x => x.TotalExonLength).Returns(2190); return mockedTranscript.Object; } [Fact] public void GetHgvsProteinAnnotation_substitution() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1262295, 1262295, "A", "C", VariantType.SNV, "1:1262295:A>C", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:p.(Asp2Ala)", hgvspNotation); VariantPool.Return(variant); } [Fact] public void GetHgvsProteinAnnotation_insertion() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1262297, 1262296, "", "TTC", VariantType.insertion, "1:1262295:T>TTTC", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:p.(Asp2_Asp3insPhe)", hgvspNotation); VariantPool.Return(variant); } [Fact] public void GetHgvsProteinAnnotation_duplication_right_shifted() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1262297, 1262296, "", "GAC", VariantType.insertion, "1:1262295:T>GAC", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:p.(Asp3dup)", hgvspNotation); VariantPool.Return(variant); } [Fact] public void GetHgvsProteinAnnotation_deletion() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1262300, 1262302, "TCG", "", VariantType.deletion, "1:1262300:1262302", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:p.(Ser4del)", hgvspNotation); VariantPool.Return(variant); } [Fact] public void GetHgvsProteinAnnotation_delIns() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1262300, 1262305, "TCGGAG", "GAGACA", VariantType.indel, "1:1262300:1262305", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:p.(Ser4_Glu5delinsGluThr)", hgvspNotation); VariantPool.Return(variant); } [Fact] public void GetHgvsProteinAnnotation_no_change() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1262300, 1262302, "TCG", "AGT", VariantType.indel, "1:1262300:1262302", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:c.10_12delinsAGT(p.(Ser4=))", hgvspNotation); VariantPool.Return(variant); } [Fact] public void GetHgvsProteinAnnotation_frameshift() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1262300, 1262301, "TC", "", VariantType.deletion, "1:1262300:1262301", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:p.(Ser4GlyfsTer19)", hgvspNotation); VariantPool.Return(variant); } [Fact] public void GetHgvsProteinAnnotation_frameshift_stop_gain() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1262313, 1262312, "", "GA", VariantType.insertion, "1:1262333:1262332", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:p.(Phe8Ter)", hgvspNotation); VariantPool.Return(variant); } [Fact] public void GetHgvsProteinAnnotation_extension() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1263141, 1263143, "TAG", "", VariantType.deletion, "1:1263141:1263143", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(Enst00000343938GenomicSequence, 1260147 - 1); var transcript = GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var hgvspNotation = annotatedTranscript.HgvsProtein; Assert.Equal("ENST00000343938.4:p.(Ter215GlyextTer43)", hgvspNotation); VariantPool.Return(variant); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/HgvsUtilitiesTests.cs ================================================ using Genome; using Moq; using UnitTests.TestDataStructures; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class HgvsUtilitiesTests { [Fact] public void ShiftAndRotateAlleles_Rotated() { int observedStart = 98; string observedRefAminoAcids = "YYAKEV"; string observedAltAminoAcids = "Y"; HgvsUtilities.ShiftAndRotateAlleles(ref observedStart, ref observedRefAminoAcids, ref observedAltAminoAcids, "MHYCVLSAFLILHLVTVALSLSTCSTLDMDQFMRKRIEAIRGQILSKLKLTSPPEDYPEPEEVPPEVISIYNSTRDLLQEKASRRAAACERERSDEEYYAKEVYKIDMPPFFPSENAIPPTFYRPYFRIVRFDVSAMEKNASNLVKAEFRVFRLQNPKARVPEQRIELYQILKSKDLTSPTQRYIDSKVVKTRAEGEWLSFDVTDAVHEWLHHKDRNLGFKISLHCPCCTFVPSNNYIIPNKSEELEARFAGIDGTSTYTSGDQKTIKSTRKKNSGKTPHLLLMLLPSYRLESQQTNRRKKRALDAAYCFRNVQDNCCLRPLYIDFKRDLGWKWIHEPKGYNANFCAGACPYLWSSDTQHSRVLSLYNTINPEASASPCCVSQDLEPLTILYYIGKTPKIEQLSNMIVKSCKCS"); Assert.Equal(100, observedStart); Assert.Equal("AKEVY", observedRefAminoAcids); Assert.Equal("", observedAltAminoAcids); } [Fact] public void Rotate3Prime_Identity_Insertion() { const int expectedStart = 46; const string expectedRefAminoAcids = ""; const string expectedAltAminoAcids = "A"; var observedResult = HgvsUtilities.Rotate3Prime("", "A", 44, "MAAQVAPAAASSLGNPPPPPPSELKKAEQQQREEAGGEAAAAAAAERGEMKAAAGQESEGPAVGPPQPLGKELQDGAESNGGGGGGGAGSGGGPGAEPDLKNSNGNAGPRPALNNNLTEPPGGGGGGSSDGVGAPPHSAAAALPPPAYGFGQPYGRSPSAVAAAAAAVFHQQHGGQQSPGLAALQSGGGGGLEPYAGPQQNSHDHGFPNHQYNSYYPNRSAYPPPAPAYALSSPRGGTPGSGAAAAAGSKPPPSSSASASSSSSSFAQQRFGAMGGGGPSAAGGGTPQPTATPTLNQLLTSPSSARGYQGYPGGDYSGGPQDGGAGKGPADMASQCWGAAAAAAAAAAASGGAQQRSHHAPMSPGSSGGGGQPLARTPQPSSPMDQMGKMRPQPYGGTNPYSQQQGPPSGPQQGHGYPGQPYGSQTPQRYPMTMQGRAQSAMGGLSYTQQIPPYGQQGPSGYGQQGQTPYYNQQSPHPQQQQPPYSQQPPSQTPHAQPSYQQQPQSQPPQLQSSQPPYSQQPSQPPHQQSPAPYPSQQSTTQQHPQSQPPYSQPQAQSPYQQQQPQQPAPSTLSQQAAYPQPQSQQSQQTAYSQQRFPPPQELSQDSFGSQASSAPSMTSSKGGQEDMNLSLQSRPSSLPDLSGSIDDLPMGTEGALSPGVSTSGISSSQGEQSNPAQSPFSPHTSPHLPGIRGPSPSPVGSPASVAQSRSGPLSPAAVPGNQMPPRPPSGQSDSIMHPSMNQSSIAQDRGYMQRNPQMPQYSSPQPGSALSPRQPSGGQIHTGMGSYQQNSMGSYGPQGGQYGPQGGYPRQPNYNALPNANYPSAGMAGGINPMGAGGQMHGQPGIPPYGTLPPGRMSHASMGNRPYGPNMANMPPQVGSGMCPPPGGMNRKTQETAVAMHVAANSIQNRPPGYPNMNQGGMMGTGPPYGQGINSMAGMINPQGPPYSMGGTMANNSAGMAASPEMMGLGDVKLTPATKMNNKADGTPKTESKSKKSSSSTTTNEKITKLYELGGEPERKMWVDRYLAFTEEKAMGMTNLPAVGRKPLDLYRLYVSVKEIGGLTQVNKNKKWRELATNLNVGTSSSAASSLKKQYIQCLYAFECKIERGEDPPPDIFAAADSKKSQPKIQPPSPAGSGSMQGPQTPQSTSSSMAEGGDLKPPTPASTPHSQIPPLPGMSRSNSVGIQDAFNDGSDSTFQKRNSMTPNPGYQPSMNTSDMMGRMSYEPNKDPYGSMRKAPGSDPFMSSGQGPNGGMGDPYSRAAGPGLGNVAMGPRQHYPYGGPYDRVRTEPGIGPEGNMSTGAPQPNLMPSNPDSGMYSPSRYPPQQQQQQQQRHDSYGNQFSTQGTPSGSPFPSQQTTMYQQQQQNYKRPMDGTYGPPAKRHEGEMYSVPYSTGQGQPQQQQLPPAQPQPASQQQAAQPSPQQDVYNQYGNAYPATATAATERRPAGGPQNQFPFQFGRDRVSAPPGTNAQQNMPPQMMGGPIQASAEVAQQGTMWQGRNDMTYNYANRQSTGSAPQGPAYHGVNRTDEMLHTDQRANHEGSWPSHGTRQPPYGPSAPVPPMTRPPPSNYQPPPSMQNHIPQVSSPAPLPRPMENRTSPSKSPFLHSGMKMQKAGPPVPASHIAPAPVQPPMIRRDITFPPGSVEATQPVLKQRRRLTMKDIGTPEAWRVMMSLKSGLLAESTWALDTINILLYDDNSIMTFNLSQLPGLLELLVEYFRRCLIEIFGILKEYEVGDPGQRTLLDPGRFSKVSSPAPMEGGEEEEELLGPKLEEEEEEEVVENDEEIAFSGKDKPASENSEEKLISKFDKLPVKIVQKNDPFVVDCSDKLGRVQEFDSGLLHWRIGGGDTTEHIQTHFESKTELLPSRPHAPCPPAPRKHVTTAEGTPGTTDQEGPPPDGPPEKRITATMDDMLSTRSSTLTEDGAKSSEAIKESSKFPFGISPAQSHRNIKILEDEPHSKDETPLCTLLDWQDSLAKRCVCVSNTIRSLSFVPGNDFEMSKHPGLLLILGKLILLHHKHPERKQAPLTYEKEEEQDQGVSCNKVEWWWDCLEMLRENTLVTLANISGQLDLSPYPESICLPVLDGLLHWAVCPSAEAQDPFSTLGPNAVLSPQRLVLETLSKLSIQDNNVDLILATPPFSRLEKLYSTMVRFLSDRKNPVCREMAVVLLANLAQGDSLAARAIAVQKGSIGNLLGFLEDSLAATQFQQSQASLLHMQNPPFEPTSVDMMRRAARALLALAKVDENHSEFTLYESRLLDISVSPLMNSLVSQVICDVLFLIGQS"); Assert.Equal(expectedStart, observedResult.Start); Assert.Equal(expectedRefAminoAcids, observedResult.RefAminoAcids); Assert.Equal(expectedAltAminoAcids, observedResult.AltAminoAcids); } [Fact] public void Rotate3Prime_Identity_Deletion() { const int expectedStart = 530; const string expectedRefAminoAcids = "A"; const string expectedAltAminoAcids = ""; var observedResult = HgvsUtilities.Rotate3Prime("A", "", 529, "MEAAAGGRGCFQPHPGLQKTLEQFHLSSMSSLGGPAAFSARWAQEAYKKESAKEAGAAAVPAPVPAATEPPPVLHLPAIQPPPPVLPGPFFMPSDRSTERCETVLEGETISCFVVGGEKRLCLPQILNSVLRDFSLQQINAVCDELHIYCSRCTADQLEILKVMGILPFSAPSCGLITKTDAERLCNALLYGGAYPPPCKKELAASLALGLELSERSVRVYHECFGKCKGLLVPELYSSPSAACIQCLDCRLMYPPHKFVVHSHKALENRTCHWGFDSANWRAYILLSQDYTGKEEQARLGRCLDDVKEKFDYGNKYKRRVPRVSSEPPASIRPKTDDTSSQSPAPSEKDKPSSWLRTLAGSSNKSLGCVHPRQRLSAFRPWSPAVSASEKELSPHLPALIRDSFYSYKSFETAVAPNVALAPPAQQKVVSSPPCAAAVSRAPEPLATCTQPRKRKLTVDTPGAPETLAPVAAPEEDKDSEAEVEVESREEFTSSLSSLSSPSFTSSSSAKDLGSPGARALPSAVPDAAAPADAPSGLEAELEHLRQALEGGLDTKEAKEKFLHEVVKMRVKQEEKLSAALQAKRSLHQELEFLRVAKKEKLREATEAKRNLRKEIERLRAENEKKMKEANESRLRLKRELEQARQARVCDKGCEAGRLRAKYSAQIEDLQVKLQHAEADREQLRADLLREREAREHLEKVVKELQEQLWPRARPEAAGSEGAAELEP"); Assert.Equal(expectedStart, observedResult.Start); Assert.Equal(expectedRefAminoAcids, observedResult.RefAminoAcids); Assert.Equal(expectedAltAminoAcids, observedResult.AltAminoAcids); } [Fact] public void Rotate3Prime_Identity_WithNullAminoAcids() { const int expectedStart = 55; const string expectedRefAminoAcids = "Q"; const string expectedAltAminoAcids = "*"; var observedResult = HgvsUtilities.Rotate3Prime(expectedRefAminoAcids, expectedAltAminoAcids, expectedStart, "MGWDLTVKMLAGNEFQVSLSSSMSVSELKAQITQKIGVHAFQQRLAVHPSGVALQDRVPLASQGLGPGSTVLLVVDKCDEPLSILVRNNKGRSSTYEVRLTQTVAHLKQQVSGLEGVQDDLFWLTFEGKPLEDQLPLGEYGLKPLSTVFMNLRLRGGGTEPGGRS"); Assert.Equal(expectedStart, observedResult.Start); Assert.Equal(expectedRefAminoAcids, observedResult.RefAminoAcids); Assert.Equal(expectedAltAminoAcids, observedResult.AltAminoAcids); } [Fact] public void IsAminoAcidDuplicate_True() { var observedResult = HgvsUtilities.IsAminoAcidDuplicate(85, "P", "MEAAAGGRGCFQPHPGLQKTLEQFHLSSMSSLGGPAAFSARWAQEAYKKESAKEAGAAAVPAPVPAATEPPPVLHLPAIQPPPPVLPGPFFMPSDRSTERCETVLEGETISCFVVGGEKRLCLPQILNSVLRDFSLQQINAVCDELHIYCSRCTADQLEILKVMGILPFSAPSCGLITKTDAERLCNALLYGGAYPPPCKKELAASLALGLELSERSVRVYHECFGKCKGLLVPELYSSPSAACIQCLDCRLMYPPHKFVVHSHKALENRTCHWGFDSANWRAYILLSQDYTGKEEQARLGRCLDDVKEKFDYGNKYKRRVPRVSSEPPASIRPKTDDTSSQSPAPSEKDKPSSWLRTLAGSSNKSLGCVHPRQRLSAFRPWSPAVSASEKELSPHLPALIRDSFYSYKSFETAVAPNVALAPPAQQKVVSSPPCAAAVSRAPEPLATCTQPRKRKLTVDTPGAPETLAPVAAPEEDKDSEAEVEVESREEFTSSLSSLSSPSFTSSSSAKDLGSPGARALPSAVPDAAAPADAPSGLEAELEHLRQALEGGLDTKEAKEKFLHEVVKMRVKQEEKLSAALQAKRSLHQELEFLRVAKKEKLREATEAKRNLRKEIERLRAENEKKMKEANESRLRLKRELEQARQARVCDKGCEAGRLRAKYSAQIEDLQVKLQHAEADREQLRADLLREREAREHLEKVVKELQEQLWPRARPEAAGSEGAAELEP"); Assert.True(observedResult); } [Fact] public void IsAminoAcidDuplicate_False() { var observedResult = HgvsUtilities.IsAminoAcidDuplicate(307, "*RX", "MHYDGHVRFDLPPQGSVLARNVSTRSCPPRTSPAVDLEEEEEESSVDGKGDRKSTGLKLSKKKARRRHTDDPSKECFTLKFDLNVDIETEIVPAMKKKSLGEVLLPVFERKGIALGKVDIYLDQSNTPLSLTFEAYRFGGHYLRVKAPAKPGDEGKVEQGMKDSKSLSLPILRPAGTGPPALERVDAQSRRESLDILAPGRRRKNMSEFLGEASIPGQEPPTPSSCSLPSGSSGSTNTGDSWKNRAASRFSGFFSSGPSTSAFGREVDKMEQLEGKLHTYSLFGLPRLPRGLRFDHDSWEEEYDEDEDEDNACLRLEDSWRELIDGHEKLTRRQCHQQEAVWELLHTEASYIRKLRVIINLFLCCLLNLQESGLLCEVEAERLFSNIPEIAQLHRRLWASVMAPVLEKARRTRALLQPGDFLKGFKMFGSLFKPYIRYCMEEEGCMEYMRGLLRDNDLFRAYITWAEKHPQCQRLKLSDMLAKPHQRLTKYPLLLKSVLRKTEEPRAKEAVVAMIGSVERFIHHVNACMRQRQERQRLAAVVSRIDAYEVVESSSDEVDKLLKEFLHLDLTAPIPGASPEETRQLLLEGSLRMKEGKDSKMDVYCFLFTDLLLVTKAVKKAERTRVIRPPLLVDKIVCRELRDPGSFLLIYLNEFHSAVGAYTFQASGQALCRGWVDTIYNAQNQLQQLRAQEPPGSQQPLQSLEEEEDEQEEEEEEEEEEEEGEDSGTSAASSPTIMRKSSGSPDSQHCASDGSTETLAMVVVEPGDTLSSPEFDSGPFSSQSDETSLSTTASSATPTSELLPLGPVDGRSCSMDSAYGTLSPTSLQDFVAPGPMAELVPRAPESPRVPSPPPSPRLRRRTPVQLLSCPPHLLKSKSEASLLQLLAGAGTHGTPSAPSRSLSELCLAVPAPGIRTQGSPQEAGPSWDCRGAPSPGSGPGLVGCLAGEPAGSHRKRCGDLPSGASPRVQPEPPPGVSAQHRKLTLAQLYRIRTTLLLNSTLTASEV"); Assert.False(observedResult); } [Fact] public void IsAminoAcidDuplicate_False_WhenAminoAcidsNull() { var observedResult = HgvsUtilities.IsAminoAcidDuplicate(307, null, null); Assert.False(observedResult); } [Fact] public void IsAminoAcidDuplicate_False_StartEqualToAminoAcidLength() { var observedResult = HgvsUtilities.IsAminoAcidDuplicate(3, "ABC", "DEF"); Assert.False(observedResult); } [Fact] public void GetNumAminoAcidsUntilStopCodon_FirstAminoAcidIsStop() { const int expectedResult = -1; var observedResult = HgvsUtilities.GetNumAminoAcidsUntilStopCodon( "RHRNRNTQTETNTETQRHRNTQKHRNKHRDTETHRNTETNTETQKHTETQKQTQRHRNTQKHTDRNKHRNTETQKYRNTQKHRNKHRDTETQKHSDAETQQHKHRNTETHRNRNTETNTETQTHRHRETQKHTETLKHSGRCPGCRGSIA", "RHRNRNTQTETNTETQRHRNTQKHRNKHRDTETHRNTQKHRNKHRDTETHRNTETNTETQKHTETHRQKQTQKHRDTEIQKHTETQKQTQRHRDTETQRRRNTATQTQKHRNTQKQKHRNKHRDTDTQTQRNTETHRNTETQWAVSRLQRLHRC", 37, true); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetNumAminoAcidsUntilStopCodon_FoundExtraAminoAcids() { const int expectedResult = 38; var observedResult = HgvsUtilities.GetNumAminoAcidsUntilStopCodon( "MLAEPFNWHVEYAHSGDVLGPSGLPASPGAPGTCLHNPAGSNWGPG*EVLMAGTVPAVPG*SGEGSQF*LPWSCSDSPQAGSRAHGQGPGIPLLPQGHGTQSLCRAQGSVPAAEPVPPTEDGRGLSGPEQGHRGTAPARRPPGGWQDLLLLSATLRL*RHCPQHQQ*LPARPGRLAAGCP*ETLWP*PLLSPAAGHRTRDLFPGGGDVPGGPLLRQAGCGADPAFPRPEPQGFGRPLEPAPCGAGGDHHERDRGC*RPHQLL*GVRCHSRRPPEPSDGGPHPRGHGAAPQCQQCGGCAAAQASGLPGAAGPAEGQCRRGPVPVLQ*AGAQRAAEARQLPQPDADLRSRPSAH*QPSLGGRAFHPDVWQSLGRESGLRSDLVQEPGLLCAERKALGRGAEPVPAPAARLPHRPWRPGQPCRAGQQEPVQALPALQLEGNGGTTWAPPFRQPSVRLLRLQPCAGAGRPLRPLIPYLPWPEEFLHHHRELAGLLELLDPSAGEPGP*GPTPLPWRS*EWPSVGL*VQ*RPVVLFPAAAGAAGARARAGPNAQ*LPGPQGQVPREPAGLRLVRGADL*AG**HRGHRCASRAALWPVPPGTVGGLEPRGPVPAAGHGALWLPLGPHAPVAG*RALRPTLRPGVQLPGPAGPPAAARPDPLLQHPPHACAPAAAALRRGGPGRPDLCQGDLSPGGQQQLRPGAAGHGCRRAHSLPLPTVTHWPGWRAAGRADHEPLPATPPHEP*PASHQPRQEGGSPGHGQDEA*DHHAGEPGGP*AQEVAHLGCPAALRPAGVVHGLRRLPGMRAPVPLARFAPVLSFARVFPPFSAPPPAQRALALQNLLSHSQAPERAGQALSRCL*PAALCIGG*MQKQGRNGVCS*EASNSGQERSLKKRPPAVTHSYQPAQHGMAPKLRRSQEET*RGGLRLIREGFLGEVILELAPGEHSEHDW*TEGCRGAQGSTLPRAKQGHWGLS*DPEGVKPLLPQLPLLLEPLHI*PLALLFTASTCSRLPSLSPPSWLCSRNSRLLPVSLLFFRLHL*RMRADNRNTVAKTRLWKGFQKSFFFFN*KKYLQR*ALAMLPRLVSNSWAQAILPSQPPRVLGLQV*ATAPSPRNLSAVWSSISHLMTCSAWGGGVSFPQLPQGGPLPSAAPLSC*PSSRKHTGCR*SGHSRDPQFKRVISISGDSRMGVSALNSPSCFTRKDPVKSPTEVTAH*RGERWSIE*HWAIQAALLPPDRS*ASLAGGLPTAFSGARLAGDGAAARPSLPAPW*PRGFLSAGLSCYLSLHHELSA*DWGSKRVSSQ*A*VGDCDLEKPWASNTCFSEAPKEGSDILFKNTTKQNSQDMCSFVCSVSHNLRLGDGTLG*GRFFCLASPHLPLALWIRQI*TF*RILREGFLG*GSMAKSVSLWTVYTSRRWI*RNPGFHFQCQSETCSQAGALVHTTYSGHQQQPRPDRASLFFFFETESLSPRLEPSGEILAHYNLHLPGSGNSRA*ASRVAATTGAGQHACLIFVF**RQGFTMLPRLVSNS*AQAVYPPQPPKVLGLQA*ATAPSQNICFYTQRAPLVRTEPRCPEPGSRPPGAQHLSFYT*WAGSGEDRESWWKFHSWPRGGALAPHCRLLTAPIPAAPVPDFISLLSPRVPGPSTLPSVLQEPTPLQLQHQGERGLHMPKYPCRMKGRPALDVPFLNNSHCRRV*DVLF*LSPASDAPPICAEWVWECG*GSKCQRSTFQNQVPSANHVGKVQTWRCPCASAPTHPFSFSCVRKEKFSEPSRLVAFKLQTMICSKKRAFHHKSVHLFTTVFQAGFIKKFLTLE", "MLAEPFNWHPGMWNMLIVAMCLALLGCLQAQELQGHVSIILLGATGDLAKKYLWQGLFQLYLDEAGRGHSFSFHGAALTAPKQGQELMAKALESLSCPKDMAPSHCAEHKDQFLQLSQYRQLKTAEDYQALNKDIEAQLQHAGLREAGRIFYFSVPPFAYEDIARNINSSCRPGPGAWLRVVLEKPFGHDHFSAQQLATELGTFFQEEEMYRVDHYLGKQAVAQILPFRDQNRKALDGLWNRHHVERVEIIMKETVDAEGRTSFYEEYGVIRDVLQNHLTEVLTLVAMELPHNVSSAEAVLRHKLQVFQALRGLQRGSAVVGQYQSYSEQVRRELQKPDSFHSLTPTFAAVLVHIDNLRWEGVPFILMSGKALDERVGYARILFKNQACCVQSEKHWAAAQSQCLPRQLVFHIGHGDLGSPAVLVSRNLFRPSLPSSWKEMEGPPGLRLFGSPLSDYYAYSPVRERDAHSVLLSHIFHGRKNFFITTENLLASWNFWTPLLESLAHKAPRLYPGGAENGRLLDFEFSSGRLFFSQQQPEQLVPGPGPAPMPSDFQVLRAKYRESPLVSAWSEELISKLANDIEATAVRAVRRFGQFHLALSGGSSPVALFQQLATAHYGFPWAHTHLWLVDERCVPLSDPESNFQGLQAHLLQHVRIPYYNIHPMPVHLQQRLCAEEDQGAQIYAREISALVANSSFDLVLLGMGADGHTASLFPQSPTGLDGEQLVVLTTSPSQPHRRMSLSLPLINRAKKVAVLVMGRMKREITTLVSRVGHEPKKWPISGVLPHSGQLVWYMDYDAFLG", 9, true); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetChangesAfterFrameshift_AfterFrameshift() { var observedResult = HgvsUtilities.GetChangesAfterFrameshift(4, "MABCDEFGHIIIKL", "MABCEFGH*"); Assert.Equal(5, observedResult.Start); Assert.Equal('D', observedResult.RefAminoAcid); Assert.Equal('E', observedResult.AltAminoAcid); } [Fact] public void GetChangesAfterFrameshift_AtEndAfterFrameshift() { var observedResult = HgvsUtilities.GetChangesAfterFrameshift(4, "MABCDEFGHIIIKL", "MABCDEFGHIIIKLL*"); Assert.Equal(15, observedResult.Start); Assert.Equal('*', observedResult.RefAminoAcid); Assert.Equal('L', observedResult.AltAminoAcid); } [Fact] public void GetChangesAfterFrameshift_WhenStopRetained() { var observedResult = HgvsUtilities.GetChangesAfterFrameshift(4, "MABCDEFGHIIIKL", "MABCDEFGHIIIKL*"); Assert.Equal(15, observedResult.Start); Assert.Equal('*', observedResult.RefAminoAcid); Assert.Equal('*', observedResult.AltAminoAcid); } [Fact] public void GetChangesAfterFrameshift_FirstAminoAcidIsStop() { var observedResult = HgvsUtilities.GetChangesAfterFrameshift(4, "MABCDEFGHIIIKL", "MABCDEFGHIIIKL*"); Assert.Equal(15, observedResult.Start); Assert.Equal('*', observedResult.RefAminoAcid); Assert.Equal('*', observedResult.AltAminoAcid); } [Fact] public void GetAltPeptideSequence_Genomic() { var refSequence = GetGenomicRefSequence(); var transcript = GetGenomicTranscript(); const int cdsBegin = 112; const int cdsEnd = 121; const string transcriptAltAllele = ""; const string expectedResult = "RHRNRNTQTETNTETQRHRNTQKHRNKHRDTETHRNTETNTETQKHTETQKQTQRHRNTQKHTDRNKHRNTETQKYRNTQKHRNKHRDTETQKHSDAETQQHKHRNTETHRNRNTETNTETQTHRHRETQKHTETLKHSGRCPGCRGSIA"; var observedResult = HgvsUtilities.GetAltPeptideSequence(refSequence, cdsBegin, cdsEnd, transcriptAltAllele, transcript, false); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetCdnaPositionOffset_Intron_RltL_Reverse() { var transcript = HgvsCodingNomenclatureTests.GetReverseTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 137619, 1,true); Assert.NotNull(po); Assert.Equal(2, po.Offset); Assert.Equal(1759, po.Position); Assert.Equal("*909+2", po.Value); } [Fact] public void GetCdnaPositionOffset_Intron_ReqL_Reverse() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Intron, 10, 108901173, 108918171, 422, 423) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(108813927, 108941437, 129, 1613, 1485)); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(108810721); transcript.SetupGet(x => x.End).Returns(108918171); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(true); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.Translation).Returns(translation.Object); var po = HgvsUtilities.GetPositionOffset(transcript.Object, 108909672, 0, true); Assert.NotNull(po); Assert.Equal(8500, po.Offset); Assert.Equal(422, po.Position); Assert.Equal("294+8500", po.Value); } [Fact] public void GetCdnaPositionOffset_Intron_LltR_Reverse() { var transcript = HgvsCodingNomenclatureTests.GetReverseTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 136000, 1, true); Assert.NotNull(po); Assert.Equal(-198, po.Offset); Assert.Equal(1760, po.Position); Assert.Equal("*910-198", po.Value); } [Fact] public void GetCdnaPositionOffset_Intron_LeqR_Reverse() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 2, 134901, 135802, 1760, 2661), new TranscriptRegion(TranscriptRegionType.Intron, 1, 135803, 137619, 1759, 1760), new TranscriptRegion(TranscriptRegionType.Exon, 1, 137620, 139379, 1, 1759) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(138530, 139309, 71, 850, 780)); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(134901); transcript.SetupGet(x => x.End).Returns(139379); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(true); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.Translation).Returns(translation.Object); var po = HgvsUtilities.GetPositionOffset(transcript.Object, 136711, 1, true); Assert.NotNull(po); Assert.Equal(909, po.Offset); Assert.Equal(1759, po.Position); Assert.Equal("*909+909", po.Value); } [Fact] public void GetCdnaPositionOffset_Gap_LeftSide_Forward() { var transcript = GetForwardGapTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 1101, 1,false); Assert.NotNull(po); Assert.Equal(0, po.Offset); Assert.Equal(100, po.Position); Assert.Equal("50", po.Value); } [Fact] public void GetCdnaPositionOffset_Gap_RightSide_Forward() { var transcript = GetForwardGapTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 1102, 1,true); Assert.NotNull(po); Assert.Equal(0, po.Offset); Assert.Equal(101, po.Position); Assert.Equal("51", po.Value); } [Fact] public void GetCdnaPositionOffset_Gap_LeftSide_Reverse() { var transcript = GetReverseGapTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 1102, 1,true); Assert.NotNull(po); Assert.Equal(0, po.Offset); Assert.Equal(201, po.Position); Assert.Equal("151", po.Value); } [Fact] public void GetCdnaPositionOffset_Gap_RightSide_Reverse() { var transcript = GetReverseGapTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 1103, 1,false); Assert.NotNull(po); Assert.Equal(0, po.Offset); Assert.Equal(200, po.Position); Assert.Equal("150", po.Value); } [Fact] public void GetCdnaPositionOffset_Intron_RltL_Forward() { var transcript = HgvsCodingNomenclatureTests.GetForwardTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 1262210, 1,true); Assert.NotNull(po); Assert.Equal(-6, po.Offset); Assert.Equal(337, po.Position); Assert.Equal("-75-6", po.Value); } [Fact] public void GetCdnaPositionOffset_Intron_LltR_Forward() { var transcript = HgvsCodingNomenclatureTests.GetForwardTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 1260583, 1,true); Assert.NotNull(po); Assert.Equal(101, po.Offset); Assert.Equal(336, po.Position); Assert.Equal("-76+101", po.Value); } [Fact] public void GetCdnaPositionOffset_Intron_LeqR_Forward() { var transcript = HgvsCodingNomenclatureTests.GetForwardTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 1261349, 1,true); Assert.NotNull(po); Assert.Equal(867, po.Offset); Assert.Equal(336, po.Position); Assert.Equal("-76+867", po.Value); } [Fact] public void GetCdnaPositionOffset_Exon_Forward() { var transcript = HgvsCodingNomenclatureTests.GetForwardTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 1262627, 4,true); Assert.NotNull(po); Assert.Equal(0, po.Offset); Assert.Equal(540, po.Position); Assert.Equal("129", po.Value); } [Fact] public void GetCdnaPositionOffset_Exon_Reverse() { var transcript = HgvsCodingNomenclatureTests.GetReverseTranscript(); var po = HgvsUtilities.GetPositionOffset(transcript, 137721, 2,true); Assert.NotNull(po); Assert.Equal(0, po.Offset); Assert.Equal(1659, po.Position); Assert.Equal("*809", po.Value); } //temp skipping to run smoke tests //[Fact] //public void GetCdnaPositionOffset_RnaEdits() //{ // var transcript = GetRnaEditTranscript(); // var positionOffset = HgvsUtilities.GetCdnaPositionOffset(transcript, 51135987, 20); // Assert.NotNull(positionOffset); // Assert.False(positionOffset.HasStopCodonNotation); // Assert.Equal(0, positionOffset.Offset); // Assert.Equal(1343, positionOffset.Position); // Assert.Equal("1343", positionOffset.Value); //} [Fact] public void GetCdnaPositionOffset_Gap_Forward_ReturnNull() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Gap, 1, 134901, 135802, 1760, 2661) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(138530, 139309, 71, 850, 780)); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(134901); transcript.SetupGet(x => x.End).Returns(139379); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.Translation).Returns(translation.Object); var po = HgvsUtilities.GetPositionOffset(transcript.Object, 135001, 0,false); Assert.NotNull(po); Assert.Equal(0, po.Offset); Assert.Equal(1760, po.Position); Assert.Equal("*910", po.Value); } private static ISequence GetGenomicRefSequence() { return new SimpleSequence( "AGACACAGAAACAGAAACACACAGACAGAAACAAACACAGAGACACAGAGACACAGAAACACACAGAAACACAGAAACAAACACAGAGACACAGAAACACACAGAAACACACAGAAACACAGAAACAAACACAGAGACACAGAAACACACAGAAACACAGAAACAAACACAGAGACACAGAAACACACAGAAACACACAGACAGAAACAAACACAGAAACACAGAGACACAGAAATACAGAAACACACAGAAACACAGAAACAAACACAGAGACACAGAGACACAGAAACACAGCGACGCAGAAACACAGCAACACAAACACAGAAACACAGAAACACACAGAAACAGAAACACAGAAACAAACACAGAGACACAGACACACAGACACAGAGAAACACAGAAACACACAGAAACACTGAAACACAGTGGGCGGTGTCCAGGCTGCAGAGGCTCCATCGCTGT", 2258580); } private static ITranscript GetGenomicTranscript() { var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 2258581, 2259042, 1, 462) }; var transcript = new Mock(); transcript.SetupGet(x => x.TranscriptRegions).Returns(transcriptRegions); transcript.SetupGet(x => x.StartExonPhase).Returns(0); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); transcript.SetupGet(x => x.Translation.CodingRegion.CdnaStart).Returns(1); return transcript.Object; } private static ITranscript GetReverseGapTranscript() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 1001, 1100, 201, 300), new TranscriptRegion(TranscriptRegionType.Gap, 1, 1101, 1103, 200, 201), new TranscriptRegion(TranscriptRegionType.Exon, 1, 1104, 1203, 101, 200), new TranscriptRegion(TranscriptRegionType.Intron, 1, 1204, 1303, 100, 101), new TranscriptRegion(TranscriptRegionType.Exon, 1, 1304, 1403, 1, 100) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(1051, 1353, 51, 250, 200)); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(1001); transcript.SetupGet(x => x.End).Returns(1403); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(true); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.Translation).Returns(translation.Object); return transcript.Object; } private static ITranscript GetForwardGapTranscript() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 1001, 1100, 1, 100), new TranscriptRegion(TranscriptRegionType.Gap, 1, 1101, 1103, 100, 101), new TranscriptRegion(TranscriptRegionType.Exon, 1, 1104, 1203, 101, 200), new TranscriptRegion(TranscriptRegionType.Intron, 1, 1204, 1303, 200, 201), new TranscriptRegion(TranscriptRegionType.Exon, 2, 1304, 1403, 201, 300) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(1051, 1353, 51, 250, 200)); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(1001); transcript.SetupGet(x => x.End).Returns(1403); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); transcript.SetupGet(x => x.TranscriptRegions).Returns(regions); transcript.SetupGet(x => x.Translation).Returns(translation.Object); return transcript.Object; } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/HgvscNotationTests.cs ================================================ using VariantAnnotation.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class HgvscNotationTests { // NM_004006.1:c.93G>T [Fact] public void ToString_substitution() { var startPosOff = new PositionOffset(93, 0, "93"); var endPosOff = new PositionOffset(93, 0, "93"); var hgvsc = new HgvscNotation("G", "T", "NM_004006.1", GenomicChange.Substitution, startPosOff, endPosOff, true); Assert.Equal("NM_004006.1:c.93G>T", hgvsc.ToString()); } // NM_012232.1:c.19del (one nucleotide) [Fact] public void ToString_deletion_one_base() { var startPosOff = new PositionOffset(19, 0, "19"); var endPosOff = new PositionOffset(19, 0, "19"); var hgvsc = new HgvscNotation("T", "", "NM_012232.1", GenomicChange.Deletion, startPosOff, endPosOff, true); Assert.Equal("NM_012232.1:c.19del", hgvsc.ToString()); } // NM_012232.1:c.19_21delTGC (multiple nucleotide) [Fact] public void ToString_deletion_multiple_base() { var startPosOff = new PositionOffset(19, 0, "19"); var endPosOff = new PositionOffset(21, 0, "21"); var hgvsc = new HgvscNotation("TGC", "", "NM_012232.1", GenomicChange.Deletion, startPosOff, endPosOff, true); Assert.Equal("NM_012232.1:c.19_21del", hgvsc.ToString()); } // NM_012232.1:c.7dupT (one base duplication) [Fact] public void ToString_one_base_duplication() { var startPosOff = new PositionOffset(7, 0, "7"); var endPosOff = new PositionOffset(7, 0, "7"); var hgvsc = new HgvscNotation("T", "T", "NM_012232.1", GenomicChange.Duplication, startPosOff, endPosOff, true); Assert.Equal("NM_012232.1:c.7dup", hgvsc.ToString()); } // NM_012232.1:c.6_8dupTGC (multi base duplication) [Fact] public void ToString_multi_base_duplication() { var startPosOff = new PositionOffset(6, 0, "6"); var endPosOff = new PositionOffset(8, 0, "8"); var hgvsc = new HgvscNotation("TGC", "TGC", "NM_012232.1", GenomicChange.Duplication, startPosOff, endPosOff, true); Assert.Equal("NM_012232.1:c.6_8dup", hgvsc.ToString()); } // NM_012232.1:c.5756_5757insAGG (multi base insertion) [Fact] public void ToString_insertion() { var startPosOff = new PositionOffset(5756, 0, "5756"); var endPosOff = new PositionOffset(5757, 0, "5757"); var hgvsc = new HgvscNotation("", "AGG", "NM_012232.1", GenomicChange.Insertion, startPosOff, endPosOff, true); Assert.Equal("NM_012232.1:c.5756_5757insAGG", hgvsc.ToString()); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/HgvsgNotationTests.cs ================================================ using Genome; using Intervals; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class HgvsgNotationTests { private static readonly ISequence SimpleSequence = new SimpleSequence("ATCGGTGCTGACGATACCTGACGTAAGTA"); private readonly IInterval _referenceInterval = new Interval(0, SimpleSequence.Length); private const string RefSeqAccession = "NC_012920.1"; [Theory] [InlineData(5, 5, "G", "T", VariantType.SNV, "NC_012920.1:m.5G>T")] [InlineData(5, 5, "G", "G", VariantType.SNV, "NC_012920.1:m.5=")] [InlineData(5, 7, "GTG", "", VariantType.deletion, "NC_012920.1:m.5_7del")] [InlineData(10, 12, "GAC", "", VariantType.deletion, "NC_012920.1:m.12_14del")] [InlineData(16, 15, "", "GATA", VariantType.insertion, "NC_012920.1:m.15_16insGATA")] [InlineData(19, 22, "TGAC", "GTCA", VariantType.MNV, "NC_012920.1:m.19_22inv")] [InlineData(10, 9, "", "GAC", VariantType.insertion, "NC_012920.1:m.12_14dup")] public void GetNotation_MT(int start, int end, string referenceAllele, string altAllele, VariantType type, string expectedHgvs) { var simpleVariant = new SimpleVariant(ChromosomeUtilities.ChrM, start, end, referenceAllele, altAllele, type); string actualHgvs = HgvsgNotation.GetNotation(RefSeqAccession, simpleVariant, SimpleSequence, _referenceInterval); Assert.Equal(expectedHgvs, actualHgvs); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/HgvspNotationTests.cs ================================================ using VariantAnnotation.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class HgvspNotationTests { [Fact] //hgvs example: LRG_199p1:p.Trp24Cys public void Missense_substitution() { Assert.Equal("LRG_199p1:p.(Trp24Cys)", HgvspNotation.GetSubstitutionNotation("LRG_199p1",24, "Trp", "Cys")); } [Fact] //hgvs example: LRG_199p1:p.Trp24Ter public void Nonsense_substitution() { Assert.Equal("LRG_199p1:p.(Trp24Ter)", HgvspNotation.GetSubstitutionNotation("LRG_199p1", 24,"Trp", "Ter")); } [Fact] //hgvs example: NP_003997.1:p.Cys188= public void Silent_substitution() { Assert.Equal("NP_003997.1:c.XXX(p.(Cys188=))", HgvspNotation.GetSilentNotation("NP_003997.1:c.XXX", 188, "Cys", false)); } [Fact] //hgvs example: LRG_199p1:p.(Met1?) public void StartLost_due_to_substitution() { Assert.Equal("LRG_199p1:p.(Met1?)", HgvspNotation.GetSubstitutionNotation("LRG_199p1", 1, "Met","Cys")); } [Fact] // hgvs example:NP_003997.1:p.(Ala3del) public void One_aminoAcid_deletion() { Assert.Equal("NP_003997.1:p.(Ala3del)", HgvspNotation.GetDeletionNotation("NP_003997.1", 3, 3, "Ala", false)); } [Fact] // hgvs example:NP_003997.1:p.(Ala3_Ser5del) public void Multiple_aminoAcid_deletion() { Assert.Equal("NP_003997.1:p.(Ala3_Ser5del)", HgvspNotation.GetDeletionNotation("NP_003997.1", 3, 5,"AlaLysSer", false)); } [Fact] //p.Trp26Ter public void Deletion_gained_stop() { Assert.Equal("NP_003997.1:p.(Trp26Ter)", HgvspNotation.GetDeletionNotation("NP_003997.1", 26, 27, "Trp", true)); } [Fact] public void Unknown_start_equals_end() { Assert.Equal("NP_003997.1:p.(Arg26Cys)", HgvspNotation.GetUnknownNotation("NP_003997.1", 26, 26, "Arg","Cys")); } [Fact] public void Unknown_start_not_equals_end() { Assert.Equal("NP_003997.1:p.(Arg26_Cys27)", HgvspNotation.GetUnknownNotation("NP_003997.1", 26, 27, "Arg", "Cys")); } [Fact] // hgvs example:NP_003997.1:p.(Ala3dup) public void One_aminoAcid_duplication() { Assert.Equal("NP_003997.1:p.(Ala3dup)", HgvspNotation.GetDuplicationNotation("NP_003997.1", 3, 3, "Ala")); } [Fact] // hgvs example:NP_003997.1:p.(Ala3_Ser5dup) public void Multiple_aminoAcid_duplication() { Assert.Equal("NP_003997.1:p.(Ala3_Ser5dup)", HgvspNotation.GetDuplicationNotation("NP_003997.1",3, 5, "AlaLysSer")); } [Fact] // hgvs example:NP_003997.1:p.(His4_Gln5insAla) public void One_aminoAcid_insertion() { Assert.Equal("NP_003997.1:p.(His4_Gln5insAla)", HgvspNotation.GetInsertionNotation("NP_003997.1", 4, 5, "Ala","MBCHQDE")); } [Fact] public void Insert_stop_codon() { Assert.Equal("NP_003997.1:p.(Gln5Ter)", HgvspNotation.GetInsertionNotation("NP_003997.1", 4, 5, "TerAla", "MBCHQDE")); } [Fact] public void Insert_past_stop() { Assert.Null(HgvspNotation.GetInsertionNotation("NP_003997.1", 8, 9, "TerAla", "MBCHQDE")); } [Fact] // hgvs example:NP_003997.1:p.(Lys2_Gly3insGlnSerLys) public void Multiple_aminoAcid_insertion() { Assert.Equal("NP_003997.1:p.(Lys2_Gly3insGlnSerLys)", HgvspNotation.GetInsertionNotation("NP_003997.1", 2, 3, "GlnSerLys", "MKGABC")); } [Fact] // hgvs example:NP_003997.1:p.(Lys2_Gly3insGlnSerLys) public void Insertion_at_end() { Assert.Equal("NP_003997.1:p.(Cys6_Ter7insGlnSerLys)", HgvspNotation.GetInsertionNotation("NP_003997.1", 6, 7, "GlnSerLys", "MKGABC*")); } [Fact] // hgvs example:NP_003997.1:p.(Cys28delinsTrpVal) public void Del_one_ins_two() { Assert.Equal("NP_003997.1:p.(Cys28delinsTrpVal)", HgvspNotation.GetDelInsNotation("NP_003997.1", 28, 28, "Cys", "TrpVal")); } [Fact] // hgvs example:NP_003997.1:p.(Cys28_Lys29delinsTrp) public void Del_two_ins_one() { Assert.Equal("NP_003997.1:p.(Cys28_Lys29delinsTrp)", HgvspNotation.GetDelInsNotation("NP_003997.1", 28, 29, "CysLys", "Trp")); } [Fact] // hgvs example:NP_003997.1:p.(Pro578_Lys579delinsLeuTer) public void Del_two_ins_stop() { Assert.Equal("NP_003997.1:p.(Pro578_Lys579delinsLeuTer)", HgvspNotation.GetDelInsNotation("NP_003997.1", 578, 579, "ProLys", "LeuTer")); } [Fact] //Pro578_Lys579 goes to TerLeu public void Delins_becomes_substitution_of_Ter() { Assert.Equal("NP_003997.1:p.(Pro578Ter)", HgvspNotation.GetDelInsNotation("NP_003997.1", 578, 579, "ProLys", "TerLeu")); } [Fact] // hgvs example:NP_003997.1:p.(Arg97ProfsTer23) public void Frameshift_with_known_countToStop() { Assert.Equal("NP_003997.1:p.(Arg97ProfsTer23)", HgvspNotation.GetFrameshiftNotation("NP_003997.1", 97, "Arg", "Pro",23)); } [Fact] // hgvs example:NP_003997.1:p.(Tyr4Ter) public void Frameshift_gains_immediate_stop() { Assert.Equal("NP_003997.1:p.(Tyr4Ter)", HgvspNotation.GetFrameshiftNotation("NP_003997.1", 4, "Tyr", "TerCysIle", -1)); } [Fact] // hgvs example:NP_003997.1:p.(Ile327ArgfsTer?) public void Frameshift_unknown_countToStop() { Assert.Equal("NP_003997.1:p.(Ile327ArgfsTer?)", HgvspNotation.GetFrameshiftNotation("NP_003997.1", 327, "Ile", "Arg", -1)); } [Fact] public void Frameshift_due_to_insertion() { Assert.Equal("NP_003997.1:p.(Cys3ArgfsTer40)", HgvspNotation.GetFrameshiftNotation("NP_003997.1", 3, "Cys", "Arg", 40)); } [Fact] //NP_001263627.1:p.(Met1?) public void Start_lost_start_equals_end() { Assert.Equal("NP_001263627.1:p.?", HgvspNotation.GetStartLostNotation("NP_001263627.1")); } [Fact] //NP_001263627.1:p.(Met1?) public void Start_lost_start_not_equals_end() { Assert.Equal("NP_001263627.1:p.?", HgvspNotation.GetStartLostNotation("NP_001263627.1")); } [Fact] // from varnom: p.Ter110Glnext*17 public void Stop_lost_with_countToEnd() { Assert.Equal("NP_001263627.1:p.(Ter110GlnextTer17)", HgvspNotation.GetExtensionNotation("NP_001263627.1",110, "Ter", "Gln", 17)); } //p.Ter327Argext*? [Fact] public void Stop_lost_without_countToEnd() { Assert.Equal("NP_001263627.1:p.(Ter327ArgextTer?)", HgvspNotation.GetExtensionNotation("NP_001263627.1", 327, "Ter", "Arg", -1)); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/ProteinChangeTests.cs ================================================ using VariantAnnotation.Interface.AnnotatedPositions; using Moq; using VariantAnnotation.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class ProteinChangeTests { [Fact] public void Substitution() { var variantEffect = new Mock(); variantEffect.Setup(x => x.IsFrameshiftVariant()).Returns(false); variantEffect.Setup(x => x.IsStopRetained()).Returns(false); variantEffect.Setup(x => x.IsStartLost()).Returns(false); variantEffect.Setup(x => x.IsStopLost()).Returns(false); var proteinChange = HgvsProteinNomenclature.GetProteinChange(5, "A", "B", "MACTAWR", variantEffect.Object); Assert.Equal(ProteinChange.Substitution, proteinChange); } [Fact] public void Single_base_deletion() { var variantEffect = new Mock(); variantEffect.Setup(x => x.IsFrameshiftVariant()).Returns(false); variantEffect.Setup(x => x.IsStopRetained()).Returns(false); variantEffect.Setup(x => x.IsStartLost()).Returns(false); variantEffect.Setup(x => x.IsStopLost()).Returns(false); var proteinChange = HgvsProteinNomenclature.GetProteinChange(5, "A", "", "MACTAWR", variantEffect.Object); Assert.Equal(ProteinChange.Deletion, proteinChange); } [Fact] public void Frameshift() { var variantEffect = new Mock(); variantEffect.Setup(x => x.IsFrameshiftVariant()).Returns(true); variantEffect.Setup(x => x.IsStopRetained()).Returns(false); variantEffect.Setup(x => x.IsStartLost()).Returns(false); variantEffect.Setup(x => x.IsStopLost()).Returns(false); var proteinChange = HgvsProteinNomenclature.GetProteinChange(5, "A", "C", "MACTAWR", variantEffect.Object); Assert.Equal(ProteinChange.Frameshift, proteinChange); } [Fact] public void Extension() { var variantEffect = new Mock(); variantEffect.Setup(x => x.IsFrameshiftVariant()).Returns(false); variantEffect.Setup(x => x.IsStopRetained()).Returns(false); variantEffect.Setup(x => x.IsStartLost()).Returns(false); variantEffect.Setup(x => x.IsStopLost()).Returns(true); var proteinChange = HgvsProteinNomenclature.GetProteinChange(5, "*", "C", "MACTAWR", variantEffect.Object); Assert.Equal(ProteinChange.Extension, proteinChange); } [Fact] public void Duplication() { var variantEffect = new Mock(); variantEffect.Setup(x => x.IsFrameshiftVariant()).Returns(false); variantEffect.Setup(x => x.IsStopRetained()).Returns(false); variantEffect.Setup(x => x.IsStartLost()).Returns(false); variantEffect.Setup(x => x.IsStopLost()).Returns(false); var proteinChange = HgvsProteinNomenclature.GetProteinChange(6, "", "A", "MACTAWR", variantEffect.Object); Assert.Equal(ProteinChange.Duplication, proteinChange); } [Fact] public void Insertion() { var variantEffect = new Mock(); variantEffect.Setup(x => x.IsFrameshiftVariant()).Returns(false); variantEffect.Setup(x => x.IsStopRetained()).Returns(false); variantEffect.Setup(x => x.IsStartLost()).Returns(false); variantEffect.Setup(x => x.IsStopLost()).Returns(false); var proteinChange = HgvsProteinNomenclature.GetProteinChange(4, "", "A", "MACTAWR", variantEffect.Object); Assert.Equal(ProteinChange.Insertion, proteinChange); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/RegulatoryRegionAnnotatorTests.cs ================================================ using System.Linq; using Moq; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions { public sealed class RegulatoryRegionAnnotatorTests { [Fact] public void Annotate_Promoter() { var variant = GetVariant(); var regulatoryRegion = GetRegulatoryRegion(); const ConsequenceTag expectedConsequence = ConsequenceTag.regulatory_region_variant; var annotatedRegulatoryRegion = RegulatoryRegionAnnotator.Annotate(variant, regulatoryRegion); var consequences = annotatedRegulatoryRegion.Consequences.ToList(); Assert.NotNull(annotatedRegulatoryRegion); Assert.Single(consequences); Assert.Equal(expectedConsequence, consequences[0]); } private static IRegulatoryRegion GetRegulatoryRegion() { return new RegulatoryRegion(ChromosomeUtilities.Chr1, 948000, 950401, CompactId.Convert("ENSR00001037666"), RegulatoryRegionType.promoter); } private static IVariant GetVariant() { var variant = new Mock(); variant.SetupGet(x => x.Type).Returns(VariantType.SNV); variant.SetupGet(x => x.Start).Returns(949523); variant.SetupGet(x => x.End).Returns(949523); variant.SetupGet(x => x.Behavior).Returns(AnnotationBehavior.SmallVariants); return variant.Object; } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/AminoAcidTests.cs ================================================ using System; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.TranscriptAnnotation; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class AminoAcidTests { private readonly AminoAcids _standardAminoAcids = new(false); private readonly AminoAcids _mitoAminoAcids = new(true); [Fact] public void AddUnknownAminoAcid_ExpectedResults() { const string aminoAcids = "MACGYIL"; Assert.Equal(aminoAcids + 'X', AminoAcids.AddUnknownAminoAcid(aminoAcids)); } [Fact] public void AddUnknownAminoAcid_SameIfStopCodon() { const string aminoAcids = "*"; Assert.Equal(aminoAcids, AminoAcids.AddUnknownAminoAcid(aminoAcids)); } [Fact] public void Translate_ExpectedResults() { SequenceChange aa = _standardAminoAcids.Translate("TTC", "CTC"); Assert.Equal("F", aa.Reference); Assert.Equal("L", aa.Alternate); } [Fact] public void Translate_NullOrEmptyInput_ReturnEmpty() { SequenceChange aa = _standardAminoAcids.Translate(null, null); Assert.Equal("", aa.Reference); Assert.Equal("", aa.Alternate); aa = _standardAminoAcids.Translate("", ""); Assert.Equal("", aa.Reference); Assert.Equal("", aa.Alternate); } [Fact] public void Translate_NsInInput_ReturnEmpty() { SequenceChange aa = _standardAminoAcids.Translate("ANA", "AAA"); Assert.Equal("", aa.Reference); Assert.Equal("", aa.Alternate); aa = _standardAminoAcids.Translate("AAA", "ANA"); Assert.Equal("", aa.Reference); Assert.Equal("", aa.Alternate); } [Fact] public void ConvertAminoAcidToAbbreviation_ThrowException() { Assert.Throws(delegate { AminoAcids.ConvertAminoAcidToAbbreviation('a'); }); } [Theory] [ClassData(typeof(StandardGeneticCodeData))] public void ConvertTripletToAminoAcid_StandardGeneticCode(char expectedResult, string[] triplets) { foreach (string triplet in triplets) { char actualResult = _standardAminoAcids.ConvertTripletToAminoAcid(triplet); Assert.Equal(expectedResult, actualResult); } } [Theory] [ClassData(typeof(VertebrateMitochondrialCodeData))] public void ConvertTripletToAminoAcid_VertebrateMitochondrialCode(char expectedResult, string[] triplets) { foreach (string triplet in triplets) { char actualResult = _mitoAminoAcids.ConvertTripletToAminoAcid(triplet); Assert.Equal(expectedResult, actualResult); } } [Fact] public void GetAbbreviations_ExpectedResults() { Assert.Equal("AspTyrCys", AminoAcids.GetAbbreviations("DYC")); } [Fact] public void GetAbbreviations_SingleAA_ExpectedResults() { Assert.Equal("Tyr", AminoAcids.GetAbbreviations("Y")); } [Fact] public void GetAbbreviations_NullOrEmptyInput_ReturnEmpty() { Assert.Equal("", AminoAcids.GetAbbreviations(null)); Assert.Equal("", AminoAcids.GetAbbreviations("")); } [Fact] public void TranslateBases_ExpectedResults() { const string expectedResult = "RAD"; string actualResult = _standardAminoAcids.TranslateBases("CGCGCAGAT", true); Assert.Equal(expectedResult, actualResult); } [Fact] public void TranslateBases_NullInput_ReturnNull() { Assert.Null(_standardAminoAcids.TranslateBases(null, true)); } private sealed class StandardGeneticCodeData : TheoryData { public StandardGeneticCodeData() { Add('A', new[] {"GCT", "GCC", "GCA", "GCG"}); Add('R', new[] {"CGT", "CGC", "CGA", "CGG", "AGA", "AGG"}); Add('N', new[] {"AAT", "AAC"}); Add('D', new[] {"GAT", "GAC"}); Add('C', new[] {"TGT", "TGC"}); Add('Q', new[] {"CAA", "CAG"}); Add('E', new[] {"GAA", "GAG"}); Add('G', new[] {"GGT", "GGC", "GGA", "GGG"}); Add('H', new[] {"CAT", "CAC"}); Add('I', new[] {"ATT", "ATC", "ATA"}); Add('L', new[] {"CTT", "CTC", "CTA", "CTG", "TTA", "TTG"}); Add('K', new[] {"AAA", "AAG"}); Add('M', new[] {"ATG"}); Add('F', new[] {"TTT", "TTC"}); Add('P', new[] {"CCT", "CCC", "CCA", "CCG"}); Add('S', new[] {"TCT", "TCC", "TCA", "TCG", "AGT", "AGC"}); Add('T', new[] {"ACT", "ACC", "ACA", "ACG"}); Add('W', new[] {"TGG"}); Add('Y', new[] {"TAT", "TAC"}); Add('V', new[] {"GTT", "GTC", "GTA", "GTG"}); Add('*', new[] {"TAA", "TGA", "TAG"}); } } private sealed class VertebrateMitochondrialCodeData : TheoryData { public VertebrateMitochondrialCodeData() { Add('A', new[] {"GCT", "GCC", "GCA", "GCG"}); Add('R', new[] {"CGT", "CGC", "CGA", "CGG"}); Add('N', new[] {"AAT", "AAC"}); Add('D', new[] {"GAT", "GAC"}); Add('C', new[] {"TGT", "TGC"}); Add('Q', new[] {"CAA", "CAG"}); Add('E', new[] {"GAA", "GAG"}); Add('G', new[] {"GGT", "GGC", "GGA", "GGG"}); Add('H', new[] {"CAT", "CAC"}); Add('I', new[] {"ATT", "ATC"}); Add('L', new[] {"CTT", "CTC", "CTA", "CTG", "TTA", "TTG"}); Add('K', new[] {"AAA", "AAG"}); Add('M', new[] {"ATG", "ATA"}); Add('F', new[] {"TTT", "TTC"}); Add('P', new[] {"CCT", "CCC", "CCA", "CCG"}); Add('S', new[] {"TCT", "TCC", "TCA", "TCG", "AGT", "AGC"}); Add('T', new[] {"ACT", "ACC", "ACA", "ACG"}); Add('W', new[] {"TGG", "TGA"}); Add('Y', new[] {"TAT", "TAC"}); Add('V', new[] {"GTT", "GTC", "GTA", "GTG"}); Add('*', new[] {"TAA", "TAG", "AGA", "AGG"}); } } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/AnnotatedTranscriptTests.cs ================================================ using OptimizedCore; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Pools; using VariantAnnotation.TranscriptAnnotation; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class AnnotatedTranscriptTests { [Fact] public void SerializeJson_NominalUsage() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 1263141, 1263143, "TAG", "", VariantType.deletion, "1:1263141:1263143", false, false, false, null, AnnotationBehavior.SmallVariants, false); var refSequence = new SimpleSequence(HgvsProteinNomenclatureTests.Enst00000343938GenomicSequence, 1260147 - 1); var transcript = HgvsProteinNomenclatureTests.GetMockedTranscriptOnForwardStrand(); var annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, refSequence, null, null, new AminoAcids(false)); var sb = StringBuilderPool.Get(); annotatedTranscript.SerializeJson(sb); var jsonString = StringBuilderPool.GetStringAndReturn(sb); Assert.Contains("ENST00000343938.4:p.(Ter215GlyextTer43)", jsonString); VariantPool.Return(variant); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/CdnaSequenceTests.cs ================================================ using UnitTests.TestDataStructures; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class CdnaSequenceTests { [Fact] public void RnaEdits_snv_forward_no_utr() { //NR_002754.2 var genomicSeq = new SimpleSequence("actctggtttctcttcaaatcgtataaatctttcgccttttactaaagatttccgtggagagaaacgagtgtgagtctgaaaccaattttttgaggccttgcgtttattagcagggctt", 11968210); var codingRegion = new CodingRegion(11968211, 11968329, 1, 119, 119); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 11968211, 11968329, 1, 119) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(107,107,"t") }; var codingSequence = new CdnaSequence(genomicSeq, codingRegion, regions, false, rnaEdits); Assert.Equal("actctggtttctcttcaaatcgtataaatctttcgccttttactaaagatttccgtggagagaaacgagtgtgagtctgaaaccaattttttgaggccttgcgttttttagcagggctt", codingSequence.GetCdnaSequence()); } [Fact] public void RnaEdits_snv_forward_with_utr() { //NM_001144032.2 chr1:148644011-148644795 var genomicSeq = new SimpleSequence("ACTATAAAGACAGTGAAAAGATCAGTGGTTATCTTTGCAGACGCCACCATCGCTGTGAGCCCTGTACTATCAGCCATGGTCAACTCCGTCGTCTTTTTTGAAATCACCAGGGATGGCAAGCCCTTGGGCCGCATCTCCATCAAACTGTTTGCAGACAAGATTCCAAAGACAGCAGAAAACTTTCGTGCTCTGAGCACTGGAGAGAAAGGATTTCGTTATAAGGGTTCCTGCTTTCACAGAATTATTCCAGGGTTTATGTGTCAGGGTGGTGACTTCACACGCCCTAATGGCACCGGTGACAAGTCCATCTATGGGGAGAAATTTGATGATGAGAACCTCATCCGAAAGCATACAGGTTCTGGCATCTTGTCCATGGCAAATGCTGGACCCAACACAAATGGTTCCCAGTTTTTCATCTGTGCTGCCAAGACTGAGTGGTTGGATGGCAAGCATGTGGCGTTTGGCAAGGTGAAAGAACGTGTGAATATTGTGGAAGCCACGGAGCACTTTGGGTACAGGAATAGCAAGACCAGCAAGAAGATCACCATTGCTGACTGTGGACAATTCTAATGAGTTTGACTTGTGTTTTATTTTCACCACCAGACCCATTCCTTCTGTAGCTCAGGAGAGCACCCCTCCACCACATTTGCTTGCAATATCCTAGAATCTTTGTGCTCTTGCTGCAGTTCCCTTTGGGTTCCATGTTTTCCTTGTTCCCTTCCATGCCTAGCTGGATGGCAGAGTTGAGTTAAGTTTATGATTATGAAATAAAAACTAAGTAACAA", 148644011 - 1); var codingRegion = new CodingRegion(148644086, 148644580, 76, 570, 495); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 148644011,148644795, 1, 785) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(420, 420, "C"), new RnaEdit(500, 500, "T"), new RnaEdit(737, 737, "T") }; var codingSequence = new CdnaSequence(genomicSeq, codingRegion, regions, false, rnaEdits); Assert.Equal("ATGGTCAACTCCGTCGTCTTTTTTGAAATCACCAGGGATGGCAAGCCCTTGGGCCGCATCTCCATCAAACTGTTTGCAGACAAGATTCCAAAGACAGCAGAAAACTTTCGTGCTCTGAGCACTGGAGAGAAAGGATTTCGTTATAAGGGTTCCTGCTTTCACAGAATTATTCCAGGGTTTATGTGTCAGGGTGGTGACTTCACACGCCCTAATGGCACCGGTGACAAGTCCATCTATGGGGAGAAATTTGATGATGAGAACCTCATCCGAAAGCATACAGGTTCTGGCATCTTGTCCATGGCAAATGCTGGACCCAACACAAATGGTTCCCAGTTTTTCATCTGCGCTGCCAAGACTGAGTGGTTGGATGGCAAGCATGTGGCGTTTGGCAAGGTGAAAGAACGTGTGAATATTGTGGAAGCCATGGAGCACTTTGGGTACAGGAATAGCAAGACCAGCAAGAAGATCACCATTGCTGACTGTGGACAATTCTAA", codingSequence.GetCdnaSequence().Substring(codingRegion.CdnaStart-1, codingRegion.Length)); } [Fact] public void RnaEdits_in_coding_sequence_reverse_insertion() { //NM_000682.6, chrom: chr2:96778623-96781984 var genomicSeq = new SimpleSequence("CTTATTACAAAATATCCTTTATTGATAAAATAGCTCAGAGTTTAAAAAAAAAAAAAACACCACCTGCATGTCGCAATAAGAGGTCACAGGCAAGAACACTGGGGGTCCCATGGGGCGCACACAAGACCGGCCAGCAGAGGGTCACAGTCAGTCCCTCTCCTGGCCCAGCTCCCCACCACATCCCAGGGCGATACTCTGGCCTCAACAACCCACTGAGGACCAAGCTGGGAAGCCTCCCACACCCCAGGAAGGACTCTTTTTGGTCCCCTCCATTCTCTCTACACCCAGAAAACTCCCTCGGTGCCCTTCCAAATCTAGCAGGTCCATCTGGCCCATTCCCCCGACACCTGCCAAGCTAAGATGCCTACTGGCCCAATGTTGAAGCCAGGCCCTCTCCAAGGGAAGGCCGATAAACCTCCTTTCCACACTTCCAACTGTTCTGGGTGCCAGGTTTTGGGGTGGGACTGAGAACCAGGAAGCAGGGGTCCTCAATGCACAGCCCCATCAGCATTGCGGGGAGCAGCGTGGCTGGGTCCGAGGCAGTCCACAAGCACCCACCTGGGGGGATCAGTTGTGGTTCACAAGGACTCATTTGGGGCTTGGAGACCTGGCCGGGCACTCCAGTGGGAGGCTCCCCTAGGGGCGCACCAGGCTCTGATGCCAGTACCCCACCTGGGGGCGCTGCCACCTGTCACAGGCTCTCATCTTAGACTGTTGCCGAGGTGTGGATATTTTGAGCTGTCTTGGGGAGACAATTTGCCTCCTTGATGACAAAAGACTTATCCCCCACTGGGGAGACCCAAGCCACTAAAAACCCTCTTGGTGTTGCCGGTGAAATGTCGAAACGTTGTCATGTAGCGTAATAACTCAGACCTTTGCAGCCAGAAGAACACATTCTCAAAGAGATCCTTTAACTTGAAATAGTGATTCTGTCTGCCACTCCCGGCTTCCAGTTCGGGGTAGGAATTCACACACCCCAGGGACAGAACAAAAGTCTACAGGAAGACAGGTGGTGGTAAACACAGAGGAAAGGGATTTTTATATCACCATATAATCACATTTTTGGTTCTCTAGTGTGTTCCCCCACAGAGCTCAAAGCTTTCTGCAAAGCCTTTCATCTCCCTGCAGCAAGTAGGCAGTGAGCTATTGTCGCCCCGATTTTTGCAGGGGGTGAATGCCAGTGATCGGGGATCTCCCGTCGAGGCAGAGACCAGGCCTCCAAGACCGCCCCAGCGAGGCATCCACGTGGCCACCCACCTACCGGAGGGGTGCTGGGTAAGGAAGCCGATCCATTGTTCTGGCTTTCAAAGGAACCACAGATCCGAAAACAGGCAAAGGGGGAAAGGAGGGCCCAGAGACGATGCCACCCCATAAGCCCCCATCCCAGCGCCTGCCAGGGACCGCGAGTGCCTAGCGTGGGTGATCAGTCTTCGTTTCTTCCTCCCCCTCAGCAGCAGGCCCCACTGGGAAAAGTGGAAGGCTGGCTCCGTGCTCTTTGTGGGTGGGGGGGAGATGAAAAAGAAACGAAAACACCACAAGCAAGTGACCTGCCAGGAACACAAGGTCCTCAAGAAAGGGAAGCCCAGACATTGGTCTGGAGAGCATGGGGCTCTGGGAAGAAAGTGCTCTCTCTTCTCCTGGTCTTGGCTATGTTCCAGAGGATTTGAACCACCTCCATCGGCCTGTGCTCAGGGAGAGGGTGGAGAAGGGGTCCCCCACAGCTAAGCCGGCAAGGGGAAGCTTCACTGGGACCCTTGCTAGCAGCCCCCCTGCCCACCCCTCCCAAGGGGTTCCTAAGATGAGGCCTACAGGATCTGGGCAGGGAGCAGAAAGCCCAGGGGAGGCAGCCACACACAGCAGGGCAAGAAGCAGGGTGACCCCGGCGCCACCGCACCAACCCCACAGGGGCAGCGCAGGCGGGCTCACCAGGCCGTCTGGGTCCACGGGCGGCACAGGATCCTCCGGAAGGCACGGCGGAAGTCCTGGTTGAAGATGGTGTAGATAACAGGGTTCAGTGAGCTGTTGCAGTAGCCGATCCAGAAGAAGAACTGGAAGAGGCCATGGGGCACCTTGCAGTGCTTCGGGCAGATGGCTCCCAGGCTGTAGCTGAAGAAGAAGGGGAACCAGCAGAGCACAAAAACGCCAATGACCACAGCCAGCACGAAGGTGAAGCGCTTCTCCCGGGTCAGCTGCGCCCGTCGACGCCACCACTGCCCACCTATAGCACCCACGCCCCTGCCCAGGAGCACCTGGCCACGTAGGGTGGCCAGCACCCGGGAGCCCTGTGGCTGCTGCAGCGGGGGGCTGCAAGCTGAGGCCGGAGACACTGGCACTGCCTGGGGTTCACACTCTTCCTCCTCCTCCTCCTCCTCTTCAGCTTCATCCTCTGGAGATGCCCCACAAACACCCTCCTTCTGGCCCTGGCCTGAGTTGGGAAGGGCAGCCCAACTGGGTGGCAAGGCCCGGGTCCCAGTATCTTCAGGGGTCTCCCCCTCCTCCTTCTCCCCAGTGGACTTCGAGTGTCCGTTGACCTCTCTGGCAGAAGCCACAGAGGCCAGGGCTGGCAGTTTGGCTGAGGCCAAAGCCCCACCATGGTCGGGTCGGGGCTGCTTGGACTCACCCTGCCCAGGCCCCCCCTTGGCCCTGGGACCTCTGCGGTTGCTGCGTTTGGCGATCAGGTAGATGCGCAGGTAGACAAGGATCATGATGAGGCAAGGAGCAAAGAAAGATCCGATGCTGGAGGCCAGGATGTACCAGGCCTCCTGGTTGAGCTTGCACTGGGGGCGCCCGCGCGGCTGGGGGCCCTGGTCGCCCTTGTAGATGAGGGGCGGCAGCGAGATGACGGCGGCGATGAGCCACACAGTGAGGATGATGCACTTGATGCGGCGCGGGGTGCGCTTGGAGTTGTACTCCAGCGCGCGGCTCACGGCCCAGTAGCGGTCCAGGCTGATGGCGCACAGGTGCACGATGGACGAGGTGCAGAAGAGCACGTCGAGCGCCAGGTACACCTCGCACCACGTGCGCCGGAAGTACCAGTAGCCCAGCAGCTCGTTGGCCAGCGAGAAAGGGATGATGAGCGTGGCCACCAGGATGTCGGCGGCGGCCAGCGACACCAGGAACAGGTTCTGAGGGGCGCGCAGCGAGCGGCTGGTCAACACAGCCAGGATGACCAGAGCGTTGCCGAAGATGGTAAAGAGAATGAGGAAGGTGATGGCCGCCGCTATGGCCGCTGTGGCCTGCACGGAGTAGGGGTCCTGGTGGTCCATGACGGGGCGGGAGGTGGGCAGAGGGAGCGCTGCCCGCCCAGTGCGCACCGTGGACGACAGCGCTGCCCGGCTCGGCTAGACAAGAGCGTCGCCCCT", 96778623 - 1); var codingRegion = new CodingRegion(96780545, 96781888, 97, 1449, 1344); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon,1, 96778623,96780986, 1008, 3371), new TranscriptRegion(TranscriptRegionType.Exon, 1, 96780987,96781984, 1, 998) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(999, 998, "AGAGGAGGA") }; const bool onReverseStrand = true; var codingSequence = new CdnaSequence(genomicSeq, codingRegion, regions, onReverseStrand, rnaEdits); var expectedCodingSeq = "ATGGACCACCAGGACCCCTACTCCGTGCAGGCCACAGCGGCCATAGCGGCGGCCATCACCTTCCTCATTCTCTTTACCATCTTCGGCAACGCTCTGGTCATCCTGGCTGTGTTGACCAGCCGCTCGCTGCGCGCCCCTCAGAACCTGTTCCTGGTGTCGCTGGCCGCCGCCGACATCCTGGTGGCCACGCTCATCATCCCTTTCTCGCTGGCCAACGAGCTGCTGGGCTACTGGTACTTCCGGCGCACGTGGTGCGAGGTGTACCTGGCGCTCGACGTGCTCTTCTGCACCTCGTCCATCGTGCACCTGTGCGCCATCAGCCTGGACCGCTACTGGGCCGTGAGCCGCGCGCTGGAGTACAACTCCAAGCGCACCCCGCGCCGCATCAAGTGCATCATCCTCACTGTGTGGCTCATCGCCGCCGTCATCTCGCTGCCGCCCCTCATCTACAAGGGCGACCAGGGCCCCCAGCCGCGCGGGCGCCCCCAGTGCAAGCTCAACCAGGAGGCCTGGTACATCCTGGCCTCCAGCATCGGATCTTTCTTTGCTCCTTGCCTCATCATGATCCTTGTCTACCTGCGCATCTACCTGATCGCCAAACGCAGCAACCGCAGAGGTCCCAGGGCCAAGGGGGGGCCTGGGCAGGGTGAGTCCAAGCAGCCCCGACCCGACCATGGTGGGGCTTTGGCCTCAGCCAAACTGCCAGCCCTGGCCTCTGTGGCTTCTGCCAGAGAGGTCAACGGACACTCGAAGTCCACTGGGGAGAAGGAGGAGGGGGAGACCCCTGAAGATACTGGGACCCGGGCCTTGCCACCCAGTTGGGCTGCCCTTCCCAACTCAGGCCAGGGCCAGAAGGAGGGTGTTTGTGGGGCATCTCCAGAGGATGAAGCTGAAGAGGAGGAAGAGGAGGAGGAGGAGGAGGAAGAGTGTGAACCCCAGGCAGTGCCAGTGTCTCCGGCCTCAGCTTGCAGCCCCCCGCTGCAGCAGCCACAGGGCTCCCGGGTGCTGGCCACCCTACGTGGCCAGGTGCTCCTGGGCAGGGGCGTGGGTGCTATAGGTGGGCAGTGGTGGCGTCGACGGGCGCAGCTGACCCGGGAGAAGCGCTTCACCTTCGTGCTGGCTGTGGTCATTGGCGTTTTTGTGCTCTGCTGGTTCCCCTTCTTCTTCAGCTACAGCCTGGGAGCCATCTGCCCGAAGCACTGCAAGGTGCCCCATGGCCTCTTCCAGTTCTTCTTCTGGATCGGCTACTGCAACAGCTCACTGAACCCTGTTATCTACACCATCTTCAACCAGGACTTCCGCCGTGCCTTCCGGAGGATCCTGTGCCGCCCGTGGACCCAGACGGCCTGGTGA"; var rnaEditLength = rnaEdits[0].Bases.Length; Assert.Equal(expectedCodingSeq, codingSequence.GetCdnaSequence().Substring(codingRegion.CdnaStart-1, codingRegion.Length+rnaEditLength)); } [Fact] public void With_rnaEdits_reverse_deletion_utr() { //NM_001317107.1 chr14:22138125-22139232 var genomicSeq = new SimpleSequence("ATATGGTATGTAACTTATTCTTTGCAAGGCGCTTCTTTAATTTGGAGCACCACGTATCCTAAGGACGTAGACATTTTCATTTTTCTTCTTTTCTCTCTTTTCTCCCCACTAACTTGTTTAAGGCACTCTTCATTTCTTCATTCCTAAGGGTATAGATAATGGGGTTCAGCAGGGGGGTGACTGCAGTGAAAAACACAGATACTGCCTTGTCCTCTGGGAGGCTGGTGGATGGGCGGGAATAGATGAAGATGCAGTGTCCCAGGAACAGTGTAACTACAGTGAGATGGGCTGCACAGGTGGACAGGGCCTTCCACTTGCCCTTGGAGATCTGCTGCCTCAGACTCACCAGGATGACTGCGTAGGACACCACCAGGACCACAAAACAGACCACGGAGATCAATCCACTGTTGGAGACAATGAGGATCTCAAGGACGTGGGTGTGTCAATGCAGGCCAGCTTGATCACCTGAGGTACATCACAGAAGAAGTTGTCAATCTCATCAGGACCACAGTAGGGCAGCTTGATGGTAAGGGAGGTGAGGGCTATGGAGTGGATGGTCCCTCCTGTCCAGAGGGCCACAGCCAGCAGCACACATACCTTCCAGTTCATCACTATCATGTACTGCAGGGGTTTACAGATGGCCACATACCGATCATAGGCCATGACGGTGAGGAGGAAGATCTCTGTGCAGGCAAAGAGGTGCAGGAAGAACATCTGGGTCACACAGGCATCAAAAGAGATGAGCTTTTCCTCTGACCACACGTCTCTCAGCATCTTGGGGACAGTGACAGTGGAGTGGCAGACATCAATAAAGGACAGGTTGCTGAGGAAGAAATACATGGGAGTATGGAGCCGGTGGTCATAGATAATAGTTATGACAATGAGAACATTCCCAATCAGTGTCAGGACATAAAAAATGAGGAACATGGAAAACATAGCTATCCGTGCCTTATGATTTACAGATAAACCTCTAAGCCGAAAATATGTCACTAAAGAAGTTTGATTGAGTAGGATGGCCTCTTCCATTCTCTTTGTTAGACAACCTGTAAAGAATTAGAAAAAAAGTCTAATATAACACAGTATCTGCATCAATCATTTGGTCATTTAA", 22138125 - 1); var codingRegion = new CodingRegion(22138201, 22139150, 83, 1030, 948); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 22138125,22138561, 670, 1106), new TranscriptRegion(TranscriptRegionType.Gap, 1, 22138562,22138563, 669, 670), new TranscriptRegion(TranscriptRegionType.Exon, 1, 22138564,22139232, 1, 669) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(905, 905, "T"), new RnaEdit(796, 796, "C"), new RnaEdit(679, 679, "A"), new RnaEdit(670, 671, "") }; var codingSequence = new CdnaSequence(genomicSeq, codingRegion, regions, true, rnaEdits); var expectedCodingSeq = "ATGGAAGAGGCCATCCTACTCAATCAAACTTCTTTAGTGACATATTTTCGGCTTAGAGGTTTATCTGTAAATCATAAGGCACGGATAGCTATGTTTTCCATGTTCCTCATTTTTTATGTCCTGACACTGATTGGGAATGTTCTCATTGTCATAACTATTATCTATGACCACCGGCTCCATACTCCCATGTATTTCTTCCTCAGCAACCTGTCCTTTATTGATGTCTGCCACTCCACTGTCACTGTCCCCAAGATGCTGAGAGACGTGTGGTCAGAGGAAAAGCTCATCTCTTTTGATGCCTGTGTGACCCAGATGTTCTTCCTGCACCTCTTTGCCTGCACAGAGATCTTCCTCCTCACCGTCATGGCCTATGATCGGTATGTGGCCATCTGTAAACCCCTGCAGTACATGATAGTGATGAACTGGAAGGTATGTGTGCTGCTGGCTGTGGCCCTCTGGACAGGAGGGACCATCCACTCCATAGCCCTCACCTCCCTTACCATCAAGCTGCCCTACTGTGGTCCTGATGAGATTGACAACTTCTTCTGTGATGTACCTCAGGTGATCAAGCTGGCCTGCATTGACACCCACGTCATTGAGATCCTCATTGTCTCCAACAGTGGATTGATCTCCGTGGTCTGTTTTGTGGTCCTGGTGGTGTCCTACGCAGTCATCCTGGTGAGTCTGAGGCAGCAGATCTCCAAGGGCAAGCGGAAGGCCCTGTCCACCTGTGCAGCCCATCTCACTGTAGTTACACTGTTCCTGGGACACTGCATCTTCATCTATTCCCGCCCATCCACCAGCCTCCCAGAGGACAAGGTAGTATCTGTGTTTTTCACTGCAGTCACCCCCCTGCTGAACCCCATTATCTATACCCTTAGGAATGAAGAAATGAAGAGTGCCTTAAACAAGTTAGTGGGGAGAAAAGAGAGAAAAGAAGAAAAATGA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCdnaSequence().Substring(codingRegion.CdnaStart-1, codingRegion.Length)); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/CodingSequenceTests.cs ================================================ using CacheUtils.TranscriptCache; using UnitTests.TestDataStructures; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class CodingSequenceTests { [Fact] public void Create() { // ENST00000374673.3 var sequence = new SimpleSequence( "GGGGTGTGTCTCCAGGGCCTTCCGCACTCAGCCAGGGAGAGCAAACAAACAGGCTTGGGGGACTGGGGAGGGGGGAAAGCGGAGGGGCAGGGTAGGGGCGGGGCAGGAGTGGAAGGCGGGGCAGGAGCAAGCGGCCTGGGCAGGGCAAGGGGGCCTCAGCTGGACCCTCGGATACTCACGGCAGTTGGCTTCATCAGTTCGGTCCTCACAGTCAAAGTCACCATCGCAGCGCCACAGCTTGAGGGCACAATGTCCATTCCCGCAGGGGAACTCGTTGGGCTCACAGGGTGGCGGGGGGCCTAGGAGACCGGGCAGGGGTCAGCAGCATCCTCCCGGGCCAGCTTCCTGCTCCCCGCACCCACCTGCACCCCTGCCGGTGCGCACCACAGTCTAGCTCATCGCTGCCGTCCTCGCAGTCCTCCTGTCCGTCGCAGAGGTAGTCTCTGGGGATGCAGTGCCCATTGCGGCATGCGGCCTCCTGGGGCCCACAGGGCAGGGGCCTGACGGAACCGGGAAGCAGGGGCTGAGGAGCGTGGGTGACTGGTGGCTGTCGCATGATGGTTGTCTCTGGCCGGGGCGGTAAAGATGTCGTCTCCACAAGGAGAGAGAATGTGGGGCTGATACCCAGGACTGGCTCCTCTGTGGATAGATTCCGCTTGGCATTTGGCAGAAGCAGATGGCTCCTCACCTGCTCCTTGTCCCCAACCCTCCCCAGGCCCACCCTGTACTCCCCAACACCACTCCCTGCCACCCCCTGCCTGGCTCTGTCATCACCCTTCCTATGCCCCCATCCTCTGCCTGCACCAAACCCTCATAGTCCTTGATGGGCTCCAAGACCCAGGTGTAGGACCCTGGCCCTCCCCTGGCACCCAAACCACTCGTGGCCCCGGACATCCCCTCACCACAATTGAGCTCATCAGACATGTCCCTGCAGTCGGGCCGCCGGTCACAGCGATACTCCAGGGCCACACACTCATTGTAGCTGTGGCAGGCAAACTCGGCCTCCGTGCAGGCTCTTGGGAACTGGGGCACTGCAGGTGGAAAGGAAGCAGACTGGAGTCAGAGGCGGCAGGAGGCAGGTGCGGGAAGCTGTAGGTGCTGTGTGGCTGGAGTGGGCTCCAGGGCCCTGTGTCAGGCAGCTCGGTTTCTGGCAGGCACAACGAGGGCAAGCAGCACACACTAGACACATCCACAGCACACGTGGGGCATGGGACATGCGGCAGTGGCCTCCCCCATCTCTAAAACAGACCCCACACACAGTTGACATGCCACACGCATGCAACCACCACACCACACACATGCAGGCCACAGCCTGGCCCAGTGAGGACAAAGAAGGAGGGGAGAAGGGAGTGCCCAGCTGTCTTGGGCTGTGCCCAGCCAGCCATCTTGCCCACACCCTTCTTTCCTCTCCATCCTTTAAAAAATTTTTTTCTCTCTTCTTTTTTATTTTTTTAGAGACAGGGTCTTGCTACGTTGCCCAAGCTGGTCTCGAACTCTTTGCCTCAAGCAATCCTCCCGTCTTGGCCTCCCAAAGTGCTGGGGTTACAGGCGTGAGCCCCTGCACCCGGCCTCCTCTCCAACCTTAACTTCTCTAGGAACCTGGCTGGGCCTCGGCCTGGCTTACACTCTCACCTGGTGTCACTGCGACCGCCACAGCGGCCGGCGGGGGTGGGGGGGTCTGTGCTGGAAAGGAAGATGTGATCAGTGGCTGTTCCACCTGGGAGCCGGGAGCTGAGGGCTGCAGGGCTGGGCCACATTCCACCATCCCTAGCCAGGAGGACTTATTGAAAAGTGAGAGAGGAGGGCTGGACCCCCAGCAGTCTTTAGACCTGGGCCTGATGATGCAGAAGAGCAAGCTTGATCTCTGGGTGCAATAATTAAGGGTTTTTGTTTGTTTGTCTTGTTTTAGAGGCAGGGTTTTGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCATGATCCTAGCTCACTGCAGCCTCAAACTCCTGGGCTCCGGTGATCCTC", 22213528); var codingRegion = new CodingRegion(22213728, 22215214, 1, 538, 538); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 4, 22213728, 22213827, 439, 538), new TranscriptRegion(TranscriptRegionType.Intron, 3, 22213828, 22213912, 438, 439), new TranscriptRegion(TranscriptRegionType.Exon, 3, 22213913, 22214167, 184, 438), new TranscriptRegion(TranscriptRegionType.Intron, 2, 22214166, 22214430, 183, 184), new TranscriptRegion(TranscriptRegionType.Exon, 2, 22214431, 22214559, 55, 183), new TranscriptRegion(TranscriptRegionType.Intron, 1, 22214560, 22215160, 54, 55), new TranscriptRegion(TranscriptRegionType.Exon, 1, 22215161, 22215214, 1, 54) }; const string expectedResults = "NCAGCACAGACCCCCCCACCCCCGCCGGCCGCTGTGGCGGTCGCAGTGACACCAGTGCCCCAGTTCCCAAGAGCCTGCACGGAGGCCGAGTTTGCCTGCCACAGCTACAATGAGTGTGTGGCCCTGGAGTATCGCTGTGACCGGCGGCCCGACTGCAGGGACATGTCTGATGAGCTCAATTGTGAGGAGCCAGTCCTGGGTATCAGCCCCACATTCTCTCTCCTTGTGGAGACGACATCTTTACCGCCCCGGCCAGAGACAACCATCATGCGACAGCCACCAGTCACCCACGCTCCTCAGCCCCTGCTTCCCGGTTCCGTCAGGCCCCTGCCCTGTGGGCCCCAGGAGGCCGCATGCCGCAATGGGCACTGCATCCCCAGAGACTACCTCTGCGACGGACAGGAGGACTGCGAGGACGGCAGCGATGAGCTAGACTGTGGCCCCCCGCCACCCTGTGAGCCCAACGAGTTCCCCTGCGGGAATGGACATTGTGCCCTCAAGCTGTGGCGCTGCGATGGTGACTTTGACTGTGAGGACCG"; var codingSequence = new CodingSequence(sequence, codingRegion, regions, true, 1, null); var observedResults = codingSequence.Substring(0, expectedResults.Length); Assert.Equal(expectedResults, observedResults); } [Fact] public void Length_ReturnTrueLength_WhenGapsArePresent() { const int expectedResults = 720; var sequence = new NSequence(); var codingRegion = new CodingRegion(10051, 12770, 51, 769, 720); const byte startExonPhase = 1; int naiveCodingRegionLen = codingRegion.CdnaEnd - codingRegion.CdnaStart + 1; var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 10001, 10299, 1, 299), new TranscriptRegion(TranscriptRegionType.Intron, 1, 10300, 12300, 229, 331), new TranscriptRegion(TranscriptRegionType.Exon, 2, 12301, 12970, 331, 1000) }; var codingSequence = new CodingSequence(sequence, codingRegion, regions, false, startExonPhase, null); var observedResults = codingSequence.Length; Assert.Equal(expectedResults, observedResults); Assert.NotEqual(expectedResults, naiveCodingRegionLen); } [Fact] public void RnaEdits_snv_forward_no_utr() { //NR_002754.2 var genomicSeq = new SimpleSequence( "actctggtttctcttcaaatcgtataaatctttcgccttttactaaagatttccgtggagagaaacgagtgtgagtctgaaaccaattttttgaggccttgcgtttattagcagggctt", 11968210); var codingRegion = new CodingRegion(11968211, 11968329, 1, 119, 119); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 11968211, 11968329, 1, 119) }; var rnaEdits = new IRnaEdit[] {new RnaEdit(107, 107, "t")}; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, false, startExonPhase, rnaEdits); Assert.Equal( "actctggtttctcttcaaatcgtataaatctttcgccttttactaaagatttccgtggagagaaacgagtgtgagtctgaaaccaattttttgaggccttgcgttttttagcagggctt", codingSequence.GetCodingSequence()); } [Fact] public void RnaEdits_snv_forward_with_utr() { //NM_001144032.2 chr1:148644011-148644795 var genomicSeq = new SimpleSequence( "ACTATAAAGACAGTGAAAAGATCAGTGGTTATCTTTGCAGACGCCACCATCGCTGTGAGCCCTGTACTATCAGCCATGGTCAACTCCGTCGTCTTTTTTGAAATCACCAGGGATGGCAAGCCCTTGGGCCGCATCTCCATCAAACTGTTTGCAGACAAGATTCCAAAGACAGCAGAAAACTTTCGTGCTCTGAGCACTGGAGAGAAAGGATTTCGTTATAAGGGTTCCTGCTTTCACAGAATTATTCCAGGGTTTATGTGTCAGGGTGGTGACTTCACACGCCCTAATGGCACCGGTGACAAGTCCATCTATGGGGAGAAATTTGATGATGAGAACCTCATCCGAAAGCATACAGGTTCTGGCATCTTGTCCATGGCAAATGCTGGACCCAACACAAATGGTTCCCAGTTTTTCATCTGTGCTGCCAAGACTGAGTGGTTGGATGGCAAGCATGTGGCGTTTGGCAAGGTGAAAGAACGTGTGAATATTGTGGAAGCCACGGAGCACTTTGGGTACAGGAATAGCAAGACCAGCAAGAAGATCACCATTGCTGACTGTGGACAATTCTAATGAGTTTGACTTGTGTTTTATTTTCACCACCAGACCCATTCCTTCTGTAGCTCAGGAGAGCACCCCTCCACCACATTTGCTTGCAATATCCTAGAATCTTTGTGCTCTTGCTGCAGTTCCCTTTGGGTTCCATGTTTTCCTTGTTCCCTTCCATGCCTAGCTGGATGGCAGAGTTGAGTTAAGTTTATGATTATGAAATAAAAACTAAGTAACAA", 148644011 - 1); var codingRegion = new CodingRegion(148644086, 148644580, 76, 570, 495); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 148644011, 148644795, 1, 785) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(420, 420, "C"), new RnaEdit(500, 500, "T"), new RnaEdit(737, 737, "T") }; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, false, startExonPhase, rnaEdits); Assert.Equal( "ATGGTCAACTCCGTCGTCTTTTTTGAAATCACCAGGGATGGCAAGCCCTTGGGCCGCATCTCCATCAAACTGTTTGCAGACAAGATTCCAAAGACAGCAGAAAACTTTCGTGCTCTGAGCACTGGAGAGAAAGGATTTCGTTATAAGGGTTCCTGCTTTCACAGAATTATTCCAGGGTTTATGTGTCAGGGTGGTGACTTCACACGCCCTAATGGCACCGGTGACAAGTCCATCTATGGGGAGAAATTTGATGATGAGAACCTCATCCGAAAGCATACAGGTTCTGGCATCTTGTCCATGGCAAATGCTGGACCCAACACAAATGGTTCCCAGTTTTTCATCTGCGCTGCCAAGACTGAGTGGTTGGATGGCAAGCATGTGGCGTTTGGCAAGGTGAAAGAACGTGTGAATATTGTGGAAGCCATGGAGCACTTTGGGTACAGGAATAGCAAGACCAGCAAGAAGATCACCATTGCTGACTGTGGACAATTCTAA", codingSequence.GetCodingSequence()); } [Fact] public void RnaEdits_snv_reverse_utr() { //NM_031947.3, chr5:140682196-140683630 var genomicSeq = new SimpleSequence( "TGCATGTACACACAAATGGCTTTATGCAAAGGCCCTGACAGAACGATATTTAGTTTTTCAGATTAGGTACATAGGGCCAACCAGCCCACCCTGTACATTCCAGCAAGTGCAAGAGCAGCAACTTTCCTATTTCAATACAATTATGGGCAGAAATTATATGATGTAAAATAGAGGCCCTTCCATAAAGTTAAGATTTAGGGTAGAAGAAGGGAAGATAAAACCAAAATTCCCATGAAGTCAAAATTAGACAGTGGTCTTGTACTCTGCTGAACCCTGTGATGAACTGTAGTCCTCAAACTCATGGACTCGGATCCAGGTTCACCAAGACACTTCAGTATGCTTCCAACTGTTTCATCATCATCTTCCTGCTGTATTCGTAGGCCACAAACAGTGCCCCATTGGCAGGGATTGCTCGAATCATAGTAGCTTTCAGTCCAGAATATAAGGCTACTATTCCTTCATTTCTCACAACACTTAAGAGGGTACCAATAAATCCTGCCTGTTTCCCATACATGGAAAGAACTTGAATTCTGGATTTAATACAATCCACTGGGAACACGACAAGCCACAGGCAAATTCCAGCAACTCCACCACTTAACATCAAATGGACAGGGCCTAGTTCATCTTTTGATCTCCCTGACGCAAAAAACGATCGGCTCAGTTCATAGCCACCAAAGAAAAAGAAATAACCCGGTACTTCTTGAAGTAGAGTACTCGAGAGTCCATGGTAGAAGCCCAAGGGGCCATCCTTTTTAAGGATACCCTTCACGACAGACCAAATTGTATTATGGCTTTTTGCTATCTTCCCTGACATCTCCATTTCATACATGGTCTGTAGCCGGCACTTCACAAGCTCAGTGGGGCAGAGAGCCAGTGCAGCAAATGCAGAGGCGAAGGACCCCGCGGCTGCAGTCTGGAGATCACTCAGCTTTGCCTGCTTGTCCATTCCAGCCACTTTCCTGACAAACTGCTGGCAGAACCCGTAGCACATGAAGAGGACCGAGTTTTCGGCGACGTAGGCCATAAGTGCCGGGCCGGTGCCCTTGTAGAAGCCCCGGAGACCCACTTGGGCGTATGTCTTCAGGAAGCAGTCGGTGAGGCCCTTGTACAGGTCAGGGAACGTCTGCATCTTCACTTTTATTGTGTCGAAGGGCTGCCCAGTCAGTACACACGCTGTCCCCCCTGCGGCCCCCGCTGTGAGGTCGATGGCGGCTTGGATGCCAGGACCGGACTTCATGTTCGCTCACTCGTCTGAGGGTCCCAGTGGAAGGCGACTAACTCCCCAGAGCGTGAGACCGGCTTTTCACGTCCAGCCGCAGCGAGCGCGGGGAATGGAGTTGGGGGTGGTGGGGTGGCTCTACCGCCTGTTCTGGGCTCTCACCCCAGTGCGGGGGAAGCCGCTCAACCCTACGCTCCGCCGCGGGCCGCCCCCTCC", 140682196 - 1); var codingRegion = new CodingRegion(140682527, 140683432, 199, 1104, 906); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 140682196, 140683630, 1, 1435) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(366, 366, "T") }; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, true, startExonPhase, rnaEdits); Assert.Equal( "ATGAAGTCCGGTCCTGGCATCCAAGCCGCCATCGACCTCACAGCGGGGGCCGCAGGGGGGACAGCGTGTGTACTGACTGGGCAGCCCTTCGACACAATAAAAGTGAAGATGCAGACGTTCCCTGACCTGTACAAGGGCCTCACCGACTGCTTCCTGAAGACATACGCTCAAGTGGGTCTCCGGGGCTTCTACAAGGGCACCGGCCCGGCACTTATGGCCTACGTCGCCGAAAACTCGGTCCTCTTCATGTGCTACGGGTTCTGCCAGCAGTTTGTCAGGAAAGTGGCTGGAATGGACAAGCAGGCAAAGCTGAGTGATCTCCAGACTGCAGCCGCGGGGTCCTTCGCCTCTGCATTTGCTGCACTGGCTCTCTGCCCCACTGAGCTTGTGAAGTGCCGGCTACAGACCATGTATGAAATGGAGATGTCAGGGAAGATAGCAAAAAGCCATAATACAATTTGGTCTGTCGTGAAGGGTATCCTTAAAAAGGATGGCCCCTTGGGCTTCTACCATGGACTCTCGAGTACTCTACTTCAAGAAGTACCGGGTTATTTCTTTTTCTTTGGTGGCTATGAACTGAGCCGATCGTTTTTTGCGTCAGGGAGATCAAAAGATGAACTAGGCCCTGTCCATTTGATGTTAAGTGGTGGAGTTGCTGGAATTTGCCTGTGGCTTGTCGTGTTCCCAGTGGATTGTATTAAATCCAGAATTCAAGTTCTTTCCATGTATGGGAAACAGGCAGGATTTATTGGTACCCTCTTAAGTGTTGTGAGAAATGAAGGAATAGTAGCCTTATATTCTGGACTGAAAGCTACTATGATTCGAGCAATCCCTGCCAATGGGGCACTGTTTGTGGCCTACGAATACAGCAGGAAGATGATGATGAAACAGTTGGAAGCATACTGA", codingSequence.GetCodingSequence()); } [Fact] public void RnaEdits_insertion_in_utr() { //NM_080431.4, chrom: chr1:2938046-2939467 var genomicSeq = new SimpleSequence( "TGGAAGAGGCCTCAGCAGGCCCAGGCCACCTGGAGGGAGAGCAGACCTGCGGCTGAGGATGCAGGGCTCCCGGGCACGGTGCTAGCCCTGCCTTGAGACACCCCGAGAGCTGTGGGAAGAGCTGTGGGATCCCCTATTGCATCACAAAGCGGCCCTGGAGGGCTGGTCTTTATTTTGATGAGGCTGAGAAGGGAAGGCTGCGGGCATGTTTAATCCGCACGCTTTAGACTCCCCGGCTGTGATTTTTGACAATGGCTCGGGGTTCTGCAAAGCGGGCCTGTCTGGGGAGTTTGGACCCCGGCACATGGTCAGCTCCATCGTGGGGCACCTGAAATTCCAGGCTCCCTCAGCAGAGGCCAACCAGAAGAAGTACTTTGTGGGGGAGGAGGCCCTGTACAAGCAGGAGGCCCTGCAGCTGCACTCCCCTTTCGAGCGTGGCCTGATCACAGGGTGGGATGACGTGGAGAGACTCTGGAAGCACCTCTTTGAGTGGGAGCTAGGCGTGAAACCCAGCGACCAGCCCCTGCTTGCAACGGAGCCCTCCCTGAACCCCAGGGAGAACCGTGAGAAGATGGCAGAAGTCATGTTCGAGAACTTCGGCGTGCCCGCTTTCTACCTGTCGGACCAGGCGGTGCTGGCTCTCTACGCCTCTGCCTGTGTCACGGGCCTGGTGGTGGACAGCGGGGATGCGGTCACCTGCACTGTCCCCATCTTTGAGGGTTACTCCCTGCCCCACGCAGTCACCAAGCTCCACGTGGCGGGCAGGGACATCACGGAGCTCCTCATGCAGCTGCTCCTGGCCAGCGGCCACACCTTCCCCTGCCAGCTGGACAAGGGTCTCGTGGACGACATCAAAAAGAAGCTGTGCTACGTGGCCTTGGAGCCCGAGAAGGAGCTTTCCCGGAGGCCGGAGGAGGTCCTGAGGGAGTACAAGCTGCCCGACGGGAACATCATCAGCCTCGGGGACCCGCTGCACCAGGCGCCCGAGGCCCTGTTCGTGCCCCAGCAGCTGGGCAGCCAGAGCCCCGGGCTCTCGAATATGGTCTCCAGCAGCATCACCAAGTGTGATACCGACATCCAGAAGATCCTCTTTGGGGAGATTGTGCTGTCGGGGGGCACTACCCTGTTCCACGGGCTGGATGACCGGCTTCTCAAGGAGCTGGAGCAGCTGGCCTCCAAGGACACCCCCATCAAGATCACGGCTCCCCCCGACCGGTGGTTCTCCACCTGGATTGGAGCCTCCATCGTCACCTCTCTGAGTAGCTTCAAGCAGATGTGGGTCACCGCCGCAGACTTCAAGGAGTTTGGGACCTCCGTGGTGCAGAGAAGATGCTTCTGAAGGCCGCTTCTCGTTGGGTACCGTGGGGGGTGAACCCTAGCCCCAGCTTTGGGAGGATGTTCAATAAAGGACCAATGCCGGAA", 2938046 - 1); var codingRegion = new CodingRegion(2938251, 2939384, 206, 1339, 1134); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 2938046, 2939467, 1, 1422) }; var rnaEdits = new IRnaEdit[] {new RnaEdit(1423, 1422, "AAAAAAAAAAAAAAA")}; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, false, startExonPhase, rnaEdits); var expectedCodingSeq = "ATGTTTAATCCGCACGCTTTAGACTCCCCGGCTGTGATTTTTGACAATGGCTCGGGGTTCTGCAAAGCGGGCCTGTCTGGGGAGTTTGGACCCCGGCACATGGTCAGCTCCATCGTGGGGCACCTGAAATTCCAGGCTCCCTCAGCAGAGGCCAACCAGAAGAAGTACTTTGTGGGGGAGGAGGCCCTGTACAAGCAGGAGGCCCTGCAGCTGCACTCCCCTTTCGAGCGTGGCCTGATCACAGGGTGGGATGACGTGGAGAGACTCTGGAAGCACCTCTTTGAGTGGGAGCTAGGCGTGAAACCCAGCGACCAGCCCCTGCTTGCAACGGAGCCCTCCCTGAACCCCAGGGAGAACCGTGAGAAGATGGCAGAAGTCATGTTCGAGAACTTCGGCGTGCCCGCTTTCTACCTGTCGGACCAGGCGGTGCTGGCTCTCTACGCCTCTGCCTGTGTCACGGGCCTGGTGGTGGACAGCGGGGATGCGGTCACCTGCACTGTCCCCATCTTTGAGGGTTACTCCCTGCCCCACGCAGTCACCAAGCTCCACGTGGCGGGCAGGGACATCACGGAGCTCCTCATGCAGCTGCTCCTGGCCAGCGGCCACACCTTCCCCTGCCAGCTGGACAAGGGTCTCGTGGACGACATCAAAAAGAAGCTGTGCTACGTGGCCTTGGAGCCCGAGAAGGAGCTTTCCCGGAGGCCGGAGGAGGTCCTGAGGGAGTACAAGCTGCCCGACGGGAACATCATCAGCCTCGGGGACCCGCTGCACCAGGCGCCCGAGGCCCTGTTCGTGCCCCAGCAGCTGGGCAGCCAGAGCCCCGGGCTCTCGAATATGGTCTCCAGCAGCATCACCAAGTGTGATACCGACATCCAGAAGATCCTCTTTGGGGAGATTGTGCTGTCGGGGGGCACTACCCTGTTCCACGGGCTGGATGACCGGCTTCTCAAGGAGCTGGAGCAGCTGGCCTCCAAGGACACCCCCATCAAGATCACGGCTCCCCCCGACCGGTGGTTCTCCACCTGGATTGGAGCCTCCATCGTCACCTCTCTGAGTAGCTTCAAGCAGATGTGGGTCACCGCCGCAGACTTCAAGGAGTTTGGGACCTCCGTGGTGCAGAGAAGATGCTTCTGA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void RnaEdits_insertion_reverse_in_utr() { //NM_001242659.1, chrom: chr1:1533388-1535476 var genomicSeq = new SimpleSequence( "TCTGTTGGTCTGAGAATGATGGACATTTAGACACTGGCGCCAGGTTTGCGCCTGACCGGCGCCACGCAGGGGTGGGCGGAGCAAAGACACACAGGTGGGCTACAGGTGTCACACGGCACCAGCCAGGGCCCGGGGTGGCTGGGGTGAGGATGGGTGTTTGGCCAGTGACCAGGAGTCAGGTCAAGTCCAGGTGGTCAGTGCCAGGGGCTCCAGGAGGGGAGGGCAGTGCCATAACCCTCCTGGTGTCCAGCGTCACCAGGCGGTCGTCACAGAAAGCAACCTCGGCCCGGGGCCCGGGTCTGCAGCAGGTGGGCAGGGTCAGCTTTTCTTCCATGGCGGGTGGCATTGTCTGGGCCGGATACTGGCTCTCGACCCCTGGGCATGCAAAGGCTAGGGGTGGTGCTGTCAGTCACACCGTTGCCACCAAGGTCCCCTGGGTCGGCTGAGGCTTGGGATCCAGGCAGCGGTGGAAGGTCGGGCTGCTCAAGGCCGGTATCTAAGCTTCTGCCCTGGGACCCAGTGGTGATGGCCGCCATCTGCCCCATTCCCACAGGGACCTAGTCAGAGGTCGCACACACAAAAGGGGTACCTGGCCCTGGAGAACCACCAGCTGCCCGGGGTCTGAGAACACTCACCCTGGCCGCTGGGCCAGGCCTGCCAGGCTCCCGGCTGGTCCAACACCCTAAACGGTACAGAGCGCTGCAGGCCCTCACCTCATGCTTCTGCAGCGCTTGAGGGTGAAGGTGTCTCCAAGGGGACGCTGGCCAGATGCATGGAGAGGCCGGCCAATCTTAGGGCCACAGACCCCCCCTGGACAGCAGAGGGTGTTGGCTCCTGCAGTGGGCCCGAGATACTAAGGCACGAAGCTAACCTAAGCCCAGTGGGGTGGGGCGGGGCAGGACAGGCTTGGAGAGCTGCGCCCCAGGCCTGCAAAGCACAGTGACCGAGCAATGGCGACGGTCTGTCTGGGACAATTCGGCACAGGATGGAGGTGCGGGGTGAGCCGGAGTGCCCATGGCTCTTGCTGGAAGGGGCTCCATGCCCTGGCCGCCTCTATAAAGGCCTGCGGAGAGCGGGGAGAGCCCTGGATGCGGCTGGCACAGCAGCGCAAGCCCAGGGGCCAATCCGGGGCCAGAGTCTGGGAGTCTGACGCCCGGCTGGAAAGGGCGTGTGATGATGCCAAAGTGCCGGAGCCGTCGCCGGCAGGTCCTCCTCCGCGGGGATCTTAAGGAGGCAGCAGGAATGAGGAGAGGAGAGCGGGCGGAGGACCTGGGAGCTCAGGCGCCCTCAGGCAGGTGGCGCAAAGATGGGCGGGCGGCCTCGCGCTTCAGGGGTGTCTGCGCAGGCCGGGGCGCGCGAGGGCCGGGCGCATGAGGTTCTCGGTGATGTAGGCCACCAGCAGGCAGATGACCACCAGCATGACGCAGATGGAGCCGCCCACCGCCGTCATGGCCACCACGATGTCCTGCATGCCGGCCGGCTCGGCGGTGAACTCCACGCACTCGGCCGGCTCGGGGGTCTCTGGCGCGGCGGCGGCGGGCCCAGCGCGCAGCGGCAGCGGCTGCAGGCACAGGCGGTAGAGGACGCTGTCGTGCACGTCGGGCAGCAGGTAGTCGCGGCAGGAGGCCCCGAGGAGCACGCGCTCGCACGGGAAGCGCGTGTAGGCGCCGCGCCACGAGCAGTTGAGCGCGAAGGCGCGCACGCGGCGCGCGGCGGCCGGGGCCAGGCGCCACTGCAGGAGGACGCTGCGGTTGCGCAGGACGCTGGCGCGCAGGGAGCGGCCGGCCGGGGCGTGCAGCACGCAGCCCGGAGCCTGGCAGCGGAAGCCGCGCGCGGGGCTGCGGAAGCACAGGCGCCCGCCGCCCGCCTCGGGGCCCTCGGGCAGCACCTTGTAGGGGCACCAGGGCGCGTCGGGGGTCGGCTCCCAGCCCGGCGGCGTCGGGGCGGCCGCGGCGCAGGGCGGCGGCGCGCAGGCGGCCAGCAGCAGCAGCAGCGGCGGGGCGCGCATCCTGCGGCGGGGCCACGGGGCGCGGCGCTGGGTCACGCGGGCCGCGCCGCCGCCGTCCCCGCTGCCCGCTCCCCGCGATCC", 1533387); var codingRegion = new CodingRegion(1534715, 1535395, 82, 762, 681); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 1533388, 1535476, 1, 2089) }; var rnaEdits = new IRnaEdit[] {new RnaEdit(2090, 2089, "AAAAAAAAAAAAAAA")}; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, true, startExonPhase, rnaEdits); var expectedCodingSeq = "ATGCGCGCCCCGCCGCTGCTGCTGCTGCTGGCCGCCTGCGCGCCGCCGCCCTGCGCCGCGGCCGCCCCGACGCCGCCGGGCTGGGAGCCGACCCCCGACGCGCCCTGGTGCCCCTACAAGGTGCTGCCCGAGGGCCCCGAGGCGGGCGGCGGGCGCCTGTGCTTCCGCAGCCCCGCGCGCGGCTTCCGCTGCCAGGCTCCGGGCTGCGTGCTGCACGCCCCGGCCGGCCGCTCCCTGCGCGCCAGCGTCCTGCGCAACCGCAGCGTCCTCCTGCAGTGGCGCCTGGCCCCGGCCGCCGCGCGCCGCGTGCGCGCCTTCGCGCTCAACTGCTCGTGGCGCGGCGCCTACACGCGCTTCCCGTGCGAGCGCGTGCTCCTCGGGGCCTCCTGCCGCGACTACCTGCTGCCCGACGTGCACGACAGCGTCCTCTACCGCCTGTGCCTGCAGCCGCTGCCGCTGCGCGCTGGGCCCGCCGCCGCCGCGCCAGAGACCCCCGAGCCGGCCGAGTGCGTGGAGTTCACCGCCGAGCCGGCCGGCATGCAGGACATCGTGGTGGCCATGACGGCGGTGGGCGGCTCCATCTGCGTCATGCTGGTGGTCATCTGCCTGCTGGTGGCCTACATCACCGAGAACCTCATGCGCCCGGCCCTCGCGCGCCCCGGCCTGCGCAGACACCCCTGA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void GetCodingSequence_InsertionGeneModel_InsertionRnaEdit() { // NM_019119.4, chr5:140566701-140571111 var genomicSeq = new SimpleSequence( "AGAATGCTACGGAAGTCCTTGACAAAAAGGAAACACTGAGACAGATGGGCTGAGAAGAAGAGCTGTCGAGTCCCTGATTGGGAAAGGAAAAATTAAAAACCCTAGATCTCTGGTACACATAAGTCTGGGTTTGCGATTGCTATTTGTGCTGGGGCAGTGTGATTGAGACTGACATTGAGGAAAGAAGCAGCTATGAAGACCAGGGGGTTCAGCTTTCCAAGACAAAGGCAAGTCCTGTTTCTTTTTCTTTTCTGGGGAGTGTCCTTGGCAGGTTCTGGGTTTGGACGTTATTCGGTGACTGAGGAAACAGAGAAAGGATCCTTTGTGGTCAATCTGGCAAAGGATCTGGGACTAGCAGAGGGGGAGCTGGCTGCAAGGGGAACCAGGGTGGTTTCCGATGATAACAAACAATACCTGCTCCTGGATTCACATACCGGGAATTTGCTCACAAATGAGAAACTGGACCGAGAGAAGCTGTGTGGCCCTAAAGAGCCCTGTATGCTGTATTTCCAAATTTTAATGGATGATCCCTTTCAGATTTACCGGGCTGAGCTGAGAGTCAGGGATATAAATGATCACTCGCCAGTGTTTCGGCACAAAGAGATGGTCTTAAAAATATCAGAAAATACAGCTGAAGGGACAGCATTTAGACTAGAAAGAGCACAGGATCCAGATGAAGGTCATAACAGTATCCAAAACTACACGATCAGCTCCAACTCTTTTTTCCATATTAAAATTAGTGGCAGTGATGAAGGCATGATATATCCAGAGCTAGTGTTGGACAAAGCACTGGATCGGGAGGAGCAGGAAGAGCTCAGCTTAACCCTCACAGCGCTGGATGGTGGGTCTCCATCCAGGTCTGGGACCTCCACTATACGCATTGTGGTCTTGGATGTCAATGACAATGCCCCACAGTTTGCCCAGGCTCTGTATGAGACCCAGGCTCCAGAAAACAGTCCAGTAGGGTCCCTTATTGTTAAAGTGTCTGCAGGAGATGCAGACTCAGGAGTCAATGCAGAAGTATCCTATTCATTTTTTGATGCTTCTGAAGATATTTTAACAACGTTTCAAATCAATCCTTTTTCTGGGGAAATCTTTCTCAGAGAATTGCTTGATTATGAGTTAGTAAATTCTTACAAAATAAATATACAGGCAATGGACGGCGGAGGCCTTTCTGCAAGATGTACAGTTTTGATAAAAGTATTAGATTCCAATGACAATCCTCCTGAACTGATCATATCATCACTTTCCAACTCTGTTGCTGAAAACTCTCCTGGGATAGTATTGGCTGTTTTTAAGATTAAAGACAGAGACTCCGGAGAAAATGGAAAGACATTTGCTATGTTCAAGATAATCTGCCTTTTTTTCTGAAACCGTCTGTTGACAATTTTTACATCCTAATGACTGAAGGTGCACTGGACAGAGAGAGCAAAGCTGAGTACAACATCACCATCACCGTCACTGACTTGGGGACACCCAGGCTGAAAACCGAGCACAGCATAACCCTGCAGGTCTCCGACGTCAATGACAACGCCCCCGCCTTCACCCAAACCTCCTACACCCTGTTCGTCCGGGAGAACAACAGCCCCGCCCTGCACATCGGCAGTGTCAGCGCCACAGACAGAGACTCAGGCACCAACGCCCAGGTCACCTACTCGCTGCTGCCGCCCCAGGACCCACACCTGCCCCTCGCCTCCCTGGTCTCCATCAACGCGGACAATGGCCACCTGTTTGCCCTCAGGTCGCTGGACTACGAGGCCCTGCAGGCTTTCGACTTCCGCGTGGGCGCCTCAGACCGCGGCTCCCCGGCTTTGAGCAGCGAGGCGCTGGTGCGCGTACTGGTGCTGGACGCCAACGACAACTCGCCCTTCGTGCTGTACCCGCTGCAGAACGGCTCCGCGCCCTGCACCGAGCTGGTGCCCCGGGCGGCCGAGCCGGGCTACCTGGTGACCAAGGTGGTGGCGGTGGACGGCGACTCGGGCCAGAACGCCTGGCTGTCGTACCAGCTGCTCAAGGCCACGGAGCCCGGGCTGTTCGGTGTGTGGGCGCACAATGGGGAGGTGCGCACCGCCAGGCTGCTGAGCGAGCGCGACGCAGCCAAGCACAGGCTGGTGGTGCTTGTCAAGGACAATGGCGAGCCTCCTCGCTCGGCCACCGCCACGCTGCACGTGCTCCTGGTGGACGGCTTCTCCCAGCCCTACCTGCCTCTCCCGGAGGCGGCCCCGGCCCAGGCCCAGGCCGACTTGCTCACCGTCTACCTGGTGGTGGCGTTGGCCTCGGTGTCTTCGCTCTTCCTCCTCTCGGTGCTCCTGTTCGTGGCGGTGCGGCTGTGCAGGAGGAGCAGGGCGGCCTCGGTGGGTCGCTGCTCGGTGCCCGAGGGTCCTTTTCCAGGGCATCTGGTGGACGTGAGCGGCACCGGGACCCTGTTCCAGAGCTACCAGTACGAGGTGTGTCTGACTGGAGGTTCAGAGACCGGCGAGTTCAAGTTCTTGAAGCCGATTACCCCCCACCTCCCGCCCCATAGGGGTGGGAAAGAAATAGAGGAAAATTCTACTCTCCCCAATAGCTTTGGATTTAATTATTGAAAGGAACCCACTTAATAAAGACATTTACTTCTTTAATATATTCTTGTTGGCTAACTAAATTGTGTATGCCCACCACAAAGAAGGTACTATTTTTTGTTTGATTCATCTTCAACTTTGCGTATTATGCTTAACTTCACAAGTTAACTTTTTCTTATTTTGTATCCTGATGAGGCATTTCTTACTAGAATCCCATAAGTGAAATATAATATTTTTCAAAGTTGATATCATTTAAAAATTTTTGGTCGTTTTAAATGTCTTTATTGACTTTAAATTCATTGCCTCTACATTATTCATTAGTTCTTCTTTTCCTAAAACTTTTTACTTGTTAAAATAGTCTGCTGCATGTAATATGTGCTTTTACTATTTGATATTTCTTCTATTTTTCTTTTGAAACCGGTGTTCTTATTGGTTTGCCATCCTTGTTCATTACAACTGTTTTTTGTTTGTTTGTTTGTTTTTTGGTTTGTTTGTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCAGCTCACTGCAACCTCCGCCTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCAGAGTATCTGGGACTACAGTTGCATGTCACCACGTTCGGCTAATTTTTGTATTTTCAGTAGAGACGGGTTTCATCATGGTGGCCAGGATGGTCTATCTCTTGACCTCGTGATCCACCCCACTCAGCCTCCCAAATTGCTGGGATTTACAGGCATGAGCCACCGCACCCAGCCTACAATAATTTTCTTAAACTTTACCTTTTATTTTAAAGTTCTAGTTTCCCGGCATTGATAGTTCCCTATTTGAAATATAATGTTTCTCTTGTAAGTGATATGATAAATAAACCCCTAATTAGCCTTAGAAGAAAAACCACTGCAAGATATTAAGCGTGTGTAAATGGGCTTTAGTCTGGAAACCAAAAAAAAAAAAAAAATTTAGTCATTCTATAGGATCATGTGAAAATATTTAATTTGCTCCTTTTAATTCTGTATAAACAAATCAGAGGTTCCTGAGGTTCCTGTTAAATTTTTAATGGCTAATAGCCCAGTGCCATCCAGTTGAAAAAACAACAGCAATCACAAAGTAGAGGTTTATATTGTGCGGCTTTTATATTCAGCTATTAGAGTGTTATTGGTAGTGTCTAGCCTTTTCCTCCACGACATTCCTTGACTTAATCCATTTGGGCCTATTATAGACAAAATAGAGCTTCTTTCTAGATATAAGGTCTTTGAGGCAGGGCTCAGTGGCTCATTCCTGTAATCCCAGCACTTTGGGAGGCCAAGGCGGGCAGATCACCTTAGGTCACGAGTTTGAGACCAGCCTGACCAACGTTAAGTAACCCCGTCTTTACTAAAAATACAAAATTAGCCAGGCATGGTGGCACATGCTTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCAGGAGGTGGAAGTTGCTTTGAGCCGAGATTGCACCATTGTACTCCAGCCTGGGCAATAAGAGCAAAACTCCATCAAAATAAAATAAAATAAAATATAAAATAACTTAAAAAGAACTTTGAATAAAATTCTATGAAAAAAGACACTAGAATGCTGTTCTTAATTTTAATAGTGTTAAGATAGGTGTTAGTGTGGTCTGTTCTTTACCTCCCTTTATTTGGTGCAGAGAAGTTAGATCCTGCTAAATTTCAATTAAGAGGGGACCTTAAAATAAGGATCAATCTCTTATTTAACCCTGTAAGTTACTTTAAAGCTAATACAAGAAAAACAAAGACAAGTGAAAGTAAGGAAACAGAAATTGC", 140566701 - 1); var codingRegion = new CodingRegion(140566893, 140569285, 193, 2586, 2394); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 140566701, 140568035, 1, 1335), new TranscriptRegion(TranscriptRegionType.Exon, 1, 140568036, 140571111, 1337, 4412) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(908, 908, "T"), new RnaEdit(1336, 1335, "A"), new RnaEdit(2096, 2096, "G") }; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, false, startExonPhase, rnaEdits); const string expectedCodingSeq = "ATGAAGACCAGGGGGTTCAGCTTTCCAAGACAAAGGCAAGTCCTGTTTCTTTTTCTTTTCTGGGGAGTGTCCTTGGCAGGTTCTGGGTTTGGACGTTATTCGGTGACTGAGGAAACAGAGAAAGGATCCTTTGTGGTCAATCTGGCAAAGGATCTGGGACTAGCAGAGGGGGAGCTGGCTGCAAGGGGAACCAGGGTGGTTTCCGATGATAACAAACAATACCTGCTCCTGGATTCACATACCGGGAATTTGCTCACAAATGAGAAACTGGACCGAGAGAAGCTGTGTGGCCCTAAAGAGCCCTGTATGCTGTATTTCCAAATTTTAATGGATGATCCCTTTCAGATTTACCGGGCTGAGCTGAGAGTCAGGGATATAAATGATCACTCGCCAGTGTTTCGGCACAAAGAGATGGTCTTAAAAATATCAGAAAATACAGCTGAAGGGACAGCATTTAGACTAGAAAGAGCACAGGATCCAGATGAAGGTCATAACAGTATCCAAAACTACACGATCAGCTCCAACTCTTTTTTCCATATTAAAATTAGTGGCAGTGATGAAGGCATGATATATCCAGAGCTAGTGTTGGACAAAGCACTGGATCGGGAGGAGCAGGAAGAGCTCAGCTTAACCCTCACAGCGCTGGATGGTGGGTCTCCATCCAGGTCTGGGACCTCCACTATACGCATTGTGGTCTTGGATGTCAATGACAATGTCCCACAGTTTGCCCAGGCTCTGTATGAGACCCAGGCTCCAGAAAACAGTCCAGTAGGGTCCCTTATTGTTAAAGTGTCTGCAGGAGATGCAGACTCAGGAGTCAATGCAGAAGTATCCTATTCATTTTTTGATGCTTCTGAAGATATTTTAACAACGTTTCAAATCAATCCTTTTTCTGGGGAAATCTTTCTCAGAGAATTGCTTGATTATGAGTTAGTAAATTCTTACAAAATAAATATACAGGCAATGGACGGCGGAGGCCTTTCTGCAAGATGTACAGTTTTGATAAAAGTATTAGATTCCAATGACAATCCTCCTGAACTGATCATATCATCACTTTCCAACTCTGTTGCTGAAAACTCTCCTGGGATAGTATTGGCTGTTTTTAAGATTAAAGACAGAGACTCCGGAGAAAATGGAAAGACAATTTGCTATGTTCAAGATAATCTGCCTTTTTTTCTGAAACCGTCTGTTGACAATTTTTACATCCTAATGACTGAAGGTGCACTGGACAGAGAGAGCAAAGCTGAGTACAACATCACCATCACCGTCACTGACTTGGGGACACCCAGGCTGAAAACCGAGCACAGCATAACCCTGCAGGTCTCCGACGTCAATGACAACGCCCCCGCCTTCACCCAAACCTCCTACACCCTGTTCGTCCGGGAGAACAACAGCCCCGCCCTGCACATCGGCAGTGTCAGCGCCACAGACAGAGACTCAGGCACCAACGCCCAGGTCACCTACTCGCTGCTGCCGCCCCAGGACCCACACCTGCCCCTCGCCTCCCTGGTCTCCATCAACGCGGACAATGGCCACCTGTTTGCCCTCAGGTCGCTGGACTACGAGGCCCTGCAGGCTTTCGACTTCCGCGTGGGCGCCTCAGACCGCGGCTCCCCGGCTTTGAGCAGCGAGGCGCTGGTGCGCGTACTGGTGCTGGACGCCAACGACAACTCGCCCTTCGTGCTGTACCCGCTGCAGAACGGCTCCGCGCCCTGCACCGAGCTGGTGCCCCGGGCGGCCGAGCCGGGCTACCTGGTGACCAAGGTGGTGGCGGTGGACGGCGACTCGGGCCAGAACGCCTGGCTGTCGTACCAGCTGCTCAAGGCCACGGAGCCCGGGCTGTTCGGTGTGTGGGCGCACAATGGGGAGGTGCGCACCGCCAGGCTGCTGAGCGAGCGCGACGCGGCCAAGCACAGGCTGGTGGTGCTTGTCAAGGACAATGGCGAGCCTCCTCGCTCGGCCACCGCCACGCTGCACGTGCTCCTGGTGGACGGCTTCTCCCAGCCCTACCTGCCTCTCCCGGAGGCGGCCCCGGCCCAGGCCCAGGCCGACTTGCTCACCGTCTACCTGGTGGTGGCGTTGGCCTCGGTGTCTTCGCTCTTCCTCCTCTCGGTGCTCCTGTTCGTGGCGGTGCGGCTGTGCAGGAGGAGCAGGGCGGCCTCGGTGGGTCGCTGCTCGGTGCCCGAGGGTCCTTTTCCAGGGCATCTGGTGGACGTGAGCGGCACCGGGACCCTGTTCCAGAGCTACCAGTACGAGGTGTGTCTGACTGGAGGTTCAGAGACCGGCGAGTTCAAGTTCTTGAAGCCGATTACCCCCCACCTCCCGCCCCATAGGGGTGGGAAAGAAATAGAGGAAAATTCTACTCTCCCCAATAGCTTTGGATTTAATTATTGA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void RnaEdits_in_coding_sequence_reverse_insertion() { //NM_000682.6, chrom: chr2:96778623-96781984 var genomicSeq = new SimpleSequence( "CTTATTACAAAATATCCTTTATTGATAAAATAGCTCAGAGTTTAAAAAAAAAAAAAACACCACCTGCATGTCGCAATAAGAGGTCACAGGCAAGAACACTGGGGGTCCCATGGGGCGCACACAAGACCGGCCAGCAGAGGGTCACAGTCAGTCCCTCTCCTGGCCCAGCTCCCCACCACATCCCAGGGCGATACTCTGGCCTCAACAACCCACTGAGGACCAAGCTGGGAAGCCTCCCACACCCCAGGAAGGACTCTTTTTGGTCCCCTCCATTCTCTCTACACCCAGAAAACTCCCTCGGTGCCCTTCCAAATCTAGCAGGTCCATCTGGCCCATTCCCCCGACACCTGCCAAGCTAAGATGCCTACTGGCCCAATGTTGAAGCCAGGCCCTCTCCAAGGGAAGGCCGATAAACCTCCTTTCCACACTTCCAACTGTTCTGGGTGCCAGGTTTTGGGGTGGGACTGAGAACCAGGAAGCAGGGGTCCTCAATGCACAGCCCCATCAGCATTGCGGGGAGCAGCGTGGCTGGGTCCGAGGCAGTCCACAAGCACCCACCTGGGGGGATCAGTTGTGGTTCACAAGGACTCATTTGGGGCTTGGAGACCTGGCCGGGCACTCCAGTGGGAGGCTCCCCTAGGGGCGCACCAGGCTCTGATGCCAGTACCCCACCTGGGGGCGCTGCCACCTGTCACAGGCTCTCATCTTAGACTGTTGCCGAGGTGTGGATATTTTGAGCTGTCTTGGGGAGACAATTTGCCTCCTTGATGACAAAAGACTTATCCCCCACTGGGGAGACCCAAGCCACTAAAAACCCTCTTGGTGTTGCCGGTGAAATGTCGAAACGTTGTCATGTAGCGTAATAACTCAGACCTTTGCAGCCAGAAGAACACATTCTCAAAGAGATCCTTTAACTTGAAATAGTGATTCTGTCTGCCACTCCCGGCTTCCAGTTCGGGGTAGGAATTCACACACCCCAGGGACAGAACAAAAGTCTACAGGAAGACAGGTGGTGGTAAACACAGAGGAAAGGGATTTTTATATCACCATATAATCACATTTTTGGTTCTCTAGTGTGTTCCCCCACAGAGCTCAAAGCTTTCTGCAAAGCCTTTCATCTCCCTGCAGCAAGTAGGCAGTGAGCTATTGTCGCCCCGATTTTTGCAGGGGGTGAATGCCAGTGATCGGGGATCTCCCGTCGAGGCAGAGACCAGGCCTCCAAGACCGCCCCAGCGAGGCATCCACGTGGCCACCCACCTACCGGAGGGGTGCTGGGTAAGGAAGCCGATCCATTGTTCTGGCTTTCAAAGGAACCACAGATCCGAAAACAGGCAAAGGGGGAAAGGAGGGCCCAGAGACGATGCCACCCCATAAGCCCCCATCCCAGCGCCTGCCAGGGACCGCGAGTGCCTAGCGTGGGTGATCAGTCTTCGTTTCTTCCTCCCCCTCAGCAGCAGGCCCCACTGGGAAAAGTGGAAGGCTGGCTCCGTGCTCTTTGTGGGTGGGGGGGAGATGAAAAAGAAACGAAAACACCACAAGCAAGTGACCTGCCAGGAACACAAGGTCCTCAAGAAAGGGAAGCCCAGACATTGGTCTGGAGAGCATGGGGCTCTGGGAAGAAAGTGCTCTCTCTTCTCCTGGTCTTGGCTATGTTCCAGAGGATTTGAACCACCTCCATCGGCCTGTGCTCAGGGAGAGGGTGGAGAAGGGGTCCCCCACAGCTAAGCCGGCAAGGGGAAGCTTCACTGGGACCCTTGCTAGCAGCCCCCCTGCCCACCCCTCCCAAGGGGTTCCTAAGATGAGGCCTACAGGATCTGGGCAGGGAGCAGAAAGCCCAGGGGAGGCAGCCACACACAGCAGGGCAAGAAGCAGGGTGACCCCGGCGCCACCGCACCAACCCCACAGGGGCAGCGCAGGCGGGCTCACCAGGCCGTCTGGGTCCACGGGCGGCACAGGATCCTCCGGAAGGCACGGCGGAAGTCCTGGTTGAAGATGGTGTAGATAACAGGGTTCAGTGAGCTGTTGCAGTAGCCGATCCAGAAGAAGAACTGGAAGAGGCCATGGGGCACCTTGCAGTGCTTCGGGCAGATGGCTCCCAGGCTGTAGCTGAAGAAGAAGGGGAACCAGCAGAGCACAAAAACGCCAATGACCACAGCCAGCACGAAGGTGAAGCGCTTCTCCCGGGTCAGCTGCGCCCGTCGACGCCACCACTGCCCACCTATAGCACCCACGCCCCTGCCCAGGAGCACCTGGCCACGTAGGGTGGCCAGCACCCGGGAGCCCTGTGGCTGCTGCAGCGGGGGGCTGCAAGCTGAGGCCGGAGACACTGGCACTGCCTGGGGTTCACACTCTTCCTCCTCCTCCTCCTCCTCTTCAGCTTCATCCTCTGGAGATGCCCCACAAACACCCTCCTTCTGGCCCTGGCCTGAGTTGGGAAGGGCAGCCCAACTGGGTGGCAAGGCCCGGGTCCCAGTATCTTCAGGGGTCTCCCCCTCCTCCTTCTCCCCAGTGGACTTCGAGTGTCCGTTGACCTCTCTGGCAGAAGCCACAGAGGCCAGGGCTGGCAGTTTGGCTGAGGCCAAAGCCCCACCATGGTCGGGTCGGGGCTGCTTGGACTCACCCTGCCCAGGCCCCCCCTTGGCCCTGGGACCTCTGCGGTTGCTGCGTTTGGCGATCAGGTAGATGCGCAGGTAGACAAGGATCATGATGAGGCAAGGAGCAAAGAAAGATCCGATGCTGGAGGCCAGGATGTACCAGGCCTCCTGGTTGAGCTTGCACTGGGGGCGCCCGCGCGGCTGGGGGCCCTGGTCGCCCTTGTAGATGAGGGGCGGCAGCGAGATGACGGCGGCGATGAGCCACACAGTGAGGATGATGCACTTGATGCGGCGCGGGGTGCGCTTGGAGTTGTACTCCAGCGCGCGGCTCACGGCCCAGTAGCGGTCCAGGCTGATGGCGCACAGGTGCACGATGGACGAGGTGCAGAAGAGCACGTCGAGCGCCAGGTACACCTCGCACCACGTGCGCCGGAAGTACCAGTAGCCCAGCAGCTCGTTGGCCAGCGAGAAAGGGATGATGAGCGTGGCCACCAGGATGTCGGCGGCGGCCAGCGACACCAGGAACAGGTTCTGAGGGGCGCGCAGCGAGCGGCTGGTCAACACAGCCAGGATGACCAGAGCGTTGCCGAAGATGGTAAAGAGAATGAGGAAGGTGATGGCCGCCGCTATGGCCGCTGTGGCCTGCACGGAGTAGGGGTCCTGGTGGTCCATGACGGGGCGGGAGGTGGGCAGAGGGAGCGCTGCCCGCCCAGTGCGCACCGTGGACGACAGCGCTGCCCGGCTCGGCTAGACAAGAGCGTCGCCCCT", 96778623 - 1); var codingRegion = new CodingRegion(96780545, 96781888, 97, 1449, 1353); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 96778623, 96780986, 1008, 3371), new TranscriptRegion(TranscriptRegionType.Exon, 1, 96780987, 96781984, 1, 998) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(999, 998, "AGAGGAGGA") }; const byte startExonPhase = 0; const bool onReverseStrand = true; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, onReverseStrand, startExonPhase, rnaEdits); var expectedCodingSeq = "ATGGACCACCAGGACCCCTACTCCGTGCAGGCCACAGCGGCCATAGCGGCGGCCATCACCTTCCTCATTCTCTTTACCATCTTCGGCAACGCTCTGGTCATCCTGGCTGTGTTGACCAGCCGCTCGCTGCGCGCCCCTCAGAACCTGTTCCTGGTGTCGCTGGCCGCCGCCGACATCCTGGTGGCCACGCTCATCATCCCTTTCTCGCTGGCCAACGAGCTGCTGGGCTACTGGTACTTCCGGCGCACGTGGTGCGAGGTGTACCTGGCGCTCGACGTGCTCTTCTGCACCTCGTCCATCGTGCACCTGTGCGCCATCAGCCTGGACCGCTACTGGGCCGTGAGCCGCGCGCTGGAGTACAACTCCAAGCGCACCCCGCGCCGCATCAAGTGCATCATCCTCACTGTGTGGCTCATCGCCGCCGTCATCTCGCTGCCGCCCCTCATCTACAAGGGCGACCAGGGCCCCCAGCCGCGCGGGCGCCCCCAGTGCAAGCTCAACCAGGAGGCCTGGTACATCCTGGCCTCCAGCATCGGATCTTTCTTTGCTCCTTGCCTCATCATGATCCTTGTCTACCTGCGCATCTACCTGATCGCCAAACGCAGCAACCGCAGAGGTCCCAGGGCCAAGGGGGGGCCTGGGCAGGGTGAGTCCAAGCAGCCCCGACCCGACCATGGTGGGGCTTTGGCCTCAGCCAAACTGCCAGCCCTGGCCTCTGTGGCTTCTGCCAGAGAGGTCAACGGACACTCGAAGTCCACTGGGGAGAAGGAGGAGGGGGAGACCCCTGAAGATACTGGGACCCGGGCCTTGCCACCCAGTTGGGCTGCCCTTCCCAACTCAGGCCAGGGCCAGAAGGAGGGTGTTTGTGGGGCATCTCCAGAGGATGAAGCTGAAGAGGAGGAAGAGGAGGAGGAGGAGGAGGAAGAGTGTGAACCCCAGGCAGTGCCAGTGTCTCCGGCCTCAGCTTGCAGCCCCCCGCTGCAGCAGCCACAGGGCTCCCGGGTGCTGGCCACCCTACGTGGCCAGGTGCTCCTGGGCAGGGGCGTGGGTGCTATAGGTGGGCAGTGGTGGCGTCGACGGGCGCAGCTGACCCGGGAGAAGCGCTTCACCTTCGTGCTGGCTGTGGTCATTGGCGTTTTTGTGCTCTGCTGGTTCCCCTTCTTCTTCAGCTACAGCCTGGGAGCCATCTGCCCGAAGCACTGCAAGGTGCCCCATGGCCTCTTCCAGTTCTTCTTCTGGATCGGCTACTGCAACAGCTCACTGAACCCTGTTATCTACACCATCTTCAACCAGGACTTCCGCCGTGCCTTCCGGAGGATCCTGTGCCGCCCGTGGACCCAGACGGCCTGGTGA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void With_rnaEdits_snv_mnv() { //NM_001242659.1 var genomicSeq = new SimpleSequence( "ACTATAAAGACAGTAAAAAGATCAGTGGTTATCTTTGCAGACGCCACCATCXCTGTGAGCCCTGTACTATCAGCCATGGTCAACTCCGTCGTCTTTTTTGACATCACCGTCGACGGCAAGCCCTTGGGCCGCATCTCCATCAAACTGTTTGCAGACAAGATTCXAAAGACAGCXGAAAACTTTCGTGCTCTGAGCACTGGAGAGAAAGGATTTCGTTATAAGGGTTCCTGCTTTCACAGAATTATTCCAGGGTTTATGTGTCAGGGTGGTGACTTCACACGCCXTAATGGCACXGGTGACAAGTCCATCTATGGGGAGAAATTTGATGATGAGAACCTCATCCGAAAGCATACAGGTTCTGGCATCTTGTCCATGGCAAATGCTGGACCCAACACAAATGGTTCCCAGTTTTTCATCTGXXCTGCCAAGACTGAGTGGTTGGATGGCAAGCATGTGGCCTTTGGCAAGGTGAAAGAACGTGTGAATATTGTGGAAGCCATGGAGCACTTTGGGTACAGGAATAGCAAGACCAGCAAGAAGATCACCATTGCTGACTGTGGACAATTCTAATGAGTTTGACTTGTGTTTTATTTTCACCACCAGACCCATTCCTTCTGTAGCTCAGGAGAGCACCCCTCCACCACATTTGCTTGCAATATCCTAGAATCTXXGTGCTCTTGCTGCAGTTCCCTTTGGGTTCCATGTTTTCCTTGTTCCCTTCCATGCCTAGCTGGATTGCAGAGTTGAGTTAAGTTTATGATTATGAAATAAAAACTAAGTAACAA", 149553002); var codingRegion = new CodingRegion(149553003, 149553787, 1, 785, 785); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 149553003, 149553787, 1, 785) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(52, 52, "G"), new RnaEdit(164, 164, "C"), new RnaEdit(174, 174, "A"), new RnaEdit(284, 284, "C"), new RnaEdit(294, 294, "C"), new RnaEdit(420, 421, "CA"), new RnaEdit(670, 671, "CT") }; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, false, startExonPhase, rnaEdits); var expectedCodingSeq = "ACTATAAAGACAGTAAAAAGATCAGTGGTTATCTTTGCAGACGCCACCATCGCTGTGAGCCCTGTACTATCAGCCATGGTCAACTCCGTCGTCTTTTTTGACATCACCGTCGACGGCAAGCCCTTGGGCCGCATCTCCATCAAACTGTTTGCAGACAAGATTCCAAAGACAGCAGAAAACTTTCGTGCTCTGAGCACTGGAGAGAAAGGATTTCGTTATAAGGGTTCCTGCTTTCACAGAATTATTCCAGGGTTTATGTGTCAGGGTGGTGACTTCACACGCCCTAATGGCACCGGTGACAAGTCCATCTATGGGGAGAAATTTGATGATGAGAACCTCATCCGAAAGCATACAGGTTCTGGCATCTTGTCCATGGCAAATGCTGGACCCAACACAAATGGTTCCCAGTTTTTCATCTGCACTGCCAAGACTGAGTGGTTGGATGGCAAGCATGTGGCCTTTGGCAAGGTGAAAGAACGTGTGAATATTGTGGAAGCCATGGAGCACTTTGGGTACAGGAATAGCAAGACCAGCAAGAAGATCACCATTGCTGACTGTGGACAATTCTAATGAGTTTGACTTGTGTTTTATTTTCACCACCAGACCCATTCCTTCTGTAGCTCAGGAGAGCACCCCTCCACCACATTTGCTTGCAATATCCTAGAATCTCTGTGCTCTTGCTGCAGTTCCCTTTGGGTTCCATGTTTTCCTTGTTCCCTTCCATGCCTAGCTGGATTGCAGAGTTGAGTTAAGTTTATGATTATGAAATAAAAACTAAGTAACAA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void With_rnaEdits_deletion() { //NM_033089.6, chrom: chr20:278204-280965 var genomicSeq = new SimpleSequence( "GGAGGATGCTGGGAAGGAGGTAAAATGGCCACCGGCGGCGGCGCGGAGGAAGAGAGGAAACGGGGGCGGCCGCAGCTTCTGCCCCCCGCGCGGCCCGCGGCCCGGGGCGAGGAGGCCGACGGCGGCCGCGAGAAGATGGGCTGGGCCCAGGTGGTGAAGAATCTAGCCGAGAAGAAGGGCGAATTCCGCGAGCCGCGGCCGCCGCGGCGGGAGGAGGAAAGCGGCGGCGGTGGAGGGAGCGCCGGGCTCGGCGGCCCCGCGGGCCTGGCGGCGCCGGACCTCGGCGACTTCCCACCGGCTGGCCGCGGGGATCCGAAGGGCCGTCGGAGAGATCCGGCCGGCGAGGCGGTGGACCCCCGCAAAAAGAAGGGCGCTGCGGAGGCGGGCAGGAGGAAGAAGGCCGAGGCGGCGGCGGCCGCCATGGCGACCCCGGCCAGGCCCGGCGAGGCCGAGGACGCGGCCGAGCGGCCCCTCCAGGATGAGCCGGCGGCGGCGGCGGCAGGCCCGGGCAAGGGTCGCTTCCTCGTCCGCATCTGTTTCCAGGGAGACGAGGGCGCCTGCCCGACCCGGGACTTCGTGGTAGGAGCGCTTATCCTGCGCTCCATCGGCATGGACCCGAGCGACATCTACGCGGTCATCCAGATCCCGGGCAGCCGCGAATTCGACGTGAGCTTCCGCTCAGCGGAGAAGCTGGCCCTGTTCCTACGCGTCTACGAGGAGAAGCGGGAGCAGGAGGACTGCTGGGAGAACTTTGTGGTGCTGGGGCGGAGCAAGTCCAGCTTGAAGACGCTCTTCATCCTCTTCCGGAACGAGACGGTGGACGTGGAGGACATTGTGACTTGGCTCAAGCGCCACTGCGACGTGCTGGCCGTGCCGGTGAAAGTGACCGACAGGTTTGGGATCTGGACCGGGGAGTACAAATGCGAGATCGAGCTGCGCCAGGGGGAGGGCGGGGTCAGGCACTTGCCAGGGGCCTTCTTCCTGGGGGCCGAGAGGGGCTACAGCTGGTACAAGGGGCAGCCCAAGACATGCTTTAAATGTGGTTCCCGGACCCACATGAGCGGCAGCTGCACGCAGGACAGGTGCTTCAGGTGCGGGGAGGAGGGGCACCTGAGCCCTTACTGCCGGAAGGGCATCGTGTGCAACCTCTGTGGCAAGCGAGGACACGCCTTTGCCCAGTGTCCCAAAGCAGTGCACAATTCCGTGGCAGCTCAGCTAACCGGCGTGGCCGGGCACTAAACACCCGCCTGCCTGCCAGGGTGAACACACAGCCAGCTTATCCCTCTTAAGTGCCAAAACTTTTTTTTAAACCATTTTTTATCGTTTTTGAAGGAGATCTTTTTAAAACCTACAAGAGACATCTCTCTATGCCTTCTTAAACCGAGTTTACTCCATTTCAGCCTGTTCTGAATTGGTGACTCTGTCACCAATAACGACTGCGGAGAACTGTAGCGTGCAGATGTGTTGCCCCTCCCTTTTAAAATTTTATTTTCGTTTTTCTATTGGGTATTTGTTTTGTTTCTTGTACTTTTTCTCTCTCTCCTTGCCCCCCTCCCGCCCTCCCCGCCCCATACCTTTTCTTCCCCTGGATTTTCACCCTTTGGGCTGCCTTGCTCATCTTTATGCCCCAGCACTAGGTACGGGGCCCAACACGTGGTAGGCACTCCATCAGTGTTTGCTGAATTGAAAACATTGTTGACTGTGGCTTCTATCAGAGTGTCTACCTTTTGCAGCTCTTCCCCTCCCTCATTTAATTTGCTGCTTTTAATCTACGTGGTCTGAGAATTTGTGAAACCAGTGTTGTTAGAAGTGTATATAATCTGAATCAATAAGCTCTGAATGGTGGCCAAGGGCCTCTCTTATGGCACAAAGATGCATGGACTTCATGACAGCTCTTTTGGTGGCTCAGAAGCCATTTTTTATAGAATCATGGAATCTAGAATATTCCTGCTGGAAAGAACCTGAGAGTTGGTTTGGACCAATTCCCTGGTTTTCCAGCAGATGAAACAGGCCCAAAGAGGTTAAATGACTGGGTGAAAATCACATAGCTGTCTGGTGCCAGAGCCAGCCTATAGTAGAGTCCCCTGACCCCAAGCCCGGTGCTCATTCCACTACCTCTCACACTTCACAACAATTTCCTCAACACTTGAGGGCCCAGAAAGTCTGATCTCTCCAGAATGATCAGCCCAGAGGAATGCTGAGAAATCACCTGGAGGAGGGAGCAGAAAGAGAAGGTTTTTAAGGAGGGGCTTCTGAATACTTGGGAGATACGGAACGGACCAAGGACCACACTCCAGGGTGCATTCGTTGCTCCCTGGGGCACCACTTCTGGATTACAGTGTGCCAGGTCCTTTGGAGGCCCTACCCCTTCCCCATTCATTGCCACCAGTGAGAAATGGGGGTGCCCCTGTGTAAAGAAACCTACCAAAGGTTTACATTTGCACCTTAGCCTCAATAGCTACGAACCCTAGAGAAGCAGCTAGCTGGAGCTCATGTGCAACTCCTGATTCTCAGGAGAAAGATGGATTTTAACCCAAAATTATGAGTGAGCTGTTAACTCTAAAATGTACTTGGGAGATAGGCCAAGCGAGAGGTCATGGGCCAACTAAGTGTTATCCAGTAGAAAAGACAGTACACTGCTTTTCTTTTAGTGTTTGCTTTTCCTTTGCTATATGTTTTGCTATTTCCTTGTGGCTTAGAATGTAAAATTGATTGTTAAAAGTTTTGTTCTGAATAAATATTTATCTTTTGTATTGCTAAAA", 278204 - 1); var codingRegion = new CodingRegion(278228, 279442, 25, 1236, 1212); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 278204, 278687, 1, 484), new TranscriptRegion(TranscriptRegionType.Gap, 1, 278688, 278690, 484, 485), new TranscriptRegion(TranscriptRegionType.Exon, 1, 278691, 280965, 485, 2759) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(485, 487, ""), new RnaEdit(2763, 2762, "AAAAAAAAAAAAAA") }; const byte startExonPhase = 0; const bool onReverseStrand = false; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, onReverseStrand, startExonPhase, rnaEdits); //The coding sequence from refseq does not have the deletion from rna edit. That was manually inserted. var expectedCodingSeq = "ATGGCCACCGGCGGCGGCGCGGAGGAAGAGAGGAAACGGGGGCGGCCGCAGCTTCTGCCCCCCGCGCGGCCCGCGGCCCGGGGCGAGGAGGCCGACGGCGGCCGCGAGAAGATGGGCTGGGCCCAGGTGGTGAAGAATCTAGCCGAGAAGAAGGGCGAATTCCGCGAGCCGCGGCCGCCGCGGCGGGAGGAGGAAAGCGGCGGCGGTGGAGGGAGCGCCGGGCTCGGCGGCCCCGCGGGCCTGGCGGCGCCGGACCTCGGCGACTTCCCACCGGCTGGCCGCGGGGATCCGAAGGGCCGTCGGAGAGATCCGGCCGGCGAGGCGGTGGACCCCCGCAAAAAGAAGGGCGCTGCGGAGGCGGGCAGGAGGAAGAAGGCCGAGGCGGCGGCGGCCGCCATGGCGACCCCGGCCAGGCCCGGCGAGGCCGAGGACGCGGCCGAGCGGCCCCTCCAGGATGAGCCGGCGGCGGCGGCAGGCCCGGGCAAGGGTCGCTTCCTCGTCCGCATCTGTTTCCAGGGAGACGAGGGCGCCTGCCCGACCCGGGACTTCGTGGTAGGAGCGCTTATCCTGCGCTCCATCGGCATGGACCCGAGCGACATCTACGCGGTCATCCAGATCCCGGGCAGCCGCGAATTCGACGTGAGCTTCCGCTCAGCGGAGAAGCTGGCCCTGTTCCTACGCGTCTACGAGGAGAAGCGGGAGCAGGAGGACTGCTGGGAGAACTTTGTGGTGCTGGGGCGGAGCAAGTCCAGCTTGAAGACGCTCTTCATCCTCTTCCGGAACGAGACGGTGGACGTGGAGGACATTGTGACTTGGCTCAAGCGCCACTGCGACGTGCTGGCCGTGCCGGTGAAAGTGACCGACAGGTTTGGGATCTGGACCGGGGAGTACAAATGCGAGATCGAGCTGCGCCAGGGGGAGGGCGGGGTCAGGCACTTGCCAGGGGCCTTCTTCCTGGGGGCCGAGAGGGGCTACAGCTGGTACAAGGGGCAGCCCAAGACATGCTTTAAATGTGGTTCCCGGACCCACATGAGCGGCAGCTGCACGCAGGACAGGTGCTTCAGGTGCGGGGAGGAGGGGCACCTGAGCCCTTACTGCCGGAAGGGCATCGTGTGCAACCTCTGTGGCAAGCGAGGACACGCCTTTGCCCAGTGTCCCAAAGCAGTGCACAATTCCGTGGCAGCTCAGCTAACCGGCGTGGCCGGGCACTAA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void With_rnaEdits_reverse_deletion_utr() { //NM_001317107.1 chr14:22138125-22139232 var genomicSeq = new SimpleSequence( "ATATGGTATGTAACTTATTCTTTGCAAGGCGCTTCTTTAATTTGGAGCACCACGTATCCTAAGGACGTAGACATTTTCATTTTTCTTCTTTTCTCTCTTTTCTCCCCACTAACTTGTTTAAGGCACTCTTCATTTCTTCATTCCTAAGGGTATAGATAATGGGGTTCAGCAGGGGGGTGACTGCAGTGAAAAACACAGATACTGCCTTGTCCTCTGGGAGGCTGGTGGATGGGCGGGAATAGATGAAGATGCAGTGTCCCAGGAACAGTGTAACTACAGTGAGATGGGCTGCACAGGTGGACAGGGCCTTCCACTTGCCCTTGGAGATCTGCTGCCTCAGACTCACCAGGATGACTGCGTAGGACACCACCAGGACCACAAAACAGACCACGGAGATCAATCCACTGTTGGAGACAATGAGGATCTCAAGGACGTGGGTGTGTCAATGCAGGCCAGCTTGATCACCTGAGGTACATCACAGAAGAAGTTGTCAATCTCATCAGGACCACAGTAGGGCAGCTTGATGGTAAGGGAGGTGAGGGCTATGGAGTGGATGGTCCCTCCTGTCCAGAGGGCCACAGCCAGCAGCACACATACCTTCCAGTTCATCACTATCATGTACTGCAGGGGTTTACAGATGGCCACATACCGATCATAGGCCATGACGGTGAGGAGGAAGATCTCTGTGCAGGCAAAGAGGTGCAGGAAGAACATCTGGGTCACACAGGCATCAAAAGAGATGAGCTTTTCCTCTGACCACACGTCTCTCAGCATCTTGGGGACAGTGACAGTGGAGTGGCAGACATCAATAAAGGACAGGTTGCTGAGGAAGAAATACATGGGAGTATGGAGCCGGTGGTCATAGATAATAGTTATGACAATGAGAACATTCCCAATCAGTGTCAGGACATAAAAAATGAGGAACATGGAAAACATAGCTATCCGTGCCTTATGATTTACAGATAAACCTCTAAGCCGAAAATATGTCACTAAAGAAGTTTGATTGAGTAGGATGGCCTCTTCCATTCTCTTTGTTAGACAACCTGTAAAGAATTAGAAAAAAAGTCTAATATAACACAGTATCTGCATCAATCATTTGGTCATTTAA", 22138125 - 1); var codingRegion = new CodingRegion(22138201, 22139150, 83, 1030, 948); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 22138125, 22138561, 670, 1106), new TranscriptRegion(TranscriptRegionType.Gap, 1, 22138562, 22138563, 669, 670), new TranscriptRegion(TranscriptRegionType.Exon, 1, 22138564, 22139232, 1, 669) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(905, 905, "T"), new RnaEdit(796, 796, "C"), new RnaEdit(679, 679, "A"), new RnaEdit(670, 671, "") }; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, true, startExonPhase, rnaEdits); var expectedCodingSeq = "ATGGAAGAGGCCATCCTACTCAATCAAACTTCTTTAGTGACATATTTTCGGCTTAGAGGTTTATCTGTAAATCATAAGGCACGGATAGCTATGTTTTCCATGTTCCTCATTTTTTATGTCCTGACACTGATTGGGAATGTTCTCATTGTCATAACTATTATCTATGACCACCGGCTCCATACTCCCATGTATTTCTTCCTCAGCAACCTGTCCTTTATTGATGTCTGCCACTCCACTGTCACTGTCCCCAAGATGCTGAGAGACGTGTGGTCAGAGGAAAAGCTCATCTCTTTTGATGCCTGTGTGACCCAGATGTTCTTCCTGCACCTCTTTGCCTGCACAGAGATCTTCCTCCTCACCGTCATGGCCTATGATCGGTATGTGGCCATCTGTAAACCCCTGCAGTACATGATAGTGATGAACTGGAAGGTATGTGTGCTGCTGGCTGTGGCCCTCTGGACAGGAGGGACCATCCACTCCATAGCCCTCACCTCCCTTACCATCAAGCTGCCCTACTGTGGTCCTGATGAGATTGACAACTTCTTCTGTGATGTACCTCAGGTGATCAAGCTGGCCTGCATTGACACCCACGTCATTGAGATCCTCATTGTCTCCAACAGTGGATTGATCTCCGTGGTCTGTTTTGTGGTCCTGGTGGTGTCCTACGCAGTCATCCTGGTGAGTCTGAGGCAGCAGATCTCCAAGGGCAAGCGGAAGGCCCTGTCCACCTGTGCAGCCCATCTCACTGTAGTTACACTGTTCCTGGGACACTGCATCTTCATCTATTCCCGCCCATCCACCAGCCTCCCAGAGGACAAGGTAGTATCTGTGTTTTTCACTGCAGTCACCCCCCTGCTGAACCCCATTATCTATACCCTTAGGAATGAAGAAATGAAGAGTGCCTTAAACAAGTTAGTGGGGAGAAAAGAGAGAAAAGAAGAAAAATGA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void RnaEdits_deletion_reverse_utr() { //NM_001123068.1 chrom: chr1:147954635-147955377 var genomicSeq = new SimpleSequence( "TTGTTACTTAGTTTTTATTTCATAATCATAAACTTAACTCAACTCTGCAATCCAGCTAGGCATGGAAGGGAACAAGGAAAACATGGAACCCAAAGGGAACTGCAGCAAGAGCACAAAGATTCTAGGATATTGCAAGCAAATGTGGTGGAGGGGTGCTCTCCTGAGCTACAGAAGGAATGGGTCTGGTGGTGAAAATAAAACACAAGTCAAACTCATTAGAATTGTCCACAGTCAGCAATGGTGATCTTCTTGCTGGTCTTGCTATTCCTGTACCCAAAGTGCTCCATGGCTTCCACAATATTCACACGTTCTTTCACCTTGCCAAAGGCCACATGCTTGCCATCCAACCACTCAGTCTTGGCAGCACAGATGAAAAACTGGGAACCATTTGTGTTGGGTCCAGCATTTGCCATGGACAAGATGCCAGAACCTGTATGCTTTCGGATGAGGTTCTCATCATCAAATTTCTCCCCATAGATGGACTTGTCACCAGTGCCATTATGGCGTGTGAAGTCACCACCCTGACACATAAACCCTGGAATAATTCTGTGAAAGCAGGAACCCTTATAACGAAATCCTTTCTCTCCAGTGCTCAGAGCACGAAAGTTTTCCGCTGTCTTTAGAATCTTGTCTGCAAACAGTTTGATGGAGATGCGGCCCAAGGGCTTGCCGTCGACGGTGATGTCAAAAAAGACGACGGAGTTGACCATGGCTGATAGTACAGGGCTCACAGTGATGGTGGC", 147954635 - 1); //coding region between 34..528 var codingRegion = new CodingRegion(147954850, 147955344, 34, 528, 495); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 147954635, 147954669, 704, 738), new TranscriptRegion(TranscriptRegionType.Gap, 1, 147954670, 147954674, 703, 704), new TranscriptRegion(TranscriptRegionType.Exon, 1, 147954675, 147955377, 1, 703) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(704, 708, null), new RnaEdit(378, 379, "CA"), new RnaEdit(252, 252, "C"), new RnaEdit(242, 242, "C"), new RnaEdit(239, 239, "A"), new RnaEdit(132, 132, "A"), new RnaEdit(122, 122, "C"), new RnaEdit(104, 104, "A"), new RnaEdit(49, 49, "A"), new RnaEdit(10, 10, "G"), new RnaEdit(4, 4, "G") }; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, true, startExonPhase, rnaEdits); var expectedCodingSeq = "ATGGTCAACTCCGTCATCTTTTTTGACATCACCGTCGACGGCAAGCCCTTGGGCCGCATCTCCATCAAACAGTTTGCAGACAAGATTCCAAAGACAGCAGAAAACTTTCGTGCTCTGAGCACTGGAGAGAAAGGATTTCGTTATAAGGGTTCCTGCTTTCACAGAATTATTCCAGGGTTTATGTGTCAGGGTGGTGACTTCACACACCCTAATGGCACCGGTGACAAGTCCATCTATGGGGAGAAATTTGATGATGAGAACCTCATCCGAAAGCATACAGGTTCTGGCATCTTGTCCATGGCAAATGCTGGACCCAACACAAATGGTTCCCAGTTTTTCATCTGCACTGCCAAGACTGAGTGGTTGGATGGCAAGCATGTGGCCTTTGGCAAGGTGAAAGAACGTGTGAATATTGTGGAAGCCATGGAGCACTTTGGGTACAGGAATAGCAAGACCAGCAAGAAGATCACCATTGCTGACTGTGGACAATTCTAA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void RnaEdits_big_insertions_reverse() { //Transcript id:NM_032508.3, chrom: chrX:148678216-148713568 var genomicSeq = new SimpleSequence( "TAAAATGAGGAACCGGTTTATTGAACAGCTTAAGGAGAGCAAAAATAGTGGCTTTAGCTACATTTTTTACACACTGAGCAGGAAAGTCTAAACCATCCCGTTCCCCTGTACCCCAAAGAGAACAGGGCTTGCTGGAGGCCAGTGCCAAGGGCGGAGTCGTGCTCGCAGCAGACTTGAATTAACCCCATGTAGGCCGGCGAGCAGTTGCCCGCGTGAAAACACCACCCTCTTCTCCTGGCTGAGAAGATCAAAGCTCTTTTTTTACCCTCTTTTCAGCAAAGGACCTATTTGTTTTCAGGCAGGAGGATGTTAAACTTGCAGCCTCTGACACACGGTGGAACCTGCAGTGCTTGGAGAAACGGCACGCACACGTGAAAACATCATGCCTACTCCAAAGCCTTCTTGTTGCTGGCAGGAGGGAAGCTTGAGACTTTCCCACGCATAGTCGTGACCCGCGTGGCCGTTTCTGCTCTCAGCAACATTCTCTAGTGTTCCGGCTTCAAGCAGCGCTTGTCAGGTTTGAAGCTAGCCACTATTCTGAGAACGTCAGAAAAGCATGGACCATCTCTTGCTTGGTGTTGCCGTTCTGGCAGTAGCAGCTACTACGTACCTGCACGAGTTCCAGGGCAGAAGTGGCAATGTCCCATGAAGGCGTGGCACCCCACGGGGGGGGGGGGAGTGTGCCACGGGCGTCCACTTCTGCAGCAGAAGGCATGTGCCTACAGCACAAGCTTGTAAAAAAATACTTGAACAGAATATGCTGTACAGAACTAGGGGTTAACACCGCATATGAAGATGCTAAAACATTTGTATAAATACTCTGTATACAAGCATGGAGTCACTCCCGTAGAAAGGGCTCATCCGTGAGGCTATGAAAAACTGCTGTCAGCATGCCCAAAGAGAAACTACTTCCACAGTAGGAACAGAAAAAAGGACTGTGCTGTGTCTAAACACGTGGTGCATCAGAGACATAGTTACAGTTCCTACTGACTGCCCCAGCCACGACCTGGGAGTGCTGAGGACCTGGGAGTGCTCAGCGAGCTGCAGGAGGTCAGCCCTGTGGAGAAATACATTTCTAAACAATACTTTTGATTGGGATTTCAGCACCGTATAGACAGATGTTCCTTCTGGGGGCCTGGCAAGCAGCCATCTCCCAGTGGGTCTGACGGGGAAGAGGGGTACCTGGAGCCCCTCCCAGACAGACGGTAATCCCACCCCTGTTCTCACACTCTTCCTGGCATCCGCATCTGCTGGCACACACCCCCGTCACCTGCCACTTCCGCGTCCCGTCGTGGTGAGTGGCTGATAGGCGCTGGATGCAAACAAGGCATGAGATGGACGTACCTGGAGACCCAGCTCCAGTACTGGTTCTGGTCTGCGGGGTGAACGAGGGGGCAGAGGAAGGCGGAGAGAGTGCGTCCCAGTCCACTTAAGCTCTGTCCCCGGAAGTGGCATCTAATCTGGCATTTCGATATTTAATTTGGGAGGTGGGAGCACATACTTCCCAGGGCTCTGGGTAATGACCACCCTGGCCTTCTTTCGAAACATGGGTGCGATTTTAGGGGGCTCCGGAACTGGGGTCTCTTCGGTTTCTTCATTATCTTCGTGATGGAGATCATAGGAAATGTTTCCATATTCTCGTAGAAATGGGAAGATTTCAAGCAGAAACTGACAGAAATCTTTGCGGATACCAAACCACCCTGAAAAATAAGAATTTTTTATTTCACACACGAGGCTCAACTGACCTTCCTGTTAACTTTCTTTCCGTAACAAGAAGTTTCACTCCTACAATGTCATAACATACTTTATCCAGACTCCTGAGTCACAAAGCCTGAACAGGGCTTGAGTACCCAAAATGGGGAAGAAGTGCAAATGCTAGCTCTGTGGTGCTTGGAGTGGGGTTCCCGGACCGGCAGGGACAGCGTCCACGGGGCCTAGTTAGGGATGCCATTCTCGGGCCCCAGCCCAGACCTCCAGAAACTGAGTCGGGCTAGGGTGGGCTCCAGCGGTCCCCTTTTCCTGGCCCTTTTGGGATTCTGCTGGATGCCCAAGTTTGAGAACTACTGCTCCAGTGAGTCTCAAAATATCTGTGGTGCGCAGACTACGGTGTCTTCCGCTAATCTTCTCCAGCCAGGATAAACTCATGGATGACAGTGCCACCCAAGAACAAGATTTCTGTCACCCTCTGGAATCCGTGAGGGCGGTAGTCATGCACGGGTCCTGGCCAGGAGGGGGCCTGAACTCATGGAGCCACCTTAAAGCCACTTTCCCAGTCCCACTACTCCTCTCTGTAGGCTACTGGAGTGTCAGCTCGGTGCAAGCCCTCCCTGCTCCCGGGTGCGGGGTAGGGGGCAGAGGCACAAACAGCAAGCACAGCCCGGGCTGCTGGGCTGCAGTGAGGCCCTGCCCCCAAACCCACTGGCTTTCCGAAGGGCAATGCTCTGGGCTTCCGTGCCATGGAGCCCACAGCCTTGCCAGGAAGGCACCCTCTGCAGAGATCGTTTTGGAAGTGTCTGCCTCAGCAAGCAGGTGGAGGGGAATAGAGTGTTAGCAAGGCAAGACAGGCAAGACTCGGGTGATGGCAGCAAGGATATGGGGGAGGCAGAGAGGCCAACAGGGACCTAGGATGAATCCCAGGTTTGGGTGGGAGATGTGGATTTTCCATCAAACCCTCCCGGGCCTGGGAAGAATCTGTCTTGATCCCCATTTTGCAGAGGAGGGAACGGGATCTCTGAGAGGTTGCCTGCCGTGTCTGGTTCTACCTCAAATGGCAGCGTGCACTGCGAGAAAAGTCCCGGTGCAGGCCAGCAGAACACCAGAGTTACGGCATGCCCTTCCCTTAGAAGGTCCCAGAATTTCCTCAGCCCTCACTTTCCCACACAAGCTTCTAAATTGGGGCCCTCGGGGACTCATCCCTTCCTAGACTTCTATCCGCCCCCCCCCCCACTCCCTGGTCCCCCCCCAGACACACACCAAGGACTTCTGAAATGCTGAGTACATACAGTGGTTTCCTCCCTTCTGTCCAAATGTGGTTGCCATCAGCGTGATCAACGAGAGCCAAAGGGGGACAAAGATCGGGATGCTGGAGAAGGCGTTGTGGCCATCCAGTTTGTGAACCAGCAGAATCTAAAGAAAGAGACATAGTCCCGGTTGATGCCAGCACCGAAAATGGGCAGAGGCGGAAGCCAGACTTCATTAGGCAGTTCCTCCCCACCACCCCACCCCCGCGTGAGCTCCCACAAGAGGGAACATCAGCACCGCCAGAAAAAGGCAGGAAACCACCTATCCCTGGGGAAAGCTCGAAATGAGCTTTTATGTCCCTCTTCAGAGCTCGGCAATAGCCTATCCACTTGAAAAGTTCCCAGTGCCAGCAGTTTTATGGCAAACTCCTCCGGGTGTTTGTTCTAAGGAGTCAACAGCTCCCATTCTAGAATTCTCCACGTGACTCCAATACACAAATCTGACATCCCACTCTGCTTTCCCCAGAGTGGAAACTGGAGCCATACAGAGGCACCATGGCTAAAAAGGTGCACTCTTCTCCCTGCCAGCCCCACGTGCTGCCCCCAAGAGAAAGGAAGGATGCTCTCCTTTCACCGAAGCTCCCTCTCGGAGATGGCTGTGTTCTCTCCCCTCTCCTGGAGTGGGCTCACTGTGAGCTCGAGGGACAGAGGCTGCCTTTCTAGGGGTGCAGAATCCTGTCAGGGGAAGCGCAAGCTTCAGGGGCTGAAGAGGCTTCCCGTGGAACGCTTACCTCAAATGTAAGAAGGGGCACGACGATGGTCATCCAGCTCAGGGCCATGGTTATGTGTGTCCTGCGCTGCTCTGCAATCACATCCATAGAGCGCAAGAACAAGACGGACCACACAATGTAGTAGAGGACCACCAGGCACAGAAAGGACATGAGAATCCACAGCGGGACACACACAACCTGGGGGTGGGTGAGAGAACAGCAAGAGAAGTCTCTTTAGAGCTTCCAACCTGGCCTCTGATGGAAGGCATCTTTAGCACCTTGCTGTGTCTGTCCAGTTAAGGCGGTCCTTCCCGTGAGCCGAATAAGGACCGTTCCATCTCCCAGGACTGCTGGGAGCATCGCTCAGGACAGAAAAGGTATGGTATGTTCACTATGGGGCCTGCTGCCACCAGGGGACACACACGCTCAGTGAGTCATCAGTCCCTCTTCCTTTGGGTGACAGACAGCCCTGCACCTGGCTCCGCAGCCTCTACTCTTCCAGAGGCCCACTCTCCCACACTCTCTCAGGCTCCTCTAGGTTCTGCTGCCATCACAGCTTCCCGGGAAATGGGACACAACTGTCACCCTGTGCACACACACAAGATCTCACCCCAACAGACTCTCTTCACAGGCAACATTCCCACAACCTGCTGGGGGTACTTTGGCAACACAAATGGGAATGGGCTCCCCAGAAAGTCTGGCTGCCTGGGCTCCTAAGGATCCCTAACCTCACCCCTACCAAGTTAGTGAACTTGGCGGGTTGATGCTGGATACAGGTTGATGCTGGATACGTAGCGCTGCCGGGTCCCCGCCTCCACGGCAAGGGCGCATTCCCAGTATGTCCCTGTCGTACCAGGTAGACCTTGTCTCATCCACACACAAGCCCAGAGGACGAGTTCCGGGGGCGCCACTTGGCCAGGCTCCCCTGTGACACGTCTTCGCCCTCCTGCCCTGCCTCCTGGGACGACACTCCTCCGTTCTCCCTTTTTATTAATTATCTATCATACAGTAGGAAAAGTGACCGTCTTCCTTTGGTGTGAGTTCCCTGAGTCTTCACACAAGTAGATTCGCACAGCCGTTGGCAGGATGCAGAAGAGGTCTGTCACCCTGCAAAACTCTCCGTGCTGTCCCTTCACTATCACACCGTCCCCACCATTAGCCCCGGCAAACACTGATCTGTTCTCTGTCACTGTACTTTTGTCTCTGCTGGAACTTTATGTAGATGGCATCGCGAGACAAGTAACCTGTTGAGACTGGCTTCCCGCCATCCACATAATGTCTCTAATGAGATTCATCCAAGTTGTTCCCTCCCTGTATCCACAGGTCGTTCCCTCTCAGTTCTGAGTGGTATTCCATTGTATGGATGCCTACAGTTTATCTGACTGTCCGCTGAGGGTGGTTTGTGAAAACCAAACAAGGCCGCTATCCAAAATGCAAACAAGACTGCTACAAACACTGGTGTGTGGGTTTCTACAAGGCTGCGCGCTTTCACTTCTCTGGGGGTAAATCTTATACCCAGGAGTGGGGCAGCCAAGTCCCACGGGAAGTGCGCTTTTAACTGCATCAGAGATGGCCAAACCATTTTCTACAGTGCCCGTACCACCTGCCTTCCCGCCAGTAACACTGGAGTGTCCCAGTTCCTCTGCATCCTCCCAGGCACGTGGCCTCGTCAGTGTTGCTGAGTTTCACCATTCTGAAGCCATGTGTTTCGGGCCCTCATCCTGGTTGTAGTTTGTCTTCCCTAACCTGTAATGGCGTTGAGCATCTTTTCCTGTGCTTTTTCGCCATGTGTATATCCCCTTCGCAAATTGTCAACTCTTTTGCCAATTTTTAGGTGTTTCTTTTTGCAGTTTTGAGTTTTTAAGAGTTCTCTGTATGTTCTGGGTGCAAGTCAGTGTTTTGATGTGTGCTTTGCAAATATTTTCTCCCAGTCTGTGGCCTGTCTTCATTTCATTTTAATTTTGAGGAAGTCCAAATTTATGATTTCTCTCTCGTATGGACCATATTAACAGTGCCATTTCTAAGTACTCTCTGCCTAATTGCAAACCCCAAAGATTCTGTCCTATGTTATTTCCTAACAGATCTATAGTTTTACATTTTCTTTTAGATCTATGATTTGAGTTGGCATACGAATTTTACTTCTCCTGACCAGTTCTTGATTGTAGGTTTCTCTGCAGAGTCTATGCACAGCCTTTCTTCCCCGTTCCCCATTCTGTGATGAGATTCTCCTTTTTACTGAAGTTCCCTTCATGGGTGGAATGTTAGATCTCAATAGGCTTCCTTGTTTTCTCTTGCTCACTATGGGAAACATGTACTCAGACTGCTTCATCAACTGGGATCTACAGAGGATGAAGGGCAGAAAAAATCTTTCTTCTCATTTGTGGAAAGCTCTCAAAATTACTAATCTTTTTTTTCCCTGACAGTATCTTATACGAAAAAATTTGGTCTTGTTTAGATATGTTTCCTTCACGTCACAACAGAAACAGTTTTGAACCCAATGACCATTCTCCAGATACAGCACTGTGAAGTTGTAGGATGAGTGACTGAATATTTTTTATTATGAATGTTTTATCAAAGACTTTGGGCAGAAGTGATTATTCCCATCTTTAAATATGGAGTATACTTAGGTTCCCTTCATTTCTTCTCTGCTCCCAATTCTTTACTATACTTTTCACTTTTTAAGGGTAGCTATATTTAATACATAAAATATATTGTATGCAAAATTATACATCAAACAACAGAGAAAATAAAACCGAACAAAAACACTAGCATGACCTTACCTCCCAGTGGCAATGGAACTCTTCTGCCTTTAGTCTCCATCTTTTTTCCATGCATTTAATAGTGGAATCTATACTGTGTTCTCTAATTTCTACCTTGCCACTTATCTTTCTATCTCTGCATCCATCTACCCATTTATTCACAGTTAATTTCAACCAAATGCCCAGTAACTGAAGTCACAGTTTAGAAGCATGACATAGATGCCACCACCAGCAAGAGTGTAAATGGGTATGGCTTTTTTTTTTTTTCTTTTGAGACAGGGTCTTGCTACATTGCACAGGTTGGTCTCGAACTCCTGAGCTCAAGTGATCAGCTCACCTCGGCCTCCCAAAGTGCTGGGACTGTAAGTGTAAGCCACTGCGCCTGGCCAGGTATGGCTTTTGGAAAAGCAAGTTGGCAGTGCAGTATACGTATATAGGAATCTCAAATAGTTCCCAACCTCTAGCTCAGTAACACTATTTCTGGACTATTTCCTAAGAAAACAACCAAAAAACAAAAGGCAAAAATTTTAATGCATAAACATATATACTGCAGTATGATTTAAAATCATTCAACACTGGCAACAATGGAAATACCATATTTTAGAAATAAGAGGATGGTCTGATACATGCACTTGAAGAATATTTTGTATCAATTAAACTTTAGAAGTCATGTTTATAAAGACCTTTTATTAACATGATAAAATGTTTATGATACAACATTAAAACAAAAAAATCAGGATACAAAATGGTGCACACAGTTATATCCAAACTGTTTGTATAAAACACAGACATAAAAACACTAACCATGTTATCTCCCCATGGTGGGATTATGGGTGACTATTAGGCTATTACTTCTGCTTGTCCGTCTTTTCCAAGCTTTGTACAGTGAATATGAATTACTTTTATAATAAAAAAGAAGTTTATTTAAGGATTTTAAAAGTTACATACAAGCCAGGGCCAGTGGATGATCTTGTCCAGTCTTAAGGCAATGAATATAAACTGGAGAATGTTGACAGAACACAGGATTTCTAACTAAAAATGAAGAGAAGAATCAGTTAAACAAAGTATAATTTGCATTTAATACTGCAGTAATTTGGTTAACACACTAAAAGACAATACACATTATAATACAGTGTAACTTGTATAATATTATATTGCACTGGAAACTCCTGATTTTCGGTACCAGAGGGGCACAGCAGTATTATGGCAAGGGGAAATGGGGCTCAGCCTGCTGCCTTGCCCCTCTGCTGCCTTGCCCCTCTGCTGCCTTGCCCTTTCTGGCCTGAGGCCTGAGCACAGATGAGAACCTTGTTTCCAGGTTCACTGGGAACACAGGCTAGCTGCAATAGACCACTAAGCTTCCTTCATGTCCCTACCAACAATACTGTCTACTCGAAACCATCCTTGTGCATCCTTCTTCTCATACCTCGGTTTTGCCCTTGGACCCCACCCCTTCCTTTCAGGCTCCTTGAGGACCTTGTCACATCAAGCCATCAACTGTAGATTCCGTACTGTATCTTTAACCCCACCTATTCCATTGATCCCTCCTAGCAGCACAGAAATGACTCCTTCTCATTAAAAACAAAAGCCCCGAAACCAGATAAACCCACCCCATCCTTTGATCCAAGACCCCTCTTAGCCTCAGGTTTCTCTCTGCGAGACAGTAATGAATATGCAAGCCCCTACTATCTCATCTACCATTCCCAGCTTCCTTGTGATCTGGCCTCCACCCCCATGACCACCAGAAGACAGCTTTCATCAAAGTCACCGATAACCGGGAGGCAGCAAAAACACTTCAACTGGCATCCTACTGATACTGTGGATGACTCTTTCCTTTTTGCTCCCCTCCTCCTGGCCATCTAGGATCCCACCATACAGTCTTGGAGCCCTCTGGAGTGTTCTGGTAATTCCTTCTGTCTCCTTTTCCCCCTCCTTAAATGGTGATGCTCCGCAGAGCCACAAACTCAGTCTTCTCTCCCGACTAGGAAGACAGACACACAAACACACATATTTTCACTCACCCGCTGCACCCTGGACTGGGGGAGAAGAAATTTCAACCAGACCCTTGGTCTGCATTATTACCTCCTTCATGGTTAGTTCTCACATCTGTCTCCAACTTAGCCTTCCTGCTGCATAGCAGACCCAGCTGTGCATAGGCCACTCGGCTCTCCTAAAGGCACCTCAAATGAAGCCAGTCCCAAATAGAGATCATTATCGCCTGCACTTAGAACCTTGTATCTCCTCTTTCTATAGCCTCCCATGTCACTTTCTATGGCCTCCCATGTCACTTTCTATGGCCTCCCATGTCACTTTCTATGGCCTCCCATGTCACTTTCTATGGCCTCCCATGTCACTTTCTATAGCCTCCCATGTCACTTTCTATAGCCTCCCATGTCACTTTCTATGGCCTCCCATGTCACTATGGCCTCCCATGTCACTTTCTATGGCCTCCCTTGTCACTTTCTATGGCCTCCCATGTCACTTTCTATGGCCTCCCATGTCACTTTCTATGGCCTCCCATGTCACTTTCTATAGCCTCCCATGTCACTTTCTATAGCCTCCCATGTCACTTGGAGGCACAACAGTGGTCTCATTTCTCAAGCCATGAGCCCGGGAATCACCCTGCATTCTTTTTCCCACACTGTCACGTTCAGTCACCACATCCTGTCCACTGTAAAGTCCAGATTTCTCCTGAATCCCGTGCCCACTTCCTATCCTGATGATGACTGCCTAGTGAGGCCTTCTTCGGCACCATCCGCCTACCACTTCAGCAACCCCTTACTAACCTCCCGCCTCCAGCCCAAACGCCCTGCAGGCTGCTCCTCACTCTGGTAAAGTACATTCTTGCCCACAAAATTGAAATCTGGGACCAGGCCCAGAGCTTCCCAAAAGTTCTCAGTACATAGGTACATGGGGAATTTAGTAATTCCTCCATAGACCTCTAGGCCAAAAGAAATCCCTAATGTGGCTGGGAATGCCAACAGTTGCACTGATTTAAGTAATTACGTCTAACTTAGTAAGTATTTCTAAGTAGCCACCAGAAAAAATAATTCATATAAATCTAAAGAAAAATGTTAATATTATTCTTAAATAACCAAAACTAATTCCCAGTGGGATGCGTGTGCCTGTCAGGTAGCTCACCATTTCCCACGCCTTGGAATCAGACAAGGTGCTCCCACTCGTTACCTGTTCTTCACCCGGATTTTCACACAGCATTAGCCTTTTTTGTTTTCACAGCAACTGCTGAAAACCCAGCTTCTTAAAGATACGACGTCACTGAAAGGAATGCAGTGTGGCCTAAAAATAAACCTGTAAACTATTTCAAGCTAGTAGTTTGTATGGTGTCCCAACAAATGTCAGGTATTACTGTTTTCCTCAAAATGTCCACTATCCCCTGGTGCCCTGTGAGTGCACTGGGGTGCCTGGGGCACTTCAGCACATGCTCTGTGGGCTGAGGACGTGGCCCCTATTTGCCCCAATCCCTCTGTAGAGGCTCCCTCTCTTCTCAACTCCCACAGTTCCTTCCTTTCCACCTTGTGCTCCCGTCGGATCTGAATCGCCACAGTCCACTCAGCTGATGGAGTGTTTCCTGCCTCTAGGCTTCAATGTGTCCCAAAAATGCCATCCCTTCCCTCCCAACACAGGGCTTCCTGGCAGACCCTCAACTCCTCCTTCCCATCTCTGTATGAGCCTACTCCAGACCACCCCCTCACCAACATAGGTACTGTTCTTGCATCACAGGAGGAGGGAGCTCAGCTCCTGGTATGTTGTTTCTTCCAAGGGCAGGAATGCCTAGAGTGTGAATATATGAAAGACTTACTTTCTGTAATTCAGGCCCAATGCAGTCCTAGTCCTTGTATAGTTGTCCCTCAGTATCTGTGGGGGATTGGTTCCAGGACCCCCTGTGGATACCAAAATCCATGGATGCTCAAGTCCCTGATATAAACTGGGAATTGTAGGGAGGGTGAAAGTGGACCATCAGATACTCTCCCTCCACGGCTCACTCCTACCTACCACTATACCCATGCTGTGGCCTTTCCACCAGCATGAAAATCAGGGAATAGCTCCCCTTGTTCAAGGCCAAACCCTTCCTGGTGCTCTAGGTCCCATTCTCCCTGAGTCCCACAGGGCCTTGCTCCATCACCGTCCCTCTGTCTACATACCTTCCCCTCACGCCCATACCTGGGGTCTAGGCTTATATGCCTGACCCACCCTACAGCTGCTATGTTTACTTCCTAAGCCAACTGCAGTCTTCTCCCTTCACTCTTCATCCACACGGCTCAAAACCAGGCATCTACAGCCTCTAGTTCTCCCCTCAACCCATCAGTATTCAGCTTTGGGCCCTCAGCTTCTATGCAGTTATGTAGTTATGTGCCCGGGCTTTAGAGTCGGGCTGACTCAAACGGAATCCTGGTCCTGCCCCTTCACATGTGGCCATGAACAAATGACTTATCTTCTCTGGACCTACCTCACAGAGTTAGTTAGTAAGAAAACTACCACATGTAATGTGCCTCGCACAATGCCTGGCACACAGTAAGTGCTCAATAAACGTTATCTGCAATTACTTTCATTACTATTATTACTAGTCCTGGTATTTTATTCATCTGCATATCCTCTATGCTTAGGGAAAAAGGGCTTGGCATCTAGTAAATACTTGATAAATGTTTATTGAATGAATAAACAAACACAGGGGCACATCAGGATAAGCTAACCAGACAGCAGGGGAGGTGCTAAATCATGGGGTCTGAGGTGGGGAGATGGTCAGTTTTGAGTGTCAACTTGGCTGGGCTATAGTACCCAGTTATTTAATCAAACACTAAGCTTGCTGTTGCAGTGAAGGTACTGACTTTGAATAAAGGAGACTACCCTCCATAGCATGGGTGGGCCTCATGCAATCAGGTGAAGGCCTTAAAAGCAAAAACTGTGGTTTCCTGGAGAGGAAAAAATTCTGTGCCAGGACTGCAGTGTCAACTCCTCCTAGGTCTCCAGCCTGTTCACCTGCCCTGGAGATTTCAGACTTGCCAGCCCCACAGTAATGTGAGCCAGTTCCTTAACTCTCTTTATACATATATCTGTATCTAACCTATCAGTTCTGTTTCTGTCTGATTGATACAGGATGTGAAGCTGGGAGAAGGCTGATGTCCTGGGTGAAAAGCTAGTGTTCTAAGTGAAGAAAGAAAAATTGTTATTTCCATCATCTTTTGTTGCCTTGTTGTCTCATGATGTAGAGTTGGTAATGATCAAGCTCTTCCTAACAAAGGGTAAGAAATTGACATCTGAATAACTGAGCAAAATATTTTACTTTTGAAAACATTCTTTTTTTTTTTTTTTAAGACGGAGTTTCATTCTGTCACCCAGGCTGGAGTGCAATGGTTCAATCTTGGCTCACTGCAACCTCCGCCTCCTGGGTTCAAGAGATTCTCCCGCCTCAGCCTCCGGAGAAGCTGGGATTACTGGCGCATGCCACCACACCCGGCTAATTTTTGTATTATTAGTAGAGACGGGGTTTCACCATGTTGGCCAGGCTGGTCTCAAACTCCTGACCTCAAGTGATCCACCTGCCTTGACCTCCCAAAGTGGAAAACATTCTTAAATATATGAAATCTCACCTCTAGTGACCTGTCATGTCGAAAGCCCCAAACGCAAGCTGCAACAGACACCGGGGAAACAAAGAACAGCGGCATGAAGACCAGGAGCCAGAAATGGCTTCCTCTCTCGATTCTGTCACAGACCAGAACTTCAAACATCAACAAGAGCAAGTGGATGCCCACTGCAATCAACATGGCTTTAAACTCCACACACGTTTCTCCTTCTGCTCTAAAAAAGGGAGAGAAGAAGAAAACACCCTCAGTTCAGAATCTCCACTATAAGCAAGCAGTTCAGGGCAAATACCAACTTATATTTATACTTTGAATTTTACTTGAAAATTTGACAAAAGCAAAGGGAAATCAGGTAGAAAGCTAACTTAAACCTAAGCTTTGGTAGGCAATCTCTGAAACATCGAAGAACTACTACATAATACAAAATGAACATTACAACCAAACCAGAATTTAATGTTTTAACCGTATAAGGATATTCTCAAAAGTAATAGCCAGTTCTTATTTCCCTGACAATGTACATAAACACTTCTGTTCACATCTTTAAATTCAACAACAAGAGTTACTTCCAAGATTATTCAAGCTGATTTGCTTCTGCTGCTAAAACCAGGCAAATACCCCTTAAGTCTCATGATCCTCATTTTTCAAGAAACCATGTAAACCACCCTTCACACAATATTATAAAAATAACTCTAGTTCTATGAACAAGTGCCAGTTATATTTCAAGATAGTAGTAACTATTGTTAGGTGCTATTTTTAAATGCAAATAAAACGTATAAATGATTTTCATTTTCCTTTCTATTCCATTAAGATAAATTAACACCTGCAGATGAAAGAGAAAGAAGAACAAGAGTTAAAACTGTTCTCAAACAAAATCAGTTTAATTAGCTAAGTATCATGCACAATAACCTTAACAGATCTACAATTGAGCAATGGTAAGGCCGCTAATCAGGAAAAGGCTCTATAATGCATCTGAAAGGCCTACAATGTTTATTCAAAATACAGATGAACATTTATGATATACATGTATTGTGGGTGACAAATACACCGGAAGTTAGATCATAGAGAAAATGCTATCAGAGGTTATTCCTGTAGGACCCAACCATGTTCCACTGGTTAATGTTAACATGAGAATGACCACGCCTGTACATTCCTTACATTCAACCCCACATACACAATTCCTTTCCTTGCTCAAAACATCTTAAATAAGACCAACAAAGAGAAGTTTGAATATATTCTAAATATCAATTAGTAGAACCTAAATGTTTATTTAACTTTGCATTCTTTGAGAAGCAATTAATATTAGATATCTGAAAATATCTCATAAAAAAATAAAACACATACAGCCACACAGGTCATCAATCTTTCAAAAAAAATCTAAGAACTCTGAAACAGCTATACATGAATGTCCCTCACCTGCCAAGGCTCTCTGTGTAACTGTTTTACAGTTCTTAGACATGTATGTGATATGTAATTTACACAATCTGAATCATTTTCATATTTAGTAAACAAAAATTTAAAAAGTTGATGTAGTGGCCGGACGTGGTGGCTCACACCTGTAATCCCAGCACTTCAGGAGGCCAAGGCGGGTGGATCACCTGATGTCAGGAGTTCAAAACTAGCCTGGCATGGTGAAACCCTATCTCTACTAAAAATACAAAAAATTAGCTGGGCATGGTGGCAGGTGCCTGTAATCCCAGCTACTCAGGAGGCCGAGGCAGGAGAATCACTTGAACCCGGGATGTGGAGGCTGCAGTGGGCCGAGATCACACCATTGCACTCCAGCCTGGGCAACAAGAGCGAAACTCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAGGTGCGGTAGTTTCAACTTTACACTTTTCCCACATGAGCAGCTGCCTTCTGGGAATTCCTGTACTCCTCATTTTCCCAGTGGAGGTTCATAATAGCCTCCCAGTCTTAAGTCCCCCTTTTTCCCTTTATGTAGTTACAGTCTCTGTGGCAGAGAAGGGAAAGCCTCTCGCAGGTCCCAGCAACAAGCAGGTTGCATGCTGGCGTGAGCCAACTCCCAGGAATTTGGGCCAGCGGAGGCATCCAAGAGCAGGGAGGGCAAGAGTTGGAAACGGTAAAGGGTACCCCTCTCCCCTGCCCCCAAAGGCTCTGCTTTCCTTCTCGGCATCCAATCTTTGACTTTCCTCATTCCCCAGCTGCTGTCTCAGGGACTCATGGTCTCCTGGTCAAGCCACCTCCCCTACACTGCTAATAGTCCTAAAGCTCTGGGACTAGGAGGGTGGGACAAGGGGAGCCCCAGTTCCAAAACTGTATTGGAGAAAGATCTTTCATGACCAAACATAATATGAGTGTCTTTTCCTAAAAACAGGGTGGTTTCATGCTGCTTAGTCTAGTATGGCATACCAGTTCTGTATTTTGGGTGCATTTTGGATTATATAGGCATCTGTGAGCTGGCTGGCAACTTACCCCCAAATGGCACTGCTTTTGTAAGAAAATACATACCAAAGACTAAATTTTCAAAAACACAGAAGAGATCTGTTAGCTTATACTATAGTTCTAAGACCCCAGATAGGTAGAAAATAAAATGGTCCTTACTTCATCAAAAGTGAGAAAAGTCAAGATATTGCTCCCTCATGCTAGAGACCAATGGGTTGTATAAAGCAGTATTACCGATATTGAGGATTTCGTGCCCAGACTCCAGTTCCAACTGAGGCTCCAACAATGACCATTAACTTCCACAGCCATATTGGAGCAAAGACAGCCCAGTAACTCCACTGTATGATGCCATCCAAACGAAGGGCCAGCAGCACAGAGAACAGCAGCAGACAGGCATAGATGAGGAATTTACTAGGAGAAAAGTAAAACGATTAAGAAGGATTCACTTTTACAAATATGTGATACTGAAATGGGGAGTAATAAGAGCCACATTTGTCAGCATGTAAAAGGAGTCACTAACTCAATAATCATTTATTGAAAAGGTCTATGGGGCAGACATAATGTGTTGGGAATAAAAGAAACATAAAGAAGACTAAAATAAGTCTCCTGCTTTCCAAGGCTTCCTCATAGAAGGAAGACCACACAGAAACATATAATACAGCACAATGTTTGTGATGAGAGCTTGGAAGAGGAAATACAGACTGTGTCTGAGGAGGCACTCAGAAGCAGAGATGTGGTGACCCTAGAGCTGGTTCCAGAGGGCAAGTAGGAAGCTGCCAGGCAGGAAAGCAAGTGAGATGAGGAAGTATTCCAGGCAGAAGGAACTAGCTATACCAAGACACAGAGACTGGAAAAGGCTGACATGCTCTGAAAATGGTCAAGTTCTATCACTAACTGATTCTATTTCTAAAAAGGCAGCCATCTGTCATATTCATATGGCATGAACATTTTAGTGTATGTATTATACTTTCCATGAATGAATAAATTACACATACACACATGCTCACGTCTCATATAAAAGGGAATTGCTACAGAGGATGTCCTTGAAATAATTAGAAATTATACTCTTGAGGACCTCTATTTCCAGCCTTGACTTAATAATAGGAATATAATTTACCTTCCCGCCTAAATAAGAAGCTTGATACAGTCTACAAAAGAACAGTTTTCAGACATTGACAACAGGCAGTGGAGAACAGGTGAGAAGGAGGAAATAAAGGAGGTAAACCCTACTATTGCCCCAGTTTGCAGATCAGAGGCAGTTTCCAGGCTGCAGCAAGAAAACAGTTAAAACTCAACCCTGTCAATAACTATATTAAATATAAATGGCCAACTGAAAGACAAAGATGATCAGATTGGATAAGTAAGCAAGACAACTATATGCTGTCTGTAAGAATCCCACTTTATCTATCTATAAAGACACAGATAGATTAAAAGCAAAAGGAAAGAAAAAGTTATACCAAATAAACACTAACCAAAAGAAAGCTGGAATGACTATATTAATATCTGTTTAGTCTTCCATTGCTGCTGTAACAAATTACCACAAACTTAGCAGCTTAAAACAACGTAAATTTATTATCTCACAGTTCTATATGACAAAAGGCGAGACGGGCTTGGTTGGTTTCTCTACTCAGAATCTCACAAGGCTGAAATAAAGATGTCTGTTGGTGGAATGCTTATCAGGGGACTCTGGCAGAATCTACTTCTAAGCTCATTCAGGTTGTTGGCAGAATCCAGTTTCTTGTGGTTGTAGGACTGAGGTATGTGTCTCTTTGCTTGCTGTCACGCAGCAGCTGATCTTGCGATAGTAGGGGCCTCTCCTGGGTCCTTGTAAATAGGCCCCTACATCTCAAAGCCAGTAACAAGCTATAGCATATTCAATCTTTCTCATGCTTGGGATGTTTTCTCACTACTTCTGCCATATCGCTTCTGCTTCCACTGAGAGAAAGTTCTCCGCTTTTAAGAGTTCATGTGATGAAACTGAGTCCACCTGGTTAAGCCAGGCTACTCTCCCTATTTTAAGGTCCATAACTGTGGTTGGTAGGCAGAATTCTAAAGAAGTTTCCCAGGATTCCTGTCCCCTGATTATTCAATCAAACACTAATCTGAGTAATACTGTGAAGGGACTTTGCAGATGGAATTAAGGTTACTAATCAGCTAACTTTACAATAGGAAGATTATACTGGATTATCCAGGTGTGCCCAGTGTAATCCCATAAGCCCTTAAGAAAGCAGAAGAGTAAGTCAGAGAAATGTGGTGGAAGAGAGATGAGGCAGAAGTCAGAGAGATTCCAGACTTGAGAAGGATTCAGCCTGTTACTTCTGGCTTTGAACATGGAGGTAAGGAACCATGAGCCAAGGAATGCAGGCAGGCTTCAGAAGCTGAGAATAACTCGCAGCTGACAGCCAGCAAGGTAAATGGGACCTCAGCCCTACAACCCCAAGGAACTAAATTCTGACAATAGCCCAAATGTGCTTGAAAGCAGATTAATCCCTGGAGGCTCCAGAAAGGAATAGAGCCCTCCTGACACTTTGATTTTGACCCTGTGAAACTAGGCAGAAGACCCATCTGAGTTGTGCTGTACCCGGACTTCTGACCTAAAGAACTGAGAGTAATTTGTCATGGTAACAGCAGAAACGAATGCTAATAAATATAGCTTCAAAGTCCCCTTTGCCATGTAAAATAATAACATATTCACAGGTTTCAGGGCTTAGGGCCTGGGTATCTGTGTATTTGTGGGGGTGGGGCATTCTGCCTACCACAACATAAGACACAGTATATTTTTGAACAAGGACTATTTCCAGGGACAAATAGAGGTAGTTCATAATGATAAAGGGGTCAATTTGTCATATGCCTAATAACAAAGTTTCACAATACATGTAGAAAGTACTGATCAATCTAAAAGGAGAAATAAAAAAATCAAACTGTTATAAATGGAAATTAACATTCCTTTCTTAGTAACTAATAGAACACATAAACAGAAAATTACTAAGGATATATATGATTGTAGCAACACTATCAACCAACTTGACCTAATTAATATTAATGAGTTCCTCCCAACAAAAGCAAAATACAGATTCCTTTCAAACACACACGGAACATTCACCAAGATAGATTGAATTCTGGGCCATAAAACAAAGCTCAACAAATTTAAAAGGACTGAAATCATACAAAGTAAACAAGCACAATGGAGTCAAACTAGAAATCAACAATAGAAAAATATCTGGAAAATTCTCAAAATACTTGAAAATTAAATGCCACACTGCGAAATAATCCATAGGTCAAAGACTATGAAGAAAATTGAAAAATATTTTGGACTAAAGGCAAAAACACAATATACCAAAATTTGTGAGATACACTAAAGCAGTACTTAAGGGAAATTTTAGCATCAAATACTTACATTAGAAAAGATATCAAGTCAATAATCTAAGATTCTATCTTATGAAACTAGAAAGAACATGGAGGTAAGGAACCTCCATGTAAGAAACGGAAGGAAATTTTTAAAAAGTAAATGGAAAGAAGAAAATGATAAATGTAAGCACATTAATCAATAAAATACAGTAAAAAGGGATTAGAGAAAAAAATCAATGAAACTAAAAGCAGTTTCTTTGAGAAAGTAAGAAAATTGGTAAATCTACAGCCAGAATAATCAGTAAAAAAGAGTAGGCTCAAATTACTAATATCAAGAATGAAAACAGGGATATCACTACAAATCCTAATAATACTAAAGGGATAAGTAGGGGATATTATAAACAACTTTATGCCAGGAAATTTCTTGAAACAAAGACATGGAAACTGCAATTCTAGTTAAAACCTTTTTCAGAAAGAAAATGTCAGGCCATGGCAAAATCTACCAAACATTTAATGAGGAAATACCACCAATTCTTCACAAACTCTTGCAAAAAAGATGAGGGGGGAACATTTCCCAATTTATTTTATGAAGCCAGCATTACCCTGATAACAAAACTGACAAAGAAAGAAAACTACAGACCAATATCTCTCATGAACAGAGATGCAAAAATCCTGAAAAGATTTTAGCCAACTGAATTCAGCAATATATAAGGATACTACATTACGACTAGGTGTGGTTTAGCTAGGATTGCAAGATTGCAATCTTGGATTAACATTTGAAAATCAGTTAGCAACCTTCGTGGTATTAGCAGGCTGAAAAAGAAAAATCATATGATCATCTCAATAGTTGTGTAAAAAGCATTTGACGTAATTTGCTACCCATTCAAGTTAAAAGCGTTAAACAAATTAGGAAAAGAAGAGCATCTATGAAAAACCTGCAGTTATAATGCTTAATGGTGAGAGACTCAATACTTTCCCCTTAAGACAGGAAACACAGCAAGTATGTCCACTTCAACACATCTATTCAACATTGTACTAAAGGTCCTAGCCACAACAATAAGATGATAAAAAGAAATTAAAAGAATATAGTTTTAAAAGAAAGAAGTAAACCTGTCATTATCTATAGGCAACATGATATTCCCTGTAGGAAATCCTAAGTAATCTAAAAAAAAGCTAGTAGATCTAGTAAGTGAATTTAACAAAATTGAAGGACACAAGGTAAATTTACAAAAATTATTGCATTTCTATATAGTAGCTATCACAAATTGGAAACAGAAATTTAAAAATATGTATCATTTACTGCAACTTTCCTATAAATTTATAACTATTCAAAAATTAAAAGTTTATGAAAAAATCCACCATTTACAATAGCATCACAAATATAAAACACTTAGGGATAAATGTAACAACATATATGCAAAACCTATATACTGAAACCTACAAAACACTGGTAGGACTGTGGGGAAGCTAATAAACAACAGAAATTTATTCCTCACAGTTCCAGAGGCTGGAAGTTGGAGATCAGGATGCCAGCATGGCTGGGTTCTGGCGAGGGCTGCCTTCTGGGTGGCAGATGGCAGACTTCTCAATACCCTTCACATGGTGAAAAGAGAGTGAGTTAGCTCTCTGGTCTCTTTTTATAAGGGCACTAGTTCCAACCATGAGGGTTCCACCCTCATAACCTAATCACCTCCCAAATGCCCCACCTCCAAATACCATCACACTGGGGACTAGAGTCAACTTGTGATTTTTAAAGGGACACATTCAGTTCATAACTGCCAAGAAAAATTAAAGATCTAAATAAATGGAGACATATACTGTGTTCATAGAACACTCAATACTGTTAAGATTTACATCCTCTGTGTATTAGTTTCCTATGGGTGCTGTAACAAACTACCACAAAATTGGTGACTTAAAATGGCACACATTTATTATCCTGAGGCCAGAGGTCCAAAAAGGGTTTCACTGGGCTAAAACCAAGATGTCAGCAGGACCTACTCCTTCTGGAGACTCTATGGGAGAATCTATTACTTGTTTTTTTCCAGTTTCCAGAGCTGTATTCCCTTGGCTCATGGCCCCTTTCTCTATCATCAAAACCAGCTGCATAAAATCTTCAAATCTCTGTCTCTGTTTCCACCACACTGCCTTCTCCTCTTAATATTATCTCCCTCTTTTAAGGGAACCTGTGATTGATTGCATTTATTGCCCCACTTGGATAACCCCATCATCTCAAGATCTTTAACAGGTTCCATGGAAGTGGGTATCTTTGTGGGTCATGATTTAGCCTACCACACACACCAAATTAACCAAATTATTTTAAAGAATCAATGCAATCCCAGCGAAAATTGGGAGCCAGCAGGTTCTTCTGTAGAAATTGACATACTGACTTTAAAAGTCATATGACAGTGCAAAGGACCTAGAATGGGCAAAACAATTCTGAAAAAGAACAAAGTTGGACAATTTATACTACATGCTTTCAAAGCTTACTATAAAGCTACAGTAATAAACACAATGTAGTACTGGCATAAGAATAGATATATAGAACGTACTAGAGAGTTGGAAAACAGACCCATACATATAGAGTCAGTTAATTTTCCACAAAGGTGCCAAAACAACTCAATGGAGAAATAATTATGTTTCAATAAATGGTGGTACAACTGGATAAATGAGGAAAGTACACCTTGACTCTTAACCTTATAACATATACAAAAATTTATGTGACATGGATCATAGACCTAAATGTCATACACAAAACTATAAAATGTCCAAAACACATATGAAAAAAAAATACTTGCAACCTACGGTTAGGCAAACATTTCTTAGATGGGATATGAAATTGGACTTCTCCAAAATGAAAAACTCTTACTCTCCAAAAGATACCTATCATTAACAAAATGACAGCCAAGCCACAAACTGGGAGACCATACTCTGAAAACACGTATCTGACAAAGTAACCTGTATCCAGTATATAAAAAGGACTTTTACAACTAAAAAAATAAGTGAACAACCCGATTTAAAATGGCCAAAAAATTCAGAGACATCCCATCAAAGAAGGCATACAAAAAGCAAATAAGCACACAAAAAATACTCAGTATTTTCAGATGTTAGGACAATGCAAACTAAAATGACAATGAGATAAGATTACACACACACTAGATCTCTTATATATGCTGGTGGGAATGCAAAATAGTACAGCCACTTTAGAAAACATTTTGGAAGTTTCTTATACAGTTAAGTGTATACTTATTCTATGGCCCAGCAATCCCACACCTAGGTATTTTACTCAAGAAAAAGGAAAACGTATGTCCACACAAAGGCCTGTATTCAAATATTCCAAGAAGCTTTATTCATAATTGCCACTGGTAACAACTCACACATCCATAAACTGGTGAATGGTTAACTGAATTTTGATATATCCGTTCAATGGAATACTAATCAGCAGTAAAAAGTAACAAACTATGGATGACAACAACAACATTGATGAATCTAAAATGCACTATAAGTAAAATAATTCAGATGGAAAACACAAGATAGTGTACAATGCTACTTATTTGACATTCTGGAAAGGGCAAAACTCTAGACACAGAAAAACAGATCAGTGCAGTTACAGATAAGGAACAGAAAGCTCAGATGTTAAATAACTTGTCTAAGAGCTCAGAGTTGGGAGTCAAACCATTTGACTCCAAAATCCTTCTAATTGAGTATTATACTATACCGGCTTTACACTGATAAATGAACATGTTTAGAAAAGCATCTGTATAAATATCTGCTGAGCAGCTAACAGGTCTCCACTCGGGAAGATTAAGGATAGTCTTTATACTCTTTTGCTTATGATATCTGAGTTTTCTGTAAGGTACGTGTACTGTTTTTGTACTATGAAAAATAACATAAGAAACTTCGATTGGAAAAAAGGGAGGAGGGTTGTAAGAAGACTGGACCTACAAAATTAAACTTGGAAGCAAGCCTCCCAGAGAGAAAACTACGCTGGGAAGATCATGAGATTCAGAGGGGTTGCTAAAAAAATATACATAGCATCATTGTGTCATGGTATTCTTACCCATAAAGTTAGGGCAATAATATGTACCTCATAAGGTGGCTACGAATATAAGATAATGAGTATAACAAGTGCTTTGTATAAAGCACTAAATCAGTGGCTCTCAAAGTTTTTAGTATCAAAAATTTTGACGTTTAACTCTTAAAAAGAACTCCTTTTCAGTTCTAACACTTACTGAGGATCCCAAAGAGCTCTGGTTGATATGGGTTATCCCTATTGGTATTTAATATTTACGATACAGTTGTTATTCAAAAAATATCTACTCTTCGAAGGTAATAATAAACACATTACATACTAACATACATAACATTATTTAAAAACATTTCCAAAACCAACACAAATTTAGTGAGAGTAGCGACACTGTTTCTACAGTTTAGCAAAATTTTTAATGTATGATATCATAGAAGACACCTAGATTCTAATATCTATATTTACATTCATTCTGTTGCAATACATTGTTTTGGTTCAAGTATACAAAGAAAATCTGGCCTCACAGAGATCTGTAGTTAGAAAAGAACTATTTTAATAGCTCTTTCAGATAATGGTAGACGTTCTTCAACAGGACACTAAAACTCAACAAGTGGTAGTTTCTTAAAGGTTAACTATGATGTGGAATCTGAAATCGTATCAATGACCTTTTTGTACTCTGCTACACTGAAATCCACTGGTCTCTCTTATACTTTCAATGAATCTTTTATCCCAGCATTATTGTATAATGCAATGTAGGTCGACTGGAAAATATTGGTTTACTGAGTTATAAAGATCTTTCAAATGTTGACACATTTCATATACAATAGCAAAAAAAATCACATTACTTACTATCTTCACCAACTTCATCAGAAATATTTTAATTACTGGGAGGTTGTCAAGCTCATGGTAGCAGATACAACTTTTCAAAAATTCTGGTTTTCCCATGAAAGATCAAATTTTACCATTTACAAAAAATACCATGAGTTGTTTTCCTTGAAGTTTTGCTTCCTGCACTTTCTTTTTTTGTGTGTGTGATAGAAGCATCTTTTATTATAGTATTTTTGTCTTTTTTTTTCTTTTTTTTATTATTATTATACTTTAAGATTTAGGGTACATGTGCACAATGTGCAGGTTAGTTACATATGTATACATGTGCCATGCTGGTGTGCTGCACCCATTAACTCGTCACTTAGCATTAGGTATATCTCCTAATGCTATCCCTCCCCCCAACCCCCACCCCACAACAGTCCCCAGAGTGTGATGTTCCCCTTCCTGTGTCCATGTGTTCTCATTGTTCAATTCCCATCTATGAGTGAGAACATGTGGTGTTTGGTTTTTTGTCCTTGCAATAGTTTACTGAGAATGATGATTTCCAATTTCACCCATGTCCCTACAAAGGACATGAACTCATCATTTTTTATGGCTGTATAGTATTCCATGGTGTATATGTGCCACATTTTCTTGATCCAGTCTATCATTGTTGGACATTTGGGTTGGTTCCAAATCTTTGCTATTGTGAATAGTGCCAGTTAGAATGTCAATCATTAAAAAGTCAGGAAACAACAGGTGCTGGAGAGGATGTGGAGAAATAGGAACACTTTTACACTGTTGGTGGGACTGTAAACTAGTTCAACCGTTGTGGAAGTCAGTGTGGCGATTCCTCAGGGATCTAGAACTAGAAATACCATTTGACCCAGCCATCCCATTACTGGGTATATACCCAAAGGACTATAAATCATGCTGCTATAAAGACACATGCACACGTATGTTTATTGTGGCTTCCTGCATTTTCAAGAAAATGTCTGCCAAACACCATAACACAAATAACCACAGTCTTGTCTGTCAGTTGTCCTTCCAAATAAAAATGATACTCCATGAAAACAGCAGCTTGTAACTCAGGCACACACGTTTTTCCTTGAGTCAACCATTGCACTTCAGCATACAACAGGCCTTTATGCATTCTTCCCATTTCATCACATGGAATATTAAACTAGATGTGCATTCAGGGGTCAAGATTAAATGAGATTAATATTTTTCTGCTTTATCAAGGACATTCGTAAGTGAAGCTGGCATTTTTTTTTTAACTGCAAGTCCAGCACGTGGTGGTGAAGAATCCAGTGACAGAAGACTAGTGCAGCTGGTGCCACTGTCCTGATTTGTGCTCCAGCACCAGCTGTTTGATGCACTACTTTTGTACCACCAGTGCCAATGTCGACCAAGGCAAAGAATGTCTTAGTATTATAATTTTGACTTTGCAGATACCTGGAAAGGGTCTCAAGGCCCACTCCCAAGGTCTGTGGGAAACAGTCTGATAACCAGCGTACTCAATACACATTAGCTAATATTATTAATACTCGAAAACAAAAACAGGCTTTTATAAGCTGCTACAAAGAAAGAACATTTAAAGAAACCAAGATAAAAATGACTTCATCCCTTAGACTTTGGACAAGAAGAGAGCCTGTGTCAAGGTGATTCACAAGTTAATAGAAATGGAGTTATCACCACAATTACAGTACACAATTAGTAGAGGCAGAAGTTTTCTCTCTTAAAGCAGAGGGAAATATTCCAAAATCTAAAAAAATCAGAACCAATTTGTCAACTAAAGCCTGTGGCTACAAATATAATTACCAGAAACAATGGACTTGAAAACAGACTATTTAAAAAAGAAATTAGTGGATTCAAACATTAACAAATGCTAAGATAATGACGACACAGGATTCATATGTAAATTAGATAACATGTACAGGTAATTTTTATCCTAAAGGATTTTCAGACTATAAATAAAAAGTAAATTGGTGGGGGGGGTGGTAAATGCTGTAAGCTAACTTCATTATCCTCTGTAAAACAGTTTCCTTATTTAATAAAAAAAGAGAGTAGTGGAACTAAAGAAAAGAGAAGTACCAAAATGTTTTAAATGTTGGAAAAGGGATTATTTTATGTGGTTTTTGACATAATTTAAAGGAAACTTAAAATTTTATGTCTATTACATGAAAAATAGAAAACAAGATAAACTTACAAAAGGACCCACAAAATATAATTCATGATTCAAATACTAGGGTTAGAAAAATATACATGAGCTGAATGTTCTCTTATTAATTTCCTATTGCTGCCATTCAAAACTGCCACAAACCTAGTGGTTTAAAAAGCATAAACCTAGTGGTTTAAAAGGCATAACCCTTACAATTCTGGAGGTCAGAGTCCAATATGGGTCTCGCTGGGCTAAACTTGAGGTGTCAGCAGGACTATGCTACTGAAGGCTCTAGGAGAGAATGTTTCTTTGCCTTTTCTAGCTCTAGAGGCTGCCACATTCCTTAGCTCATGGCCCCTTCCTCCATCTTCAAAGTCAACAATGGAGAATGCAGTTCTTCTCATACTGAATCACTCTGACCTCCTTTTCTGTCTCCCTCTTCTATATTTAAGGGCCCTGTAATTACATTGGGCTCAGCCAGAGAATCCAGAATAATCTATTTTAAGGTCAGCTGATGAGCAAACTTAATTCCATCTGCTACCTTAATTCCCCTTTGCCATGTAACAGAACATATTCCCTGGTTCCACGGATTAGGATGTGGACATACTTGGGGGCGTCATTATTCTGCCTACCACAGCTCTGTAAAAAGAAAGGTTGCTTCCTAGATTAAGAAAGTAAACCTGTTACATTCAAGAGTTACGCTTGGAACCAAAGTCAAAAGTCGAACCAACTATCAGAAATCAGATGACAGAGGTTTAGCACGCCTGGATGAAGACAAGAGCAACAATTTACTATCTTTAAAATTAGATAAATTAGATTTCAAAGCCGAAAGTATTAAAATGCTAAAAGGACTAATAAGGAAAACCTAAATAACAAAAACCTAACATGACCATGGAATATGGAAATAATTTTACAATTTTTCTTTTTTAAGGATACACAGAAATATTTTAATTGTGGGCTTCCTCATGCTACTCTTAAATCATGACAGATAAAACAGACAAAGTTCCTAAGGAAAATACGGAAATGAACACAAGAGTAGATTTAAAAAACAGCAAGGTTATATGAAGAGAATTAAGTAGCAGAACGAGAAAATTTAAAAAACAGCAAGCTTATATGAAGATAATTACGTAGCAGAACAAGAAAACATACTTGCTTTACATATATATAGTTTTTCCAGGGAGAAAAAAAGGATCATGGGGCAGCAATACACAATAAACACACATCTAATTTACAATTGTTAAAAATATTTGAGTGCCTGAAATGTGTGAGATACATAAAAAGGAGCAACAATGCGTTCGCTCAAGATGCTCAGACTAATAGGAACAGAGGGAGATGATGTATCAACAAGTAACTACATATGGTAAACTAGGGGACTTGGAAGATTAATTTCCCTGAGAGAGATGGGAAGAGAAGTGGAATCTGGGCAACAAGTTGGGAATAAGGAATTCCAGACAAAGGGAATAAGATGTTTTCCAATTTTTTAAAAATTATTTTTATTTATTATGGATACAAAACAGTTGTGCCTATTTATGGGGTACACGTGATACTGTGATGCAAGCATACAATGTGTAATGATCAAATCAGAGTAACTGGGATATCCATCACCTCAAGCACTGATCACTTACTTGTGTTAGGCACATTTCAATTCCACTCTTTTAGTTATTTGCAAATATACATTATTAACTATATTTGCCCTCTTGTGCTACCAAACACTAGATTTATTATTTCTATCTAACTGTATTATTGTACCCATTGCTCATCCCCTCTTTATCCTCCCCTCTCCAATTCCAGCAGACTGGAACACACAGAATACTTTCTATAAATCGTGGTGGGTAGGGTGCTGCAGAGACAGGAGAAAGGGTACTGGGGGGAATAAGCCAGAAGTTGGCTGTGGTAACATGGTAGGGGAATGTGAATATCAAGGTAAAGTCCTTGGAATTTATTAATAGGTAATATGGGGCCACTGCAGATTTTCTCAACAAGAGTGACAGAAGATACATATGAGTGTGTAAGGTAGATTGTAATCAGTAGAGACCAGAGGTTGACATTTTATACAATTCATAAAAACACTGACAAAAAATGGATCATTTTCAATACTGATTTGTTAAGCAGTTTAGTAGAGTTACTAAAATACAATTGAAACATAAAAATACCTGGGCAGCTGCCAAAACAAAACTCAAGAGAAAATGTGTTCATTTAAAATGTTTAAGTAAAAGTAGAAAACAAAGAAAAAAAAAAGAGGCAAAAGAAAACTAGTAAACTGAGTTTCTAAGAAATTTGGGGAAAAAGCCAAAACATGAAAATAATAAAACTAAAAGTAGAAATATAGATATAAAAATGAAACTGATCAGTAATCCCAACAGCTAGCTTTTTAAAAAAATTAAACTACAAAGTTGATCAAGTAAAACAAAGTAATACAATCAGTAAGTTCAAAAGATGTAATTCATACACACTCCTAGTTGGGGAAAGAAAGAATAGTATCTTAACTATATCAATATATTTGCAAGCAAAGTCTAAAAGGTGACTGCATAGCCAAAAGGAAATATCAAGCCTGATTTGCAAAAGAATACACAATAATATGCAATAATACAGTGATAAGAAAATGTATTTACAAAAATAACTTATGGTCATAGCCTAAACTACTTTGAGTTACTTTTCAAGAAACATTCAGACCAGATTCCAAGTATAAAAATAGACTGCTTAATTCTCCACAAACCTGGGAGAAATGGGAGGTTCTGGTTTAACATTAATCACTACTTCTTAAAATTCACTTTTCCAGTTACACCTTTAGAATGGATTTATTCCACTCTCATTTGAATCTGTAGTGTTAGTATACGATGAATCACTAAGTAGTGCTACCTGGGCCAAAGCTAGTATCCTCTCTGAAATTTACATGGAGCTTTCTGCTCAGGCTCAAATTCCCTCTCTCCACTTTGCAGGGTTGGGAGAATAGGGAGGGAGAAAAGGAAGAAGGGAGGGAGAGAGAGAATATACTTAGCAAAGGAATGATGACTCTGAATGTAAAAGTTCTAGTATCTGTTTTCTGCATATCATAAGAATGCAGCTGGACTCTTTTTAGACCTATCAGTTTTTTTCCAGTGGAAGCTGCTGGCTTCACTGGAGCACATACACGCGCATTAACACATGCCTGCATGCACGTACACACGCACAAGCACATACACACACACAAACTTTGGATGGCACATCCCAGTCTAAAGCTTGACAGAATGGCTCCAAATGACAACCTGACATACTCACAAACTATCAAGGGAACTGCCCCTCTTACACCTTATTAACCAAATTTAAAGTCTTTTAGTGGAGACTTTAGCTGGTTTTATTAAGATAAAAAATGTTTAAGAGCATTTAACCAGTAGATACAAAAAAGCACAAAAGCCAAATAAAAGCAGAGCCCAGGCCAGTAGGAAGATCTGCATTTAAGTTCCTCAACTGCCATTCATCAGCGGTGTGGTCTTGGGTGAGTTATTTTCCCCCAGGTCTGTCTCCTCATCTGATTCCTCATATACTGATTCATTAAGACAAGGGTATGTGAAAGCACTGTGTAAATGATACATTTTACCCATTCTAGCTTTAGCAGTATTATGAAAGACAAAAGTTCTGCCACATTGTAGGTAATAATCTCATTTAGCAATCATTGTTGTCACTATTAGGTTGGAGCTGACAAAGTATGAGTCTCCACTTATAAAGCATCTATCTCCAGAGTTCGAGGCTTTTGTCTTAAATTCCAGTCATCTTGTACAGGCATGACATATCAGGCAAAATGATTAGAAAACTCAAGCTCCATATTAAAAAGCTTAACTTCTGGAGTCCTAGGAAACTATCTAAAACTCCTTCAATCCACGGTCTCTCCTGTACAACAAAGACTTCCCAGTGGTAGATGTCTATTTGTACCCAACATCCACTCATTCAGCACGTAATTCAGCACCTCCAAATCCTGGCCCTCAAAGAACACAGCTAAGAACAATGTGTGTGTAATTATCAAGTAATAGGAATGATACTTTTAAAACTGGAAATTATACATTCAAATGAGATTTCTCTCCTTTAACCAGTCCCCTTGGGAGGCAATGCAGTAATTCCAATGGTACTTCATTACTCAAATCATCTTTGAAGCTTTCTTCTTGGAAGCACCTTGAGAACCTGCAGTCTGATCTTCTGACTATCCCAAATGGTGCTAAATTTTCACTGAGGGTGGATTCAAATTTTGGAAATGGCAAACAGTCAGTCAGAGCCAAGGTTAGTGAATAAGATGTGTGATCAAACTAGGTGGAACTATTTTGGTTGAAAATGATAGATGATCATAAAGCAATGAGATGGATCTTCTTATGTGATATGTAAACTGACTTTAAAGGGAATTCCAGATGAGTAACAAGGAGTATGAACAATGGAATAGGTGTATACATTCTCTTTCCCCAAGTAACCTCTTTGAATAACAACACTCATTTGGATGTATAAGCGCCACTAACAGGTTTGTTTTGTTTAATCACTTACAATTTGTAAATAGAGGGTTCCTTAGCATACTGGATCTACAATGTCATGGAGGAAAGGAGTTGTCACTTAACTAGAACTATGGCATGGCACGCAGGAAAGAGCACAGGCTTCATGAGACTGACGTGGATTCGAATGTTGACTTAGATTCTTCCTGCCTATGCAACTTCAGGGAAGTCACAAGCTGTCAAGACTTTAAAATAAGGCTGATACCTAATTTTGCAGGGTTGTTGTAAAGATCATGGATAATGGATGCAAAGCCCCTACATTGGGCCTGGCATGTGGTGGGTACTCAAGAAGTCACAGCTATTATTAACAGCAACATCAGTAAAATCAAGACCATTTTCTGACTGTAGGTGGCTTGAGAGGACAGAACAACAGATAAGCAGTCACTTGAGTGGTAAGTAGTTCATTGACAGTACTGACAGTACACTCGTGCATTAGATACTGCTTTCTTAAAAACAATAACCTGTAAAAACTATTTGCAGTAACTATGTATTTTTAATTCATACGACACATATCATCTGCAGTGCTCATACCACATGGACCTCTTCACTACCTGAACATACCAGGCACTATCATGACCCCATGCCTTTGTGTACTTTGTCCCTTCTGAGATACTGAAAGGGGCCAGCCCCTCCACACCTGTGGGTATTTCTCGTCAGGTGGGACGAGACTGAGAAAAGAAATAAGACACAGAAACAAAGTATAGAGAAAGAACAGTGGGCCCAGGGGACCGGCACTCAGCATACGGAGGACCCGCACCAGCGCTAGCCTCTGAGTTACCTCAGTATTTACTGATCATTATTTTTACTATCTTAGCGAGGGGAGTGTAGCAGGGCAACAGGTGGGGAGAAGGTCAGCAGGGAAACGTGAGCAAAGGAATCTGTATCATGAATAAGTTCAAGGAAAGGTACTGTGCCTGGATGTGCACGCAGGCTAGATTTATGTTTCTCTTTACCCAAACAACTCAGTGTAGCAAAGAGTAACAGAGCAGTATTGCTGCCAGCATACTTCGCCTCCAGCCACAGGGTGGTTTTCTCCTATCTCAGAATAGAACGAATGGGAATGGTCAGCTTTACACAGAGACATTCCATTCCCAGGGATGAGCAGGAGACAGAAGCCTTCCTCTTATCTCAACTGCAAAGAGGCCCCCCTCTTTCACTACTCCTCCTCAGCACAGACACTTTACGGGTGTCGGGCTGGGGGGTGGTAAGGTCTTTCCTTTCCCACAAGGCCATATCTCAGGCTGTCTCAGTGGGGGGAAACCTTGGACAATACCCAGGCTTTCTTGGGCAGATGTCCCTGCGGCCTTCCGCAGTGCACTGTGTCCCTGGTTAATCGAGAATGGAGAATGGCGATGACTTTTACCAAGCATACTGCCTGCAAACATATTGTTAACAAGGTACATCCTGCACAGCCCTAAATCCATTAAACCTTGATTCATTACAGCACAGGTTTCTGTGAGCACAGGGTTGGGACTAAAGTTACAGGTTAACAGCATCTCAAAGCAGAAACAATTTTTCTTAGTACAGATCAAAATGGAGTTTCTTATGTCTTCCTTTTCTACATAGACACAGTAACAATCTGATCTCTCTCTCTCTTCCTCACAAGGTACCCTTCCTACTCTCTGGTCTGATGACAACCCTACAATTCTAGCTCAAAAGTCTGGCTCAAAAGACTAGCTCAATTCAAAATCTAGCTCAAAAGTCATTTCCACTATAGACTCTTCCCTGCCTGCTCGAGACAGAATTAGTAGCTCTGCCATTTGTACTTCTAAGACATTTGGTTTCTTTCACTGTAATGCCTCATACTCTATTGAATTCTTATTTCCCTTTCATTTCTGCTTCTTCCACTAGGCTTACGCATTGAGAGGAGGACTATATTCCTTTATTTTTGTACTTAGAGCACCTAGTACATCACTTGGCACGAGATAGGAATCTAGATGTTTGAGGAATCAATGTTATAATATACTTTGCAAATAACTGTTAGAGGCCAGGTAGCTGACTTTAGTGAACTATATTACCAGCAATCGCATTCCTTTTTCCTGCTTCAAACAAGAGACAAGATAACTTATATGGACTCCAGAATGTCTTCTGAAGATGGAGCATACAGAAGTCTCCAAACACTACTGATCTCATCAGTACCCCATGCCCACAAGTTGGAACCCCCAGAGCAACACTTCTCAGCCTTTGTTCCAGTCTAGCACACCTGAGGGATACACCACACTCCCATCAGGAGCTCACAACAGATGACTAAGAAGGCCCAGCAGAGTTTTCACAGCCTTCCAGTGCCACATGGCTACCAGGGTGGAAGGGCCTTCTCTGAACCAGCAACTCCCCCAAAGTAGCAGAAAATTTCAAGGCTCCATGGTGAGTGCCGGGAGCCCACCTAATGCTACCTACTACCAGATCCCTACCACCTACAATGCCTCAGAATCAGTTTGCCTACTCTAAGAACAGGGCTCTTTCACTGTTAGGAAGCCTCAGGCATTCAAGGAAGGACTTGTTTGTGGGTCCGGTACATCTGTTTCGGCATGTCTTAACCTGCATAACTAAAAAGCAATTATGAAAAGAAGTTCATGATCAACCAGTTCAGTGCCAGATGGAAATAGGTAGAGTGGGTCAGCTGGCAGCCCTGTACATCTGAGTGTTGACACTTGTGAATCATTCTCTGTCACTATAGTTTCCAGAGCCTTGCCAACGCAGCAGTGGGTTCAGACTTACCAAGATCCAAAAGGCTGGAAAGGGGATCTGGAGCCATGACCCATGCCTTGGGACCCACCAAACCATGACAGTGACTATTTCTATGACAAAGACATGCTCTTCAAAGTAAATATAGCCATATTCTGCCTTAGTGCCATGTTCTCTCCCATCTCTCCAAAGCAATGGCTGCACTCTGAAGGTGGGAAGAGTGTGAGAAGAAAGAGAACCAGGCCACAGACCACCAGGAAAGCAGTGATTGAAAGCAGCAGCTCTAGATTTATCTTATAGGCTCTGGAGAGTTATTCAGTTATTTACTCCACAAATATTTATTGAGCCAGGTACTATTCCAAGTATGCAGGCTACGATACTGACACAGTCACGGCCTTGACCCCATGGAGCTTAAAGTTTAGTGGGGGAGGCAGGTATTAATCAAATAAGCATCCAGACAGACACAAGGGTACAAATATGCACTATGATGGAGGAGTAGTAAATACTACAGGAACTGTAGAAACAGGGAGCAGTAAATCTGGTCAGGGAAGTCAGGAATGCTTCCCTTAGCAAGTGCCAAGTGAGGATAAATAGGAGTTCACTAGGTCTGGAGGAGAGGAAAAGAGAGGGAAACATTCCTGGTAAAGAGAACAGTGTGTGACAAGGTCCTGGGGTAGGAGGGAGATGTTGAATTGGTTCAAAAATAGAGGCAGGGAGCAAAGCAGTAGCATCAAGGTTTGAGTCCCCAGCATAAGGTAACTTCCTGTGTACACCCCTCAGTTTTCTTTCTCCTATAGAGAAGGCCTTCCTCAGCATGGAAGGAGACTAAAGCACACTGTCAAAAACTAGGAGTGCTCAGGAGAGGTGAAACACTACCAGCTAAAAAAACAAATCTGATTTACCAATAATACCTGGCCAATGTCCTCTTGAAATATCTGTAAAATTAAGGAACTCACTACCTCTGGCAACAATGAATCAGGAAAATGGCACAGTATAAAAGACCTTTAAACTTTTCAGGTAGAAATACATTTTAATGCAGGTAGAATTAAGAGATTGATGAATATGTTGAGAATTACTATAAATCTGCTTAGATACCAGGTATTTCACTGTAATTTCATATACTAATTTTTTTGGGGGGAAAAGGGCTAAAGGAAAAAACGGTTAAAGGTAGTATCAGTGCAGCACTATTTTGCTTTGGCAACTCTGTTAGCCTGTTTGATCACTTAAGGCAGTTTCTTGAGGCTTCTGAATTCTCTGTCACCTTGCAAACAAGTCACTGTAGCTTTCTATTTGTAAGGCACTTTGTAATCAATCCTTAATTGGCCTCCCTACCCCACTGAGAGATTATACTCATTCTACAGATAAAACAATAGATAAGTGAGTTGCCCAAGGGCACAAAACAATTCAGTTTTTACTTAACTTTCTTATCACCATCTCCAAATAGAAGACATACAGGGCTTGTCCAGCCACAGCACCAACCCTACTACCCTGCAGACCAATCTTAACCCCCTGATGCAGCAGAGGAAGGGACTGCAGTTAGTTCTGTTACCTCCTCTATGCCTTCGAAGCCAACAATGATAATGTCTGCACCGTGTTAGAGACACTGGCACTCCTAACCAGAACATGAACTCAAAGTGGAATAAGAGCACATACAAAGTCACAATGACTCACAAGTTAAAATGTCACCATCCCTAGGGAGACTGCACCCTCAACATACAAATACTCCCCTAAAACTTCCTTTCATCCTCAACAAGCAGCCAAGGCATAGGTCCCCAGAGTTGGCCCCCCCGCAGTACAGCACTGCACAGTACTGTAAGGCATCTTAAGTGTACAAGCCCTTCTTCACCCCATGCTGAGACCGCCAGCATCTAAAGCACCTAGCATAGGTTACAGCACACAAATATTTACTGAGCACGTCTTCCCAACTGGCAAATAAGGGAAACGGATACTCCTTTCGGAGAGTAGTGCATTTAAGAGATCAGTTAGGCCAACTCCTCATTTTACAGAAGGGCCAAAGAAGGGCAGAGAATCACCCAGTTACATATGGCCTCGTCTCCAATCCAAGTCTTTGGGCTCAGTTTAGTTCCACAAACATTTTATATGTGCCTACTATGTGCCAGATACGGTGGGTGGGAGGCGCTGGGGACACTGAGATGAGCAGGAGATGGTCTCAGCTTCAGGGAGCTCACAGCCTGGGGTAAAGAACAGTTACAAAACAGTCGGGCAAGTGCAAGGACAGGCGCTTGAGTGCTTAGCCAGTGTTTTCTCCACACTGTCCACTGGGTGTTCCCCATCTGGAAAAGAAGGACAGCAATATCTACTTCCTGCAAATAAACTTGTAGTGAAGACTGATGAGAAAATGAGTATGGAAAATGATCAGGCTCACCCAGAAATGCATTATATAAGGTATTGCCATACCCCGACACTGACTCCAAGGATTCAACAGTGTTTCAGGTATCCGTTCAACAAAGAAGCAAACAAGAGAGGGATGGGCATAAACCCAAGGAAAACCCCAAGGACTGCGAGGAACTGGAGGAAACAGCTGGGGTGGGGGTAGGGGTGTTGGGTCGAGTGAGACGGCCCTGACGCGGAGAATGGAGGGCCCGCAGCGGCGCAGAAGAGGATGGAACCGAGACGAAGAAGTTGGGACACCAATGAGGGACAGCAAGCAGAAAAGAATGGGGTTCCCTTGGGGCAGGACGGGGCTCGCGGCCGGGCCCTTCCGGCCGTGGCCGGGCAGGGGCTGAAAGCACCGGGCACGGGAGGAGGAAGCGGGCGGGCGCCGAGGCCGACTGTTTTGCCTGGGGACCGCTTGCACCCGCAGGGAGGCTCGGGCAGGCGCCCGGGTCCTCGGGCTGCAGCATCTCGCCCGCCGTGCCTCCCCGGAGCCGAACACCAGCCCGCGCCCGAGCCCGCAGCGCGGACTCCCGGGGGCGCCAACGACGCCGCCTCACCTCGGGTTGAAGTCCTGGAAGAGGCCCCTCAGGTTCATGGCGGAGAACTTCACCGCGGCGTCCTCCTCCTCCTCCCCCGCACCCCGTGCTGCACAGCCTGCGCCTTACAGCGGGTTCATGGCGCCAGCGCCAGCCGCGTCCACGCTGCTGCTCCCGCTACTGCTGCCGTCCCCGCTGCCGTCGCCGTCGCCGTCGCCGCCGCCGCCGCCGCCCGGAGAAACCTGAGCCACCGCCCCCTGCCCCTCCTTCCGGGCTTCCGTACGAGGGCCGCGCATGCGTCCGGAGCCCCGCCCAGAGCGCTCCTCGCTGGGAGGTCCCCATCCTTGTGTCCGCACGCGACCGG", 148678216 - 1); var codingRegion = new CodingRegion(148679671, 148713263, 333, 1385, 1053); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 7, 148678216, 148679915, 1141, 2840), new TranscriptRegion(TranscriptRegionType.Intron, 6, 148679916, 148681217, 1140, 1141), new TranscriptRegion(TranscriptRegionType.Exon, 6, 148681218, 148681341, 1017, 1140), new TranscriptRegion(TranscriptRegionType.Intron, 5, 148681342, 148681966, 1016, 1017), new TranscriptRegion(TranscriptRegionType.Exon, 5, 148681967, 148682143, 840, 1016), new TranscriptRegion(TranscriptRegionType.Intron, 4, 148682144, 148685652, 839, 840), new TranscriptRegion(TranscriptRegionType.Exon, 4, 148685653, 148685736, 756, 839), new TranscriptRegion(TranscriptRegionType.Intron, 3, 148685737, 148690313, 755, 756), new TranscriptRegion(TranscriptRegionType.Exon, 3, 148690314, 148690521, 548, 755), new TranscriptRegion(TranscriptRegionType.Intron, 2, 148690522, 148692969, 547, 548), new TranscriptRegion(TranscriptRegionType.Exon, 2, 148692970, 148693146, 371, 547), new TranscriptRegion(TranscriptRegionType.Intron, 1, 148693147, 148713225, 370, 371), new TranscriptRegion(TranscriptRegionType.Exon, 1, 148713226, 148713418, 178, 370), new TranscriptRegion(TranscriptRegionType.Exon, 1, 148713419, 148713568, 1, 150) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(2814, 2813, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), new RnaEdit(2227, 2227, "C"), new RnaEdit(1032, 1032, "T"), new RnaEdit(917, 917, "G"), new RnaEdit(151, 150, "GCGGCGGCGGCGGCGGCGGCGGCGGCG") }; const byte startExonPhase = 0; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, true, startExonPhase, rnaEdits); const string expectedCodingSeq = "ATGAACCTGAGGGGCCTCTTCCAGGACTTCAACCCGAGTAAATTCCTCATCTATGCCTGTCTGCTGCTGTTCTCTGTGCTGCTGGCCCTTCGTTTGGATGGCATCATACAGTGGAGTTACTGGGCTGTCTTTGCTCCAATATGGCTGTGGAAGTTAATGGTCATTGTTGGAGCCTCAGTTGGAACTGGAGTCTGGGCACGAAATCCTCAATATCGAGCAGAAGGAGAAACGTGTGTGGAGTTTAAAGCCATGTTGATTGCAGTGGGCATCCACTTGCTCTTGTTGATGTTTGAAGTTCTGGTCTGTGACAGAATCGAGAGAGGAAGCCATTTCTGGCTCCTGGTCTTCATGCCGCTGTTCTTTGTTTCCCCGGTGTCTGTTGCAGCTTGCGTTTGGGGCTTTCGACATGACAGGTCACTAGAGTTAGAAATCCTGTGTTCTGTCAACATTCTCCAGTTTATATTCATTGCCTTAAGACTGGACAAGATCATCCACTGGCCCTGGCTTGTTGTGTGTGTCCCGCTGTGGATTCTCATGTCCTTTCTGTGCCTGGTGGTCCTCTACTACATTGTGTGGTCCGTCTTGTTCTTGCGCTCTATGGATGTGATTGCGGAGCAGCGCAGGACACACATAACCATGGCCCTGAGCTGGATGACCATCGTCGTGCCCCTTCTTACATTTGAGATTCTGCTGGTTCACAAACTGGATGGCCACAACGCCTTCTCCTGCATCCCGATCTTTGTCCCCCTTTGGCTCTCGTTGATCACGCTGATGGCAACCACATTTGGACAGAAGGGAGGAAACCACTGGTGGTTTGGTATCCGCAAAGATTTCTGTCAGTTTCTGCTTGAAATCTTCCCATTTCTACGAGAATATGGAAACATTTCCTATGATCTCCATCACGAAGATAATGAAGAAACCGAAGAGACCCCAGTTCCGGAGCCCCCTAAAATCGCACCCATGTTTCGAAAGAAGGCCAGGGTGGTCATTACCCAGAGCCCTGGGAAGTATGTGCTCCCACCTCCCAAATTAAATATCGAAATGCCAGATTAG"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } //NM_017940.4 [Fact] public void RnaEdits_big_test_reverse() { //Transcript id:NM_017940.4, chrom: chr1: 16888922 - 16940100 var genomicSeq = new SimpleSequence( "TGATAGGCAAAAGCTTTTAATTGTATAGATTAAAATAACTTTGGACAAAAATTAAAACTCAGGCAGAGAATGTTTTTTTTTTCAACAACACACACTAGCAAAAACAAAGGCACAGTAAACATTGAGGCAGAAAGTTTCCAGCGTAGAGATATGAATATAATAATAGACACAGGCAGGGATGATTAATAAATGATAAAATGTTTACAGGATGATCATTGGAATACAGGACATTTCTAATTTTGAAAACCACCCTCCCAAATACTTCATTATAAGTAAGGTGTCTCTAAAAGGGACAGATCTCCTAGACCCCTCCTTAACCAAGTAACCAGTCCTGATATCATGATAATGCTGATGGACAAACTAGACCTTCTCTGCCCGCAGATGGGCTAAGGTTGGAAACTCACAGCATTGTCTCTGCAGTGTTCCCGGCAAAACGTTTAGGCTGAATTTAATCATGAAGACATTTTCAGACAACTTCAGAATGTAGATCATTGAGCCAGAGAGCTGACCTGTCCTCTATAAACAAGTCCATGTCACCACCATCAATGACAACAACAAAAAGATGAGGAAATATTTGGGGTTCAAAATAACTAAAGAAATGCAGCTATATTATCTTTTTACTTTTTTTGAACCCAAAATATCTCTTCTCCTTTTTGTTGTGTGATTTGTGGTGATATGGACTATGTGAAGGAGACAGGTCAGTTGTCCTGCTCAGTGTTCTACATTCTGCAGTTGTCTGGTAATTACCTCCTATGAAACTCAGGCTAAGCGTTTTCTGCAAGAACATGGCGTTGTTCATATTCTGCACCGGCAGAGTCCTGGGTGACATGCTGTCTCCTGCCAGCGGCTCCTGACTCCTGTTCTCTACAGGATGGAATCGAGAGGAGCAGGGCTAAGGCCTCCCAATGCTGTTTGTCCATCTAGCTGTGGTCTTCCTAAGTACTGACACCAATTGGAGGCTGAAGGACTGTGGCTTCTCTAACCAAAGGAGCCTAGCGGGTTAACAATTGTCAAGAGCAGTTGGTGGTTCTGAAATACAATCCTCAGCCAAGGATCCCTCCTGTGTTAAAGATGGATCAGCTAAAACAATTCAACACTGAAGATACAAAGAATGAGGTTAGGTTCATTGAAACCAGGGTAACACCTTTGGATGAGCTAAACACAAAGATGACACTGACCTTGAGCAGGTATAGAAGCTCAGAGACATGACTGCAAAATGAAATCCCTGAGGAACTTTGTAGCTACCCAGAGATAAGTGGTTCAAATTAAAATGTCTGACTGATCACTCCCGGCATGTGCTGCACAGTTATGTGAACGTGTCACACCTAACTTGGGTCCATTGTCTTCAGACTGAGCACAGGGTGCCACTGGCATGGTCTGAGAATAGGAATAGAGCCATGCCCACTGACCCATCCTATGTCTGGGCTTCCAAATGGAACTATAGTTTCATTCAAATCTTCACGTGCCTATAGGTCCTGCCTGCAGGAATGACATCTCTCGGCTTAGTAAGGGCTGCTTACTGTGGGAATATGACTCCCATCTGGAAGACCAGGTGGAGACTTGTTCCCATCAAAGTAAGAAACCTATTGTCCACGTCAAGGGCGAAGCTGATGTGCTGTTCCTCAAATGAGTAAAACACACTTCTGTAGTGCTGGAATGAGTCAGGTAGTTCAAAGTACATTGACGGAGTCGAATAACATCTATCCAGTGAGTCCTGCAAGACTTCAGGCTCTTCCACTTCCATCAGCATGCCGCTGAGCCTGGAAAAGCAGACAAAACTAAAGAAGCAGCCAGGGAAAATCAGACACCACAGAGCCCCACTAGATTTCAGAAGTAACGTAAGGAAGTGGTAAGAAAAGAAAAGGATAGATCCATTAGATCCATTAATGAGGTAAAAAAAAAAAATTATTGCCTTTATGTTGGGATAGAAAAGGGCCAGGTAGAAAACAATGAAAGAGAAAGACAGAGAGACAGAGACAGAGACAGAGACAGAGAGAAAGTGAGCTAGTGAATTGGCCAGGTGACATACTGGTAAGGGAGTAAAAGGACACTCTGAGTTAGTGCCCTCATGACACACAGCACACTGCGATCATGAAAAGAGTGAGCTCAATAGTTTTCCATAAAATATGCTCAAAATTCGATGCAGTGGCCATGAGAGTACAGCTTTTGAAGTATGGTCATCCTATGGTACGTTAGTAAATGATAAGGGGAGGAAGAAATGGAAACCTAAACATCTACTGCAATGAAAACCAACAGCAATGACAGTAGGAGTAATTCAGCCTTCGTTGAAAACATGAAATCAAACACACTCTGGTTTCCCTCAATCTGTTGCCTCCAGGTGTTAACACAGAATTAAGCATCCACAATTGCTGAAAGTTACCTGGGGCATGGTGGGTTTTGATCTTCTTCCCCTTCTTTTCTTCCCCTTCTCCTTCTTTTCTTCGTTGATCTTCTTCCCCTTCTTTTCTTCCCCTTCCCCTTCTTTTCAATTTCTGCAATAAATTCAGACATGGACAGACACATTAAGCTGATTCCCCTACACACATAACAATCCACTGTCTAACCCTCACACAGGGACCTCAGGCTCCTCAGCATAAGAATAGGAGACTGTGAGAGATATATTTCAGGAGGCCTGAAGGCTGGTCATGATAGAAATTCCTCGGTTTTTCTCCCAGAAACTGTGGGTAAAATGTCCCTATTCTAGTAGATCGTTATCCCAATATCATTTGTCCCGAGTTTGTGCAAACAGTTATGCCATATTTTTCCAATCAATTTAAAGCAAATACCCTCAAATGATTTCTAGGAGAAAAACTGCAATATTTAGCCCTGTCTCATCAAATACTCAGATTGTTCATGGTTGTGAGGACTTTAGACACTGAAATTAGAGTGAAAAAGGAAATCTACAAACCCTTGAGTCAAAATCATAGTTCTCTGAATTTGTCACATCTGCCCAGGTCCAATGTCATGAGAGTAGAATCAGAGTGCCACAGGCATGGCCTGAGACTAGGAAGAGAGCCATGCTCACTGACCCATCCCATGTCTGGGCTTCCAGTTAGAACTAGAGTTTCATTCAACCTACATGTGCCTATAGGTCCTCACTGCAGCAATGACATCTCTCAGCTCAGTAATGGCCACTTGGAGCAGGAATATGATCTTTATATGGAAGACTCAGTGGATCCTTATCACCTTCATAGAAAGGTACTCACCTCCCACGTCAAGAGAAAAGCCAACATGTTTTTCCTCCAATGCATAAAAGGAACTTCCATAGGGCAGGCAGGAGTCAGGCTGTTCAAGACAACTGGAAGGAGTTGAATAACATCTATCCAGTGAGTCCTGCAAGACTTCAGGCTCTACTGCCTCCAGCAGCTCCCTGCTGAGCCTGGAAAAGTAGGAAAAAGTAAAGAATAAGCCAGGGGGAATCAGAAACCACACAGCCCCAGCTACATTTCATGGCTAACATAAGGAACTGTTTAAACAGAAAAAGGACAGATCCATTAATGAGGTAATGAATTATTGCCTTTATGTTGGGATAGACCAGGGCCAGGTAGAAAAGAATGAAAGAGAAAGACAGGGAGAGGGAGAGGGAGAGAGAGACAGAGGAGAAAGTGAGCTCAGCGAATTGGCCGGGTGACACACTGACGAAGGGGTCAAAGGACACTCTGAGTTAGTGCCCTCGGGACACACAGAGAACAGTGATCATGAAAAGAGTGGGCTCAATAATTTTCCATAAACTTGCTTAAGATTCCATGCAGTTGCCATACAGCCTTTGAGGTATGGTCAACCTACAGTAAGTTAGTAAATGATAAGGGGAGGAAGAAATGGAAACCTAAACATCTACTGCAAGGAAAACCAACAGCAATGTCAGTAGGAGTAATTCAACCTTCGTTGAAAACATGAAATTGAACATACTCTTGTTTTCCCTGGACCTGGCATCTCCAGGTGTCAACACAGAATTAAGCATCCATAATTGCTCAAAGTTACCTGGGGCATGATGGGTCTTGGTCTTCTTCCACTTCTTGGTACTTTTCAATTTCTGCAATAAGTTCAGACATGGACAGACATATTAAGCTGGTTCTCCTACACACATAACAATCCACTGTCTAATCCTCACGCAGGGACTTCAGGCTCCTCAGCATGAGAATAGGACACTGTGAGAGATCTTCTTCAGGAGGCCTGAAGGCTGATCATGATAGAGATTCCTGGGTTTTTGTCCCAGAAACTGTGGGTAAAATTCCCTATTCTGGTAGATCGTTATCCCAAGATCATTTGTCCCAAGTTTGTGCAAATGGTTATGCCATATTTTTCCAATCGATTTAAAGCAAATGCCCCCAAATGGTTGCTGGGAGAAAAACTGCAATATTCAGCCCTGTCTCATCAAATACTCAGATTCTTCATGGTAGCGAGGATTTTAGATGCTGAAATTAGAGTGAAGGATGAAATCTACAAGATCTACAAAATTGAGACAAAATCAGAGTTGTGTGAATTTGTCACATCTGCCCAGATCCAACATCTTGAGAGTGGGATTAGGGTGCCACAGGCATGGCCTGAGACTAGGAAGAGAGCCCTGCTCACTGACCCATCCCTTGCCTGGGCTTCCAAGTGGAACTAGAGTTTCATTCAACCTACATGTGCCTATAGGTCCTCCCTGTGGCAATGACATCTCTCAGCTCAGTAAGGGCCATTTGCAGTAGGAATATGACCCTAACCAGAAGACTCAGTGGATCCTTATCACCTTCATAGAAAGGTACTCACCATCCATGTCAAGAGCCCAGCCAACACGCTGTTGCTCCAATATGTAAAAGGCACTTCTGTAGGGCTGGCATGAGTCAGTCAGTTCAAGATAACCTGAAGGAGTTGAATAACATCTATCCAGTGAGTCCTGCAAGACTTCAGGCCCTTTCTCATCCAGCAGCTCCCTGCTGAGCCTGGAACAGTGGGAAAAAGTAAAGAATAAGCCAGGGGGAATCAGAAACCACACAGCCCCAGCTAGATTTCATGGCTAACATAAGGAAGAGTTTGAAAAGAAAAAGGACAGATCCATTAATGAGGTAACAAATTATTGCCTTTATATTGGGATAGACTAGGGCCAGGTAGAAAAGGATGAAAGAGAAAGACACACACACACACACACACACACACACACACACACACACACACACAGAGTGAGCTCAGTGAATTGGCCAGGTGACACACTGATGAGGGAGTCAACGGTCATTCTCTATTTGTGCTCTCAGGACACACAGTGAACAGTGATCATGAAAAGCATGGCCTCAATAATTTTGCATAAAATGTGCTCAAGTTTCCCTGCAGCCACCATGAGAATACAGCTTTTGAGGTATGGTCAACCTTCACTAGGTTAGTAAATGATAAGGGTAGGAAGAAATGGAAACCTAAACATTTACTCTAATGAGAACCAAAAAGCAATGTAGTAGGCATAATTTAGACTTGTCTGACAAGACAAAATCATTATTTTCAGCATGTACTGTTTTCCCTGGACTTGGCATCTCCAGGTGTCAACATCAAATTAACTGTCCACAATTTCTCAGACTCACCTGGGACCTGTTGCCTCTTGGTCCTCCTTTTTCACTTGATCCCACCGATGTCCTGCAAATAAATTCAGATGGGGCCTCTTACATTAAGCAGTTCTTCCTTGCACACAGAAACATTCCTCTGTCCAATCCTAACACAGGTACATCAGTCTGGTCAGTGTGAGAACAGGAGACTTTGAGAGAAATATTCCAGCAGGCCTGAGGTCAAGTCTTGAGAAAACTGGCTTGGGTTCTTTCATGAGCCTTGGGCAAAATTACCCTGTTTTGGAATGTTATCTTCCCTATGTGCTCTGTCCTAGGTTTGTGTACACAAATGAGCAACTTTTTCCCCAATAAATTGTAGGCAAATAGTTCTAACACCTCATAGGAGAGATACTTCAATATTAAGCTTTCTCTCATCAAATACCCAGAATTTGATAGTTTATGAGATTGTGGACACAGAGATTTGATGAAGGGGTGCAATGTACCAGCTCTTGAGTCAAAATGAAACTTGGTTCTACACAGAAGCATCAGCTATTATGGCTTTTGTGGGTGAAAAGTCAGCCATTTATCTAGAAAACATACCAGGAACATGACGGACAGATGAGCTAAAGCAAGCGAACTTAGAAGACACAGAAAATGGGAATAAATTCAGTGAAACCTGGGCCACATCTTTCACTGAGAGGTAGACAAGGGTGACACTTGCCTTGGGCAGGTAAAGAACCACACAGACATGCTTTGGGAACAAAACTCATAAGGAATTTTGTAGCTGGCAAGAGACATTTAATTCAGATGAGCTGATCTGACAGACAACTCCTGGTCATGTGCTGCATAGTTTGGTGTGAGCTTGCCACACCTGCCTTGAGTTCAATGTCGTGACAGTCAGTCCAGGTTGGCACGGGCATGGCCTGAGACTAGGAAGAGAGCAAAGCTCACTCACCCACCCCATGCCTGTGCTTCAGACTCGACTCCAGAGTGATTGAAATCTACATTGATATATAGGTTCAGCCCACAGTGATGGCAAATCTCAGCCCAACAAGGGGCACAAGGCCCAAAGATTATGGGGTCTACCTGGGCCATGAACTGGAGCTTTATCACCTTCACAATGGAGTACTCACCGCCTATGTCAACAGCCATGCAGACTTGCTGTTCCTCTAATGAGTGAAATGTGCCGCTGTAAGACTTGTACGAGGCCAACATTTCAGGAGGAATTGAGAGAGTCGAATAACCTTCATCCCAGGACTCCTGGGGGACTTCCTCCTCTTCAGACTCCTGCAGATTCCTGATGAGCCAGGCAGGACAGGGATGATAGAAGATTTAACCAACAGACATTAGACAACAAAACCTCCCAGATGATCTGATGGGAGACAGAATGGAGTGGTCACAGAAACCAAAGGCATTTTTCCTTCAAGAGAAATAAAACTAGCCTTCTAAATACAGGGTGGAGGGTGACTGCTCTGGGGACAGAGCAAAAATGGGCAGCATGTGCTCAGTACATTTGCCACAGATGAGCCAACTCAGGGCACCCAGACTCTCCCTGTAAACTACCATCATGACTTGCAGCACAGAGAACTGACACAGGGCTTCAACTACTTTGCATAAATTGGGTTGAATTTTACATGCAGCATTCAAGTGAAGAGAGTTCTTGACACAGTGCAGACACAGATCTTGTGTATTAAGGGCCCCATTTTCCCAATATTTTGATATAATATATTTACCTTTTCAATTTCTTTTCTTGCAAAAATACTAGCCAACATACTACCAACAGATAGGAAGAAAGCATATATACATCTCTCCCTGGATTTAAACACATGGGAGAGAATAGGCAACACCAAGAAATCCCTGTTTGAGGGTCTGGAGTGGACTTCCAGCAAACTCCAACAGACCTGAAGCTGAGGGACCTGATTGTTAGAAGGAAAACTAACACACAGAAAGGAATAGCATCAACATCAACAAAAAAGACATCCATCCCAAAACCCCATCTGTAGGTCGCCATCATCAAAGACCAAGGGTAGATAAAACCACAAAGGTGGGGAGAAACCAGAGCACAAAAGCTGAAAATTCCAAAAACCTGACATCCCTTCTCCTCCAAAGGATCACAGCTCCTCGCCAGCAATGGAACAAAGCAGGATGGAGAATGACTTTGATGAGCTGACAGAAGTAGGCTTCAGAAAGTCGGTAATAACAAACTTCTCTGAGCTAAAGGAGGATGTGCGAACTCATCGCAAGGAAGCTAAAAACCTTGAAAAAAGATTAGACGAATGGCCAACCAGAATGAACAGTGTAGAGAAGACCTTAAATGACCTGATGGAGCTGAAAACCATGGCACGAGAACTACGTGATGCATGCACAAGCTTCAGTAGTCAATTCGATCAAGTGCAAGAAACGGTATCAGTGATTCAAGATCAAATTAGTGAAATGAAGCGAGAAGAGAAGTTTAGAGAAAAAAGAGTAAAAAGAAATGAACAAGCCTCCAATAAATATGGGACTATGTGGAAAGACCAAATCTACGTTTGATTGGTGCACTGAAAGTGACGGGGAGAATGGAACCAAGCTGGGAAACATTCTTCAGGATATTATCCAGGAGGACTTCCCCAACCTAGCAAGGAAGGCCAACATTCAAATTCAGGAAACACAGAGAACACCATAAAGATACTCCTCGAGAAGAGCAACCCCAAAACACATAATTGTCAGATTCACCAAGGTTGAAATGAAGGAAAAAATGCTAAGTGCAGCCAGAGAGAAAGGTCGGATTACCCACAAAGGGAAGCCCATCAGACTAGCAGCAGATCTCTTGGCACAAACCCTACAAGCCAGAAGAGAGTGGGAGCAATATTCAACATTCTTTTTTTTTTCCATATGTATAGTTTTCCTTTATTATTTTTTGTGTGTATGTATATATATGTATATATATTTTTCAATACTTTAAGTCTTAGGGTACATGTGCACAACGTGCAGGTTAGTTACATATGTATACATGTCCACATTGGTGTGCTTCACCCATTAACTCATCATTTAACATTAGGTATATCTCCTAATGCTACCCCTCCTCCCTCCCCCCACCCTACAACAGGCCCCAGTGTGTGATGTTCCCCTTCCTGTGTCCATGTGTTCTCATTGTTCAATTCCCACCTGTGAGTAAGAACATGCGGTATTTCGTTTTTTGTCCTTGCGATAGTTTGCTGAGAATGATGGTTTCCAGCTTCATCCATGCCCCTACAAAGGACATGAACTCATCATTTTTTATAGCTGCATAGTATTCCATGTTGTATATGTGCCACATTTTCTTAATCCAGTCTATCATTGCTGGATATTTGGCTTGGTTCCAAGTCTTTGCTATTGTGAATAGTGCCACAATAAACATATGTGTGCATGTGTCTTTACAACAGCATGATTTATAATCCTTTGGGTATACACCCAGTAATGGGATGGCTGGGTCAAATGGTATTTCTAGTTCTAGATCCCTGAGGAATTGCCACACTGTCTTCCACAATCGTTGAACTAGTTTACACTCCCACCAACAGTGTAAAAGTGTTCCTATTTCTCCACATCCTCTCCAGCATCTTCAACATTCTTAAAGAAAAGAATTTTCAACCCAGAATTTCATATCCAGCCAAACAAAGCTTCATAAGTGAAGGAGAAATAAATCCTTTACAGAGAAGCAAATGCTGAGAGATTTTGTCACCACCAGGCCTGCCTTACAAGAGCTCCTAAAGGAAGCACTAAACATGGAAAGGAACAACCGGTACCAGCCACTGCAAAAACATGCCAAACTGTAAAGACCATTGACGCTAGGAAGAAACTGCATCAACTAACGGGCGAAATAACCAGCTAACATCATAACGACAGGCTCAAATTCACACATAACAATATTAACCTTAAATGTAAATGGGCTAAATGCCCCAGTTAAAAAACACAGAATGGCAAATTGGACAAAGAGTCAAGACCCATCAGTGTGCTGTACTCAGGAAACCCATCTCACATGCAGAGACACACATAGGCTCAAAATAAAGGGATGGAGGAAGATCTACCAAGCAAATGGAAAGCAAAAAAATGCAGGGGTTGCAATCCTAGTCTCTGATAAAACAGACTTTAAACCAACAAAGATCAAAAGAGACAAAGAAGGCCACTACATAATGGTAAAGGGATCAATTCAACAAGAAGAGTTAACTATCCTAAATATATATGCACCCTATACGGGAGCACCCAGATTCATAAAGCAAGTCCTGAGAGACCTACAAAGAGATTTAGACTCCACACAATCATAATGGGAGACTTTAACACCCCACTGTCAATATTAGACAGATCAATGAGACAGAAGCTTTACAAGGATATCCAGGACTTGAACTCAGCTCTCCACCAAGCAGACCTAAAAGACATCTACAGAACTCTCCACCCCAAATCAACAGAATATACATTCTTCTCAGCACCACATCACACTTATTCCAAAATTGACCACATAGTTGGAGGTAAAGCACTCGTCAGCAAATGTAAAAGAATGGAAACCACAACAAACTGTCAGACCACAGTGCAATCAAATTAGAACTCAGGATTAAGAAACTCACTCAAAACCGCACAACTACATGGAAACTGAACAACCTGCTCCTGAATGACTACTGGGAAAATAACAAAATGAAGGCAGAAATAAAGATGTTCTTTGAAACCAATGAGAACAAAGACACAACATACCAGAATCTCTGGGACACATTTAAAGCAATGTGTAGAGGGAAAATTATAGCACTAAATGCCCACAAGAGAAAGCAGAAAAGATCTAAAATTGACACCCTAACATCACAATTAAAATAACTAGAGAAGCAAAGCAAACAAATTCAAAAGCTAGCAGAAGACAAGAAGTAACTAAGATCAGAGCAGAACTAAAGGAGATAGACACACAAAAAACCCTTCAAAAAATCAATGAATCCAGGGCTGGTTTTTTGAAAAGATCAACAAGAAAACCCTGTTTGGCTAGTTCACCTGGCTCATCTGATGGCAAGTTCCTATCTTGAGAGGACTATGAAATTAAAACCAATACAAGTGCCACAAATAACATACAACATTGTAAATCAGCACAATTTGTAGCTGGGTGAATGGAAGAAATAGTTCTATTCATCACTTCCTCATTTTCCCTAAATCTACAATCTCCAGATGTCACTACTGAATTAACAGCCAACAATTCCACAACATTACCTGGGAGACACTGGCCCTTTTTCTTCCTCTTCCTCATCATCACTTTCATTTTCTGTAAATAAATTCAGAGAAGCAGGTCACATTAAGCAATTCATACTTCACATATGACCAAATCACTGTCCAGTCATAGCACAAGGACATAACTATTCTCAGTGCAAGAATAAGGATTCTGACAGGAATATTCTAGGGTGCCCTAGATTAACTTTGGTGAGAATTAGATGACCCTGCTTTCCAGACCCACAGGCCAAAATCTCCCTCTACGTGTAGACCATAATGCCATATTCCCTGCCTGAGTCAAAGTTAAACAAAATTTTTTCCCCAAAAAAATCTCCAAAAATTGGTCCATTTTCTAAGAGTGTTGCTGCAATACGGACTTATATCACCAGATAACATGGACATTAAATGTTTAGAGGCATCTATACATGAAACACACATGATAGATAAATTTGAACAACTCTTGCTTTAAAAAGAATCTGTGATTTGGGAGGCCAAGACAGGTGAATCATTTGAGGTCATGAGTTCAGGACTACCCTGGCCAATATGGGGAAACCCTGTCTCTACTAAAAATACAAAAATTAGCCAGATGTGATGTTGTGCACCTGTGGTCCCAGCAACTCAGGAGGCTGAGGCAGGAGAATCACTTGAATCTGGGAGGCAGAGGTTGCACCAAGCCAAGATGGTGCAACTGCACTCTAGCCTGGGTGACAGAGCAAGACTCCATCGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATCCACGATGCTACAAAGAAACATTGGATCAGCCATTGCATTGACAGGGTGGAGAACCAGGGTCCAGCCTTGCTTTATGGAAATATATCAGCAAAGTAAAGAAGAAAAGTTTCCGTCCTGATTTCAGGGTGACTGTGCAGCTAAGCAAGCTGACTTAAAGGAGATCCGGATGAAAGCTGAGAGCAGTGAAGCCTGGGGAACAATATTTCCAAATACAAAGGCAAGGCTGCCAGCTTCCTGAAACAGGCATAGAAACTCCATGGACATTGTTCAGGGACAGATGACTTAATCACAGATGACAAGAGATACTGAATCGAAGCTAGGAGGCCTGACAGATACTGCCTGTGCACCTCCTGCACTCAGGTGACTATGAGATTGTCACACTTGCCTGGGGTCGAGTAACTTGATACTGGGGACTGGCAGACAAAGGCATGACATTAGCTGAGAAGGACAAAAAAACTCCCTGATATCTGTTTAGAAACCCATCATAGTTTTTTATTCAAATGAATTTGTGTTTATAGAGCCTGTCTTCAGAGTTTATCTTCCTCAGCCTAGAGAGAGGTATGAGACACAAGGAAAACAGAGGCTACCTGGGATAATGTGTACAGCATCCTCCCATTCAACATGAGAGGATGAGCCAATGAGAGTTGAGTCGACTTTGTCTTCCTCAAATGTGATTTTGGTTTTCCTATGTGGCTGGTTGGAGTCATAAGGGCCATGGCTATTTGAACAAGTGATGGCACATTCCTCCAGTGAGTCCTCAGGGACTTCCTTTTCTTCAGCCTTCGGCATCTCCCTGATGAGCCAGGTGGGACAGAGATGACAGAAGATTAAACACAGAGGGATTGGACCCCAGGGAGTCCTAGCTGGTTTTGACAGGCGGCATTAAGAGAGTGGTCCCAGAAAGCAAAATGGAGGTTCCCTTTAAGGGGGAACATGCAATCCTGTTCTCTCTGCAACAGAGCATGGCTGCCATGGGAACCAGAGAGGAAGAGAGCAGCTGGTGTTCATTGCAGTGGACAGATAGGAGCTGAGGAGGATGAAGACTCAGCTATCCCTGTATGGTGCAGACATGACACTCGGCACACATAGAGAAACATGACAGCTGCCGCACCCTGTGTCTAAGCTGGGTTATATTTCACATACTGTGGCCAAGCAAATGCGGGTTTTTGGCCCATCATAGATGCCAGAGAGGGTGTACCTCCTAGATATTCTTCATATGTTACCATCCATTACTTGTTCCTGAGTATTCAGTGTTACCTGGGGGCAGACGATTTCTGCACTTTCTCAGCCACCTCAACTTGAACATCTTCATCGTCATCGTTGTCATTTTCTGTAAATACAGAAGTGTTCGTTCAGATATTTCCCACTTCACAGTCTGCAAGCACAGTCAGCCCAATGTGCAACAGAGACATGAACATCTAGGCATGGGTCACCGTTCAACTGAAAACTCTCATGTTTTATCTTTAACAGAATGCCCTGGCATGGTTTCCTGATCCATCAGGCAATGCATTTCTGATCTGGAGGGCCACCATCAAGATGTGGCCAAATATTGAAAAGACCTTTTGCTTCCCATATCACTGGAGGCTTGTGCAGCCTCTCTCTGGACTTTGGCAGCTGTCGCCCCCATCCTGCCAGATCTGATTCCCAGGCACAGGCTTGGTGTCCTGTCACAGTTTGCATTTCAAACCTAATTCTTTCTCTTAGAAGCAGACAAACTTATCCCACAGTCCTCTATGCATCAGAAGATTTCAAGCCTCCAAGTGGCTTCTGCTGTGTTATTCAGGGACATTCTATCCATGGGGAGTGCTCCAGTCTGAAGCACTTCCTACCACGAAACGCCACCACATAAAGTGCCTTCTCCAACATCACACGGCGAGGGGCTTCATCTCATTTTGGAAAGCAGTTTTAAGTGTTCCCACATTTGAATGCTTCAGACCCTTGCAAGAGACAATTTGCCATGGAGAGAGAGAAACTCAGGAAAGACAAGTCATTCAGTCACTGACAGTTACTAAGAACATTGCCGAAAAGACACCCTGGGAACCTTCATTCTTAGTCCAGAGCTCTTTTCACTCTAACAAGCCTGCTCCTATCGCAGCCTCCTTCCTGTCCTTTAAAACTAGATAGATGCTGCCTCTTACTCCAAAGACAACCTTCCATCAAGGGAGGAGGGACAATTGCAATACTGTGACCTCCAACCCCATGGGTTTCCCAACTCCGTTCTTACCCAGGAAGTCCTGGTCATGTCATGGCCACATAAGCTTAGTGGCAAAAAACACCATTGATACAACTGTCATTGTGAAAGTATGGAGGTCTGGAGTCTCTCATAAGCCTGGGGTTTTGGGTCATCAGGGCCTATGGCCACCTTACCTGGGCTGAGCTTTTGGACAAGGTGCTGTGCCAGTCTACACCCCTCAGCCAGCTGTTCTTGGAGGTCCTGCCCCTGGGACTTGTCTGGCTCATCCGGAGTGAGGAGGGCCTGGAGATGCTGATTCAATGAGCAGGAGGCATCTCTCCCTTCCCGTAACTTCTCCCTTAACTGGGTCAGCTCTCGTTCCTGAGAGTGAACCAGGACTTTATATTGCCTAAGGTGAGACGGTAGAGAAAATTTAAGAGTGGAAAGGGTTGAGTGATCCGCTCAAATATTGCAACAGAGATTTCTGAGACAATGTCCTCAAGGAGACCTCCAAGCAGAAGGTCAGCACATGTTGGAAGGAATGTCTGTGGCTAAGAGAAAGAATAGAAAATGGTTTACAGGTTTCCTCTGTATCAGAGAGGGCTCCTGCAAGATCCTCGATGATGTTCCATTCATCTTTCCCTTCTGTAAACAAAAGTAGGTGTCTTCCTAATTCCATTTCAAAAAGACATCCTTTCAGTCCCTCACTCTGGCCATGGACATTTCCATGTGAAAATACACATAGTGCATCTTGCGGCCACTAGATACAAAGCCATGTACAGAAATGAGGCCAGGTGCAGATGGGGCGAATTGAAAAGATGAAAGAAGAAAAGAATGACAGGGTCGAGAAGGCAACATTGATTGAGTGAAAGAATGAGAAGACGCAGTCAGTCAGAAGGTGATTCTCACTAAGGGTAAGTGGGGTGGTGATGGCACACCATTTTGAGTATACTGAATGCTGCTGTGTGGTTCACACTCCTTTGGTTAATTTTGTGTTATGTAAATTTCACATCAACAATTACTTGTTTGAAAAAGAGAAAACAAGGCTCTGAGAAACAACTGCAACCCATAAATTTTTATTATCCTTCTTCTCTGCTTGATAAATACTTGTGTGTTGCGAGCCTGCCATGGCAATTCCTGCCCTTCCCCTGGCCCAGCTTAGTTCTTAAGTCTCCCCACTGAGCTGCTGTACTTCAGAGATTTACACACCTGCCCCCCTGCCTGCCCCCATGGGGTCCCCTCACCTGAGCTCCTCAGCTTGCTTGAGCTGCTCTGCAAGCTTCTCCTCCTTGAACTGTCGCTCATTCCTCAGCATAGATTTTATGAGGTCTTTGCACTCTTCATTTTCTGAGAAAAGACAGACACGCCTGCCTCAGTGGAAGGCTGGACATGCTGCTGTGGTCATTGCCTACAGGGCAGGAGCCAGGTCCATCCCAAGGACAAAACTCTCCCCAGTACCAGGGTCTAGACAGGGATTTCCACATCTTTACTCTTCAGTCTCCTGACTTTCTGGCATCTGATCCTCCAAAATTTAGAGATGAAGAGAACCTCAATGGCACATCAAGGAAGTTGACAAGATGATTCAACCACAACGAAGTGGAGTCAGAATTCACAGCCCCTGAGGTCTGACTCTGAATGCAGGGCCACTTTCCCAAGACTTGCAGCCTCTCCTCTAAAACACTGCACTGGGGCATGAAGTAGTGATTTCTTGTACAGTCGGGAAGGCCCCTAGGACTATGGGACTGATGGTTTCCCTTTTACTGGGAATTTCAAGGACAAGTATGCAAAAGATTTTAAAAATCTTTGATTTTTAAATCATATCTTCAGTTATGATTTTAAGAATCATATCTGAAGCATAAAGTGTGACACATAACACCATAAGGCCATGAAGGAAATATGCCCAAATGTTAATAAAGTTTGTGTTAATTTAGAAACAGCAGAATGAAGAACTAATAGATAGTGTTTACTGTGTGCTAATAAATGTTCTAGGAGATTGACAAGAAATAGCTCATGTAATTCACTGCAGCAATTTACAGAGGTAGGTATTATTGTAGTACCCTCTGAACAGGTGAGGAAACAGGGACAGAAAAGACAAGCAACTTGGATGGAGCCCAGGAGACAGGCCCACGGTCTCTGCTCTGTACACTGCACTGCTATCTCCACACATTCTCGGGTGCGATCTTTCTTCCTCTTTAGGAACAAGACTCTGTGCCCCAGGAAGCAGGACTTCACTCTCACCAAGCTACATTCTGCTTCTTATTCTTATTTTTATTTATCATTATTAGTATTATTTTTTTAACAGTCTTGCCCTGTCGCCCAGGCTGGAGTGCAATGGCAAAATCTTGGCTCACTGCAACCTCAGCCTCCTGGGTTCAAAGGATTCTCCTGCCTCAGCCTCCTGAGCAGGGGTGATTACAGTCACCTGCCACCATGCCCATCTACTTTTTGTATTTTTAGTGGAGATGGGGTTTCTCCATGTTTCCCAGGCTGGTCTCAAACTCCTGACCTCGTGCTCTACCCGCCTCAGCCTCCCAAAGGGCTGGGATTACAGGAGTGAGCCACCATGCACAGCCCCTACTCCCTGCTCTTGATGCTGTCACTTATAGATAGCACAGGTTCTATTAGGAGCAGACTCCTCTTGAAGCCCCTCAGAGCAGGTACTGGCTACTATCACCAAGTTTCCCTCAGAGTCACTAGAACAGAGCTTTGCATATTGGGCCTCAACAGAAACTTGAACTGAATAAAAGTTCACTAGTCTCAGACATTTAGAACAACAGACTAGATGTTATTTGTCTGCAGGATCTTACATGGTACAGAGAGGATTCTTGAAAACATGATTGAGCCTCTTGGAGAAAACAGGTCATTCTGTGCCTGTGTCAGAAATCAATAAATGGCAGTTTAACTCTAGTCCCACCCCCACCTGATTGCAAACATGGAAAGTTGCTAAATATTTTGGGACCTCTGTCTTCCAACTTTAACAAAATGTTAAAATACCCATTTCTGTTTTCCTAGAAGTATGGGGAGGATGACATTATTTTAGATGGAGAGAGCACTTAGTTTCTCAGAGAGAAGACAGGACTTCGTTCATCACTTTCGTGATGGTGAGCCTATAGATCTTACTGTATTTGTTCTGCTGGTTGGCCAGGAAGCAGGCCAGTTGAGTTACAAAACATTTCTCTTTGAGGTTTCTGAACTGCTGTTTCTTCTCTGCCAGCTGGGGATGCAATTTCTCGTTGATTTCTAGAATGTTCATCTCTGCCTTCTCGCTGGACAAAGGGCCGGCTGATACCACCATGCTGACGTTTGTGGCAGAAGAGGTGGGGCCAGGGACTGGGGAGAAGAAAGGCAAACACATGATGGGTTAAAAACTGGTGAAATCAAATAGGTTTAATCACACTGAGGGATGTCAGCGGCAGCCTTGTCTACTTATTTGAAGATGATGTTTCCCTGGTTTCACTCTTGTCATCTCCAGTCTTGATCTCCTTTAAGTCAACTTATCTTAGCTATGCAGTCACCTTGAAACCAGGACATAAACACTTCTACACTTTTCTTGCTTATAAGTTTCTATAAAGCAAGGCTTGGCCCTGAGATTTTTACCCCATGAGTGGCCAATGTTTCTGTGTAGCACAAAAGGTTTCATTTTGCCTTTTTAATTTTTTTCTTTTTTGGTTTTTTGTTTTTTGTTTGAGACGGAGTCTCACTCTGTCACGCAGGCTGCAGTGCAGAGGCACAATCTCAGCTCACTGCCACCTCTGCCTCCCGGGTTCAAGCGATTCTCATCCCTCAGCCTGCCAAACATCTGGGATTACAAGCGCCAAGTAACATGCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCGCCATCTTGGACAGGCTGGTTTCGAACTCCTGACCTCAGGTGTTCCGCCCACCTTGGCCTCCCAAAGTGCTGGGATTAAGATGTGAGCCAGCACCCCCGGTCAGAGACTTTTTTTTTTTTTTTTGAGATGGAGTCTCGCTCTGTCTCCCAGGCTGGAGTGCAGTGGCACAATCTAGGCTCACTGCAAACTCCGGTTCCTGGGTTCATGCCATTCTCCTGCCACAGCCTCCCGAGTAGCTGGGACTACAGGTGCCCAACACCGTGCCCAGCTAATTTTTTTTTTTTGTATTTTTAGTAACGACGGGGTTTCACCGTGTTAGCCAGGATGGTCTCGATCTCCTGACCTCGTGATCCACCCGCCCCAGCCTCCGAAAGTTCTGGGATTACATGTGTGAGCCACCGCGCCCGGCCGAGACTTCTTATTAATAGCTAAGACAAGCCAATGAAAAGGAGAGAGAGTCTAGCCTGAGAGGAGTGAACCAGGGTGGGAGGATCGTCTCAGCCGATCCTCCCACCTAAGTCTCCTGAGCAGTTGGGACTAGAGGCACGCAGCACCATGCCTGCCTAATTTTTTGTATTCTTTGTAAAGATGGGTTTCACCATATTGTCCAGGCTGGTCTTCAACTCCTGAACTCAAGTCATCCTCCCACTTGGGCCTTCCAAAGTGCTGTGATTATATGTGTGAGTCACAGAACCTAGCTCCATCCTAGTTTCTGACTAAAACAATATGTGCGTATACAGCCTGTCCTCAGAATTGATCTTCCATAGCCTAGACAGAGGTATGAGACACAAGGAAAATAGAGGCTACCTGGGAGAATGTTTACAGCATCCTGACATTCATCATGAGAGGATTCTCTGTCTACAACCAGAGCTGAGTTGACTTTGTCTTCCTCAAAGGTGATGTTGATGTTCTTGTGAGGCTGGTTGGAGTCACAAGGGCCGTGGCTATTTGAACAAGTGATGGCACATTCCTCCAGTGAGTCCTCAGGGACTTTGCTTTCTTCAGCCTTCTGCACCTCCCTGATGAGCCAGGTGGGACAGAGATGACAGAAGATTAAACACAGAGGGATTGGACCCCAGGGAGTCCTAGCTGGTTTTGACAGGCGGCATTAAGACAGTGGTCCCAGAAAGCAAAATGGAGGTTCCCTTTAAGGGGGAACAGGCAATCCTCTTCTCTCTGCAACAGAGCATGGCTGCCATGGGAGCCAGAGAGGAAGAGAGCAGCTGGTGTTCAGTGCACTGGACAGATAGGAGCTGAGGAGGATGAAGACTCAGCTATCCCTGTATGGTACAGACATGACACTTGGCACACATAGAGAAACACGACAGCTGCCACACCCTGTGTCTAAGCTGGGTTGAATTTCACATACTGTGGCCAAGCGAATGCGGGCTTTTGGCCCATCATAGATGCCAGAGAGGGTGTGCCTCCTAGACATTTTCATATGTTACCACCCATTACTTGCTCCTGAGTATTCAGTGTTACCTGGGGGCAGATGATTCCAGTACTTTCTCAGCCTCCTCAACTTGAACATCTTCATCCTCATCTTCGTCATTTTCTGTAAATACAAAATGTTCGTTCAGATATTTCCCACTTCCCATTCTCCAAGCACAGTCAGCCCAATGTGCACAGAGACATGAACATCTATGTGTGGTTCAGCATTGTACTGAAAACTGTCATGTTTTATCTTTCACAAAATGCCCTGGCATGGTTTCCTGGTCCATCGGGCAATGCATTTCTGATCTGGAGGGCCACCATCAAGATGTGGCCAAATATTGAAAAGACCTTTTGCTTCCCATATCACTGGAGGCTTGTGCAGCCTCTCTCTGGACTTTGGCAGCTGTCTCCCCCATCCTGCCACAGATCTGATTCCCAGGAACAGGTTTGGTGTCCTGTCACAGTTCGCATTTCAAACCTCATTCTTTCTCTTAGGAGAGGACAAACTTGTCCCACAGTCCTCTATGTGTCATGAGACTGCACAGGCCCTCCATGTGGCTTCTGCTGTGTTATTCAGGGACATTCTATCCATGGGGAGTGCTCCAGTCTGAAGCACTTCCTACCACCAAATGCCCCCACATCAAGTGCCTTCTCCAACACCAAACGGAGAGGGGCTGCATCTCATTTTAAAAAGCATTCGTAAGTGTTCCCATATTTGGATGCTTCAGACCCTTGCAAGAGACAATTTGTTTGCCTTTGCAGATGGAGAGAGAGAAACTCTGGAAAGATAAATCACTCACTCACCGACAGTTACTAAGAACATTGCCAAAAAGACAGCCTGGGAACCTTCATTCTTAGCCCAGAGCTCTTTTCACTCCAACAAGCGCCCTCCCATCACAGCCTCCTTCCTGTCCTTTAAAACTAGACAGATGCTGCCTCTTGCTCCAAAGACCACCTTCCATCAAGGAAGGAGGGACACTTGCAATACTGTGACCTCCAACCCCATGGGTTTCCCATCTCTGTTCTTACCCAGGAAGTCCTGGTCATGTCATGGCCACATATGTGTAGCAGAAAATAACCCCACTGATACAACTGTCATTGTGAAAGTATGGAGGTCTGGAGCCTCTCATAAGCCTGGGGTTTTGGGTCATCAGGGCCTATGGCCACCTTACCTGGGCTGAGCTTCTGGAAAAGTTGCTGTGCCAGTCTACACCCCTCAGCCAGCTGTTCTTGGAGGTCCTGCCCCTGGGACTTGTCTGGCTTATCCGGAGTGAGGAGGGCCTGGAGATGCTGATTCAATGAGCGGGAGGCATCTCTCCCTTCCCGTAACTTCTCCCTTAACTGGGTCAGCTCTCGTTCCTGAGAGTGAACCAGGACTTTATATTGCCTAAGGTGAGACGGTAGAGAAAATTTAAGAGTGGAAAGGGTTGAGTGATCCGTTCAAATATTGCAACAGAGATTTCTGAGACAATGTCCTCAAGGAGACCTCCAAGCAGAAGGTCAGCACATGTTGAAAGGAATGACTGTGGCCAAGAGAAATAATAGAAAATGGTTTACAGGCTTCCTCTGTATCAGAGAGGGCTCCTGCAAGATCCTCGATGATGTTCCATTCATCTTTCCCTTCTGTAAACAAAAGTAGGTGTCTTCCTAATTCCGTTTCAAAAAGACATCCTTTCAGTTCCTCACTCTGGCCATGGACATTTCCATGTGAAAATACACATAGTGCAACTTGCAGCCACTAGATACAAAGCCATGTACAGAAATGAGGCCAGGTGCAGATGGGGCGAATTGAAAAGACGAAAGAAGAAAAGAATGACAGGGTCAAGAAGGCAACATTGATTGAGTGAAAGAATGAGAAGACGCAGTCAGTCAGAAGGTGGTTCTCACTAAGGGTAAGTGGGGTGGTGATGGCACACCATTTTGAGTATACTGAGTGCTGCTGTGTGGTTCACACTCCTTTGGTTAATTTTGTGTTATGTAAATTTCACATCAACAATTACTTGTTTGAAAAAGAGAAAACAAGGCTCTAAGAAACAACTGCAACCCATAAATTTTTATTATCCTTCTTCTCTGCTTGATAAATACTTGTGTGTTGCGAGCCTGCCATGGCAATTCCTGCCCTTCCCCTGGCCCAGCTTAGCTCTTACGTCTCCCCACCGAGCTGCTGTACTTCAGAGATTTACACAGCTGCTCCCCCACCTGCCCCCATGGGGTCCCCTCACCTGAGCTCCTCAGCTTGCTTGAGCTGCTCTGCAAGCTTCTCCTCCTTGAACTGTCGCTCATTCCTCAGCATAGATTTTATGAGGTCTTTGCACTCTTCATATTCTGAGAAAAGACAGACACGCCTGCCTCAGTGGAAGGCTGGACATGCTGCTGTGGTCATTGCCTACAGGGCAGGAGCCAGGTCCATCCCAAGGACAAAACTCTCCCCAGTACCAGGGTCTAGACAGGGATTTCCACATCTTTACTCTTCAGTCTCCTGACTTTCTGGCATCTGATCCTCCAAAATTTAGAGATGAAGAAAGGGAACCTCAAGGGCACATCAAGGAAGTTGACAAGATGATTCAACCACAACGAAGTGGAGTCAGAATTCACAGCCCCTGAGGTCTGACTCTGAATGCAGGGCCACTTTCCCAAGACTTGCAGCCTCTCCTCTAAAACACTGCACTGGGGCATGAAGTAGTGATTTCTTGTACAGTCGGGAAGGCCCCTAGGACTATGGGACTGACGGTTTCCCTTTTACTGGGAATTTCAAAGACAAGTATGCGAAAGATTTTAAAAATCTTTGATTTTTAAATCATATCTTCAGTTATGATTTTAAGAATCATATCTGAAGCATAAAGTGTGACACATAACACCATAAGGTCATGAAGGAAATATGCCCAAATGCTAATAAAGTTTGTGTTAATTTAGAAACAGCAGAATGAAGAACTAATAGATAGTGTTTACTGTGTGCCAATAAATGTTCTAGGAGATTGACAAGAAATAGCTCATGTAATTCACTGCAGCAATTTACAGAGGTAGGTATTATTGTAGTACCCTCTGAACAGGTGAGGAAACTGAGGGACAGACAAGACAAGCAACTTGGATGGAGCCCAGGAGACAGGCCCACGGTCTCTGCTCTGTACACTGCACTGCTACCTCCACACATTCTCAGGTGCGATCTTTCTTCCTCTTTAGGAACAAGACTCTGTGCCCCAGGAAGCAGGACTTCACTCTCACCAAGCTACACTCTGCTTCTTATTCTTATTTTTATTTATCATTATTATTATTATTATTATTATTATTATTTTTACCAGTCTTGCCCTGTCACCCAGAGTGGAGTGCAATGGCAAAATCTTGGCTCACTGCAACCTCAGCCTCCTGGGTTCAAAGGATTCTCCTGCCTCAGCCTCCTGAGCAGGGGTGATTACAGTCACCTGCCACCATGCCCATCTACTTTTTGTATTTTTAGTGGAGATGGGGTTTCTCCATGTTGCCCAGGCTGGTCTCAAACTCCTGACCTTGTGATCTGCCCGCCTCAGCCTCCCAAAGGGCTGGGATTACAGGAGTGAGCCACCATGCACAGCCCCTACTCCCTGCTCTTGATGCTGTCACTTATAGATAGCACAGGTTCTATTAGGAGCAGACTCCTCTTGAAGCCCCTCAGAGCGGGTACTGGCTACTATCACCAAGTTTCCCTCAGAGTCACTAGAACAGAGCTGTGCCTGTTGGGCCTCAACAGAAACTTGAACTGAATAAAAGTTCACTAGTCTCAGACATTTAGAACAACAGACTAGATGTTATTTGTCTGCAGGATCTTACATGGTACAGAGAGGATTCTTGGAAACATGATTGAGCCTCTTGGAGAAAACAGGTCATTCTGTGCCTGTGTCAGAAATCAATAAATGGCAGTTTAACTCTAGTCCCACCCCCACCTGATTGCAAACATGGAAAGTTGCTAAATACTTTGGTACCTCTCTCTTCCAACTTTAACAAAATGTTAAAATACCCATTTCTGTTTTCCTAGAAGTATGGGGAGGATGACATTATTTTAGATGGAGAGAGCACTTAGTTTCTCAGAGAGAAGACAGGACTTCGTTCATCACTTTCGTGATGGTGAGCCTATAGATCTTACTGTATTTGTTCTGCTGGTTGGCCAGGAAGCAGGCCAGTTGAGTTACAAAACATTTCTCTTTGAGGTTTCTGAACTGCTGTTTCTTCTCTGCCAGCTGGGGATGCAATTTCTCGTTGATTTCTAGAATGTTCATCTCTGCCTTCTCGCTGGACAAAGGGCCGGCTGATACCACCATGCTGACGTTTGTGGCAGAAGAGGTGGGGCCAGGGACTGGGGAGAAGAAAGGCAAACACATGATGGGTTAAAAACTGGTGAAATCAAATAGGTTTAATCACACTGAGGGATGTCAGTGGCAGCCTTGTCTACTTATTTGAAAATGTTGTTTCCCTGGTTTCACTCTTGTCATCTCCAGTCTTGATCTCCTTTAAGTCAACTTGTCTTAGCTATGCAGTCACCTTGAAACCAGGACATAAACACTTCTACACTTTTCTTGCTTATAAGTTTCTATAAAGCAAGGCTGGGCCCTGAGATTTTTACCCCATGAGTGGCCAATGTTTCTGTGTAGCACAAAAGATTGCATTTTCCTTTTTCGATATTTTTCTCTTTTGGTTTTTTGTTTTTTGTTTGAGACGGAGTCTCACTCTGTCACGCAGGCTGCAGTGCAGTGGCGCAATCTCAGCTCACTGCCACCTCTGCCTCCCGGGTTCAAGTGATTCTCATCCCTCAGCCTGCCAAACATCTGGGATTACAAGCGCCAAGTAACATGCCAGCTAATTTTTGCATTTTTAGTAGAGATTGGGTTTCGCCATCTTGGACAGGCTGGTTTCGAACTCCTGACCTCAGGTGTTCCGCCCACCTTGGCCTCCCAAAGTGCTGGGATTAAGATGTGAGCCAGCACCCCCGGTCAGAGACTTTTTTTTTTTTTTTTTTTGAGATGGAGTCTCGCTCTGTCTCCCAGGCTGGAGTGCAGTGGCACAATCTAGGCTCACTGCAAGCTCCGGTTCCTGGGTTCATGCCATTCTCCTGCCACAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCAACACCGTGCCCAGCTAATTTTTTTTTTTTGTATTTTTAGTAACGACGGGGTTTCACCGTGTTAGCCAGGATGGTCTCGATCTCCTGACCTCGTGATCCACCCGCCCCGGCCTCCCAAAGTGCTGGGATTACATGTGTGAGCCACCGCGCCCGGCCGAGACTTCTTATTAATAGCTAAGACAAGCCAATGAAAAGGAGAGAGAGTCTAGCCTGAGAGGAGTGAACCAGGGTGGGAGGATCGTCTCAGCCGATCCTCCCACCTAAGTCTCCTGAGCAGTTGGGACTAGAGGCACGCAGCACCATGCCTGCCTAATTTTTTGTATTCTTTGTAAAGATGGGTTTCACCATATTGTCCAGGCTGGTCTTCAACTCCTGAACTCAAGTCATCCTCCCACTTGGGCCTTCCAAAGTGCTGTGATTATATGTGTGAGTCACAGAACCTAGCTCCATCCTAGTTTCTGACTAAAAGAATAACAATATGTGTATATACAGCCTGTCCTCAGAATTGATCTTCCATAGCCTAGACAGAGGTATGAGACACAAGGAAAATAGAGGCTACCTGGGAGAATGTTTACAGCATCCTGACATTCATCATGAGAGGATTCTCTGTCTACAACCAGAGTTGAGTTGACTTTGTCTTCCTCAAAGGTGATGTTGATGTTCTTGTGAGGCTGGTTGGAGTCACAAGGGCTGTGGCTATTTGAACAAGTGATGGCACATTCCTCCAGTGAGTCCTCAGGGACTTTGCTTTCTTCAGCCTTCTGCACCTCCCTGATGAGCCAGGTGGGACAGAGATGACAGAAGATTAAACACAGAGGGATTGGACCCCAGGGAGTCCTAGCTGGTTTTGACAGGCGGCATTAAGACAGTGGTCCCAGAAAGCAAAATGGAGGTTCCCTTTAAGGGGGAACAGGCAATCCTCTTCTCTCTGCAACAGAGCATGGCTGCCATGGGAGCCAGAGAGGAAGAGAGCAGCTGGTGTTCAGTGCACTGGACAGATAGGAGCTGAGGAGGATGAAGACTCAGCTATCCCTGTATGGTACAGACATGACACTTGGCACACATAGAGAAACACGACAGCTGCCGCACCCTGTGTCTAAGCTGGGTTGAATTTCACATACTGTGGCCAAGCGAATGCGGGCTTTTGGCCCATCATAGATGCCAGAGAGGGTGTGCCTCCTACACATTTTCATATGTTACCACCCATTACTTGCTCCCGAGTATTCAGTGTTACCTGGGGGCAGATGATTCCAGTACTTTCTCAGCCTCCTCAACTTGAACATCTTCATCCTCATCTTCGTCATTTTCTGTAAATACAAAATGTTCGTTCAGATATTTCCCACTTCACATTCTGCAAGCACAGTCAGCCCAATGTGCACAGAGACATGAACATCTATGTATGGTTCAGCACTGTACTGAAAACTGTCATATTTTATCTTTCACAAAATGCCCTGGCATGGTTTCCTGGTCCATCGGGCAATGCATTTCTGATCTGGAGGGCCACCATCAAGATGTGGTCAAATATTGAAAAGACCTTTTGCTTCCCATATCACTGGAGGCTTGTGCAGCCTCTCTCTGGACTTTGGCAGCTGTCGCCCCCATCCTGCCACATATCTGATTCCCAGGAACAGGCTTGGTGTCCTGTCACAGTTCGCATTTCAAACCTCATTCTTTCTCTTAGGAGAGGACAAACTTGTCCCACAGTCCTCTATGCATCATGAGACTGCACAGGCCCTCCATGTGGCTTCTGCTGTGTTATTCAGGGACATTCTATCCACGGGGAGTGCTCCAGTCTGAAGCACTTCCTACCACCAAATGCCCCTACACCAAGTGCCTTCTCCAACACCAAACGGAGAGGGGCTTCATCTCATTTTAAAAAGCATTCGTAAGTGTTCCCATATTTGGATGCTTCAGACCCTTGCAAGAGACAATTTGTTTGCCTTTGCAGATGGAGAGAGAGAAACTCTGGAAAGATAAATCACTCACTCACCGACAGTTACTAAGAACATTGTCAAAAAGACAGCCTGGGAACCTTCATTCTTAGCCCAGAGCTCTTTTCACTCCAACAAGCGCCCTCCCATCACAGCCTCCTTCCTGTCCTTTAAAACTAGACAGATGCTGCCTCTTGCTCCAAAGACCACCTTCCATCAAGGAAGGAGGGACACTTGCAATACTGTGACCTCCAACCCCATGGGTTTCCCATCTCTGTTCTTACCCAGGAAGTCCTGGTCATGTCATGGCCACATATGTATAGCAGAAAAAAACCCCACTGATACAACTGTCATTGTGAAAGTATGGAGGTCTGGAGCCTCTCATAAGCCTGGGGTTTTGGGTCATCAGGGCCTATGGCCACCTTACCTGGGCTGAGCTTCTGGAAAAGTTGCTGTGCCAGTCTACACCCCTCAGCCAGCTGTTCTTGGAGGTCCTGCCCCTGGGACTTGTCTGGCTTATCCGGAGTGAGGAGGGCCTGGAGATGCTGATTCAATGAGCGGGAGGCATCTCTCCCTTCCCGTAACTTCTCCCTTAACTGGGTCAGCTCTCGTTCCTGAGAGTGAACCAGGACTTTATATTGCCTAAGGTGAGACGGTAGAGAAAATTTAAGAGTGGAAAGGGTTGAGTGATCCGTTCAAATATTGCAACAGAGATTTCTGAGACAATGTCCTCAAGGAGACCTCCAAGCAGAAGGTCAGCACATGTTGAAAGGAATGACTGTGGCCAAGAGAAAGAATAGAAAATGGTTTACAGGCTTCCTCTGTATCAGAGAGGGCTCCTGCAAGATCCTCGATGATGTTCCATTCATCTTTCCCTTCTGTAAACAAAAGTAGGTGTCTTCCTAATTCCGTTTCAAAAAGACATCCTTTCAGTTCCTCACTCTGGCCATGGACATTTCCATGTGAAAATACACATAGTGCATCTTGCGGCCACTAGATACAAAGCCATGTACAGAAATGAGGCCAGGTGCAGATGGGGCGAATTGAAAAGACGAAAGAAGAAAAGAATGACAGGGTCGAGAAGGCAACATTGATTGAGTGAAAGAATGAGAAGACGCAGTCAGTCAGAAGGTGATTCTCACTAAGGGTAAGTGGGGTGGCGATGGCACACCATTTTGAGTATACTGAATGCTGCTGTGTGGTTCACACTCCTTTGGTTAATTTTGTGTTATGTAAATTTCACATCAACAATTACTTGTTTGAAAAAGAGAAAACAAGGCTCTAAGAAACAACTGCAACCCATAAATTTTTATTATCCTTCTTCTCTGCTTGATAAATACTTGTGTGTTGCGAGCCTGCCATGGCAATTCCTGCCCTTCCCCTGGCCCAGCTTAGCTCTTACGTCTCCCCACCGAGCTGCTGTACTTCAGAGATTTACACAGCTGCTCCCCTGCCTGCCCCCATGGGGTCCCCTCACCTGAGCTCCTCAGCTTGCTTGAGCTGCTCTGCAAGCTTCTCCTCCTTGAACTGTCGCTCATTCCTCAGCATAAATTTTATGAGGTCTTTACACTCTTCATACTCTGAGAAAAGACAGACACGCCTGCCTCAGTGGAAGGCTGGACATGCTGCTGTGGTCACTGCCTACAGGGCAGAAGCCAGGTCCATCCCAAGGACAAAACTGTCCCCCGTACCAGGCTCTAGGCAGGGATTTCCACATCTTTACTCTTCAGTCTCCTGACTTTCTGGCATCTTATCCTCCAAAATTTAAAGACGAAGAAAGAGAAATTCAAGGCACATCAAGGAAGTTGACAAGATGATTCAACCACAACGAAGTGGAGTCAGAACTCACAGCCCCTGAGGTCTGACTCTGAATGCGGGGCCACTTTCCCAAGACTTGCAGCCTCTCCTCTGAAACACTGCACTGGGGCATGAAGTAGTGATTTCTTGTACAGTTGGGAAGGCCCCTAGGACTATGGGACTGACGGTTTCCCTTTTACTGGGAATTTCAAAGACAAGTATGCGAAAGATTTTAAAAATCTTTGATTTTTAAATCATATCTTCAGTTATGATTTTAAGAATCATATCTGAAGCATAAAGTGTGACACATAACACCATAAGGCCATGAAGGAAATATGCCCAAATATTTTATTAGTATGACAGGCAGCATCAAGATTTAGATTAGTTGTGTTAATTTAGAAACAGCATAAGATTAGTTTGTGTTAATTTAGAAACATCAGAATGAAGAACTAATAGATAGTGTTTACACTGTGCCAATTAATGTTCAAGGAGATTGACAGGAAATACCTCATGTAATTCATTGCAGCAATTTACAGAGGTAGGTATTATTGTAGTACCCTCTGAACAGATGAGGAAACTGAGGGACAGACAAGACAAGCAACTTGGATGGAGCCCAGGAGACAGGCTGAGGGTCCCTGCTTTGCACACTGCACTGCTGCTTCCACACATTCTCGGGTGTGATCTTTCTTCCTCTTTAGGAACAAGAGCCTGTGCACCAGGAAGCAGGACTTCACTCTCACCAAGGTACTCTCTGCTTTTTATTTTTATTTTTGTTTTATTTATCTTTTTGTTTGTTTGTTTTTTGATGAGTCTTGCCCTGTCACCCATGCTGGAGTGCAATAGTGCAATCTTGGCTCACTGCAACATCTGCCTGCTGGGTTCAAAGGATTCTTCTGCCTCAGCCTCCCGATTAGTGGTGATTACAGTTGCCCGCCACGACGCCCATCTACTTTTTGTATTTTTAGTGGAGATGGGGTTTCTCCATGTTGCCCAGGCTAGTCTCAAACTCCTCACCTCGTGCTCTGCCCGCCTCAGCCTCCCAAAGTGCTGAGATTACAGGAGTGAGCCACGTTGCACGGCCCCTACTCCCTGCTCTTGATGCTGTCACTTATAGATAGCACAGGTTCTATTAGGAGCAGACTCCTCTTGAAGCCCCTCAGAGCAGGTACTGGCTACTATCACCAAGTTTCCCTCAGAGTCACTAGAACAGAGCCTTGCCTGTTGGGCCTCAACAGAAACTTGAACTGAATAAAAGTTCACTAGTCTCAGACATTTAGAACAACAGACTAGATGTTATTTGTCTGCAGGATCTTATATGGTACAGAGAGGATTCTTGAAAACATGATTGAGCCTCTTGGAGAAAACAGGTCGTTCTGTGTCTGTGTTAGAAATCAATAACTGTGAGTTTAACTCTAGTCCCACCCCCACCTGATTGCAAACATGGAAAGTTGCTAAATACTTTGGTACCTCTGTCTTCCAACTTTAACAAAATGTTAAAATACCCATTTCTGTTTTCCTAGAAGTACAGGAAGGATGAAATTATTTTTGATGGAGAGAGCATTTAGTGTCTCAGAGAGAAGACAGGACATCATTCATCACTTTCATGATGGTGAGCCTATAGATCTTACTGTATTTCTTCTGTCGGTTGGCCAGGAAGCCGGCCAGTTGAGTTACAAAACATTTCTCTTTGAGGTTTCTGAACTGCTGTTTGTTCTCTGCCAGCTGGGGGCGCAATTTCTCGTTGATTTCTAAAATGTTCGTCTCTGCCTTCTCGCTGGACCAAGGGCCAGCTGATACCACCATGCTGACGTTTGTGGCAGAAGAGGTGGAGCCAGGGACTGGGGAGAAGAAACCCAAACATATGATGGGTTAAAAACTGGTGAAATCAAATAGGTTTAATCAGGACTGAGGGATGTCAGTAACTGAAATTCTTAACTTACTGTTGTGAAAAATGTGATCACTCCCCACAGCACTTTAGGATCCTTCACCACAAAAACAAGGTTCGAGGTGCCTGAACTCAGAGCTGAAAGCACTGCCAGTAGCTCAGACTCTGATAAGAGTGAGGTAGACTGTGGCCAGCGTGCCAGGTAACCGTCTGCAGTTGCAATAACAGAATTAGAAGGTGGGGGTGTCATGGAATCTTAGGAGCCCTGCATTCCAATTGCCCAGGCTTTCCTGAAACACAGGCACCCTAGTCTCACCTGAGGGTCACCACCAATGGGGATCATTCCTTCAGCATTCACTCTCAGTATTCGTGTACCCTTGTGATGATGCCACAGACCCGTGTCTTTCCCAATACATCTAAGCATATTCCTCACTGTTTATCTCTTGTCTGTACAACATCATCAAGGCAGAAACAGTTTCCCAACAGGTTGTATTTTCTTAATGGTAGTCATGAAGTCACCCCACCTGCTCTCAGTTAAAACAGAGCTTAAGGCCTTTCCACAGGTGTAAGATATCAAACTTTTAGCCTGCCCTGATTTCCTCTGGGTCTTCTGCAGTTTTGTCTGTATCCACTAGAAAGTGAATGAATAATTCATTTGTAAAAAATGTTGTCTTTCCTGTCTCAGTATTCTTCTTGCTGTTTCCCATTGTTATGTTGATTTCTTTTTTCTCACTGGGGCACCATCTTTGCTTTTCATTACACTCTAGACCAGTTTGACATCCCTATGTCCAGAGCTCTTCCTCTATGTGGGTTGATTTGGTTTTTGATGTCACTGAGCGCTACATTTTATACTTGTCACTTATGGATGTCATTCTAGTGTCACAAGAGCTCTTTTCAAGGTATCAAGTGATCAAAATCATTTATATAGAGATCTCCTGAAAACATGTGTGACCATCTATCTTGGGAAGTTTCATAAACCTGATGCTATTTTGTTGTTTCCATTTTGTTTTCCCATATACTGAAAAGAACAGGGCCATGAGCGGTTCTTATGCAATATGGTTTGATATATATTTTGTTGAGATGACCTAACACCATTGATTTTGGGTTGCATTCCACTAACAGAACATGGCAAGATCAAGGTTATGGTCACGGTTGGTTGGTGATCCTCAGTGTTGCAGTAGAAGGTGAGTTTGAGATGAGAGGAATGAGTAGGAAAGAGTGATCCCCTGAACCACCTCCTCGCTTTCTCAGCTTTCACCCCACCTAGGTTTTGTGAGCCTGGAACTTGGGAGACTGTTCTGTAGCCCAGGTCTCCTAAGATTGGCTGCTGGACTTGCCTGAGTTGAGGGTGCGGTGGGTTGACCCTGGGCTGCCCAGCATTCATGTGGTAGTGAAGGAAGGAGGACTGGATCAATCCCATTTCAAAGCATGTCTCTCTGCACTCCACACTGTCCTCCAATGACACTGTAAGGAAACCGCTTTAAGACGTATCAACGGCTTTAAGTAAATGTATTTTCTGGCATCTGGGAGACCTGACATTCTGTGTCATAATGAAAATCTGTCATGTTTCTTTATTTTAAAAATGATAAAACTGCAGGTTCACAGAGTTACATGGCTTACTTGAGGTCACACGGGGATGAGTTTTCAGCACTGCCAATAAAAGCAATCACATGAATTATTCAGTAATTATTCATAGGATCCATATAATTCAGTAAATATTCACATAATTATTTACTAGTTGTTCATTGACCAATTCGTACAAGGCATTTTGCTCAAAACTGTGTTTATATTTGGACATTGTATCTTCATCATAATCCTGTGGTAATGCTGTTATCCGTAAGTAACAGGTAAGAAACCTGAAGAGGAGGGATAGCAAATCATGTATTTGGACATATTTCCTTTTTTTTTTTTTGGTTTTTGTGATGCTGGAAGAATGACCAGAATGAGTCATAGGAAGAGTATACATTCCTGTAGTATTTTCCAGGACAGAGGTGTGACCTCCTAGAGTACTGGGACCAAAATTCCCAAGTGTCTGCAACCTTGCTTTAACAGTATGGGAGATCACCTCTATCACCTGGAATTCCCCTGGAACTCTGGAATATACAAGAGAAGTATGAGACTTGGGTCTTCCCTTGGCTGTGTTTAATTCACTCTTCTATGGAATACCAATGATTCTCACTAAGACTGGCCTTTTCATAAGCACAATGTGCATTTTATGGAGAAGATTTTACACTTTGCTCTATTTAGAAAGAATAAATATGAGCAGTGGTTTAGGTTTTATGCCCTGGACTTAATATGTTTCTGATTCCTGTTTTGAGATTAAATTCTCATGTAAATAGAAAAATACTTATTATTTCTCATAAGGCCAAGTTTGTTATTAGTTTGAGTTTTTGAAGATGAAGCACAAACTTTTGATTTTATCTTTGTCTGTCTCTGTCAGCGCCACTCGTTGTCTCTCAGTATGACCTGGACTTGCCCCTGCACTTACCCTTGTCCTGCTGAACCATCTCCATGCACTGTCCAATTCCATCAGTGATTCGGGCTCCTTCCAAGGCTCCCTGAAAAGGGCACAGAGATCAGGACATTAGGCACATTCCGGACACAAAGGCAACCCATACTGTAGAGTGGGCAGCTGTGTTTCCACTTCCCTAATATTCCAGTGATGTCCTCAAACTGAAAGGAACACTTTCCCTTTTTAGGGGTCTGTTCTTCATGTCTCAGTGCCTCTGATCTAGTCAACACAACTGTCCTGAATGTGAAAGAACTTGCTAAATTTCTAGTTTCTTGTTAGGTGGCTAAAATAGATTTATAAGACTTCCTTACTTACCCATGACTGCTGAAGTTTGAATTCTTAGCAGTACGATTCGTTTTCTTGTAAGGTGAGCAGCTTAGGAAAGATTGGCCATCTTCCTGTGCAAAAAGAGGCAAACTTAATTTCTACTCAAAGCATGCTTGAATTTGGAATCAGGGCTTCCACTCTTCCGAAGTTGGAGTGTCACTGCGACAGGCATGTGTCCCGAAGGGCTCGTGTCTCTGCTATACTCAAAGTTTAAATGGAGCCCAGCAAGCCAGATGTCCTTTACTTCTAGGTTCCCTCAACAGTTTCTCCTCCGCTTTAGAGACCGCATTGAAAATATTCTTGTTCTGCTGTTGTGTTTTGGCTTTGGAATGATGTGATGCAGCTCAATGGGTCCCACCCCCAACTTGATCAAAGTAAGAAACAGCTGGGAAAGTCAGTGCAAATACAAGTTCATTGTCCTCCTTGCAGGGATTCTGATTCAGAGGGCTCAGGTGGGGCCTGGAATGTTTGTTAACATGACTCAGATGTGCAGTCAATTTGGGGACTCACTGACAGCATTGACCTTACAGTTTATGGGATGATTCTTTCTGTTTGGTGATGAAGAAACTGAGGCACACAGAGTCTGTAACTTGCCCAAGTTCCCCTTGTTGTAAGTCCTGGAGCCAGATCTCAGGTGGACCAGTGCTTCTCTCCCCTATACCTCATTTCTGAGAAAAGGAAATCTTCTGCAATTTGACTTCTTTCATCTAACACATTTCCTCACAACATGCAGCCAGCATCATATTTTGGCCACTTACTATTAAAGTGAGATGCTTTTTTTTTTTTTTTTTTGAGACAGGGTCTTATTCTGTCACCCAGGGTGGAGTGCACTGGTGATTATAGATCACGGCAATCTTGAACTTCTGGGCTCAAGCGATCCTCCTGCCTCAGCTTTCCAAGTAGTTGGAACTATAGGCACACATCACCATTTCTGGCTAATTTTATATTTTTCATAGAGACAAGGTCTTGCTATGTTGCTCAGGCTGGTTTTGAACTTCTGGCCTCAAGCGATCCTCCCACCTAGGCCTCCAAAAGTGCTGGGATTACAGAAGTTAGCCACTGAACCTGGCCCTGAAATGCTTTTATTTCTTTCTTTTTTTTAATGAAAATACTGGACATGGAGATGTGGAAAGACACCTTGCTTTATTACTTTTGTTGTTATTATTATTTCTACAGTAGAATTTATACATCACAAAATTCACCATTTTTAAGCATACATTTCAGTGTCTTTTACCATATTCCAAAACTTTCGCAACCATCGCCACTACCTAATTCCAGAATATTTTCATAATGCCAAAAAGCATGCCTGTACCTATGGGCAGACACTCTCCAATTCCCCCCTTCTTGCGCTCTCTGACAACCACTAATCTACCTTCTCTATATATTGATGTACTTGTTCTGGGCACTTCCTCTATATGGAATAACAAAGTGTGGTATTTTCTATCTGCTTCTTAGAATATTGTTCTCAAGTTTCATCCTTTCTAGCCTGCGTCAGTACTTCAACTTTTTATGGCCAGATAATATTCCACTATATGGTTATACCACATTTTGTTTATTCATCAACTCATGGTGGTTTAAGATGTTTCCACTTTTTAACTATTAGGAATAATGCTGCTGTGAACAGCTTTGTACAGGTTTTTGAGTGAACATCTGTTTTTCATTTTCTTGGTTATAAACCTAGGAGTGCAATTGCTGCATCATATGTCACTTTATGTTTCACTTTTTGAGGAACTCACACACTGTTTACTAACTTCAGTAGCTATATCATTTTAGATTCCCAATAGTAATATATGAGAATTCCATATTCTCCATCACTTTTGAAACATGTGTTGTCTTTATTTTTTTCTTAAGTCATACTGCTGGGTGTGAAGTGGTATCTCATTTTGGTTTAAATTTACATTTTCCTAATGACGAAAAACATTGAACATCTTTGCATGTGCTTCTTGGCCATTTGTGTGTTTCCTTTAGAGAAACCTCTACTCACAGCTTTTTTTCCCCATTGTTAAATGTGGTTGTCGTTTATTGCTCAGTTATATGAATTCCTTATACACTCTAGGTACTAGACCTGTGTCAAACATACAATTTGGAAATAGTTCTCCCATTATGTGGATTATCTTTTCACTTCCTTGACAGTGTCCTTTGAAGCATACAAGTTTTTTATTTTAATGAAGTCCATTTATCTATTTTTCGGTTGTTTGTGCCTACTTAAAAAATGTCTAATCCAAAATCACAAAGATTTGTACCTAGGTTTCCTTCAAGACATCGTCTTTTGAATGAGAACTTTCCTGGGTTTTAGAGGAGGGTGGACATTGTTTATTGATGCCTCCTGTCCATTACCGATGTTTCTCTTGATTGTTATTCATATGCTCACCACCCCTCCATGGAGCATCCATGGCCTGTGACAGAGCTCTGGGGACTGATATCCTTCCACTGACTTTGGCGCTGGTGAGAGCCCTGGTCATGTGATTCAGCTTGGCCTTAACCCGACCCAGTTGCACATATTCCTCAGGCCCTTTAGAGTTGAAGTCGAGACCTCTCTGAGAACGCTTGCCAGCCCATGCTCTTCTAAGGCTGGAGCAAACTTCCTCCATCTATTCCAGACAGAGGGGACTGCAGGGGTTGGACTCACTCAAGATATCTCTGGTGTTAGAAAGAAGACCTGTTTCAGGCTTTGGGGAAGATTGTTCAATATGAACTAGGTCCTCTCTAATTATTTTTACCGTATGTGTGACTTCTTTCTAGAAACAAGGGAAGAATATTTATGTTAGAACATTTTGTCTATTCTTTGTCAATTGTTGTTTATCTACAATTTTAACATGGATAAAGGAGAGTTCAGTGTCAATATATTCTTAACAACTAATTACGGCTCATGTCCACCGCCATGCGATCATATTTAAATCTGTCAACTATCCTGTTACTTAGGTATTATCCTGTTCCTGATGAGAAAACAAACTCAGAAAGATTGCAAAATTTCCCTAGGTCACAAAACTAGTGAGGAGAGGAGTAAGAATTAGATATCCGTTCCTTTTGGCCTTCAAAGCTAACCTTGTACCATTAGATCAAACTGATTTACATACTTTTGCTGGAATTAGTCTCAGACTTGTGGTTCTCACTTGATTTTCCCAAGGAAACAGTGTGCCACTTTAATATCATTTCAAACTTTGAAATTTAAAACTCTTTTTATTATACTTTTTTGTCTTTGTTCTATTCCGTTGCTTTTGGTTTCTTCTCAACGGATCCCTCTTATTTATATGCTAAATATTTGTTACCTATTTTCTGTCAATTTTCACCTTTTTGAGTGTTTGTTATCTGTCTGTTGTATGCTAACAGTTTTTCACTGAGGTAAAATTTGCGTAGAGTATACTGCAAAAAAACCTAAAGGCACAGCTTAATAAATTTTAATATAATTATAATTGTAAAGTAACACCCAGTTAAAGACAGAGAACATTTTCCCCCATGCCACAAAGTTCTGATGTGGTCCTTGCCAGTCAATACTCATCCCCCAAATGAAGAATATATTCTGAATGTTGTCACTGCCTTAGCCCCTTTGTGTTGCTGGAAAGGAATACCAGAGGCTGGGTAAGTTATCAAGACAAGAGGTGCCTTTTGCTCATAGTTCTGCAGGCTGTACAAGAAGCATGGCCCCCGCATCTGCTCCTAATGAGGGCCTGAGGCTGCTTCCACTTGCAGCAGAAGGTGAAAAGGAACCAGGGTGTGCAGAGATCATATGGCGAGAGAGGAAGCAAAAGAGAGCAAGGAAAGGTGAGAGGCACTTTTTAATAACCAGCTCCTACAGGAACTAAGAGAGTGAGAATTCACTCACTACCTTCTCCCAGGGTGGGGATTCATCTATTCATGAGGGATCCACTCCCATGACCCAAACACCTCCCATTTACCCCCACCTCCAACACTGGGGACCACATTTGAACATGTGATTTGGAGGGGACCAATATTTAAACTTAGCAGCCACCATAGATTCATTTTGCTTGATCATGTGCTTCATAAAAATGGAATCATTTTGGCTGGGCCTGGTGGCTCATGCCTGTAATCCCAAGACTTTGCAAGGCTGAGGCGGGCAGATCACCTGAGGTCAGGCGTTCAAGACCAGCCTGGCCAACATGGTAAAACCCTGCCTCTACTGAAAATACAAAAAATTAGCCAGGCATGGTGGCCGGTGCCTGTAATCCCAGGCACCAGATATGTACTGGTATCTCATATGTACAGGACATGTACTGGTATCTCATTGTTGTATTGATTGATGTTCCTGATGGCTAAACTGTAGAGCATCTTTTCCTATGCTAATTGACCATTCATGTATCTTCTTTTCTTAAGTACCTATTCAAGTCTTTTGAGAAATTGTTTCATTGTGCTGTTTATCTTATTAAACTTATATATATATACATACATATATATACAAATACACTCTAAAAAACCCCTTTGTTGGAAATAAATATATCTCCTATATTGTGGTTTCTTTTAATGTTCTCTTAATGTTCCCTGTTTGGAGATAACGATAGATAATCTTCAAAAAGGTGAATATACACACCCACACCCACCCACACACATACACACACACACACACACACACACACACACACACGTGAGCCACCGGATCCAGCCTGTTGAATTTATTTCTAAGCACAACATGTATTTAGATGTTACTTGAAATGAAATTGTATTTTTATTTCATTTTCCAAATGCTCATTGCTAATACACAGAAATACAAAAGACTACTTCTATTGAGCTTATATTCTGCAACATTACCAAACTCACTAATTACTTTTGGCAGATTTTTATAGATTTCTAGGATTATTAACATACACAGTCATTATCTGTGAATAAAGACAGCTTCAATTCTTTCTTTTCAATCTTTTCAATACTTTTATTTTTCTTACTTTATTGCATTGATTTAGATCTCTAGTATAATGCTGAATTGAAAGAATAACAACAGATATTCTACTTTTTTCTCTGATTTAATAGAAAAGCATTCAATCCTATGCCATTTAATATAATGTTACCTCTGAGTTTTTTTCAAATCTACCCTTAATAGGGTTGAAAGTGTTGCCTTCTCTTCTTATCATGCTGAGAGTTTTCTGGGGTTTGTTTTTATAAATCATGAAAAAAGTTTTCAATTGTGCCAAATGCTTTTACTGTGTATGACAAGGTAATCATATGGTTTTTCTCTTTTGCCCTGATAATACATAACATTACATTTTCTTAAATATAAAAAAGATTTCTTGAATCAAGCTAGGACAGTTTTTTTAATTATAAACTTTTAACAAATATATTGAAATATAACTTACATGCAATTGAGATGCATGAAAGTGTATAATCATTAAAGTGTATAATTTTAAGAGTTTGAGCACACTATACACGAGTCAAAGAGAAAGGACAGAAAATACTAACGATGGCTCAGCACATGTGGTCTATCTTGCTGAATGCTCTATGTGAGTTTGAGAAGAGTTATTTGTTAGCTGTTCTTAGATGTATTTTGCTTAAATATCGACCTGGCTAACATGTGTCATTGATTGTGTGAATTAATTTTGTTCTAGTGGGCAGTAAAATTACTGTCTGATCACTTTGGACTTATGTGGACTGGTTCATGTTTTATTACAACGGATTCATGGAAAGCCCACAGCATTTCCCAAGACCCTCTAATTTGGCAGGACTCAATCACCAATCCACCCCTTTGTGAATTTGTCAGGGTTTGCTTTTAGGCTTTAGCAGGTTGGTCTACAATAGGCCTTATTGAAAAGTGTGACACTTATTCCTAAAGCACATCCATTCTAGTGTCTCAGTTGGATACCTGGGTGCTAATGAGGTGTGCATGAGTTCTTCCCACCATGGATGGCAGAAACTCCATCATACATTCCCCAACCCTCCTCCACCTCAAGTACCTCTGGTCCAAACTCAATTTCATAGCAGCCACCCCTCTGTTAAATCTGTTAGTCTTTTCCTTGTGCAGGTAGAGTCCACTCCTTGATAAGTATGCACATGGAACCCCACATAGACTTTGAGAGCTGCACCTTTGATCAGCTGTCTCCTCACTGGTGCCCTGCCCTGCAGATTGCAGTTGCTTCAGCCGTCTTGAACTCTGATCTCTGCCTTCTCAGCTCAGTGAGCTGCCCTGCCCTGAGTGGACTCTAGCTCACTATGCAGCTGCTGAGAAATTCTCCCCAAACAACTAGGAAATCATGGGGCTTCCCCCTTAAGTTTTCTCTTGGACTGCCTGTTGTACACTGCTGAAAACAATTTTACGTTTGTTTATGGAGGCAGGGTTAGTCTGATATGATTTATTCTAACAGACAGAAGCAGAAATCTGTTATACTCTTTTAATTACTGTGTCTTTATAATATTATGGTAGACAGAATCCTAAGATGACCCCCAGTGATCTTTGCTCTTATATAATCACTTCCTCCTGAGTGTAGACAAAGCTACTGAGGAGATGTCACTCCTGTGATTGTGTTACAATTTATGGCAAAAACAAGTTAACAGATGTAATCGAGATCCCAAATCGGTCCAATTTAAGATAGACAGATTATCTGATGAGCTTGACCTAGTGAACGTGAGTTCCTTGGAGGGACTGAGGACTTCCTGGAGAGATGTGAAGTGCAGGAGGGTTTCCATGCAGGGCGATCCTCCTCTGCTGGCTGGAGGAAGCATGCAGTGGGAACATGGGAGGCCTCTAGGAGCAGCGAGAGGCCCCTGGCTGACAGCCAGCAAGAAAACAGAGATCTCAGTCCTACAGTCACAAGGAACTGAACTCAGCTGACAACCTGAGGAAACTTGAGAGGAAGTTCTTCCCCAGAACCTCCAGAAAGAAACCCAGCCTAATTTCAGCCTGTGAGGCCCTGAGAAGAAGACCCAGAGAATCCAGGCCTGAACTTCTGATCTGTGGACACTGCAAGAAAATAAATCATTCTTATTTTACGCCGCTAATGCTTGCAGTAATTTAGTATGCAGCAATAGAAAATTAATACAAATAAAATGGAGAAGGCTTTGGAGTGGGGACAAGAAGGAAACGGTGGGAGAGGGATGCCTGTATGCTGATATGGTTGATGCCTGTATGGTTGAATTGGGTCTACCGTTCCTCATCTAATTAGCTATGGTCTATTAAGGTGCATAGCTACACACAAATATTGGTACTACGTTCAATTCAGAGGAATAAGATATTGCATTCTTGACAGTAGACAAGAACACCCTGAATTTGGGGTCACTGTATCATAAGTCATGTTATCAGGTCCCTCTAGGAAGGCTTAGAGGAAGATTTCCAGGATACACTTGTGACAACATTGAAGGCTTCTTTTTTCCCCAAAGGGACCCGATCTCCCCTCAGTCGAGAAGCTCCAAGTCTCTGAACTGGATGCCAGGTTATAAATTCCCCCTATACTGACTCCATCAGGCTTCTGTCCTCAGAACTAGAGTTTATCAGTAAAAGATAGACTCATGGGAGTCTAGGCATTTATTCTCTTATTTTATATAAATCAGTTAATGTGCAGGAACAAAACAGACTTTGAAGAAAGACACTCACAGTTGCCACAGGAAAACACCTTCAACATCCTCATGAGTCATCATGGGTGTTCTGTTGGGAGGACTTGATAGGAGGCTTTCCTCCTCACGGGCTAGTGCAGATCCAGGGGAAATGTCATCAAGTCCTCCATTCGGAGGGTAGCAGCTGAGGCTGCTGATTCGTTAGGCCTCCTGCAGCTGGAGATGCAAGTAGTGCATTTTCATGGCCACCGCAGGGCCCTCAGTTTAGCATTCTTCAGAGCCAGCATCCAACAAGCCACAGAAGCTCTGAGTATTTCCCTTTCTTCAGTCACCCACATAAATGGCTTCAGGGCCTTCTGGGGAAGGCCTGAAGGAAGATTTACAGCATACACTTGTGGCAGCATTGAAGGCTTCACTCTTCCTCAAGGGATCCAATCTCCCCTCAGTCAAGAAGCTCCAGGTATCTGAACTGGATGCCAGGTCATAAATTCCCACTATGGTGACTCCATCAGGTCTCTGTCCTCAGAACTAGAGCTTTTCTAAGTGTAACGTAAGTTGATTTCTTAGTAGATGTCCCATCCATTACATTCCCAGACACCTCACAATGATTCGAATGATTAGTAACCACCACATATCCCTGCCTCTCAGGGAAATCCCTCCCGCCTTGTCTCTAGATGGCCAAGTCCCACGGCCTGTCCTCTACTCTTCCAGAACCCTGTTGTTCTCACTGACAGCAGGGAGGGCAAATCCATGCAGCAGCTCCCGCCATGACCTCCAGCCTGCAGAGGATGGGCGCCACAGGACTTTTAAACGCATGCCGCTGTTCCCCTCACCTGTGCATTTCTTAACGCCTTGGTGAGGAGAATGTCTCTGGATCTTCCTTGATGGGAGCTAAAGGAACAAAGGTAAATAATGCTATGGGACCCACTGAGAACTGGGGCTGTGGAAGAGTGGCCACTGAAGTAATAGACAGATGCAGCTATTGCCAGATACTCAGTGCCAGAGCAGGGAGGGACAGGGAAGAAATACGGACCTCACCTTCCTCTCACTTCCAGGATCCATCGGGGGCCCTCCATTGCTAAACCTAACTAGAAGTGTGCACGCACGGGAGCCAGGGATGCATTCTAGGAGGGACGAGCCCCGAGTGGCATGAGACAGGATGGAAATGAGTGGACAGTGGATCTGTGGGAAGAAGGAGGGGATGTTATGGGAAAACAAAAGGAGAATACTAGCTAAGAACGCTAGGTGACATTAATATTCCGAAGTCTGTGCTCATATTCAGCAAAGAAAGTTCAGCATAAAGCACTAAATAAGGAGTCAAGATATTGTACTTCCAACTGTTGTTCCAACAGCTGTATTATGAAGGGCCACTTTATTTCATGCCTTTCTAATTTGACCTAAAGTGCCAGGTGGCACTGGGGCTGGCACAGCCTTGCTCAATTATGTGTTGCAGAGTACACAGAGACTGCCAGGCTGAGGGAAGATGCAAGAGAATAGAAGAGATGCTCTCAGGGAACAAGAGACCACATGGCCCCAGAGTCAGGGGCAGCATCAGCCACTGTCAGCTGCTCATTTTCCCAGACAGAGCCCACAAGCCTCAGCCATGCTTTGCTTCTGCAAGACGCTTCTTCACCTTTTCAATAAACCTGCCTGAATTTAAGCTGACAGGGTTTATTTCTCCTTCATCATAAATGAAATTCTTCACCACAACAATCTCCAATGAATTTTGGGCACAGCAGGCAGGCCCATTTCTGCTTCTGTTCCACTATCTCTCCTGTAGGTTGAAAAGGAGGAGGTACTGAATTACCTCCAAATGTTCCTCTGGCTCTGATATTCTGTTATTCTGGTTCCTTTTTGGCTACTTTGTTTTTGGTAGCGTGTATCCTAAGGCGTCCAGTTGAACAACTTTTGTCTACTGTGTCCAGGCATTCCTGGTGGTATTTCAGATAAGACTCTCTTGGGTTGCTGAACTCACAACCACTGAACCAATTCTATGACCATCTGTTTCATGGCCACATGTTTGCTCATTTTATATGTACATAAAGGGAGGGGACAGACAGCAAACTTGCGTGTTACAAATTGTATCATCTTAAAAAGGAAACAAGGCAACACTTTGCAATAAAACCTTAAGATGCATGAAATTTGAGCCTAATGCAATAAAGGATGCCCATAAAATTCTTATCTAAAGAATGTTTCGAAAATTGTTGTACAAGGACATCATCATTTAAAGTGATATGAAGAAACCTTCTCAGCTAAGCATATGGGCTAGATTAGAGAGAAAAATAAAGGACCCATCTCTGCCCTGGAAAAACTGCTGGTAGCATCTTTCAAAAAGCTCTCTGTGTTTGAGTACGCACCTTGATCCATAGGCTCACATTTGATCCCAACTGGCAGCTGCTTCTTGGCATTAACATTGGATTCCCAACTAGTAAATCTTACCAAGATCTGACTTTCTGCAGATATAATATTATTTTGTTTGACCATCCTTATCTTCAAGGGCTACCAAGAAGGAACCAAGAATTTATTTACCTCCCCAAGGGAAAAGGTTTTACCAATGAGACCCTTTCTCACCATGACCCCAGGACCCCATATGCCCTGTTCACTTGAGTGCCCTGTGTGGCCTGATAGAAGCTCATGCTGGTCACAGGATTCCTTATATGACTAGCCTCCTTCCTGAATCCCAATTTCATGGTGGTGGTCATGACAGGTGTCCTGTATCCCATGCTCATGTCCCTGAAGTCACCAGCCTATCTCCAGTTAGAAAAAATTACATGTATATAGAGAGGCCTCTTTGGAAGGAGCAAAAGCTTTCTCACCTTCGTACACTAATGGTTGGAAGGTACAACAGCATATGCACTTTGGGAAAAAATATCTGGCATATTCTTACAGAAACAAACAACTACCTATTCTATGACTCAGTAATTCCTAAGCATTTATCCAAGAGAAACTAAAACCTATGTCCAGAAAATGACTTATACAAGAATGTTCATAGCAGTTTTATTCATAATACAAAAAACTGGAAACATTCAAGTATCTGTCAATACAAGAATGGATCAATAAACTGTGATACACTCATTCCATGGAATGGCTAAAGGAACAAACTGGTGACACACAGAACAACATGGATGAATCTCAAAAACATTTGGAGTGCGATAGAAGCCATACCCAAAAAAGTGTGAGAAAAAAAGATAAATAATAATGGTTCCAAGAAATGCACAGCAGACAGCCCAGAGGCAAAGACCCACAGGACGGCGGGCCGGTCCCAGGCTGTCGATCCTAATTAAGAAACTTCTGCTGGATTTTGCCCAGCTCCATTTCCAAACTATTTTGGGTCAGTGACTTCTTTATCCCTTCCATGTTGCCTCATTTTGAACTAGAATCACTGTAAGTGTTATTCTATGTCTGTCACATCATTCCACAGTAGGGGCAGATAAGCTGTTTAGAATGGCTAAAATTCAAAAAGGTGAACACACCAAATGCTGTCAAAGATGAGGAGCAACCAGAACTTTCCATCGCTAGTGGAAATCAAAAGGGTACAGTCACTTTGGAAAACTTAAGTTCACTCAAAATCCTGCACAGAAGTACTTACAGCAATTTTATTCATCATCGCCAAAACTTGGAAGTGCCCAAGATGTCTTTCACCAAGCGAAAGAATAAACAAACTGTTGTAGCCATACAAGGAAATCTGATTCACTGATTTTAAAAAACAAGTTATCAAGCCATGAAAAGACATGAAGGAACTTAAAGTACATAATGCTAGAAAGAAGCCAGTCTGGAAACCCACATACTGTACCACTCCAACTCTAGGACATTCTTGGAAAGTCAAAAAGATAGAAGTAGTAAAATGGTGAGTGGTTGTCAGGGGTGGAGGAGAGGAGGACGCGTGAAATGGTGAAGCACAGGGAATTTTCAGCAGTGAAACTCTTTCGCATGATGCTGTATTGGGGATTTAGGACATTATGTAATTGCCAAAACCCATAATCTGTGAAACTCAAAGAATGACCTCTAATGTAAACTATGGACTTTAGTTGATAATGACGTATCAACAGTGGTTCATCAATTGTAATGAATGGACCACACTAATACAACATACTAGTAGGGAAAATTGTGTGCTGGAGGACAGGGGAGCCTAGGAGAACTCTCTGTATTATCCACTCAAGTTTTCTGTAAACCTAGAACTGTTCTAAAAAATAATGTCTATTAACTGTTTTTTTAATTAGGATGCAGCAGCCCCATATCAAGGTTTTGGTGGCATCCTGTAATTGTGTGGTTAGTACTTGGCATTGAAGTGCACCAACCTGGAGTCAGAGCAGTTGGAGATTTCAAGGCCTGTGCCATTTACCTCTAACCCTGGGGTGCCCCTGGAATACAGATAGCAGATCGGTTAAGGAGAAGCAGCCTCAGCAATCTAGACAGTGCAGGTTTCTGGTGAGGACAGGTAAAAACCATCTGGGTGGGCAGAACTTGGTGAAGACCAGAAACCACTGAGACTCAGCAGCTGCCGCAGTGGCACCCACAAATCAAAGGAGGGGGCTGGGAAGAGCTAAGGGCTACTGGATGAGCTCTCTGCCTGCAAGACAGAAGCAGATCCAGAGATTTTGGAAAATAATGTAGGTTTCAGTACAGTGTGATCTCTTCAAAAAAGTAGAGAGAATGAAAAGGAAAGAAAAAGAGAGAGCATGAGAGAGAAAGAAGAAGAAAAGAAGAAAGGAAGAAAGGAAAGAAGGGAGGGAAGGAGGGAAGGAGGAAGGAAGGGAGGGATGGAGGGCGGGTGGGAAGGAGGGAAAGAATAAAAAGAGAGAGAAAGAGAGTTGGAGGGAAGTAGGGAAGGAAGGAAGGAAGGAAGGAAATGAACAAATTTACATGAAGATGAGAACAGTGGGGAAACTTACACCACCAATATTTTCCATTAACAGGAACACGCTAAGTAGTTATTAGAGAAAGACACGCTACTGTAAAACAATATACTGTTTCCATGGGGTACAACAACCCCTTCCTCCTCCTCTGAAACACATTCTATCTCTGGCTCACTGTTGCCAGAGACACTGAGTCTTGTCTTTGGATACGTTCTGGTGCCCACAAGAATGAGATGAGACAGTGGATCCCAGAACACCAGGCCACGAACTTCCCTGTTGCTCCTTGTCCACTCCAGAAGCTACCCAGCTGCAGTTGGGGACCTCAGCCCCTGGGTCTGATGTCATCCATTTGCCTTTCTCAATGGACTTCTCTCCTTGCACTGGCTCCTACTCCCCCAGGACCTGTGGGTGACCACATGAGAAGAACACAAACAGGCCATGCCCCTTTCTTTCTCCCCCTCTCAATGCCTGCAGTAGTGGGTTCCATGGGGTAGTGACCTGAGATTTACTCATTGTGGGGCCTCTAGCCCAGAGCAGGGCCTACTACCTCACAGTCACCCCATGAATGCTCAGTGAAAGAAGACGTCCACCACAAGGTCCTGGGGAACCAAGAATTCCACTGTGGCCCATAAATTCTAAGTCTACAGGATTCTGGAATGGGAGATGGGAAAGGCCTTCAAAAGTGGCCACTTTTAACCCATTATACTGGCAACTGAGCCATGTTTCCCCATCCTGGACACATCCAGAGGGCACTGCCTAAAACCAGACACATCTCCCCACCCAGGACAGTGTAGGAGCCTTAGCCTGGGGGATGCAGGTGGACAGGGAGGGGGTGAGCCACCAAAGCTGAAGAGCAGAAAGCAGGTGAAAGGGGACAGCAGGGTGGAAACAGAGAGAAATGGGGGCAGAGAATGGGGGGTGAGAGGGGAAGAGTGAGGAGAGGGATGCAGATCTAGCTAGTAAGGAAAAGTCCTGGAGAGAACACTGTCCTCTCCTGAAGTAAAATCACTTCTACCTGACCACGGCACTGCAGCTCATGGGCAGCACATGCTGTGGATATTTGTTCATTCATTTAACAAATATTTATTTAATATCTGTTGCATGCCAAGCAAGGCCCTGCAATGTTTAGGGACCTTGACATCTTCCCTTCACATCTGAGTCATAATACAAAGAGGACTCTCTGACCCCACTGAGCTGGCAATGCCTCGGGATTTTTACCTGTTGGATCTGGCAGCTCTTGATGTCAGCCCACACCATGTGAGGCTGCTCTTGGTGCACCCAATGGGGAAGTTTCTACATCAGGGCCTCGGAGAATCCACTGGAAGCCCTGGACAGTGGGAGTCAGCGGCATCCCCAGTGTGGAGGCCAAGAGCACACAGTGCTTAAGCTCCAGGCACCCTCAGGAGGACGGCAAGGGACAATTGGCTGGTGAGAGCCCGGGTCACCGGGAACCTTCGCCTGGGTCTAAACAGGATTTGCCTTCAGATTGCCTGTGAGATAAAAGAGAGAAATCAAGGTTAACGTTGAGATTTAGGGCTTCGGTAACTTGAAGGATGGAGCTGCCATTTACGGAGACTGGGAAGACCCAGGGAAGAGCAGATTGAAAGGTGGTGGGAACTAGAGGTGGTTGGGTTTCTGTCATATGTAATCAACAGTCCTGACCAGCCTGGGCAACATAGTAAGACCCCGTCTGGGAAAAGAAAAAAGGAAAAATAAGCTGAGCATGGTGGTGCACACTTGTAGTCTCAGCTACTTGGGAGGCTGAGGCAGGAGGATTCCTTGAGCCTTCAGTTAGCGGTTAGTGAGCTATGATGGCACCACTGTACTCCAGCCTGGGGGGAAAAAAATAAAGAGTCCTGACTAAATACTAGAGTAGCCAGGGAAGTTTTCACAAAGTAAGTAATATTTGAGGCAGATCTTAGTGAACAAGAATTCCATTATTTCTGTTAGGGAATTAAGAGAGTGTGGGTGTCGTTAGTTAATGCTTATTAAAGTAGCTTTGGAATCTCATCTACTGGTCTAGCTGGTCTATCTGTACACGTATATTGTATATGCTGTCTCTCTGAGCTTTCGCTAGGTTATGCTACGGTAACAAAAGCCCCAAAATCTTAGCAGCTACACATACGAAGGTTTATTTTTCATTGACATGTCCTTTTATGGCAGGTTGACTGTGACTCTACTCTATACAAGCTACTTTATTTGTTAGATGGTGAAAACTGTGATACTCGGAGGTTGTTGAATATGGTATTAGTATGTTCATTCATTCATTCATTTAAGAAATATTTATTCAATATCTGTTTCATGCCAGGCAAGGTCAAGTACTGAGAATACACTGGTGAATCAAAGAGACAAAATCTCTAATTGCCAGGAGCTTATATTGAAAATCAGATTAAACACATACAAAATCATCATAATAACAACAATGAATACTATATTCATAAATAATAGCTGTAAGAGATTTTAGTACATCTTTTAAATTAGAAAAATATAAAAATTATTAAAACTAAAATGGCCAGGTGTGATGGCTCATGCCTGTGATCCCAACACTTTGGGATGCCAAGGTGGGAGGATCATTTCAGCCCAGGAGTTTGAAACCAGTCTGGGCACTACAGGAAAACCCTGTCTACAAAAAGGAGAAAATTAGCCGGGCACAGTGGTGCATGCCTGTAGACCCAGCTACTAAGGAGGCTGAGGTGGGAGGAGTGCTTGAGCCTGAGAGATCAAGGCTGCAGAGAGCCATGATCATACCACTGCACTCCAGCCTGGGCGACAGAGCGAGACACTGTCTCAAGAAAAAAAAAAAAATTATTTGATGTAGTCCTAAAACTATTATGTAGAATACTATTGTTTATATCACAGCACGTGAGCCCCTTAAATGGCTTAACACTTATTTAGGTATGATCCATAAAGCTTTTCTGGTAATTAAGTATACTTAAGAACAATTAAGTATAAAAGAGTTACTGCCTTGACAGGAAGATTGTAAAAATTTTAAAAAGACAAATAAATAAAAGAGTAAAAACTGTAGCTCTGTGAGGCTCAAATAACATCTAATTCAAGTCACAATGAACATCTAGCAATCATTCTGAACACCATATAATTCACTTAATACGTTTTGCCTGAACACCCAACACATCTGAATTACCAACACCCATATGTAGCCAAGAAACTGGCAATCATTTATAAATTATCACCTATGACTCCATCTGCTCTACGCACTTATTTTTTAAATTTTATTCATTTATTTATTATTTTTATTTGTTGTAGAGATGGGATCTCACTATGTTACCCAAGTTGGTCCAGAAACAGAAACAGACCCACACTAATTTCATAAATCAGATGACCATACAGTCATTCGATTTATGAAAAAAAGTGCCACATGGTGCGGAAGGAAAAGGATGGTCTTTTCAATAAATGGTGCTGGATCAAGCAGACACATCCATGTAGTAAAAAGTGAATCATAGCCAGGTGGGGTGGCTCACACCTGTAATTCCAGCACTCTGGGAGGCTGAAGCGGGCAGATTACTTGAGCCCAGGAGTTCGAGACCAACCTGGGAAACATGTTGAATCCCCATCTCTACAAAAAATATGAAAATTAGCCAGGCATGGTGGCACATGCCTATAGTCGCAGCTACTCAGGAGGCTGAGGTGGGAGGATCACTTGAGCCAGGAGATGGAGGTTGAGTGAGCTGAGATCCTGCCACCACACTCTAGCCTGGGCAATAATAGACTGAGGCCCTGTCTGAAAAAAAAAAAAAGCAAAAACTAAAATAAAATCGTTATAAGGTTAACACAGAAAAATGTGTTCATACTCTTAGGTTAGGCATTGATTTCTTAAACAGGACACAAAAAACAGTAACCATAAAGGAAAAGATTAATAAAGTATAATTTCATTAAAATGAAGAATCTCAGGCTGGGTGCAGTGGCTCATGCCTGTAATCCCAACCCTTTGGGAGGCCGAGGCAGGTGTATCACTTGAGCCTAGGAATTCCAGACCAGCCTATGCAACGTGGCAAAACCCATCTCTACTAAAAATACAGAAAACAGCTGAGTGTGGTGGTACTCCCCTGTAGGTCCCAGCTACTTGGGGGCTGAGGCAGGGGGATCACCTGAGCCTTGTGAGGTCAAGGTTGCAGTGAGCTGTGATTGTGCCACTGCACTCCAGCCTGGGCGATGGAGTGAGATCCTGTCTCAAAAAGAAAAAAAAAAAAAGAGAATCTCCCTTCATGAAAAAACACCATAAAAGAGTGAAAACGCAAGCTACAGATTGAAAAAAGGGAAATGCAATACATATAAATCCTAGAAAGGAGGCATATCCAGAATAAAGTATTACAAATCAACAGGAAAACAAGCATATCAATGAAAACTGGATAAAAAGATTTAACAGGCACGTCACAAAAGAGGACATATAAATGGCAATAAAAGATACTCAATCTCAATGAAACCACACTGATATATTACTGCACCCCTACTAGAATGGCAAAATAATTTTTAACTGACAGGTATCAGCGAAGATGTGGGGTAACCAGCATATCCCTGCTAAATGGTACAACTACTTTGGGAAAATGTTCAACAATATGTAATACTAAAGTTTTATCATTCATATACCTCTAAAACCAACAATGCCACCCCTACAAATATACCCCAGACTAGTAATGTTCAATTTCTTGATCTGTGGTGGTTCACTTGGTAAAAATTCATTACTTTTTTTTTTTTTTTTTTGAGACAGGGTCTCACTCTGCCATCCAGGTCGGAGTGCACTGCCATGATCACGGCTCACTGCAATCTCAACCTCCCGGGCTCTGGTGATCCTCCCAACTCAGCCTACCGGGTAGCTGGGACTACAGGCACACGCCACCACACACAGCTAACTTCTGTATTTTTAGTAGAGAAAGGGTTTTGCCACATTGCCCAGGCTGGTCTGGAAATCCTGGGCTCAAGTGATCTACCCACCTTGGCGTCCCAAAGTGCTGGGATTACAGGTGTGATCACTGCGCCCGGGCCACCTGCACATGTAAAATTGTGAACTTCTGTATACTTCAGTAACTTTTCCAAGATTTCTTTGACGCAAAGTTCTCAGAAATCTTAAAGCTAGCATTTCAGAATAGAAAAAGTAGCTTCTGGTTCACTAGTGAAATTTTACCAATAGAATTTAAAAACAAAAAGCTACTAACGCATATCAGCTCAGAACACTACCAGCAGATCTTTTCTTTAACTTCCTGAAGCACTGGGATTCATTCTTTTGGCAAAGAAAGGATGAACAACACTGTAACCCAAAGAAAAGATACCACTGCCAGAAAAGACTTCTTTTCGAAAGCAGCTCTAAGCAAAAGATAGGAGGAAAACAAGGAAGCCAGGCCAAACGTCTTGGTTAACTCTCCGCTGAAAGGACGCCACATGAGATGATCTAAGAAGCCAGCCAGCCAGCCAGACGCAGGGAAATCACAGCAACTCTTTGGAGTGCAAACAGCAACCCCACAATCCAATCTACCCGAAATCCTGCGGTTCATTTGAGGCTTGCCCCGCTAGTCAGGAGGTGATTCAGTGATGGCTACAAATGCTGCTCATGTGCATCCTGGAGCTGGCACACCTGGCTTGCCCATCACCAGCCTGGAGACACCGCCAGGAGCAGAAGCCCGGAGGCCAGTAAAGACCCCAACTTTGCAAGTCAGGGGCGCGAGCGCGCTCGCCTCTCAGGTCCGCAGAGGGAACGGATTTCTGGCCTGGAGGGTGGGGTGCGGGGTCAGTGTCCTCTACAGGATATAGGAGGACGTGCCCCCGAAGCTGCTCCGTCCCTCCACCCCCTGGGATGCCACAGAACACCCGCCAGCGAGTTTCTTCCCCAGCGCCCACGAGAGTTGGGCTGCGGGCGGCAGCGGCAGGCGAAGAATCCAGCGCGGGGAACTCAGGCCCCGGCGGTGCACGACCCCCCACAGCCCCCACCCGCCCCCGCGCTCGCGCAACAAAACTTGCCACGGCCGCGCCTCGACCCAGCTGTGCGCCCGCGGGTCCCGGATTCACCGCCCGCCCAGCCTGGCGCGGCGCCCTCACCTCAGAAACGCTGGGTGGACTTCGCGTAACTTCCCATTCACAGGGCAGCCGGCAGCCGCGCCGCCGCGCCTCGGCCCAGCTCCTGGCGCCGCAGATCGCCCGTCCCGCGTTCCCAAAAGCACCGCGCTCGCTCAGAAGCTCGGGCAGCCTCGCGACCCTCACCTACGCCTCCCAGTACCGCCGCTGTCTCAACCGCCACCCAGCCCCTCGCCTGCGCCTGCGCCTGCAGCCCACTGGCTCCTCAGGATCCCGATGGGCGTGTCAGGAT", 16888922 - 1); //coding region between 34..528 var codingRegion = new CodingRegion(16890438, 16918516, 1007, 4651, 3645); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 28, 16888922, 16890681, 4408, 6167), new TranscriptRegion(TranscriptRegionType.Intron, 27, 16890682, 16891301, 4407, 4408), new TranscriptRegion(TranscriptRegionType.Exon, 27, 16891302, 16891413, 4296, 4407), new TranscriptRegion(TranscriptRegionType.Intron, 26, 16891414, 16892127, 4295, 4296), new TranscriptRegion(TranscriptRegionType.Exon, 26, 16892128, 16892302, 4121, 4295), new TranscriptRegion(TranscriptRegionType.Intron, 25, 16892303, 16893674, 3844, 4121), new TranscriptRegion(TranscriptRegionType.Exon, 25, 16893675, 16893846, 3673, 3844), new TranscriptRegion(TranscriptRegionType.Intron, 24, 16893847, 16894473, 3672, 3673), new TranscriptRegion(TranscriptRegionType.Exon, 24, 16894474, 16894525, 3621, 3672), new TranscriptRegion(TranscriptRegionType.Intron, 23, 16894526, 16895567, 3620, 3621), new TranscriptRegion(TranscriptRegionType.Exon, 23, 16895568, 16895731, 3457, 3620), new TranscriptRegion(TranscriptRegionType.Intron, 22, 16895732, 16899636, 3456, 3457), new TranscriptRegion(TranscriptRegionType.Exon, 22, 16899637, 16899688, 3405, 3456), new TranscriptRegion(TranscriptRegionType.Intron, 21, 16899689, 16900981, 3404, 3405), new TranscriptRegion(TranscriptRegionType.Exon, 21, 16900982, 16901187, 3199, 3404), new TranscriptRegion(TranscriptRegionType.Intron, 20, 16901188, 16901651, 3198, 3199), new TranscriptRegion(TranscriptRegionType.Exon, 20, 16901652, 16901724, 3126, 3198), new TranscriptRegion(TranscriptRegionType.Intron, 19, 16901725, 16902761, 3125, 3126), new TranscriptRegion(TranscriptRegionType.Exon, 19, 16902762, 16902976, 2911, 3125), new TranscriptRegion(TranscriptRegionType.Intron, 18, 16902977, 16903811, 2910, 2911), new TranscriptRegion(TranscriptRegionType.Exon, 18, 16903812, 16903914, 2808, 2910), new TranscriptRegion(TranscriptRegionType.Intron, 17, 16903915, 16905687, 2807, 2808), new TranscriptRegion(TranscriptRegionType.Exon, 17, 16905688, 16905897, 2598, 2807), new TranscriptRegion(TranscriptRegionType.Intron, 16, 16905898, 16907239, 2597, 2598), new TranscriptRegion(TranscriptRegionType.Exon, 16, 16907240, 16907451, 2386, 2597), new TranscriptRegion(TranscriptRegionType.Intron, 15, 16907452, 16907914, 2385, 2386), new TranscriptRegion(TranscriptRegionType.Exon, 15, 16907915, 16907987, 2313, 2385), new TranscriptRegion(TranscriptRegionType.Intron, 14, 16907988, 16909038, 2312, 2313), new TranscriptRegion(TranscriptRegionType.Exon, 14, 16909039, 16909253, 2098, 2312), new TranscriptRegion(TranscriptRegionType.Intron, 13, 16909254, 16910088, 2097, 2098), new TranscriptRegion(TranscriptRegionType.Exon, 13, 16910089, 16910191, 1995, 2097), new TranscriptRegion(TranscriptRegionType.Intron, 12, 16910192, 16911983, 1994, 1995), new TranscriptRegion(TranscriptRegionType.Exon, 12, 16911984, 16912193, 1785, 1994), new TranscriptRegion(TranscriptRegionType.Intron, 11, 16912194, 16913544, 1784, 1785), new TranscriptRegion(TranscriptRegionType.Exon, 11, 16913545, 16913756, 1573, 1784), new TranscriptRegion(TranscriptRegionType.Intron, 10, 16913757, 16914219, 1572, 1573), new TranscriptRegion(TranscriptRegionType.Exon, 10, 16914220, 16914292, 1500, 1572), new TranscriptRegion(TranscriptRegionType.Intron, 9, 16914293, 16915343, 1499, 1500), new TranscriptRegion(TranscriptRegionType.Exon, 9, 16915344, 16915558, 1285, 1499), new TranscriptRegion(TranscriptRegionType.Intron, 8, 16915559, 16916393, 1284, 1285), new TranscriptRegion(TranscriptRegionType.Exon, 8, 16916394, 16916496, 1182, 1284), new TranscriptRegion(TranscriptRegionType.Intron, 7, 16916497, 16918341, 1181, 1182), new TranscriptRegion(TranscriptRegionType.Exon, 7, 16918342, 16918551, 972, 1181), new TranscriptRegion(TranscriptRegionType.Intron, 6, 16918552, 16918653, 971, 972), new TranscriptRegion(TranscriptRegionType.Exon, 6, 16918654, 16918808, 817, 971), new TranscriptRegion(TranscriptRegionType.Intron, 5, 16918809, 16919935, 816, 817), new TranscriptRegion(TranscriptRegionType.Exon, 5, 16919936, 16920062, 690, 816), new TranscriptRegion(TranscriptRegionType.Intron, 4, 16920063, 16921086, 689, 690), new TranscriptRegion(TranscriptRegionType.Exon, 4, 16921087, 16921156, 620, 689), new TranscriptRegion(TranscriptRegionType.Intron, 3, 16921157, 16921425, 619, 620), new TranscriptRegion(TranscriptRegionType.Exon, 3, 16921426, 16921504, 541, 619), new TranscriptRegion(TranscriptRegionType.Intron, 2, 16921505, 16935002, 540, 541), new TranscriptRegion(TranscriptRegionType.Exon, 2, 16935003, 16935274, 269, 540), new TranscriptRegion(TranscriptRegionType.Intron, 1, 16935275, 16939832, 268, 269), new TranscriptRegion(TranscriptRegionType.Exon, 1, 16939833, 16940100, 1, 268) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(5892, 5891, "AAAAAAAAAAAAAAAA"), new RnaEdit(5799, 5799, "T"), new RnaEdit(5675, 5675, "G"), new RnaEdit(5655, 5655, "G"), new RnaEdit(5390, 5390, "G"), new RnaEdit(5174, 5174, "G"), new RnaEdit(5150, 5150, "C"), new RnaEdit(4993, 4993, "A"), new RnaEdit(4828, 4828, "G"), new RnaEdit(4683, 4683, "G"), new RnaEdit(4637, 4637, "G"), new RnaEdit(4530, 4530, "A"), new RnaEdit(3845, 3844, "GAAATTGAAAAGTACCAAGAAGTGGAAGAAGACCAAGACCCATCATGCCCCAGGCTCAGCAGGGAGCTGCTGGATGAGAAAGAGCCTGAAGTCTTGCAGGACTCCCTGGATAGATGTTATTCGACTCCTTCAGGTTATCTTGAACTGCCTGACTTAGGCCAGCCCTACAGAAGTGCTGTTTACTCATTGGAGGAACAGTACCTTGGCTTGGCTCTTGACGTGGACAGAATTAAAAAGGACCAGGAAGAGGAAGAAGACCAAGGCCCACCATGCCCC"), new RnaEdit(3769, 3769, "C"), new RnaEdit(3554, 3554, "C"), new RnaEdit(3207, 3207, "A"), new RnaEdit(3140, 3140, "C"), new RnaEdit(3136, 3136, "T"), new RnaEdit(3107, 3107, "T"), new RnaEdit(3103, 3103, "A"), new RnaEdit(2993, 2993, "C"), new RnaEdit(2944, 2944, "G"), new RnaEdit(2840, 2840, "G"), new RnaEdit(2810, 2810, "T"), new RnaEdit(2706, 2707, "GC"), new RnaEdit(2695, 2695, "T"), new RnaEdit(2692, 2692, "G"), new RnaEdit(2509, 2509, "A"), new RnaEdit(2299, 2299, "A"), new RnaEdit(2294, 2294, "G"), new RnaEdit(2290, 2290, "C"), new RnaEdit(2222, 2222, "G"), new RnaEdit(2009, 2009, "G"), new RnaEdit(1964, 1964, "G"), new RnaEdit(1893, 1894, "GC"), new RnaEdit(1882, 1882, "T"), new RnaEdit(1879, 1879, "G"), new RnaEdit(1696, 1696, "A"), new RnaEdit(1652, 1652, "G"), new RnaEdit(1486, 1486, "A"), new RnaEdit(1481, 1481, "G"), new RnaEdit(1477, 1477, "C"), new RnaEdit(1409, 1409, "G"), new RnaEdit(1405, 1405, "A"), new RnaEdit(1318, 1318, "G"), new RnaEdit(1021, 1021, "C"), new RnaEdit(932, 932, "A") }; const byte startExonPhase = 0; const bool onReverseStrand = true; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, onReverseStrand, startExonPhase, rnaEdits); var expectedCodingSeq = "ATGGTGGTATCAGCCGGCCCTTGGTCCAGCGAGAAGGCAGAGACGAACATTTTAGAAATCAACGAGAAATTGCGCCCCCAGCTGGCAGAGAACAAACAGCAGTTCAGAAACCTCAAAGAGAAATGTTTTGTAACTCAACTGGCCGGCTTCCTGGCCAACCGACAGAAGAAATACAAGTATGAAGAGTGTAAAGACCTCATAAAATTTATGCTGAGGAATGAGCGACAGTTCAAGGAGGAGAAGCTTGCAGAGCAGCTCAAGCAAGCTGAGGAGCTCAGGCAATATAAAGTCCTGGTTCACTCTCAGGAACGGGAGCTGACCCAGTTAAGGGAGAAGTTACGGGAAGGGAGAGATGCCTCCCGCTCATTGAATCAGCATCTCCAGGCCCTCCTCACTCCAGATGAGCCAGACAAGTCCCAGGGGCAGGACCTCCAAGAACAGCTGGCTGAGGGGTGTAGACTGGCACAGCACCTTGTCCAAAAGCTCAGCCCAGAAAATGACGAAGATGAGGATGAAGATGTTCAAGTTGAGGAGGCTGAGAAAGTACTGGAATCATCTGCCCCCAGGGAGGTGCAGAAGGCTGAAGAAAGCAAAGTCCCTGAGGACTCACTGGAGGAATGTGCCATCACTTGTTCAAATAGCCACGGCCCTTGTGACTCCAACCAGCCTCACAAGAACATCAACATCACATTTGAGGAAGACAAAGTCAACTCAACTCTGGTTGTAGACAGAGAATCCTCTCATGATGAATGTCAGGATGCTGTAAACATTCTCCCAGTCCCTGGCCCCACCTCTTCTGCCACAAACGTCAGCATGGTGGTATCAGCCGGCCCTTTGTCCAGCGAGAAGGCAGAGATGAACATTCTAGAAATGAATGAGAAATTGCGCCCCCAGCTGGCAGAGAAGAAACAGCAGTTCAGAAACCTCAAAGAGAAATGTTTTGTAACTCAACTGGCCGGCTTCCTGGCCAACCAGCAGAACAAATACAAATATGAAGAGTGCGAAGACCTCATAAAATCTATGCTGAGGAATGAGCGACAGTTCAAGGAGGAGAAGCTTGCAGAGCAGCTCAAGCAAGCTGAGGAGCTCAGGCAATATAAAGTCCTGGTTCACTCTCAGGAACGAGAGCTGACCCAGTTAAGGGAGAAGTTACGGGAAGGGAGAGATGCCTCCCGCTCATTGAATCAGCATCTCCAGGCCCTCCTCACTCCGGATGAGCCAGACAAGTCCCAGGGGCAGGACCTCCAAGAACAGCTGGCTGAGGGGTGTAGACTGGCACAGCACCTTGTCCAAAAGCTCAGCCCAGAAAATGACGAAGATGAGGATGAAGATGTTCAAGTTGAGGAGGCTGAGAAAGTACTGGAATCATCTGCCCCCAGGGAGGTGCAGAAGGCTGAAGAAAGCAAAGTCCCTGAGGACTCACTGGAGGAATGTGCCATCACTTGTTCAAATAGCCACGGCCCTTGTGACTCCAACCAGCCTCACAAGAACATCAACATCACATTTGAGGAAGACAAAGTCAACTCAGCTCTGGTTGTAGACAGAGAATCCTCTCATGATGAATGTCAGGATGCTGTAAACATTCTCCCAGTCCCTGGCCCCACCTCTTCTGCCACAAACGTCAGCATGGTGGTATCAGCCGGCCCTTTGTCCAGCGAGAAGGCAGAGATGAACATTCTAGAAATGAATGAGAAATTGCGCCCCCAGCTGGCAGAGAAGAAACAGCAGTTCAGAAACCTCAAAGAGAAATGTTTTGTAACTCAACTGGCCTGCTTCCTGGCCAACCAGCAGAACAAATACAAATATGAAGAGTGCAAAGACCTCATAAAATCTGTGCTGAGGAATGAGCGACAGTTCAAGGAGGAGAAGCTTGCAGAGCAGCTCAAGCAAGCTGAGGAGCTCAGGCAATATAAAGTCCTGGTTCACTCTCAGGAACGGGAGCTGACCCAGTTAAGGGAGAAGTTACGGGAAGGGAGAGATGCCTCCCGCTCATTGAATCAGCATCTCCAGGCCCTCCTCACTCCGGATGAGCCAGACAAGTCCCAGGGGCAGGACCTCCAAGAACAGCTGGCTGAGGGGTGTAGACTGGCACAGCAACTTTTCCAAAAGCTCAGCCCAGAAAATGACAATGATCACGATGAAGATGTTCAAGTTGAGGTGGCTGAGAAAGTGCAGAAATCGTCTGCCCCCAGGGAGATGCAGAAGGCTGAAGAAAAGGAAGTCCCTGAGGACTCACTGGAGGAATGTGCCATCACTTGTTCAAATAGCCATGGCCCTTATGACTCCAACCAGCCACATAGGAAAACCAAAATCACATTTGAGGAAGACAAAGTCGACTCAACTCTCATTGGCTCATCCTCTCATGTTGAATGGGAGGATGCTGTACACATTATCCCAGAAAATGAAAGTGATGATGAGGAAGAGGAAGAAAAAGGGCCAGTGTCTCCCAGGAATCTGCAGGAGTCTGAAGAGGAGGAAGTCCCCCAGGAGTCCTGGGATGAAGGTTATTCGACTCTCTCAATTCCTCCTGAAATGTTGGCCTCGTACCAGTCTTACAGCGGCACATTTCACTCATTAGAGGAACAGCAAGTCTGCATGGCTGTTGACATAGGCGGACATCGGTGGGATCAAGTGAAAAAGGAGGACCAAGAGGCAACAGGTCCCAGGCTCAGCAGGGAGCTGCTGGATGAGAAAGGGCCTGAAGTCTTGCAGGACTCACTGGATAGATGTTATTCAACTCCTTCAGGTTATCTTGAACTGACCGACTCATGCCAGCCCTACAGAAGTGCCTTTTACATATTGGAGCAACAGCGTGTTGGCTGGGCTCTTGACATGGATGAAATTGAAAAGTACCAAGAAGTGGAAGAAGACCAAGACCCATCATGCCCCAGGCTCAGCAGGGAGCTGCTGGATGAGAAAGAGCCTGAAGTCTTGCAGGACTCCCTGGATAGATGTTATTCGACTCCTTCAGGTTATCTTGAACTGCCTGACTTAGGCCAGCCCTACAGAAGTGCTGTTTACTCATTGGAGGAACAGTACCTTGGCTTGGCTCTTGACGTGGACAGAATTAAAAAGGACCAGGAAGAGGAAGAAGACCAAGGCCCACCATGCCCCAGGCTCAGCAGGGAGCTGCTGGAGGCAGTAGAGCCTGAAGTCTTGCAGGACTCACTGGATAGATGTTATTCAACTCCTTCCAGTTGTCTTGAACAGCCTGACTCCTGCCTGCCCTATGGAAGTTCCTTTTATGCATTGGAGGAAAAACATGTTGGCTTTTCTCTTGACGTGGGAGAAATTGAAAAGAAGGGGAAGGGGAAGAAAAGAAGGGGAAGAAGATCAACGAAGAAAAGAAGGAGAAGGGGAAGAAAAGAAGGGGAAGAAGATCAAAACCCACCATGCCCCAGGCTCAGCGGCATGCTGATGGAAGTGGAAGAGCCTGAAGTCTTGCAGGACTCACTGGATAGATGTTATTCGACTCCGTCAATGTACTTTGAACTACCTGACTCATTCCAGCACTACAGAAGTGTGTTTTACTCATTTGAGGAACAGCACATCAGCTTCGCCCTTGACGTGGACAATAGGTTTCTTACTTTGATGGGAACAAGTCTCCACCTGGTCTTCCAGATGGGAGTCATATTCCCACAGTAA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void RnaEdits_big_test() { //NM_005960.1, chrom: chr7:100547052-100611619 var genomicSeq = new SimpleSequence( "GCGCTGACGTCTGTCTGTCCGGGTGCAGGGAGAAGGGAGGAAGAGGGGAGAGGTGGGGCGGTGCAAAGGTGAGGCTGTGCTCAGCCCTGACGCTCAGCAAAACCGATAACCAGCACTTTCATTACGTGCACGCCCCAGGGCCACGTCCCTGCCGCTGTCTTGGTCCTGAAGCCTGTTCTGCCCCAGCCCCCTGCCCGCTGGGCCCATGCAGCTGTTGGGGCTCCTCGGCCTCCTCTGGATGCTCAAGGCCTCCCCGTGGGCCACAGGTAAGGGGGAGAGGCGGAAGGGGGTTGGAGAAAAGCTCCTGATGTGATGTTCCAGGAAAGGGGAGGGAAAAGTGGCTGTAAGGCCTGGGGAGGGGGGATAAGAAGGCACCGCTTGGGGCTCTGGGTGCAGGGAGAACCGAGGCACGGCCTGACTGGGGGAGGGGGCGATGAGGAGAGGTTTCTTCCAGAGCTCCAGGTGCAGGGAAAACCCCGAGGTTGGGAAAGAGTGAGGGAGCTGGGTCTCTGCCACTCTCCACCAAGCACTGAGCAGGTTGCAGCGGCTGAGCCCCAATCTGTATCTGCAGCTGGAGGGTAGAGGGTGGGATTTACGTCTTCCCAGAGCAGTGCCCTTCCTGTCTTGACTCCTTCTGTCACCTGCCTCATGCCCCCAGCTTGAGTGTCCCCTTCACACTGGCCTCTCCCTCCCTGACAGCCCTCTAACTTCTACCCCTGGTCTCGGTCCTCTGGTTTCAGCCTCTCTGCCTTTTGTCCCCCGGCGGCTCCTCCCCAGCTCTGCCGTCACTCTCTTACCCCGGCCAGGGCCCATGTGTCTGGGTACAGCTGTTGGTACCAGGGCCGGGACAGGGAGCTCCTGATGTCCACTTTGCTTCCAAGGGTGCGTCTGAGACTACCCGTCGTGGGGTCCCAGGCTTCTCTCTTCTGCCCTGCAGGAACACCTCGCAATTCCTCTATACTTCTCTTTTCTCTGTACTTCAGTGTCTGCTTCTGATCCCCGATCCCAGGCCACCCAGCCTACAGGCCCATGAGTCCCCTTCTCAGTCACCTCCAGGGCCACATCCTGGAGCCAAGGGCTGTAGCCTGGGGATTCTCATAATCCCTGACCCCACTTCCCTGGCACCCACGAGCTAGGTTGAGACGTGACACCCCAGCTCTCAGCCACAAGATGGGCTGTGCCCGAGGTGAGGGGTAGCAGATCGGGTACTTCCCACTTCCCGTCTGCTGTGGCTGCCTGTCTTCCTTGTCCCTGACACCCCCGACAGCCGGATCTCTGATCCTAACTCTGACAAATTGTGAAATGGGTTGAAATCCACATGCTGGGGTTCATGCTTGTAAACTAATGAATCCCACGGCCAAAAGGGAATAGTATAGAAAAATATGTCTATTTGTGTGATGAACACTCACTGCTAAGCCTTAAGGTCTCCAGAACTCATCACGCCTGACTGCTGAGGTAGCTCCTTCTGGAAGTTTCCTTTTCTATGCTGTCTCTCTGCCTCTTCACCTAGTCCTCACTCCATGCCCTTTGAAGTCATTAGTGTGTGAGCTTAGCCTGTTTCTCTTTGGTGTTCTCTGGGCAGTTTTTTGTTTTTGTTTTTGTTTTTGTTTTTGTTTTTGATGGAGTCTTGCTCTGTCACCCAGGCTGGAGTGCAGTGGCACCATCTCAGCTCACTGCAGCCTCTGCTTCCTGGGTTCAAGCGATTCTCCTGCCTCAGCTACAGGTACACGCCACTACGCCTGGCTAATTTTTGTATTTTGTTTTAGTAGAGAAAGGGTTTCACCATGTTGGTCAGACTGGTCTCGAACTCCTGACCTCAAGTGATCTCCTGCCTTGGCCTTTCAAAGTGCTGGGATTACAGGCATGAGCCACCGTGCCTGGCCCTCTTTGTTCAGCTTTCCCTGTCTCCTGGTTTTTGTGATGCGCCCCCTGCCAGGACATGGCTGGGTTCTCTCTTTTTCTCTTTGAAAGCGGAGTCAGCCCAGACAGCAGCAGGGTGCCGGGAGAAGCAGTGGTGGGAGCAGAAGGTTAATGGGGGGATGGGAGCACTCCAGGCAGTAGCAGGGGGAGGAGAGGAAGGGGCAGAGGGAGGAGGAGCCTGGGTGTGACCAGAGGAGGGAAGGGAGGGGAGGGAGGCTCTGCCCAGCTCGGCTATATCAGGACAGGAGGACCTGCCATGACAAGGCCAAGGCCCGGTGAGGAATGAGGGCTCCCATGCCCCACTGCTCCCCTAGGAGCAGACAGGCAGTCGTCTCCAGCACCACAAAGCACCCAGCTCCAAGCTGCCTCTGATGCAGGAGTCAGCTGTAATATGCCCTGCCCTCTGTGATGCTGCCTGGAAAATGGGTGAGTGAGTAGCTTACATGAGTGATGTAACAAAATGACCCACGGATTTACCAGTGGATTCCTCTGCTCTGCCGCCAATGCAGGAACTTTATCCACGGCCACATCCATCTCTCAAGTGCCTTTCCCCAGAGCAGAAGCAGCCAGCGCTGTGCTCAGCAATTCTCCACACTCCAGAGACCTGGCTGGGTGGCCACTTGGTGTCCCCCAGCTCGCCTCTCCTGCTCCTGGCCACAGGGAAAATGCACCTATGACACTCACTACCTCCCCCCATGACACACTCATCTCTGAAACATTGCTCAACTCTCCAGTCAGTTCCAACACCTCAACCACCCCGACGTCCAAGTTTGCCTTCAAGGTTGAAACCACTCCACCCACCGTGTTGGTCTATTCAGCCACCACTGAGTGCGTGTATCCAACGAGCTTTATAATCACCATCTCCCACCCCACCTCCATCTGTGTGACCACGACGCAGGTGGCCTTCACCAGCTCTTACACCTCGACTCCCGTGACACAGAAGCCAGTGACCACCGTCACCAGTACTTACTCTATGACCACTACTGAGAAAGGAACGTCAGCCATGACATCTTCTCCCTCTACCACCACTGCAAGGGAAACTCCCATAGTGACAGTGACACCCTCCTCTGTGTCAGCCACAGACACAACCTTCCACACTACAATCTCATCTACAACTAGAACCACAGAAAGGACTCCCCTGCCCACTGGAAGCATCCATACAACCACGTCCCCAACCCCAGTATTTACTACTCTCAAAACAGCAGTGACTTCCACTTCCCCCATCACTTCTTCAATCACTTCCACAAATACAGTGACTTCTATGACAACGACCGCCTCCCAGCCCACAGCCACTAATACATTGTCATCACCCACTAGGACCATTTTATCTTCCACACCTGTCCTGAGCACAGAAACAATCACCAGTGGTATCACAAACACCACCCCCCTATCCACCTTGGTGACCACACTCCCCACTACCATCAGCAGGTCTACACCTACATCTGAGACCACCTACCCTACTTCTCCCACCAGCACTGTCACAGACTCCACTACCAAAATCGCCTACTCCACAAGTATGACAGGTACATTGTCCACAGAGACTTCTCTCCCACCCACCTCTTCCTCTCTCCCAACCACAGAAACAGCCACGACTCCTATGACAAACTTGGTAACCACCACCACTGAGATCTCCTCCCACAGTACTCCCAGCTTCTCTTCATCAACCATCTACTCCACAGTCACCTCACACAGTACTCCCAGATTCACTTCTTCAATCACCACTACCGAGACCCCCTCACACAGTACTCCCAGATTCACTTCTTCATTCACCAATACCAAGACCACCTCACACAGATCTCCCAGCTTCACTTCTTTGATCACCACCACGGAGACCACCTCACACAGTACTCCCAGCTTCACTTCTTCGATCACCACCACCGAGACCACCTCACACAGTGCTCGCAGCTTCACTTCTTCGATCACCACCACCGAGACCACCTCACACAATACTCGGAGCTTCACTTCTTCGATCACCACCACCGAGACCAACTCTCACAGTACTACCAGCTTCACTTCTTCGATCACCACCACCGAGACCACCTCACACAGTACTCCCAGCTTCAGTTCTTCAATCACCACCACTGAGACCCCCTTACACAGTACTCCTGGCCTCACTTCGTGGGTCACCACCACCAAGACCACCTCACACATTACTCCTGGCCTCACTTCTTCAATCACCACCACTGAGACTACCTCACACAGTACTCCCGGCTTCACTTCTTCAATCACCACCACTGAGACCACCTCAGAGAGTACTCCCAGCCTCAGTTCTTCAACCATCTACTCCACAGTCAGCACATCCACAACTGCCATCACCTCACATTTTACTACCTCAGAGACTGCGGTGACTCCCACACCTGTAACCCCATCTTCTCTGAGTACAGACATCCCGACCACAAGCCTACGAACTCTCACCCCTTCGTCTGTGGGCACCAGCACTTCATTGACTACAACCACAGACTTTCCCTCTATACCCACTGATATCAGTACCTTACCAACTCGAACACACATCATTTCATCTTCTCCCTCCATCCAAAGTACAGAAACCTCATCCCTTGTGGGCACCACCTCTCCCACCATGTCCACTGTGAGAATGACCCTCAGAATTACTGAGAACACCCCAATCAGTTCCTTTAGCACAAGTATTGTTGTTATACCTGAAACCCCAACACAGACCCCTCCTGTACTGACGTCAGCCACTGGGACCCAAACATCTCCTGCACCTACTACTGTCACCTTTGGAAGTACGGATTCCTCCACGTCCACTCTTCATACTCTTACTCCATCAACAGCCTTGAGCACGATCGTGTCAACATCACAGGTTCCTATTCCTAGCACACATTCCTCCACCCTTCAAACAACTCCTTCTACTCCCTCATTGCAAACTTCACTCACATCTACAAGTGAGTTCACTACAGAATCTTTCACTAGGGGAAGTACGTCTACAAATGCAATCTTGACTTCTTTTAGTACCATCATCTGGTCCTCAACACCCACTATTATCATGTCCTCTTCTCCATCTTCTGCCAGCATAACTCCAGTGTTCTCCACTACCATTCATTCTGTTCCTTCTTCACCATACATTTTCAGTACAGAAAATGTGGGCTCCGCTTCTATCACAGGCTTTCCTAGTCTCTCTTCCTCTGCAACTACCAGCACTTCTTCAACCAGCTCCTCTCTGACCACAGCTCTCACTGAAATAACCCCCTTTTCTTATATTTCCCTTCCCTCCACCACACCCTGTCCAGGAACTATAACAATTACCATAGTCCCTGCCTCTCCCACTGATCCATGTGTTGAAATGGATCCCAGCACTGAAGCTACTTCTCCTCCCACCACCCCATTAACAGTCTTTCCCTTTACTACCGAAATGGTCACCTGTCCTACCTCCATCAGTATCCAAACTACTCTTACTACATATATGGACACTTCTTCCATGATGCCAGAAAGTGAGTCCAGCATCTCACCCAATGCTTCCAGTTCCACTGGCACTGGGACTGTACCCACAAACACAGTTTTCACAAGTACTCGACTGCCCACCAGTGAGACCTGGCTGAGCAACAGTTCTGTGATCCCCCTACCTCTTCCTGGCGTCTCTACCATCCCGCTCACCATGAAACCAAGCAGTAGCCTCCCGACCATCCTGAGGACTTCAAGCAAGTCAACACACCCCTCCCCACCCACCACTAGGACTTCAGAGACACCAGTGGCCACTACCCAGACTCCTACCACCCTTACATCACGCAGGACAACTCGCATCACTTCTCAGATGACCACACAGTCCACGTTGACCACCACTGCAGGTTGGACCTTCTGCCTCTCTGTTCCCCTCCTTCCTCCCCTGCAAAATTCCTGTGTCACTGAGGTCAGGCTTTATCCTGAGCTTCCCTTTCTTTCTGTGTTTTCCAGGCACCTGTGACAATGGTGGCACCTGGGAACAGGGACAGTGTGCTTGCCTTCCGGGGTTTTCTGGGGACCGCTGTCAGCTCCAGACCAGATGCCAGAATGGGGGTCAGTGGGATGGCCTCAAATGCCAGTGCCCCAGCACCTTCTATGGTTCCAGTTGTGAGTTTGCTGTGGAACAGGTGGATCTAGGTGAGTTGCCAGAGCTATGCCTTCTGCACTTCCTCCCACAGGGTGTCACTGACTCTCCCCAGACTTATCCCTCTGTGGGGCCTGGAGGCACCCATGCCTTTTTGCCCGGTCCTTCCCTCCCTGCCATCTCTCCCATGCCCTCCGCTGCCCTGTGTCATGCTCCTCTCCGTCCTCACCCTTAGGAGGTGGCTGGGACTACCCTCCCTCCTGGGCCCATCTCCTGACTTGGGCTGCTTGGAGCTGTATCAGTTTCCAACTGCTGCCGGGCCAACAAACACAAATCTGGCTGCTGGAACAACACGACATTATCATGTTAGAATTCTGTAGATTAGAAGTCTGATGTGGGTGTCACTGGGCTGAAATCAAGGCGTCACCAGGGCTGTGTTGTCTTTCAGCGGCTCCAGGGAAGAATCCATTTTTTTGCCCTTTGCAGCTTCTGGAGCCTCCCACAGCAAGGCTGCATCTCTCTGTGTCTTTCTCCCATAGCCTCATCTCCCTCTAATGAACTCTGGCCTCCTCAATTGCTTCTCCCACTGTTAAGGACCCTTGTGATAACTTTGCCTCCTCCCCAAATAGTCTATGTTAATTTTCTCAAGATCAGCTGATTACGCCGGGCGGGGTGGCTCACACCTGTAATCCCAGCAGTTTGGGAGGCTGAGGGGGTAGGATCACCTGAGGTCAGGAGTTGGAGACTAGCCTGGCCAACATGGTGAAACTGTCTCTACTATTAGTTGGGCATGGTGGCAGATGCCTATAATCCTCACTATTCAGGAGGCTGAGGCAGGAAAATCGCTTGAATCCAGGAGGTGGATATTGCAGTAAAGCGAGATTTCGCCACTGCACACTAGCGTAGATGACAGAGGGAGAGTGAGACTCTGTCTTAAAAGAAAAAAAAAAATCAGCTGATTGTCTTATAATCCCTGCACTTTGGAAGGCCGAGGAGGGAGTATCGCTTGAGGCGAGGAGTTCAGGACCAGCCTGGGCAACACAGCGAGACCCTCATCTCCACAAAAAATTTTAAAAACTTACCTGGGCATGGTGGCTCATGCCTGTGGTCCCAGTTATTTGGAAGGCTGAGGTGGGAGAATCACTTGAGCCTGGGAGTTCAAGGCTGCAGTGAGCTATGATCCCCCTATTGCACTTCAGCCTGGATGACAGAATGAGACCCTATCTCAACAATAAAAAAAAGTTAGGCTGATTAGCAATCTAATTCAATCTGCACCCTTGATCCTCCCTTGCCATGTAGTATAGCATAGTCACAGTTCTGGGGATTAGGACATGGACATCTTCCCACTATGGGGGCAGCCAGGAGGGACCACAGGCTGACCGCTATCTTTCTGCCTGCTTTCACTCATCTCCACACAATTCCTTCCTTCCTCTCGCTCTCTTCTTTCTTTTCTTCTTTTCTTTTGTCTCTTTCTCTTTTCTTTTTTTCTTTCTCTTTTTCTTTCTTTCTTTCTCTTTCTTTTCTTTCCTTTCTTTTCTTCTTTCCCTCATCTTCACACAATGCTTTCTCCCTTTCTTTCTTTCCTTCTTTCTTTCTCTTTCTTTCTTTCTCTCTCTCTCTCTCTCTCTCTTTCTTTCTTTCTTTCTTTCTTTTTCTCTCTCTCTCTCTATCTTTCTTTTTCATTTTCTTGAGAGACAGTCTCACTCTGTGGCCCAGGCTGGAGTGCAGTGATGCAATCTCGGCTCACTACAACCTCCGCCTCCTGAGTTCAAGTGATTCTCCTGCCTCAGCCTCCTGAGGCATAATGCATCCTGGAATTCCTACATCTGTTCCTGCCTCTGGGCATCAGTCCTCAGGGATCTTGGAGGGGAGCAGCAGGAGGAGCCTGTGGGTTGGGGTGGTGGTGTTGGTGGCTTCAGACAAAAGCAGACAGAGAAGTGACTGGGGACATGCATGCTCTGTGTAGATGTAGTGGAGACCGAGGTGGGCATGGAAGTGTCTGTGGATCAGCAGTTCTCGCCGGACCTCAATGACAACACTTCCCAGGCCTACAGGGATTTCAACAAGACCTTCTGGAATCAGGTAAAGGGCAAAGAGAGGGGATTTTTTTTTTTTTTGAGGTGTAGTCTCGCACTCTCACCCTGGCTGGAGTGCAATGGTGCGATCTTGGGTCACTGCAACCTCCGCCTCCCGGGTTCACATGATTCTCTTGCCTCAGCCTCCCAAGTAGCTGGGATTACAGGTGCACACCATGACACCTGGCTAATGTTTTGTATTTTTAGTAGAGACGGGTTTTCACTATGTTGGCCAGACTGGTCTCGAACTACTGACTTTGTGATCCACCTGCCTCGTCCTCCCAAAGTGTTGGGATTACAGGCGTGAGCCACGGCACCCGGCCGGGGAGGGGAATTGAAGGGTCTTCCCTGGAGCTGGGGTTGGGCGTCTGGGTCCCCTCAGGTCTGCAGGTTCGGACGTGAGCCCAGGGATCCTTGGTGTTTCAGATGCAGAAGATTTTTGCAGACATGCAGGGCTTCACCTTCAAGGGTGTGGAGATCCTGTCCCTGAGGTAGGAGACCCATCTGGGGATGCGGAGGCGGTGTTGGGTGGGGGAAATGTGCGCACACAAAAAACCCATTCCTTTCTTTTGTAATCATCAGATTTTATAAAGAGGGGTGGAGGGGGTACATAAGGAATCACTCCCTGGGTATTTTTTCGGATCGTTTTCTGGGGCCATTTATCTGGAGGAGGGGTGGCACCTCTCTTCTTCAGCACACTGGAAGGAGAGAAGTTGCAGGGACATGTGGGAAGGTGGTGCCTGGATTGATGACTTTGTCCCCCTCTGGCTGGCCCCTGCTCTACTGAGTGGGTCAGCATTAGAAAGAGAGAGAGAGAAAGAGAGAGAGGGAGAGAACGCACGTCTAGGGGCGCCCGGTGGATGATGGCTTGATGCAACAAGAAGAGAACGTCAGGCCAGATGTGGTGGCTCACACTTGCAATCCTAGCACTTTGGGTAGGCTAGGCGGGTGGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATTCCTACATCTGTTCCTGCCTCTGGGCATCAGTCCTCAGGGATCTTGGAGGGGAGCAGCAGGAGGAGCCTGTGGGTGGTGGTGGTGGTGTCGGTGGCTTCAGACAAAAGCAGACAGAGAAGTGACTGGGGACATACATGCTCTGTGCAGATGTAGTGGAGACCGAGGTGGGCATGGAAGTGTCTGTGGATCAGCAGTTCTCGCCGGACCTCAATGACAACACTTCCCAGGCCTACAGGGATTTCAACAAGACCTTCTGGAATCAGGTAAAGGGCAAAGAGAGGGGATTTTTTTTTTTTTTTTTTTTTGAGATGGAGTCTCGCACTGTCATCCTGGCTGGAGTGCAATGGCGTGATCTTGGCTCACTGCAACCTCCGCTTCCCAGGTTCACATGATTCTCTTGCCTCAGCCTCCCGAGCAGCTGAGATTACAGGTGCACACCACCACACCTGGCTAATGTTTTGTATTTTTAGTAGAGACGGGGTTTCACTATGTTGGCCAGACTGGTCTCGAACTACTGACTTTGTGATCCGCCTGCCTCGTCCTCCCAAAGTGTTGGGATTACAGGCGTGAGCCACGGCACCCGGCCGGGGAGGGGAATTGAAGGGTCTTCCCTGGAGCTGGGGTTGGGCATCTGGGTCCCCTCAGGTCTGCAGGTTCGGACGTGAGCCCAGGGATCCTTGGTGTTTCAGATGCAGAAGATTTTTGCAGACATGCAGGGCTTCACCTTCAAGGGTGTGGAGATCCTGTCCCTGAGGTAGGAGACCCATCTGGGGATGTGGAGGCAGTGTTGGGTGGGGGAAATGTGCGCACACAAAAAACCCATTCCTTTCTTTTGTAATCATCAGATTTTATAAAGAGAGGGGTGGAGGAGGTACATAAGGAATCGCTCCCTGGGTATTTTTTTGGATCGTTTTCTGGGGCCATTTATCTGGAGGAGGGGTGGCACCTCCCTTCTTCAGCACACTGGAAAGAGAGAAGTTGCAGGGACATGTGGGAAGGTGGTGCCTGGATTGGTGACTTCGTCCCCCTCTGGCTGGCCCCTGCTCTACTGAGTGGGTCAGCATTAGAGAGAGAGAGAGAGAAAGAGAGAGAGGGAGAGAACGCGCGTCCAGGGGCGCCCGGTGGATGATGGCTTGATGCAACGAGAACGTCAGGCCAGATGTGGTGGCTCACACTTGCAATCCTAGCACTTTGGCAAGGTTAGGTGGGTGGATCACCTGAAACCAGTTCAAGACCAGCCTGGGCAACATAGTGAGAACCCATCCCTACAACAATAAAAATAGTAATAATAATAATAATAATAATAAAATGATTATCCAGGCGTGGTAGTGCACACCTGTAGTCCCAGATACTTGGGAGGCTGAGGAGAAAGGATCACTTTAGCCCAGGAGTTGGAGGCTGCAGTGAGCTACAATGATACCACTGCACTCCAGCCTGGGTGACAGCAAGACTTTGTCTCTATAAAACACACAGAGAGAGGAAGTCAATCATGTCAGTCATTCCTTGTCCTGCCTTCCCAGGCAGACCAAGTCAGGAATGCTGGCAGCCCCTTCTGAAAAGGATGCACGTGGCATCCCAACTCATGACCTCTGCCCTCTTTCCCCCTTCTGGTGCACTTTGGGTTGCTTCTGGAGGTGCCCCTCCAAGGACCCATATGTTCCTGGCTGGGGCACTCTCTAAGGCTGTGGACCCCTCAGGAATGGCAGCATCGTGGTGGACTACCTGGTCCTGCTGGAGATGCCCTTCAGCCCCCAGCTGGAGAGCGAGTATGAGCAGGTGAAGACCACGCTGAAGGAGGGGCTCCAGAACGCCAGCCAGGATGCGAACAGCTGCCAGGACTCCCAGAGTGAGCCCGGGCTGGAGGGAGGGGCCAGGGCCTGAGGTGTCACCCCAGCCCACTCCAGCTCAGCCAGGGGGCCACTGGACTCAGGTGCCAGCCCTGTGGTACCTCTGGCAGGTTGGGAGAACGGGAATAAGTCTACACACAATGCCATCAAGAGTGGGGCTAGGGAGGGTCTCCCCAGGACCTGGGTACTGGGGAAGAGACCCCCTGATCGTCAGGCTCAGCATTTCCCGATGGCTGAAGACCTCGGATTATTCAGGGGGGATAAGGGAGAGAACAGGAGTCTTCCCCTGTGGCCCCTCCACACTCCCCCAGACGGAGAGAGCCCTCACTGCCCTCCCTGTGCCTATCCTGCTTCCTGGCCCTAACCCCTTGACCTGCCCCGCCCATTCCATCTGTGCCTGTGTTTCCGCAGCCCTGTGTTTTAAGCCTGACTCCATCAAGGTGAACAACAACAGCAAGACAGAGCTGACCCCGGAAGGTGAGGGTGGGGTAAAGGGCTGAGTGGTCTCCCATGGCCATGACCCCTGCCACCAGGGACATTTGCCCATTGAAGCCTGTGGGCAGGGAGAGACCTTTGCGGGAGGCAAGTCATGTGGCCTAGGGAGGCTCTTCCTGGCGTTGGTTAGTGGCTTCCACCTGAGGACAGCAGGGGCCACGAGGAGAGGGTGAGGGTGCTGGGGGTGGCCTCCCCTCATCGAATCCCAGGGTCTACCCCACAGCATCCCACCTCGGAAATGGAATCCTCCTCGCGCATATTCAGAGGCACCATTATCAGGCCCCTGAATAGAATGGATGAGGTCCTTGTCTCTGTGCATACCCCTCCCCAACCCCCAGCCATCTGCCGCCGCGCCGCTCCCACGGGCTATGAAGAGTTCTACTTCCCTCTGGTGGAGGCCACCCGGCTCCGCTGTGTCACCAAATGCACGTCGGGCGTGGACAACGCCATCGACTGTCACCAGGGCCAGTGCGTTCTAGAGACGAGCGGTCCCGCGTGTCGGTAAGGCCCCGCTCACCATCAGCATCAGTCGAGCCCCGCCCACTCATTCTAGGATGAAGCCTAGCCTCACGCGCCGCCCCGGCTCTGCCCCCAGGCCCTACAGTGGAGCCTCGTCCCCAGAGTCCCGCTCCAAGCCCATCCCCGTTGCCCTACAGTGGAGCCCTGCCCTGGAGCTCTGCTCCGTCGCCCTAAAGTGTAGCCCCGCCTCCTTGATGGGGTTGAGTCCAATCCCCTGGTTCTGGGATAGACCCCGCCCACTCATTCTAGGGTGGGGCCCCGCCCCTTCGTTCTAGGGCTGAACCTTGCCCCCTTCTTCTGGGGTGGAGCCCCGCCCCCTTGTTCTAGGGTGGATCCCCGCCCCCTCCTTTTAGGGTGAAGCCCTGCCCACTTGATCTAAAGTGGAATCCCGCCCCCTCACCTAGGGTAGAGCCCCGCCCCCTCGTTCTAGGGTGGAGACCCGTCCGCTTGTTCTACGGTGGATTCCGGCCGCTTGTCTAGGGTGGAACCCCCCAGCTTGCCCTAGGGTGGAACCCCCCCGCTGCCCTAGGCTGGAGCCCCGCCCCCTCACCCGCCCCCGCGGGGCCCAGGTGCACGCGTGGACCCCGAGCCCGGAGGTGAAGAGGGTCTGACCCTGCGATCTCCCGCAGCTGCTACTCCACCGACACGCACTGGTTCTCTGGCCCGCGCTGCGAGGTGGCCGTCCACTGGAGGGCGCTGGTCGGGGGCCTGACGGCCGGCGCCGCGCTGCTGGTGCTGCTGCTGCTGGCGCTGGGCGTCCGGGCGGTGCGCTCCGGATGGTGGGGCGGCCAGCGCCGAGGCCGGTGAGCGTGCGGGGGGCGGGGCCGGGGGGCGAGGGCAGCCAAGGGGTCCCAGGCGGGCCGGCTCTGTCTGACCGCGCGGCGGCCCCACCTAGGTCCTGGGACCAGGACAGGAAATGGTTCGAGACCTGGGATGAGGAAGTCGTGGGCACTTTTTCAAACTGGGGTTTCGAGGACGACGGAACAGGTGAGTCCTGCCTCCTGGGGAAGCAGGCAGAGGCTTTCCTGGGCACCACTGCGAGGACAGACGCCCTCCTTGCCTTCCTCGCATTTACTCTGTCCCCCTCTCCCTTCCGTCCCCTCCCTCTCCCCTTCTATTTCTCCGCTCCTCTCTCTCTCTAGACAAGGATACAAATTTCCATGTGGCCTTGGAGAACGTGGACACCACTATGAAGGTGAGGGGCTAAAGAGGGGGACCCCAAGGAACTCTCCCAGCCTCCATTCCAGAATCCCTCCCCGACCCCCACCAGGGCAGGGAGGGGGCTGGGCTCGGATCAGCAGTGACCTCCCTGTCAGCCCAAACCAGTGGCTCCGCGTTCCCGTCCCTCACTGTGACTCTGACAGGTGCACATCAAGAGACCCGAGATGACCTCGTCCTCAGTGTGAGCCCTGCGGGGCCCCTTCACCACCCCCTCCGCCCTGCCCCGGACACAAGGGTCTGCATTGCGTCCATTTCAAGAGGTGGCCCCAGGACGCGGGCAGCCCAGGCTCCTGCTGTTCTTGGGCAAGATGAGACTGTTCCCCCAAATCCCATCCTTCTCCTTCCAACTTGGCTGAAACCCACCTGGAGACGCAGTTCACGTCCAGGCTCTTCCACTGTGGAATCTTGGGCAAGTCAGTAACGAGCCTCAGTTTCCTCACCTGCAAAACGGGTACAGCATTCCTGTATGATAGCTCACGCCGTCGTTGTGAAAACCACATAGACTTGGTCAATTCTCGGTCCTACTCTGCCCTCCCGTCTCAGCCCTCGTGTTGCCATTGCCTCTCTCGGATCCTCCAATCCTCACGTCCTTCACCTGGTCTCTGGCCCTGGTTCTTATTTTCTCTCAATTCCCTACTGCCTGTTTCTTACTTTGAACCTGGAGGCAGCCTGCAGCCCCATCCCATCTCCTGCCCTCTCCTGATCTAACTCCCTGCTGCATCTCTTGCTCTCATTCCTTAGACGTCCTCCCCTTTTGACCCCGTTCCTTCATCCATCCTGCACCCCAGTCCCCCAGCCCTAAATCCTCCCTCCTCTCCTCACATCCTGGTCCCTAGCAAGGTATAGATAGCCTCTGTGTCTTAGGATACCCCGGGTGCTGTTCCCTCGGTCACCCTGTTGCCCAGTTCCCCGTTTCTCTTGCTCTCATTCCTTGTATCCTTTCCCCTTTTGAGCCCGTCCATTCATCGGTTCTGCCCCCGACTCCCCCTGACCTAAATACCCCAGCTCCTAATTCCCCCCTCACCCCGTTGCTCAATTCCCCGTTTCTCTTGCTCTCATTCCTTGTATCTTCTCCCCTTCTGAGCCTGTCCATTCATCGGTGGTTCTGCCCCTACTCCCCCAGCCCTAAATACCCCAGCTGCTGTTCCTCCCCATCACCCAGCCACCGGATTCTCCATTCACCCCTTTCTCTCACCCCTGGAGCCCCGTGGGTGGGGGCAGGGCATGAGTTCCCCAGTCCCCAAGGAAAGGCAGCCCCCTCAGTCTCCCTCCTCCTCATTCCCTTCCATCTCCCTCCCCTCTGCCTTTTAAACCCATCCCCTCCGATTCCCCTCCTCCCCCCTCTCTCCCTGGTGTCAACTCGATTCCTGCGGTAACTCTGAGCCCTGAAATCCTCAGTCTCCTTGGCGGGGAAGATTGGCTTTGGGAACAGGAAGTCGGCACATCTCCAGGTCTCCATGTGCACAATATAGAGTTTATTGTAAAAAGC", 100547052 - 1); var codingRegion = new CodingRegion(100547257, 100610315, 206, 10177, 9972); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 100547052, 100547317, 1, 266), new TranscriptRegion(TranscriptRegionType.Intron, 1, 100547318, 100549480, 266, 267), new TranscriptRegion(TranscriptRegionType.Exon, 2, 100549481, 100550652, 267, 1438), new TranscriptRegion(TranscriptRegionType.Intron, 2, 100550653, 100550655, 1438, 5957), new TranscriptRegion(TranscriptRegionType.Exon, 3, 100550656, 100550784, 5957, 6085), new TranscriptRegion(TranscriptRegionType.Intron, 3, 100550785, 100550787, 6085, 6698), new TranscriptRegion(TranscriptRegionType.Exon, 4, 100550788, 100550814, 6698, 6724), new TranscriptRegion(TranscriptRegionType.Intron, 4, 100550815, 100550817, 6724, 7115), new TranscriptRegion(TranscriptRegionType.Exon, 5, 100550818, 100552774, 7115, 9071), new TranscriptRegion(TranscriptRegionType.Intron, 5, 100552775, 100552880, 9071, 9072), new TranscriptRegion(TranscriptRegionType.Exon, 6, 100552881, 100553066, 9072, 9257), new TranscriptRegion(TranscriptRegionType.Intron, 6, 100553067, 100554979, 9257, 9258), new TranscriptRegion(TranscriptRegionType.Exon, 7, 100554980, 100555095, 9258, 9373), new TranscriptRegion(TranscriptRegionType.Intron, 7, 100555096, 100555514, 9373, 9374), new TranscriptRegion(TranscriptRegionType.Exon, 8, 100555515, 100555579, 9374, 9438), new TranscriptRegion(TranscriptRegionType.Intron, 8, 100555580, 100607745, 9438, 9439), new TranscriptRegion(TranscriptRegionType.Exon, 9, 100607746, 100607894, 9439, 9587), new TranscriptRegion(TranscriptRegionType.Intron, 9, 100607895, 100608306, 9587, 9588), new TranscriptRegion(TranscriptRegionType.Exon, 10, 100608307, 100608372, 9588, 9653), new TranscriptRegion(TranscriptRegionType.Intron, 10, 100608373, 100608728, 9653, 9654), new TranscriptRegion(TranscriptRegionType.Exon, 11, 100608729, 100608891, 9654, 9816), new TranscriptRegion(TranscriptRegionType.Intron, 11, 100608892, 100609538, 9816, 9817), new TranscriptRegion(TranscriptRegionType.Exon, 12, 100609539, 100609712, 9817, 9990), new TranscriptRegion(TranscriptRegionType.Intron, 12, 100609713, 100609804, 9990, 9991), new TranscriptRegion(TranscriptRegionType.Exon, 13, 100609805, 100609896, 9991, 10082), new TranscriptRegion(TranscriptRegionType.Intron, 13, 100609897, 100610051, 10082, 10083), new TranscriptRegion(TranscriptRegionType.Exon, 14, 100610052, 100610104, 10083, 10135), new TranscriptRegion(TranscriptRegionType.Intron, 14, 100610105, 100610273, 10135, 10136), new TranscriptRegion(TranscriptRegionType.Exon, 15, 100610274, 100611004, 10136, 10866), new TranscriptRegion(TranscriptRegionType.Gap, 15, 100611005, 100611005, 10866, 10867), new TranscriptRegion(TranscriptRegionType.Exon, 15, 100611006, 100611075, 10867, 10936), new TranscriptRegion(TranscriptRegionType.Gap, 15, 100611076, 100611076, 10936, 10937), new TranscriptRegion(TranscriptRegionType.Exon, 15, 100611077, 100611169, 10937, 11029), new TranscriptRegion(TranscriptRegionType.Gap, 15, 100611170, 100611173, 11029, 11030), new TranscriptRegion(TranscriptRegionType.Exon, 15, 100611174, 100611176, 11030, 11032), new TranscriptRegion(TranscriptRegionType.Intron, 15, 100611177, 100611299, 11032, 11033), new TranscriptRegion(TranscriptRegionType.Exon, 16, 100611300, 100611307, 11033, 11040), new TranscriptRegion(TranscriptRegionType.Gap, 16, 100611308, 100611311, 11040, 11041), new TranscriptRegion(TranscriptRegionType.Exon, 16, 100611312, 100611428, 11041, 11157), new TranscriptRegion(TranscriptRegionType.Exon, 16, 100611429, 100611517, 11170, 11258), new TranscriptRegion(TranscriptRegionType.Exon, 16, 100611518, 100611619, 11260, 11361) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(865, 865, "G"), new RnaEdit(1214, 1214, "A"), new RnaEdit(1439, 1438, "AGCACATCCACAACTGCCATCTCCTCACTTCCCCCTACCTCAGGTACTATGGTGACTTCCACAACCATGACCCCATCTTCTCTGAGTACAGACATCCCTTTCACAACACCAACAACTATCACCCACCATTCTGTGGGCTCTACCGGTTTCCTGACTACAGCAACAGACCTCACATCAACATTCACGGTTTCCAGTTCCTCAGCAATGTCCACGAGTGTCATTCCATCTTCCCCCAGCATTCAGAATACAGAAACCTCATCCCTTGTCAGCATGACCTCTGCCACTACTCCCAATGTGAGACCAACTTTTGTAAGTACACTCAGCACTCCTACAAGTTCCCTCCTGACGACCTTCCCAGCAACATATTCATTTTCATCTTCCATGTCTGCCAGCAGTGCTGGGACCACTCACACAGAGAGTATCTCCTCACCTCCAGCCAGCACCAGTACACTCCACACAACAGCTGAATCCACCCTGGCACCCACTACCACCACCTCATTCACAACTTCCACAACTATGGAACCACCTTCAACCACTGCAGCAACTACAGGAACAGGTCAGACCACCTTCACCAGCTCTACAGCCACATTTCCTGAGACCACCACACCGACTCCTACAACTGACATGTCCACAGAATCTCTCACAACAGCCATGACTTCTCCTCCCATCACTTCATCAGTCACTTCCACAAATACAGTGACTTCTATGACAACTACGACCTCTCCTCCCACAACCACCAATTCTTTTACATCACTGACCAGTATGCCTCTGTCTTCTACACCTGTCCCAAGCACAGAAGTAGTCACCAGTGGCACCATAAACACAATCCCTCCATCTATCTTGGTGACCACACTCCCCACTCCAAATGCTTCATCTATGACTACATCTGAGACCACCTATCCTAATTCTCCGACTGGTCCTGGTACAAACTCCACGACGGAAATCACCTATCCCACCACTATGACAGAGACATCATCCACTGCCACCTCTCTTCCACCCACCTCTCCCTTGGTCTCAACTGCAAAAACAGCCAAAACTCCTACCACAAACTTGGTAACCACCACCACCAAGACCACCTCACATAGTACCACCAGCTTCACTTCTTCAACCGTCTACTCCACAGCCAGCACATACACAACTGCCATCACCTCAGTTCCCACTACGTTGGGTACCATGGTAACTTCTACATCCATGATCTCATCTACTGTGAGTACAGGTATCCCTACCTCACAACCAACAACCATCACTCCCTCATCCGTGGGCATCAGTGGTTCATTACCTATGATGACAGACCTCACCTCAGTGTACACAGTCTCTAACATGTCTGCAAGGCCAACAACTGTCATTCCCTCATCTCCCACTGTCCAGAATACAGAAATCTCAATCTCTGTTAGCATGACGTCTGCTACCACTCCCAGTGGAGGACCAACTTTCACAAGTACTGAGAACACTCCAACAAGGTCCCTCCTGACAAGCTTTCCAATGACACATTCATTCTCTTCTTCTATGTCTGAAAGTAGTGCTGGGACCACTCACACAGAGAGTATCTCCTCACCTCGAGGCACCACCAGTACACTCCACACAACAGTTGAATCCACCCCATCACCCACTACCACCACCTCATTTACCACATCCACAATGATGGAACCACCTTCATCCACTGTATCAACTACAGGCAGAGGTCAGACCACCTTTCCCAGCTCTACAGCCACATTCCCTGAGACCACTACACTGACTCCTACAACTGACATTTCTACAGTATCTCTCACAACAGCCATGACTTCTCCTCCCCCCGTCAGTTCTTCAATCACTCCCACCAATACAATGACTTCTATGAGAACTACAACCTATTGGCCCACAGCCACTAATACATTATCACCACTCACCAGTAGCATTTTATCTTCTACACCTGTCCCAAGCACAGAAATGATCACCAGTCATACCACAAACACCACCCCTCTATCCACCTTGGTGACTACACTCCTCACTACCATCACCAGATCTACACCTACATCTGAGACCACCTACCCTACTTCTCCCACCAGCATTGTCTCAGACTCCACGACTGAAATCACCTATTCCACAAGTATAACAGGTACATTGTCCACTGCCACTACTCTCCCACCCACCTCTTCCTCTCTCCCAACCACAGAAACAGCCACGATGACTCCTACCACAACCTTGATAACCACCACCCCTAATACCACCTCCCTTAGTACCCCCAGCTTCACTTCTTCAACCATCTACTCCACAGTCAGCACATCCACAACTGCCATCTCCTCAGCTTCCCCTACCTCAGGTACCATGGTAACTTCCACAACCATGACCCCATCTTCTCTGAGTACAGACACCCCTTCCACAACACCAACAACTATCACCTACCCTTCTGTGGGCTCTACCGGTTTCCTGACTACAGCAACAGACCTCACATCAACATTCACTGTTTCCAGTTCCTCAGCAATGTCCACAAGTGTCATTCCATCTTCCCCCAGCATCCAGAATACAGAAACCTCATCCCTTGTCAGCATGACCTCTGCCACCACTCCCAGTTTGAGACCAACTATCACAAGTACTGACAGCACTCTAACAAGTTCCCTCCTGACGACCTTCCCAAGTACATATTCATTTTCATCTTCCATGTCTGCCAGCAGTGCAGGGACCACTCACACAGAGACTATTTCCTCACTTCCAGCCAGCACCAATACAATCCACACAACAGCTGAATCCGCCCTGGCACCCACTACCACCACCTCATTCACCACATCCCCAACGATGGAACCACCTTCAACCACTGTAGCGACTACAGGCACAGGTCAGACCACCTTCCCCAGCTCTACAGCCACATTCCTTGAGACCACCACACTGACTCCTACAACTGACTTTTCTACAGAATCTCTCACAACAGCCATGACTTCTACTCCCCCCATCACTTCTTCAATCACTCCCACCGATACAATGACTTCTATGAGAACTACGACCTCTTGGCCCACAGCCACTAATACGTTATCACCACTCACCAGTAGCATTTTATCTTCTACACCTGTCCCAAGCACAGAGGTGACCACCAGTCATACCACAAACACCAATCCTGTATCCACGTTGGTGACTACACTCCCCATTACCATCACCAGGTCTACACTTACATCTGAGACCGCCTACCCTAGTTCTCCCACAAGCACTGTCACAGAGTCCACAACTGAAATCACCTATCCCACCACTATGACAGAGACATCATCTACTGCCACCTCTCTTCCACCCACCTCTTCCTTGGTCTCAACCGCAGAAACAGCCAAAACTCCTACCACAAACTTGGTAACCACCACCACCAAGACCACCTCACATAGTACCACCAGCTTCACTTCTTCAACCATCTACTCCACAGCCAGCACACCCACCACTGCCATCACCTCAGTTCCCACTACCTTGGGTACCATGGTGACTTCTACATCCATGATCCCATCTACTGTGAGTACAGGTATCCCTACCTCACAACCAACAACTATTACTCCCTCATCCGTGGGCATCAGTGGTTCATTACCTATGATGACAGACCTCACCTCAGTGTACACAGTCTCCAGCATGTCTGCAAGGCCAACAAGTGTCATTCCTTCATCTCCCACTGTCCAGAATACAGAAACCTCAATCTTTGTTAGCATGATGTCTGCTACCACTCCCAGTGGAGGACCAACTTTCACAAGTACTGAGAACACTCCAACAAGGTCCCTCCTGACAAGCTTTCCAGTGACACATTCATTTTCCTCTTCCATGTCTGCCAGCAGTGTAGGGACCACTCACACCCAGAGTATCTCCTCACCCCCAGCCATCACCAGTACACTCCACACAACAGCTGAATCCACCCCATCACCTACAACCACCATGTCATTCACAACATTTACAAAGATGGAAACACCTTCATCCACTGTAGCAACTACAGGCACAGGTCAGACTACATTCACCAGTTCAACAGCCACATCCCCTAAGACCACCACACTGACTCCTACCTCTGACATTTCCACAGGATCTTTCAAAACAGCCGTGAGTTCTACTCCCCCCATCACTTCTTCAATCACCTCCACATATACGGTGACTTCGATGACAACTACCACCCCTCTAGGGCCCACAGCCACTAATACGTTACCATCATTTACCAGTAGCGTTTCATCTTCTACGCCTGTCCCAAGTACAGAAGCGATCACCAGTGGTACCACAAACACCACCCCTCTATCTACATTGGTGACCACATTCTCCAATTCCGACACCAGTTCTACACCTACATCTGAGACCACCTACCCTACTTCTCTTACTAGTGCTCTCACAGATTCCACGACCAGAACCACCTATTCCACCAATATGACAGGTACATTGTCCACTGTGACCTCTCTTCGACCCACCTCTTCCTCTCTCCTCACCACAGTAACAGCCACAGTTCCAACAACAAACTTGGTAACCACGACCACCAAGATCACCTCACACAGTACTCCTAGCTTCACTTCTTCAATCGCAACCACCGAGACCCCC"), new RnaEdit(1520, 1520, "A"), new RnaEdit(1549, 1549, "C"), new RnaEdit(1568, 1567, "TCGATCACCACCACCGAGACCACATCCCACAATACTCCCAGCCTCACTTCTTCAATCACCACCACCAAGACCACCTCACACAGTACTCCCAGCTACACTTCTTTGATCACCACAACCACCACCACCTCACACAGTACTCCCAGCTTCACTTCTTCCATCACCACCACTGAGACCACATCCCACAATACTCCCAGCTTGACTTCTTCGATCACAACCACCGAGACCACATCCCATAGTACTCCCAGCTTCACTTCTTCGATCACCACCGAGACCACATCCCACAGTACTCCCAGCTTCACTTCATTGATCACCATCACCGAGATCACCTCACACAGTACTCTCAGCTACACTACCTCAATCACCACCACCGAGACCCCCTCACACAGTACTCTCAGCTTCACTTCTTCAATCACCACCACTGAGACCACCTCACACAGTACTCCCAGCTTCACTTCCTCAATCACCACCTCTGAGATGCCCTCACACAGTACTCCCAGCTTCACTTCTTCGATCACCACCACTGAGAACGCCACACACAGTACTCCCAACTTCACTTCTTCAATCACCACCACCGAGACCACATCCCACAGTACTCCCAGCTTCACTTCTTTG"), new RnaEdit(1595, 1594, "AGGTGGGGGACCACCGAGACCACATCCTACAGTACTCCCAGCTTCACTTCTTCAAATACCATCACTGAGACCACCTCACACAGTACTCCCAGCTACATTACCTCAATCACCACCACCGAGACCCCCTCAAGCAGTACTCCCAGCTTCAGTTCTTCGATCACCACCACTGAGACCACATCCCACAGTACTCCCGGCTTCACTTCTTCAATCACCACCACTGAGACTACATCCCACAGTACTCCCAGCTTCACTTCTTCGATCACCACCACTGAGACCACCTCACATGATACTCCCAGCTTCACTTCTTCAATCACCACCAGTGAGACCCCCTCACACAGTACTCCCAGCTCCACTTCTTTAATCACCACCACCAAGACCACCTCACACAGT"), new RnaEdit(1627, 1627, "G"), new RnaEdit(1650, 1650, "A"), new RnaEdit(1702, 1702, "C"), new RnaEdit(1957, 1957, "T"), new RnaEdit(2794, 2794, "T"), new RnaEdit(3028, 3028, "C"), new RnaEdit(3586, 3586, "C"), new RnaEdit(4024, 4024, "G"), new RnaEdit(4044, 4044, "T"), new RnaEdit(4131, 4131, "C"), new RnaEdit(4183, 4184, "CT"), new RnaEdit(4231, 4231, "G"), new RnaEdit(4273, 4273, "G"), new RnaEdit(4289, 4289, "A"), new RnaEdit(4580, 4580, "T"), new RnaEdit(4956, 4956, "T"), new RnaEdit(5344, 5344, "A"), new RnaEdit(5347, 5347, ""), new RnaEdit(5356, 5356, "T"), new RnaEdit(5358, 5358, "C"), new RnaEdit(5360, 5360, "C"), new RnaEdit(5368, 5368, "C"), new RnaEdit(5411, 5411, "T"), new RnaEdit(5413, 5413, "C"), new RnaEdit(5418, 5418, ""), new RnaEdit(5421, 5421, "C"), new RnaEdit(5480, 5480, "G"), new RnaEdit(5483, 5484, "GT"), new RnaEdit(5486, 5486, "C"), new RnaEdit(5492, 5492, "A"), new RnaEdit(5499, 5499, "T"), new RnaEdit(5501, 5501, "C"), new RnaEdit(5505, 5505, "C"), new RnaEdit(5512, 5515, ""), new RnaEdit(5527, 5530, ""), new RnaEdit(5556, 5556, "T"), new RnaEdit(5558, 5558, "C"), new RnaEdit(5630, 5630, "T"), new RnaEdit(5636, 5636, "G"), new RnaEdit(5640, 5640, "C"), new RnaEdit(5648, 5647, "TTCTTGCCTCCC"), new RnaEdit(5691, 5691, "T"), new RnaEdit(5697, 5697, "T"), new RnaEdit(5708, 5708, "C"), new RnaEdit(5711, 5711, "G"), new RnaEdit(5721, 5721, "A"), new RnaEdit(5737, 5736, "T"), new RnaEdit(5750, 5750, "G"), new RnaEdit(5777, 5777, "G"), new RnaEdit(5805, 5805, "T"), new RnaEdit(5819, 5819, "C") }; const byte startExonPhase = 0; const bool onReverseStrand = false; var codingSequence = new CodingSequence(genomicSeq, codingRegion, regions, onReverseStrand, startExonPhase, rnaEdits); var expectedCodingSeq = "ATGCAGCTGTTGGGGCTCCTCGGCCTCCTCTGGATGCTCAAGGCCTCCCCGTGGGCCACAGGAACTTTATCCACGGCCACATCCATCTCTCAAGTGCCTTTCCCCAGAGCAGAAGCAGCCAGCGCTGTGCTCAGCAATTCTCCACACTCCAGAGACCTGGCTGGGTGGCCACTTGGTGTCCCCCAGCTCGCCTCTCCTGCTCCTGGCCACAGGGAAAATGCACCTATGACACTCACTACCTCCCCCCATGACACACTCATCTCTGAAACATTGCTCAACTCTCCAGTCAGTTCCAACACCTCAACCACCCCGACGTCCAAGTTTGCCTTCAAGGTTGAAACCACTCCACCCACCGTGTTGGTCTATTCAGCCACCACTGAGTGCGTGTATCCAACGAGCTTTATAATCACCATCTCCCACCCCACCTCCATCTGTGTGACCACGACGCAGGTGGCCTTCACCAGCTCTTACACCTCGACTCCCGTGACACAGAAGCCAGTGACCACCGTCACCAGTACTTACTCTATGACCACTACTGAGAAAGGAACGTCAGCCATGACATCTTCTCCCTCTACCACCACTGCAAGGGAAACTCCCATAGTGACAGTGACACCCTCCTCTGTGTCAGCCACAGACACAACCTTCCACACTACAATCTCGTCTACAACTAGAACCACAGAAAGGACTCCCCTGCCCACTGGAAGCATCCATACAACCACGTCCCCAACCCCAGTATTTACTACTCTCAAAACAGCAGTGACTTCCACTTCCCCCATCACTTCTTCAATCACTTCCACAAATACAGTGACTTCTATGACAACGACCGCCTCCCAGCCCACAGCCACTAATACATTGTCATCACCCACTAGGACCATTTTATCTTCCACACCTGTCCTGAGCACAGAAACAATCACCAGTGGTATCACAAACACCACCCCCCTATCCACCTTGGTGACCACACTCCCCACTACCATCAGCAGGTCTACACCTACATCTGAGACCACCTACACTACTTCTCCCACCAGCACTGTCACAGACTCCACTACCAAAATCGCCTACTCCACAAGTATGACAGGTACATTGTCCACAGAGACTTCTCTCCCACCCACCTCTTCCTCTCTCCCAACCACAGAAACAGCCACGACTCCTATGACAAACTTGGTAACCACCACCACTGAGATCTCCTCCCACAGTACTCCCAGCTTCTCTTCATCAACCATCTACTCCACAGTCAGCACATCCACAACTGCCATCTCCTCACTTCCCCCTACCTCAGGTACTATGGTGACTTCCACAACCATGACCCCATCTTCTCTGAGTACAGACATCCCTTTCACAACACCAACAACTATCACCCACCATTCTGTGGGCTCTACCGGTTTCCTGACTACAGCAACAGACCTCACATCAACATTCACGGTTTCCAGTTCCTCAGCAATGTCCACGAGTGTCATTCCATCTTCCCCCAGCATTCAGAATACAGAAACCTCATCCCTTGTCAGCATGACCTCTGCCACTACTCCCAATGTGAGACCAACTTTTGTAAGTACACTCAGCACTCCTACAAGTTCCCTCCTGACGACCTTCCCAGCAACATATTCATTTTCATCTTCCATGTCTGCCAGCAGTGCTGGGACCACTCACACAGAGAGTATCTCCTCACCTCCAGCCAGCACCAGTACACTCCACACAACAGCTGAATCCACCCTGGCACCCACTACCACCACCTCATTCACAACTTCCACAACTATGGAACCACCTTCAACCACTGCAGCAACTACAGGAACAGGTCAGACCACCTTCACCAGCTCTACAGCCACATTTCCTGAGACCACCACACCGACTCCTACAACTGACATGTCCACAGAATCTCTCACAACAGCCATGACTTCTCCTCCCATCACTTCATCAGTCACTTCCACAAATACAGTGACTTCTATGACAACTACGACCTCTCCTCCCACAACCACCAATTCTTTTACATCACTGACCAGTATGCCTCTGTCTTCTACACCTGTCCCAAGCACAGAAGTAGTCACCAGTGGCACCATAAACACAATCCCTCCATCTATCTTGGTGACCACACTCCCCACTCCAAATGCTTCATCTATGACTACATCTGAGACCACCTATCCTAATTCTCCGACTGGTCCTGGTACAAACTCCACGACGGAAATCACCTATCCCACCACTATGACAGAGACATCATCCACTGCCACCTCTCTTCCACCCACCTCTCCCTTGGTCTCAACTGCAAAAACAGCCAAAACTCCTACCACAAACTTGGTAACCACCACCACCAAGACCACCTCACATAGTACCACCAGCTTCACTTCTTCAACCGTCTACTCCACAGCCAGCACATACACAACTGCCATCACCTCAGTTCCCACTACGTTGGGTACCATGGTAACTTCTACATCCATGATCTCATCTACTGTGAGTACAGGTATCCCTACCTCACAACCAACAACCATCACTCCCTCATCCGTGGGCATCAGTGGTTCATTACCTATGATGACAGACCTCACCTCAGTGTACACAGTCTCTAACATGTCTGCAAGGCCAACAACTGTCATTCCCTCATCTCCCACTGTCCAGAATACAGAAATCTCAATCTCTGTTAGCATGACGTCTGCTACCACTCCCAGTGGAGGACCAACTTTCACAAGTACTGAGAACACTCCAACAAGGTCCCTCCTGACAAGCTTTCCAATGACACATTCATTCTCTTCTTCTATGTCTGAAAGTAGTGCTGGGACCACTCACACAGAGAGTATCTCCTCACCTCGAGGCACCACCAGTACACTCCACACAACAGTTGAATCCACCCCATCACCCACTACCACCACCTCATTTACCACATCCACAATGATGGAACCACCTTCATCCACTGTATCAACTACAGGCAGAGGTCAGACCACCTTTCCCAGCTCTACAGCCACATTCCCTGAGACCACTACACTGACTCCTACAACTGACATTTCTACAGTATCTCTCACAACAGCCATGACTTCTCCTCCCCCCGTCAGTTCTTCAATCACTCCCACCAATACAATGACTTCTATGAGAACTACAACCTATTGGCCCACAGCCACTAATACATTATCACCACTCACCAGTAGCATTTTATCTTCTACACCTGTCCCAAGCACAGAAATGATCACCAGTCATACCACAAACACCACCCCTCTATCCACCTTGGTGACTACACTCCTCACTACCATCACCAGATCTACACCTACATCTGAGACCACCTACCCTACTTCTCCCACCAGCATTGTCTCAGACTCCACGACTGAAATCACCTATTCCACAAGTATAACAGGTACATTGTCCACTGCCACTACTCTCCCACCCACCTCTTCCTCTCTCCCAACCACAGAAACAGCCACGATGACTCCTACCACAACCTTGATAACCACCACCCCTAATACCACCTCCCTTAGTACCCCCAGCTTCACTTCTTCAACCATCTACTCCACAGTCAGCACATCCACAACTGCCATCTCCTCAGCTTCCCCTACCTCAGGTACCATGGTAACTTCCACAACCATGACCCCATCTTCTCTGAGTACAGACACCCCTTCCACAACACCAACAACTATCACCTACCCTTCTGTGGGCTCTACCGGTTTCCTGACTACAGCAACAGACCTCACATCAACATTCACTGTTTCCAGTTCCTCAGCAATGTCCACAAGTGTCATTCCATCTTCCCCCAGCATCCAGAATACAGAAACCTCATCCCTTGTCAGCATGACCTCTGCCACCACTCCCAGTTTGAGACCAACTATCACAAGTACTGACAGCACTCTAACAAGTTCCCTCCTGACGACCTTCCCAAGTACATATTCATTTTCATCTTCCATGTCTGCCAGCAGTGCAGGGACCACTCACACAGAGACTATTTCCTCACTTCCAGCCAGCACCAATACAATCCACACAACAGCTGAATCCGCCCTGGCACCCACTACCACCACCTCATTCACCACATCCCCAACGATGGAACCACCTTCAACCACTGTAGCGACTACAGGCACAGGTCAGACCACCTTCCCCAGCTCTACAGCCACATTCCTTGAGACCACCACACTGACTCCTACAACTGACTTTTCTACAGAATCTCTCACAACAGCCATGACTTCTACTCCCCCCATCACTTCTTCAATCACTCCCACCGATACAATGACTTCTATGAGAACTACGACCTCTTGGCCCACAGCCACTAATACGTTATCACCACTCACCAGTAGCATTTTATCTTCTACACCTGTCCCAAGCACAGAGGTGACCACCAGTCATACCACAAACACCAATCCTGTATCCACGTTGGTGACTACACTCCCCATTACCATCACCAGGTCTACACTTACATCTGAGACCGCCTACCCTAGTTCTCCCACAAGCACTGTCACAGAGTCCACAACTGAAATCACCTATCCCACCACTATGACAGAGACATCATCTACTGCCACCTCTCTTCCACCCACCTCTTCCTTGGTCTCAACCGCAGAAACAGCCAAAACTCCTACCACAAACTTGGTAACCACCACCACCAAGACCACCTCACATAGTACCACCAGCTTCACTTCTTCAACCATCTACTCCACAGCCAGCACACCCACCACTGCCATCACCTCAGTTCCCACTACCTTGGGTACCATGGTGACTTCTACATCCATGATCCCATCTACTGTGAGTACAGGTATCCCTACCTCACAACCAACAACTATTACTCCCTCATCCGTGGGCATCAGTGGTTCATTACCTATGATGACAGACCTCACCTCAGTGTACACAGTCTCCAGCATGTCTGCAAGGCCAACAAGTGTCATTCCTTCATCTCCCACTGTCCAGAATACAGAAACCTCAATCTTTGTTAGCATGATGTCTGCTACCACTCCCAGTGGAGGACCAACTTTCACAAGTACTGAGAACACTCCAACAAGGTCCCTCCTGACAAGCTTTCCAGTGACACATTCATTTTCCTCTTCCATGTCTGCCAGCAGTGTAGGGACCACTCACACCCAGAGTATCTCCTCACCCCCAGCCATCACCAGTACACTCCACACAACAGCTGAATCCACCCCATCACCTACAACCACCATGTCATTCACAACATTTACAAAGATGGAAACACCTTCATCCACTGTAGCAACTACAGGCACAGGTCAGACTACATTCACCAGTTCAACAGCCACATCCCCTAAGACCACCACACTGACTCCTACCTCTGACATTTCCACAGGATCTTTCAAAACAGCCGTGAGTTCTACTCCCCCCATCACTTCTTCAATCACCTCCACATATACGGTGACTTCGATGACAACTACCACCCCTCTAGGGCCCACAGCCACTAATACGTTACCATCATTTACCAGTAGCGTTTCATCTTCTACGCCTGTCCCAAGTACAGAAGCGATCACCAGTGGTACCACAAACACCACCCCTCTATCTACATTGGTGACCACATTCTCCAATTCCGACACCAGTTCTACACCTACATCTGAGACCACCTACCCTACTTCTCTTACTAGTGCTCTCACAGATTCCACGACCAGAACCACCTATTCCACCAATATGACAGGTACATTGTCCACTGTGACCTCTCTTCGACCCACCTCTTCCTCTCTCCTCACCACAGTAACAGCCACAGTTCCAACAACAAACTTGGTAACCACGACCACCAAGATCACCTCACACAGTACTCCTAGCTTCACTTCTTCAATCGCAACCACCGAGACCCCCTCACACAGTACTCCCAGATTCACTTCTTCAATCACCACTACCGAGACCCCCTCACACAGTACTCCCAGATTCACTTCTTCAATCACCAATACCAAGACCACCTCACACAGCTCTCCCAGCTTCACTTCTTCGATCACCACCACCGAGACCACATCCCACAATACTCCCAGCCTCACTTCTTCAATCACCACCACCAAGACCACCTCACACAGTACTCCCAGCTACACTTCTTTGATCACCACAACCACCACCACCTCACACAGTACTCCCAGCTTCACTTCTTCCATCACCACCACTGAGACCACATCCCACAATACTCCCAGCTTGACTTCTTCGATCACAACCACCGAGACCACATCCCATAGTACTCCCAGCTTCACTTCTTCGATCACCACCGAGACCACATCCCACAGTACTCCCAGCTTCACTTCATTGATCACCATCACCGAGATCACCTCACACAGTACTCTCAGCTACACTACCTCAATCACCACCACCGAGACCCCCTCACACAGTACTCTCAGCTTCACTTCTTCAATCACCACCACTGAGACCACCTCACACAGTACTCCCAGCTTCACTTCCTCAATCACCACCTCTGAGATGCCCTCACACAGTACTCCCAGCTTCACTTCTTCGATCACCACCACTGAGAACGCCACACACAGTACTCCCAACTTCACTTCTTCAATCACCACCACCGAGACCACATCCCACAGTACTCCCAGCTTCACTTCTTTGATCACCACCACGGAGACCACCTCACACAGGTGGGGGACCACCGAGACCACATCCTACAGTACTCCCAGCTTCACTTCTTCAAATACCATCACTGAGACCACCTCACACAGTACTCCCAGCTACATTACCTCAATCACCACCACCGAGACCCCCTCAAGCAGTACTCCCAGCTTCAGTTCTTCGATCACCACCACTGAGACCACATCCCACAGTACTCCCGGCTTCACTTCTTCAATCACCACCACTGAGACTACATCCCACAGTACTCCCAGCTTCACTTCTTCGATCACCACCACTGAGACCACCTCACATGATACTCCCAGCTTCACTTCTTCAATCACCACCAGTGAGACCCCCTCACACAGTACTCCCAGCTCCACTTCTTTAATCACCACCACCAAGACCACCTCACACAGTACTCCCAGCTTCACTTCTTCGATCACCACCACGGAGACCACCTCACACAGTGCTCACAGCTTCACTTCTTCGATCACCACCACCGAGACCACCTCACACAATACTCGCAGCTTCACTTCTTCGATCACCACCACCGAGACCAACTCTCACAGTACTACCAGCTTCACTTCTTCGATCACCACCACCGAGACCACCTCACACAGTACTCCCAGCTTCAGTTCTTCAATCACCACCACTGAGACCCCCTTACACAGTACTCCTGGCCTCACTTCGTGGGTCACCACCACCAAGACCACCTCACACATTACTCCTGGCCTCACTTCTTCAATCACCACCACTGAGACTACCTCACACAGTACTCCTGGCTTCACTTCTTCAATCACCACCACTGAGACCACCTCAGAGAGTACTCCCAGCCTCAGTTCTTCAACCATCTACTCCACAGTCAGCACATCCACAACTGCCATCACCTCACATTTTACTACCTCAGAGACTGCGGTGACTCCCACACCTGTAACCCCATCTTCTCTGAGTACAGACATCCCGACCACAAGCCTACGAACTCTCACCCCTTCGTCTGTGGGCACCAGCACTTCATTGACTACAACCACAGACTTTCCCTCTATACCCACTGATATCAGTACCTTACCAACTCGAACACACATCATTTCATCTTCTCCCTCCATCCAAAGTACAGAAACCTCATCCCTTGTGGGCACCACCTCTCCCACCATGTCCACTGTGAGAATGACCCTCAGAATTACTGAGAACACCCCAATCAGTTCCTTTAGCACAAGTATTGTTGTTATACCTGAAACCCCAACACAGACCCCTCCTGTACTGACGTCAGCCACTGGGACCCAAACATCTCCTGCACCTACTACTGTCACCTTTGGAAGTACGGATTCCTCCACGTCCACTCTTCATACTCTTACTCCATCAACAGCCTTGAGCACGATCGTGTCAACATCACAGGTTCCTATTCCTAGCACACATTCCTCCACCCTTCAAACAACTCCTTCTACTCCCTCATTGCAAACTTCACTCACATCTACAAGTGAGTTCACTACAGAATCTTTCACTAGGGGAAGTACGTCTACAAATGCAATCTTGACTTCTTTTAGTACCATCATCTGGTCCTCAACACCCACTATTATCATGTCCTCTTCTCCATCTTCTGCCAGCATAACTCCAGTGTTTTCCACTACCATTCATTCTGTTCCTTCTTCACCATACATTTTCAGTACAGAAAATGTGGGCTCCGCTTCTATCACAGGCTTTCCTAGTCTCTCTTCCTCTGCAACTACCAGCACTTCTTCAACCAGCTCCTCTCTGACCACAGCTCTCACTGAAATAACCCCCTTTTCTTATATTTCCCTTCCCTCCACCACACCCTGTCCAGGAACTATAACAATTACCATAGTCCCTGCCTCCCCCACTGATCCATGTGTTGAAATGGATCCCAGCACTGAAGCTACTTCTCCTCCCACCACCCCATTAACAGTCTTTCCCTTTACTACCGAAATGGTCACCTGTCCTACCTCCATCAGTATCCAAACTACTCTTACTACATATATGGACACTTCTTCCATGATGCCAGAAAGTGAGTCCAGCATCTCACCCAATGCTTCCAGTTCCACTGGCACTGGGACTGTACCCACAAACACAGTTTTCACAAGTACTCGACTGCCCACCAGTGAGACCTGGCTGAGCAACAGTTCTGTGATCCCCCTACCTCTTCCTGGCGTCTCTACCATCCCGCTCACCATGAAACCAAGCAGTAGCCTCCCGACCATCCTGAGGACTTCAAGCAAGTCAACACACCCCTCCCCACCCACCACTAGGACTTCAGAGACACCAGTGGCCACTACCCAGACTCCTACCACCCTTACATCACGCAGGACAACTCGCATCACTTCTCAGATGACCACACAGTCCACGTTGACCACCACTGCAGGCACCTGTGACAATGGTGGCACCTGGGAACAGGGCCAGTGTGCTTGCCTTCCGGGGTTTTCTGGGGACCGCTGTCAGCTCCAGACCAGATGCCAGAATGGGGGTCAGTGGGATGGCCTCAAATGCCAGTGCCCCAGCACCTTCTATGGTTCCAGTTGTGAGTTTGCTGTGGAACAGGTGGATCTAGATGTAGTGGAGACCGAGGTGGGCATGGAAGTGTCTGTGGATCAGCAGTTCTCGCCGGACCTCAATGACAACACTTCCCAGGCCTACAGGGATTTCAACAAGACCTTCTGGAATCAGATGCAGAAGATTTTTGCAGACATGCAGGGCTTCACCTTCAAGGGTGTGGAGATCCTGTCCCTGAGGAATGGCAGCATCGTGGTGGACTACCTGGTCCTGCTGGAGATGCCCTTCAGCCCCCAGCTGGAGAGCGAGTATGAGCAGGTGAAGACCACGCTGAAGGAGGGGCTGCAGAACGCCAGCCAGGATGTGAACAGCTGCCAGGACTCCCAGACCCTGTGTTTTAAGCCTGACTCCATCAAGGTGAACAACAACAGCAAGACAGAGCTGACCCCGGCAGCCATCTGCCGCCGCGCCGCTCCCACGGGCTATGAAGAGTTCTACTTCCCCTTGGTGGAGGCCACCCGGCTCCGCTGTGTCACCAAATGCACGTCGGGGGTGGACAACGCCATCGACTGTCACCAGGGCCAGTGCGTTCTGGAGACGAGCGGTCCCACGTGTCGCTGCTACTCCACCGACACGCACTGGTTCTCTGGCCCGCGCTGCGAGGTGGCCGTCCACTGGAGGGCGCTGGTCGGGGGCCTGACGGCCGGCGCCGCGCTGCTGGTGCTGCTGCTGCTGGCGCTGGGCGTCCGGGCGGTGCGCTCCGGATGGTGGGGCGGCCAGCGCCGAGGCCGGTCCTGGGACCAGGACAGGAAATGGTTCGAGACCTGGGATGAGGAAGTCGTGGGCACTTTTTCAAACTGGGGTTTCGAGGACGACGGAACAGACAAGGATACAAATTTCTATGTGGCCTTGGAGAACGTGGACACCACTATGAAGGTGCACATCAAGAGACCCGAGATGACCTCGTCCTCAGTGTGA"; Assert.Equal(expectedCodingSeq, codingSequence.GetCodingSequence()); } [Fact] public void GetCodingSequence_RnaEditInsertion_StartsCds_EndsUtr() { // NM_001220765.1, chr7:50344378-50367353 var genomicSeq = new SimpleSequence( "CGCGGCGCATCCCAGCCTGGGCGGGACGCTCGGCCGCGGCGAGGCGGGCAAGCCTGGCAGGGCAGAGGGAGCCCCGGCTCCGAGGTTGCTCTTCGCACCCGAGGATCAGTCTTGGCCCCAAAGCGCGACGCACAAATCCACGTGAGTGTTTTCAAATTGAATTTCAATAGGAAAACTTGGGGTAACTGGTGAATTTAAAAAAAAAAAAACACAGTAAAGAAAAGCGGTAAGGTTGGTAGACCCTGGTGTCGCTCAGGTCCGCCTCTCTTTTCTGAGGACAGTGAGAGAGTTCACTTCTGTCAAGCGTCTGTTGCTCTGCACTGTGCCAGCAGGTGCAGGACCAGGCCGACATGGGACACTTCTGAGCAGCCCCGCTGTCACCAGGAGAGGAGTTCTAGCTCCCAACCATATTTAAATTTATGTAGACCTACATATACCCACGGAAGTCAGCCTTTATAAAGTCGTGTGTAAAGAGTTTTCCTTATATTTGAGCCGGGAGCTTTCTTTTTATACTATAAATATGATGAGATCGAGTCTGAACTTAATTTCTGCAAGAGAGGAATTATCCCGGCTTTGAAAAGTTAGTCCTTTTGCTGACCGCAGGTTTGACGCTCAAGTCACCAAACCTTCTCAGGAAAACCCTTAGTAATATTAAGGCATCAGGTTACTTGCGGTTATATTTGAAATGTATTTTAAATATTTGTCAAGCATCGCTGCTGATGCCTAAGGAACCTCGTGAGGGCTTGTTTTTCCTTCTAATTTGGAGGCATCTAATGACCGAAAACCGTAGCGATTCCATAGGGTCTGACCAGGCACAGCTTTCAAATGCAGCTTCCCTCTCTCTAGGGACTGCAGCCCACCCAGACTGAATTTCAATGCGGTGCGCTTTGCTTAGGTTACCCACTCACAATTTCCCACTGCGCCGCAGGCAGTATATTTCAGCTTTGAGATACCTTGTTTTAAAATTCCAGACAAAATGGTGTTGAGGAAATGTCTCCTTACTAGTCCCATCAACTTCTGTTAAAAGAGGAAAATTTATGGAATTTGAAAATACTGCGTATGATATTTAAACTTTCATAGACATTCAAATGCTTTTAAGGCCAGGTTCAATTTGGTTATGAGTCGAGGGGTGGGGGGGACCCACATAGAAATGTCCTGGGTCCTCTTGAGTTTATTTCTTTGTTTGAAGATGTTTGTTCAATGAGTTTTATTGTACTCATCTTTTATATGGAATTTTAAAAAGTAACAATTTCAGTATTATTTATATTAGAATGTGTCAGAATTATTTCCGTGACAAATCAGATCATTTGGGCTATGGCTTAAAATGTACACGAGGCAAATATTCATGACAAGAAGATTCACCTTCTTACGCTGGCATCTTGTAAAATGCAGAACAAGTTAAAGAAATAATGTGTACACATACAAATAATGATGTCACATTAAAAATACTACACTATTCTTGCTTGATGGAATGTATCTGATTTCCAATTTCACCATGAACATATTTCATACATTTTTTACATGAAAAAAAACGTGACTCTTAAGTCTCACAGTCAATCAGAGCTGGTGACCAGAACATTTTATTGAACTAAATGGTCATGTTTTCTTCCCCTTTTGTTTCACGGTGAGAGTTGAAGGAAGGAGTTTAGAAACTCTCCAGTACTTGTTTAATTCATCAGTGTTCTAATTAGAGTGGTACCTCTTGGAAAACTACACACCCCCCTAATGCAGAAACATCATAGCAATAATCACCCACCCTCAGGGTCTCCAGGAGACCACAAGGGCTGCAGATAAAAGTCTGGATGTGTTAGGTTTGACCCTTTCGAAGAGTTTTACACAGGCTCCTAAAGAGAAGATCAGCTGTGGCCGTTTGTAGCCATTTCCTTTGTCGAAAAACTAAGATCGCAGTGAATGTATTAGCCAAGAGGTCTAAAGCCCTGTTGTACTGCAGGCCACTGTCTTCCTTGTTTGACTAGAGACTTGGAGTTTGAGAACAGTGGTTCTTTGGTTTGGATACATTTTTTGTTCTTGATTTGGATGTGTGTGTTTCATGCGTGGTTAATATAGCATATTTTCAATATAAATGTCAAAAATTTTGAAATAGGAAAGAACTCTCTATATATTAATGTACTTATACACACACTTCAAGATTATGCATTTATTAACAGATACATGAAATAAATTCCATGTGCATATGCACATATGCACACAGAGCGTGCACACACACAGCATGCACACAGCGTGGAGTGAGAGGCATGGGGCAGTGTGGAAGAGTTTTAACATCAAACAGACCTGAAATGAGTATTAAAGGCCCCCTTTATTTTTAAACTTTTACTAAAACAAGATGGATTTCCCTATGTTATATAATGGTGAATTTTAGGCATAAATAACGTTTTTTGAGTGTTGCATAATTGTACGTATTAATGTAATGTAACTGTGGTTAACGAAGAATTCATCAAGGATATCACTGTTTTGTGGCATTTTTTTTTTCCTCCTCTAATCTTTGGACTTGTGAAATAATTTCACTATGAAATAAATGTTGGTTCTTGTCATATTCTAAGGGAGATTGATGTAAGTGGCTCCACTCCAGCTTACAGAAGGTAAACCACGACCTTTTTGCGTTCTCTGAAAACGCTTGTCTTCCGATGCCTCTGTTTCTAAGACTGACAAGCACTCTGGGGGCACTGTGACGCCTGCTTCTAGCGGCAGAGTTGCTGCAGCTCCTGTCCTGGCTGTGAACATTGTTCTCTCTCTGGTGTCTCTATGTTCATAACTACAGAGACTTCAGCTCTATTCCATTTCATATTTGTGCTGAATAATCATTCCATTTTATGGGAGAAAACACAAGATGTAAAAGCAACAAGTGACCCATCCTTTGAAGCTTACAAGAAGAGAAACATTAATCTATTTCACGTCTTGAAAACAGATCAGTTTTATTTTGCTCAAAAAGGGCACATGTACATTTTTGATCTAGGTCTTAGAAACGTAGAGTTTCAGAGGATCAGCATTATACACACTGTCACACACACACACACTTAAAATTCAGATGAGGAACAAGATAGGAATGAGGTTTTGTTAGGGACGCAGAGCACCTAAAACCAAAGGATATCGACAGTAACAAAGCTGTTTTTACTGTAGTGCTGACTGAACACTCATGCTGGTGTCTTCATGTGGACCATGGCTTTCTTGTATTTCTTTGCAGTTTAATAAATGACTTCATATCTCAGGTTACCTTTCCACATCTCCTGGAATATATGTTTATGTCCTTAAAGTTTCAGTGTCGTCACTTTAGTAGCTTTAGTTTGAGTTTTTAAATGTTTGGTAATATTCCAACAAATATTTTTTAAGACATTATGAAACCTTATGAAGTGCCATATATTACAAGTGAGATAAAACAGCAAGCAAAAGAAGGTTTGCAGAAGGTTTTTAAGTGGCGAAGTGCGGGCCTGCCCATTTTGGTGTCTCCTTGGTGGTTACTCCTGAGAAGGGCCTGGAGGAAGAGCAACTGAGGCCTAATCTACAGGCAACTGCCAAATTGTTTCAGTTGACGTTTTTCCCTCTCATGTTTGACTATAATAAATAGGTAGTTGCCAGTGGAGCCTTCAGCCAACCACCTGGTAATAAACTGTTAAAAATGGTGCAAACCCTAGGTCACAGGTGTGGGGGCCATTTGTCTTGCCTGTTAACAGGCCTGGCCTTAATTCTTTTCTCCCATGGCCATTTCTGCCTTTGGGGAACTCACAATTCCTGTTGACTAAAAGAGCACCCTTTTCCACCACAAGCCTGACAAATCAGACGTCCACATAATTTCTGAACTCGTTTTGGTTAGGACAGGAAGCACAGGCTCCCTTCCTGTCTGTGTTTTCCTAAGAGAAAACGGTCTTCCCTCCTTTTTTGCATATTTGGCAAGTGGTTCCACCTTTCTCTGCACCCTGGTGGAGTGTGAAGGCAGCAGAGGAACCTTTTGGAGGAGGAAGAGGACACAGAGGCCCTGTAGCCAGGCACCAAGATCCCTCCCAGGTGGCTGGGTCTGAGGGGAACTCCGAGCAGCCCTAGGTCCTCAAAGTCTGGATTTGTGTGGAAAAGGCAGCTCTCACTTGGCCTTGGCGAGGCCTCGGTTGGTTGGTGAGTGCCACACGGTTTCTTTGTGTGCTTGCATGGATTGGAATAGCCATTGTGTTCTTCCGTCTTCCCTGCTGGTGTTTCCACAGTGGGTGGCCTGAGCCCAGAGCAGCTCCCCATATCCCTGTGCAGGCCACCTGTCTCGGGTGATGGAGAGCATCATTATGCTCCGTCTGAACGCTCTGCTTTCGGATGGCCCCATGCTCCACCTCCTGATAGCTCGTGGCGCGGGGCCACGGCTTAACAAATGGCTGAAAATGGGTCCTAATTAGTGGAAAAGTGCTTTCTTCATATTTTCTCACTCGAGTGTGCAGTGATTCATTTTTCTTCTGCAATCAGCTCACTGCTAAAGTAAATCTGACTCTCTTCCCGCCATTGCACACCAAAAGTTAACTCTAATGGGTAGGAGGTTAGGTTTGTTGAGAGAGCAATGCAGTAAAAAGAGGGGATCCAATGTGGTCTTGTCTGTCTGGTCTTCCTTTCTTCGTTTTTTCCTCCCTTGTCTTCTCTGTCATTCCCTTCCCTCCATTTGCCTTGCCTTTCCTGTCCTTCCCTTCCCTTCCTTCCCCTCTTTCTTTCTATAATTGGTGGGGGGTTTGCACAGACTGCCAAAACACTAAGAACTGTGTAAAGTGTTTTTGAATGGCCTTACACATATTGAAGTAGATTTTTATGCTCCATTTTTGAGATCACACACTAAAATCTATACCTTTAAAGCATTTTCTGTTAGTTTGAAACTATTTGAAAATGAACAATGTGGTTTAGATTAGAGTCCTGTTCTGAAGCTAGGAGTTCCACTATGAATATTGATTTATCAGTTTTTGACAAATTTTTGTTGTTATACCAGATTTTCACTGGCAAACCTAGAGCAAATAAAATTCCACATAAGATACTTCCCTAGACCTAATGGGAAAAATGTTTAATTTAGAGTCTTTAGGAGAAATGAGAATGAGGAATTGACCTTTTGTAAGCTTACTTCTGAGGCACTCTGAAGTGTGTTCCAGTGCTTTTAATGGAAACTAGAGAGAGCCAGCAACCCCCTAGTGTGAGCCCCACTTTTAACCGGAAAAAGTGACCTTTTCCTCCTCCTTTGTGCTGAGTTTTGCGTAGGGCAGAAAATTAAGCTGATATTCAAAGAGATTCACTGCAAAAACATATTGATAAATCGTATATTCTATTTCATTAAATTAAAACCATACTGCTAATTATCTCAGGTTGTTAAACATAAGGCAATTAATTATCATTTTAAAAGTTGGTAGGAAGTTGTGAGTACTTTTGCAGTATGAGTGTTTTCCCGCTTTAGTATGAGGTTGTGTATGTTTGCTTGAATTTACAGAATTTTCACTTTAAGAGCAGACAATGTTTTGTTAAAGAAATGAAATTTGCTAAAAAGGAGCATGTAAAGTGAAACATTAAAAATAAATAATTTCAACTTACTTAAGAGCTGCAGAAAAATCTGATTGCTGTGTTTAAAATGAATTTTCCCACATTTCGCTCTCTTATGGACAGGAGCATTTTCTGTCAGGTTATAAATAAAGACATGCCCATTTTTTGTACCCCCACAAATGAGGAAGTTGTAAGCTCTCTGAGGTTTTACTGATGAGCCCCCTCCCCCTGGGTTTGCATGAAGAGATCATAGGCCACAAATAAAGGACTACAAAATGGGGTCTAAACTATCCTGGTGGGGCCTGATACCCACGTTTCGCATGGACCTTACGATGTGATGAATGGTTTTGGCATGAGTGTCTTAAGAATGCTTCCAGATTCGGGTTACAGGACAGCCAGCGCTGAGCTCCCTATTGCAGAACAAAGTAGGAATCTAGAACTTTCTTGCTAACAGGATCCAGCTAAAACACCAAGTTAGATTCTTAAATGATGTTCTTTTCTGTCATTATTTGATTGTTGTCAGTAGCAGTAATTGTTACCAAGCCATTGATGCTTCTATTCTTCCCTTTGCCCTTCTGAGACACAGCTCATTTTGACTTCAGTGGAACCCCTCGAAGGTGGGGTGATGAGCAAGGTGAATTTTCAAAGTAAAGCTACTAAGAGACCAAACTACAATTTAAGGAACCTGATTTTTGAATCAAATTCCATATACTGTGGGTATAGTTCAACATAGATTAATTTCTTATAGTTATTATGAAAAAAATCTCATCTTGATGATAGCTGATAATTTTGTGGGTGTCGTAAACAAAACAGAGGTCAGAATTCAGTCCCTTGGGGAAAATTTCCAATTAGTAGGAAACCAAGTGGCCTACCTTAGTTTGAAGACACCCATCAGGATGTCTGCACCTTTTCATCCTCTCTGGAGGAAAGACTAAATACCCATTATTGTATATAGGTCAGGCCAAAGCAGCCTTTTATATTGCAAGGAATAAGAGGTAAATAGATATATGTGCAACAATGAATCCCCTAATGTGTTTACTCTAGAACACATGTTCTTTCTGTATTTATATGTAGATTTTGTAGATCTTGTCTTACCACCTGCTAATGGTAGATACTGTATCTAAATAAGTTGAGGAAAATTTATAGTACCTAGGAATGTGTCCTCAGTGGGCCAATCAATCAATCATGACTTCAGGTTATTTTTAATAAATATACACGTATGGGTTCATAAACAATGGGATGTTCTTGTGAAGATCTAAATAATTTTACTTCTTTGGGACTAAATAAAATATAGCTTTTGCCAAATAAACTCACACAAGCACTTATTTTAATAGAAGTCAAATGGCTTTGCAGAAACTTCAGTTTTACAGGTGCATTGTTTGAAATGTTACGGGTATACAAGTGGATTTCTCTATTATGTACAGTGTTAAGTTTGAGTTTCAAAATGTCCACCTGAAATGATTTACTTGTACGTTAAGATAATTTAACTGCTAAGAAGGCAAGATAAAGCATTCTTTGTGACACCATATGGCCTTGCTGAGGGAAAAACTTACTGTTATAAGTTTGTGTTTATCTCTCTTTTTAAAAAAAAATGAAGAAAAAAACGTTTAAAATAATGGGAACACAGCAGTTCCTGGGGTCCTCTGTCTCTTTATCTTATTATAGTAAATTACCAAAAAAATAATGACCTGGGGCATGTCTGTGTGGACCCTTCTTTTAGAGGCAGTTTCTGTGTTTTGTAAAGCTGTAGGTTCTATTTTCATTGCACTTCATATTGCTGCACAGCTCCTGACCATGCATGAAGGTCCTCTGAAATCGGTAAGAGGGCAGAAGAAAATGATTCTAAACTTAGATTTTTTTAACTTAAGTGATGAAGTGTGAAACGCCATTTATATTTGAGGAAGCTACCTAGGAAGTGGCTCATGTCGATGGCCCAAATCAGAAGAGGGCCTGTAAAAGCTTCTATCAATTTTGACTGTGTATGCTTCTACCATGGCGGCTCAATAAACAGCAGTATTAGTTTAAGAGTGGATGGTACAGTAGTATAGACGGGAAGCCTCTCCTCTCCGTGTGAACCGTGCACCCCTATGAGAGGGTAGAGACAATACAATATGCCTGTAACGTCAGGACAGACAGTCATGGCCAGCTTGAACTCCAGCCCTGGGCTTCTTGCAGCAACAAACGTGAACACAGAGGACTGTCTCCAACTCCACTTTCTCTATTTTTAAAACAACTTTTTGAATACAGTATCTGCCATCTTTTCTTATACCTCACTTTGAAACAGGTGGCTCCACTGTGGCATTTAAAATGTTCTGTTTCTTTTCCCTCTGTATCAAATACCTCTTTACCAAGAAAACATTCAAACAGCATAGTTTTTAACTGTATTTTGAAAGGTTTCCTTAGTTCCCTTTGACCCTTCCTCTTTTGCATATCAGTTCCTGGCCATAAAAATAAAAAATGCTAGGACAGAATTGCACATCTGAGCTGATTTGCCCTCAAAAAGTTTCACAGTGGAACAAACCGCAGGAGGAGTTTTCTGTGGCTCAGTTAAATGTCGGGGGAGGGTGGTGTGAAAGCCAAATTGGATTCCTGCTTTCCTGTTTAAATCTTGTTTTTCATTGTTATTTGCACCAGCAATACTCTGTGGAATAATCATGAAAATGTGTAGATTGGCAGCTAATTTTTGAAAAATGAAAAGAATCAGAAATGAAATAAGAGTGCTCGGAAGTTTTTATGTTCTCTCAACCTGTTTTGTCAAATTGTTACGAAAACCTATAAGGTCTCTTTGACTAGATACAAAGACTTTGCACATTGCCTTAGCTTTCTCTTGAAGCATTTCCTTTTTTAAAATACAGTGTAATTCACAGTGATATGATAGATTTGCAAAAGTAAAATCTACCAGTCTGAAGATGAAAGGACTTGTCTCTTAGCAGGAATAATGGGTTTTATTAAAGAGGTCTGTGACCTAAGGCATTTTAAATAAATTACAGGCTTGGTCCCTGTCTCCCCCATGTATCTACTCCCTTCAATATAAGCATCATTGAGTATTTAAGGAAATAACCCCAAATGTAACTCTAGTGTAGCTTCACTTGTCAGGGAGGAAAAAGTAAATAGCATACATTTGGCCAAATAACCAGAACTTTACTGTAGAAGTTTTATGATGAAATTTGCCTTTAGTGCAGAGTATTACAAAGATCATGTTTAGTTTCTAGCAGTATATAAGTAGCATCCATCCTTATCTGTCATGCATTTGGAGTGTGCGACCCCTGCACTGGGCTGCAACATTCTGATGGGCAAGAGTGCTAGGGAGAAAGAGGCATCACCATCAGACTGCACGGGTTCAAGTGTCAGCTCTGTGGTTGATTAGCTGTGTGACCTGGGGAAAGCTATTTCTCTTAGCCTTGGTTCTCTCATCTATAAAATGGAGATAATGATGCAGATGCCTTGGGTTTAATTGGGAGAGTTAAAGACACATTTACATATTTAGCAAGTAGGTGTTGAATTCTAGCTCTACATTGGACACTATGCCAGGTGCTCAAATAAACAAGTGGACAAGACAGACAACACCCATGGTCTTATGAGGCTTAACCATTTGCCTCTTCAATGCCAGAAACTTAGTAGGTTGATTAGATAAAGCCAGTGAGTACCAGTATCCTTTTCTTTGCAGCCTTTTCCTGGCACACTAAAAATACTCAGTACATATGAAATATCACTGGACAAAGAATCCCCCTTAGAGTACCAGTGGAGAAGGAAGGCATTTGCTTAAAAGCAAACCAACAGAAAGACATTGTAAGGCAGTTGTTTAAGTCTCAGAGACCTATAATTTTTTTCTTTTTTCTTTTTTTTCATCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCACAATCTCAGCTCACTGCAAGCTCCACCTTCCGGGTTCATGCCATTCTTCTGCCTCAGCCTCCCAAGTAGCAGAGACTACAGGCGCCCGCCACCACACCTGGCTAATTTTTTGTATTTTTAGTGGAGACGGGGTTTCGCCGTGTTAGCCAGGATGGTCTTGATCTCCTGACCTCATGATCCGCCTGCCTCGGCCTCCCAAAGTGCTGGGATTACTGGCATGAGCCACCACGCCCGGCAACTACAATTGTTCTTAAAGCTTGTAGAATTACTGTGTGCTACCAACAGACAGGCTAATTTTGAGTGACCCTCAGTACTTTGTACAGTTAATTTGGCACGCTGTGTACTTAGTGGCTTTTTAACAGCTATAAATTTGGGCTGCTAGAAAAGTAGTAAAGTTGTGATTCTTGACAGGCATCTATCTGCATTTTCATTTTTACTTCATTTGTCTAGACTCAGCTTGTCAGAATTATGGAAGAGACTCCTTGTGTCAGGGCAAGCACTGTGAAGAGAGGTATTCACTGTCAGAAAAGAGAGGGGAGCTGGAGGCAGCTCAGAGGCCTGAGACCCGCCTCCACAGGAGCCCCAGCAGGTTCGGTGGAGCTCTGGCCACACTCTCCTTTGGGATGCTGAAGTCAGAATGAGTTCACTTCCCAGCCAGTCTTGCCAAGGCTCCTCACCTGGAAGCAGCAACTGCCCAGGGCTGTTGGATGTTTCTCCCCAGGGGACAGCCAGGTCCCAGTCCCGCCTCGGTGTGGAAGGAGGAAAGGCAGGGTCCAGGAAGCTGTTTCAGGACAGGCCCAAGGTCCCCCAGGGATGCCTTTCAGGGTCAGCGGAGGCTGTAAATCAGCAGGGCCCACACGGCCTGGAAGAGGCCCCTGTGCTGTCGGCTTGCCCGGCTTGCCCGGCTCCTAGTCCGGCTTCTGCTCCTCCTTTGTAAAGTTATGGATATGCTAATAGTTTCCAACTGAGACTAGGAAAGTAAGTCCTACTTGACACTGTTTGGTCAGAAAGAGGGAGAGAAAGGAGAAGGACAGAGAGAGACTGAGAGAGAGACAGTCTCAGACAAAGGGAGACGGAGGGAGGGAGGGAGAGACAGAGAAAGAGATGGGAGGTAGGTGTGGGAGGAGGGAGAGATGCAGAAGGCAGAGGAAAGACAGACAGAGATTTAGACCTCCCAAGTCAGTGAGCAGTCCAGAGTTGGAGTGGAGGGTGCCTGGTGGCTTGTGACTGCAGACTCCACTCCCCGCTCCTAGAGGCACAGCCATGGACAGCTTCTGTCACGTTGGCCCTGCACTTATCTCTGCATCTATTTCCCCTTGTGCAAGATTCAGAACTGCATGCTCCAAAAAAACAATAAAAGCATTCATGTTCATAAGAATTGCACAGGTAAAAGGTAGTTTGCTGATATTGTTGTATTTTTTACTATCGCTTCTTTTAGGTCTTGCCTGAAATTGTTTGGGTTTCCCAGGCAAAGTAGAAAACTGCGGTACGTTTCTGTGAAATAATTATTCCTTCTGGCATCTCCCTTTACAGACCTACTGATCTTGATTTTTCATTTAGGTGAAAGTTTGTGAAAACATGCCATTAGCTTGCTTTGTGATTAACTCCTTTTACTGAATGTGAGCTCCTTTTAAATTGAGGCCATATCAAGCTTAAATTCCATATTTTACCCGGCACTCTGCATTTCTTCCATGTGGGAGAGGAGGGGCTCAGTAAGTGCTTTGTAAAATACACAGCCGAAGTGATGCACGTGCTAACAAAGGAGTGTGACAGGACTTAAGTGCCCTTCTAGACACTTCAGGCTCCCCTTTGTAAGCTGTCTTGGAAGAGGCCACATTTCCTTTCCCTCAAACAGTTTCTCATTGTTTGATTATTCTTTTAGCCTTTCTCTGGAAGCAAAGCCACTTTTACGAGAAAGTCACTGCTTTTTCATCTCAAGAGATGCAAGTTTGGAGTTTGGGGAAGTTTTCAGGTGCCCGTCAAGTCATCCTTTATGATGTCAGACGAGTCAGGCCACAGAATTCACAGGGCTCAGTGCAGACCGAAAACTTGAGGCCTCTTGTTCAGAAATTATTAAAAATTTTGGTGAACATCACCCCAAGCAAAGAGATCCCCTAAGCACCAGCCCCCAAGCAACTGCACTCATAAGCCCATGAAGCCCCCTGCTGTCAGAAACAATGTGGTTGAAATTGTGTATGCACTTGGAAGTGAGATGGATTGCAAAACACAGGTCTCCATGCTGGGGCAGGAGTGGTGATAGGGCATGGAGTGGAAATGTCCAGCAGGCCCACGTGCGAAAATGCAGAGCTCTCTGGCTCTTGCAGACTTGGCTGCTGACAATAGACGCGCTCCAGGAAGGTGCTCGCTGTGGTGTGATCTGCTGCCCACCCCTAGCTCCCTCCAGGAGACTGGTGCGGGGACTGTTTGCAAATGACTGCAAAAGTAAGAAGGTTCCCACAGAGCAGAGCTTGATTTGGGGACCAGCCGAGGGCAGTTTGTCAGGATTCCGGCTTGAAACTGTTCTCACATCTCACCGCCTGAAAGGACGAGTGTGTCCAGAGGACTTAGCATTGATCACCTCTGTCTCCATGCAGCAAACTCAGAGGCTCAGCCCGCATTCCACTGGAAGGGCGTTTGCCAGTGGTGTTGGTTGGAAGAGCCTTGACTTTGCCTTAGGAAACATCTTTTTTTAAGAATTGAAAATAACTTGAGTATGCAACAGTAGGGCATTTGTTATATAAATTAGTTGACTAGTGTGTAGCCAGTAAAATGATGATGGTGGTGTGTATTTGTTAAATAAAAAGATATGTGTGGTATTAAATTAAAAAATATTTTAAAACAACATATTTGTAATCTGTTTAGTGTCCTCTTTTTGTAAAAAGTACAGAAATAAATATACAGAAAAAATAGTAGTCCTAAGTGGTAGAAATTATGAGCATTTTCTTGCCTTTAAAAAAAGTTGTAAAAGATTGTATCATTTATGTAGCAAAAAGTTTTAAGTCAGCATTCTAAAAATTTCGTGTTGTTATAGTTGCTGTGACAAGATTTAACTTCTGTATGCTTCACCAATCAATACAGAGGTATTTAAGACCCGGTGTGTGATAGGCCGCGCTAAAATACTATACACATCTTCAGAAAACTAGAGAACTAACTTCTAACTTCCTATATTAGTGTGGCACGGCTGTTACAAAGATTTTTCTCATTTGAGTCTATCTTGCTTCTTTATCATTGTTTTGACAGTTTCAGAAGAATCGTGGCTTTTCCCCTTTTTTACAGTAAAGGTACCTGAGACTCTTGACGTATTGCTTTTTGGAAATGCTTGTGCTGGTCACATGCTTGCATCTGGGCTAGTGTGTCTGGCTTCCGTGTGCTGGTGGATGCTTACTCTGTTTTCTGAAATACTTTTTCTGTACAGTGGCCACTAGCTGTACTCCTAAGCCACACACCTACCTTGAAAATTCATGTCACTTTTAGAAATAGATAAAAGCCCCTCCCATCCAGAAAAAGTGACTATCATGTATATCCTCATCATGACTAATACTGATATTCCTGAAATTGAAAATACATATTCCATATGTACCATAAAAGGTATTAAAGATATATGGAGTGATAGATATATTATATATAACACTTCTACCCTCACAGTTTTCAGCCTAATTGAGAGGGTAAGATCCCTGAATCATCCATCAGTTTTTCAGGTCTCTGCTGAAAGCAGGCCACAGCTCAGATCCACACATCTGAACCAGAGACAGAGGTGGCCAAAAATAAAAAGGGGGACAGGGGGACAACCTGGTTTAGAGTCAACAAATAGACTGCATTTTCTGGTTAGTGAAGGAGCTCTCCTGAAAGTCATATACCAGAGCATAAATGAGCAGATTTCCTTGAGGTCACCTTCTGCTGGCCATAGCTTTCTTATCTGTGGAGCTGCCAGCTGTCATCCACTTTGGGGCACCTGAGACTGCCGAGCGGCAGGCCAGGACCCAAGTGCGAAAACACAGAACACCTTTTTGTTTCTACTCCACTGATGCTGGGGTTCTCTCCCTGGTGTTTGTGGCTCGTAGTACACTCTGTGGAACATTCACTATGGTCATCGAAGGGCAGCATCTTCCCAGTTGTTTCTTTCTTTTCTTTTTTTTTTTTAATTTAAACCGATCTGAGAAGCCAGCCATCTGTCAGCAAAACAGGAAGGCTCGGGCTGTCTCCTGGGCTCGTTTTGCTGCCGTAGTGAGCGTCACTTCTCCCCGTGTAAGAGTGCTGGTGAAGGCTGAGGCAAGGGCCCAGAAAGATTGAGGGACAAAGACAGGAGCGCCCGCATTGCCCATCTGCCAGGCTGGAGGTGTATTCATTATTGATGGAGGTAGTGCAGTTGCTGCTCAGATATGCAGCCCTGCCTGGGTAAATGAGACATTCTTCAGCAAATTGCTTCGTTTTTTGATTGCTGATTGTACGCGTGTCACCAAGCTGACTCAAGGTTCATCGATGCATGCTCAGTAAATTAGAAAGAACATAACTATGGATCAGCCAAGAGAATGAATTCTGTGCCTACAATGACCCAGGGCCATTTAATTTTCTGCTTAATTTTGTTGCAGTCAGTTTGCATTTTGGGTTATTATGCAGTAGGAAATTAACAATAAATAACAAATTTGGTCCTCCTGTGCTTGTAATGATATTTTTATAAATCTTTGTAATGCTGTTTTTAAAAGGATCAAGGTCTGTGCCAGTCTGATACTCCAGCAAGTATGTGAGGAGGAAAATGCATTATTCTTGCTAGATAACCTTGTTGTTAAATAGCATAGGGGTTCTTTATCTCTCTCTCTTTCTCATATCTTATTAGTATTTTTGCTTTAAACTAAAATCCCTTCCTCTCTTTCTCAGATAACCTGAGGACCATGGATGCTGATGAGGGTCAAGACATGTCCCAAGTTTCAGGTGAGACCTTATGAGATAGCTGTGTGGGAAGTTCATGAGAAAAGCTTCCCTGGGGCCGGAAGTCACAGTGCTTGGTATGCTCATGGGGGAGGAATAGGGGCTATTCTGCAAAAGAAAAGACCATGATGGAATTTGCCTGAGTGTTTCCTTCACCTGTTACAAATTATCTCACTTTGAGCTGAACAGAAAGCCTCCAAGATGAAATTAGTTTTACTGTTAAACTTCAGGAAAAAAAAACGGGAAGAGTTAAATACATTTTTGTACTGTTGGAAGGAAAAATGGCTGATTGGTTTAAAACCCAAACACATGCCAATGATGGTACTTAAAGAGAGAGAGAGAGAGAAGCTTGAAAAACATAATTGTTGGGCACAGTCATGACTGTTTGTTCATTAAGCATGGACACAACATTGCTCCCCTTTGCCATATATCTTTTCAAGCCGTATTGGATATAGCTCTTCTCATCCAGGAGACCCAGGAAGTGGAGAAGTCTGTAGTAGGAAAAGCCTAAGGGTAGGTCACAGACTGTGACCATTTGGCAGCACTGAGGGTGGACGGCGAGCCAGTCCAACAAAACCGCACAGTTCCCCAGTGCATGGACATAGGAAGACAGCTTTCTATCTGGCCCTGTATCCAGAGGCGTCAGCCCCAGTAGCAGCTTTCATGGACTTTGGGGTTTTCGGTATTTCATATTTTTGAGCCTCACAGACTCACAGCCAGCCCCAGAGGCTGACTTATATTTGAGAAAGTTCTCAGTGGCACCTTGCCTTGGCTGAGCGCCCTCGTGTTTTGAAGTTTCTATGGGATTCTACAAGTTGGTGCTCCTGATGAAGACCAGGACCTATGTGTGGCTGCTCCCCTGCTTGGTGGTTTCCCTGGGGAAGGTGCAGGAGAGGATCTTCTGAGTTCCATGGAACTGGAGATAGATCTGCCAATCACAGGCTTCCTTCTCCACCACTCCTCAGCCGCTCTATTCATGTTTCAGATTTTGGACTTAAACTCTCCCAGGTGCAAAGAACAAACAAAAGGCTAGCTTATTTTTCTTTTAGAGTGAGGCTTCGTATTTATTACAATATAATTGCCACATTCTTTGTGTAATTCTCACATTTATATCTTAAATATAATTCTCATGAATGAGAATTATATAATTCTCTTTTTGTATATCATTGAATATTTTCACTTAATTTTTAATTTTTTTAATCGTCACAAAATAATTGTGTACATAGACACAAAATAATTGGGTACATAGTGATGTTGTGATATATACAATGTATAGTAATCGGATCAGGTAAATCAGCATATTCATCATCTCAAACATTTATCGTTTCTTTGTATTAGGAACATTCGACATCTTCCTTCTAGCTATTTGAAACTATATATTATTGTTGACTACAGTCATCCTGCAATGGTGTAGAACACTAGAACTTATTCTTCCTACCTAGCTGTAATTTTGTCTCCTTTAACAAATCTCTCCCTATCTTCCACTCCCCCGACCTTTCCAGCCTCTATTAGCCTCTGTCCTACTTTCTACTTATAATGATGACAGCAGCATTTGTTAGTTTCCACATGTGAGTGAGAACATGTGGCTTTTTAACTTTTAGAATGTGGTATTCAGGCACTTCATGGTACAGTTGGTAAAAGTGAAAATGTGTCCAAAAGTTTGTGATTATCTATATAAACAAAAATGGTATAAATACAAATATCAATTTTGCATTGAAGAACTTACCTTAGAGGTATATTCTCACAAGTGCACAGAGCATTTAAGCATTTGTTCACTGCAGCATTGTTATCAGTATTTTAAAACTATGGTACATCCATGTACTTCCACATACAGCTCTTAAAAATAAGGAGGATATGAATGAACTAGTATGAAAAGAAGTCCAAATACATGTGAAAGTGAGAATAGCATGGTTCTGGATGGTATGCAAAGTATGATCTCGTTCTTTTAAAAGAAAATAAATTACATACACATACATATTTTCTATATGCTTGCCCATAACGTTTAGGAAAATTCTTGGGTGATATTTATTAACCTGGACTTCCTCTTGGAAGACTGATGGTAGAAGGAAGGGGACGAGTTAGGGAAGAGGAGGAGAAGGAAAACTTTGCTTTTCATCTTCTACCTTTTAGCATTATTTGAATTTATTTTCCTTAAGCGTTTACTTTGTTTCGTAAACAAAAAAGCACAAAAACAAAAAACGAGTTAAATGGGAAAAAAAGCAGTTTAGCTCTTTATAGCCTCTCATTTGGCTTCGCCAGCCTCTCACTGCAGCCTCAGAGAGCTGGTCTGGGAAACACTGGTAGATGAGGACTGTAATCCTCACTCATGGAAGAGGATCTCATTCACTGGGTTTGCTGACTGTGACTAGAAGTGATTAGGGTGTCAAAAAACCCAAGCATGTTAAAAATTTCCAGAGGCCAAAAAGATGCTTTCATTGTTCTGCTCTTCTTTTCCTTGTCGCTTTCACTTTGGGTAGCTTCTAAATTGGTATTTTGCATGGTGCATTTAAAGAAAATGAGACCCCTTTGGCCAATGCAGGAGTCTACACTCTGATATTCTAGAGTCAAAGCTGAATGCTGACACCTAGGAATTCATCTCTAGAATGTTTATATAAGGAATAGCCCCTCAGTATTCCGATCTCGTATCTTAGTAACGAAACTAACAAAAGCCTGATTCTCCTCTGGTAGTTTTCTTGTCTTTACCATAATACAAAATAAGTAATTTGTTCTGCACCCTGACTGTTCAAAGGATAGGGTAGCTGGGGGCGGGGACAAGAATGGAGACCTTATTACATAAGACTTCCTGAAAAAGGAAACTCTGTTTTTGTTTGAAATGATTTGGTCTGAAATTTAGTTTGTGTACACTTACCAAAGGGATTCCTATTTCTAAAACACTCATACTGCTTTTGATTCCTGTTAACCTTTGAGCACTCTACGTAATGATGAGAGCACTTAAAGAGTCATGTCACTTTTAGTAAAGAATCAAAGGATACTTTTTCTACTTCTTCGAGTTTGATCTCTGCTTCTCCAGTTAAAACCAGTATTTGTTTTTTTCATTTCTAAAGTTGGAAGAAATGACAGTTAGTTATGGCATAAGGATGTACATTTAACCAAATAGGAGTTGACATTCTTGGTAAGAAATCTTACCAAGATTATGTTATAGATTATAAGAAATCTTAACAAGAATATGTTCCTAAATCATCCTCTTTTCCCATAAAATATTAAAGTATCAGCAATTTCATAGGATTCAACCTAATGTATGCGAAATGCTAGATAAACAGATAAATACTTAATATCTGGCTTTTTTTCAAAGCACTGGGTTATTTGTTCCTTGAGATTTATCCTAAATGTGGGCTATACCCTGGTTTACAGTGTCTCACAGATGTGTAGTAGTAGACACTCCATAAGTGTTTACTGACTTGAATCCACAGGGTACTGAGAAAATGCTACTGATAGACTTGGAGGAGAGCATATCTAAAGCAAGCTACCCTTTCCTTTAGGGCACGTCTCACTAATTCTTTGGGTAAAGCGTATTTTTCTTCCTTTTGTGTTTTTGGCAGTCTTTCCAAAAATACGTGTTATACCTATGCATTATTTTTTGGTTTGGTTTCTAAAGAAAGAGTCAGCCGGTGGGAAAGTGAAGGATGTGGGAACTGAGAGATCTGCATCAGCATCCCACCTCTACCTCCCACGATGGGACCTGAGACAGTTATTTTTGCCTCCTGGACCACTATAGTATCATCTGTAACAGGAGGGACTTGAGCCAGTTGATCTCTAAGGTTCCTCTGGCACCTGTGACCCTAAATAGATATTGGATATTGGTTTAATGCTATTTGTAGTGTGTTTTTTTGGGGATATGGAAACCAGAAGTTTGTTTCCATAAACATAAACATAAACTGTATATATCTAAAGGATATGGAAACCTTTAGATATATATAATCTGCTTACGTAAAGAAGGTTTGTATATATTGCAGTGTCAATGGGAATATTTTATCAAGTTAAGCATAGTAAATCACATTGATTAAATGCTTTGTATTTACCAAACATTACCCAAAGTGTTTTCTCCTTTCAACCTCACAAGGACCCACAGAAGAAAATACAGTTATCATTTCCAACCTGCAGGGAGCTGAGACACAGAGAATTTAAGCAACTGACCGGAAGTCCAACAGGGAGTCAGAGATTGCTCTGGGGTGTGATCCCCACTTGGACCCTAGAGTGGAAGCTTCTCCACTACTTTATAGAGTTGAGATTCTATATTTTGAGCTTGTATTTACCCAGAGAATTATATCCTCTTGGGCAATTGTGTATAATAAAACCTCATGCATTTAGGAGAGGCGGGATGACAGAACTTTGTTGAGTGAATTATAATCTACTTGAGAAATTATTTGCTTACATTTTATAAGCTAATTATACCATATCTCATCCAGTTTTCCCAGAACACTTCTCATAGGTAATGCTTTATTTGAAACATAGGCCATAGGTAAGTTAAGTGTAAATGTGTATTTTTATAATTTAACCAGAAGTTTATTTCATTTTTCTAAATAAGTGAAATTGTATTGCATCTTCTAAATTATTCTATTTAAACACTTGATGTCTTGCTGTCTCCGTCTCTGTGTGTTTGCATGTCATTGTACATGTTCTTAGGAAAAGTGTGGGAGCTTGACGCAATATATACCTTATGTTTCTATGTGCATATAGTTTACCAAATAATACCATAAGTTTACTTAGCATATTAGAATCCATGCACATTATTTTTATTTTATCTTCACCGCAACCCTGTGGGATAGACCAAAATCATGCTTTTCAGCCTCCTTTTTCCACTTGAGGAAAGGAGTCTTAAAAAAGGGACCAGTCTCATGTTCCCATTCGTCTTACAACTAATTGGTCAAGCCAGAAAGCCAGAACTATGTCCTGGGTCACTAACTCCTAGTCACTGTGTGTTAGTATTTGAGATGCCTGTTGGCTTGATTTAGTCATTTATTTTTTAGTGTTTTATAATCCTTGCATACTTTTACATTTTAAATGGTTAACCAGGCAAATTGGTTTAAAATCAGTGCATAAAAATACTGTGCCTATCATGATGGGTTTCATGAAGTGATAACTTTTCATCATGGAGATCCTCAGCTGTCACAGAAGATGAGGGGCCCTGGGTACAGAGGCTCACGTGAGGGATGAAAGTCTCAGCAGCCCGGACTTACACTTTGGGGCTTTTAGGCAAATCAGACAACCTCTTAAGAACTATCACTGAGTTCAGGCAAGGCGAGCTTGAATTAACACAGGGCCCTTGGTGGGCATGTGAATATATCTCACTTCACTACCATCCAGTTCTGACTCTTTACTAGATGCCCCTGTACATACCAAGACTGATTTTTTATTCTCCCTTCTCCCCATGTGGTTTCTTCTGCATAGAGAGTTCCTATTGATCAGTCTGACCCATGGTATTTTAGAATTGCGATCCCTACTGTTTCATTATTCCTTTTTCTCCCCCATGTTGAAAAAAATAAATGTCCTGAGATGCAAGATCAGGGACACTGGAGCACTGACATTTAGTTCAGTGCAGGAACTGAAGGCAGATGTAATTCTTAAGAAGCGTACCTGTTATTATGAACCATCCTCAACAAATTGTAGTGGATCTTGTTTTCTCATAGATACAGCAGTTAAATTTTTTAATAAAAGTAACTAAGAGTTATTTGGATGTATTTTAGCATGCACTGAGCGGAAAGTACGACATTTCTTCATTGGGTAAGTCCTGATTCTTTATGATCCTCACTTGGTTCCAGGGCCCCATGCATCTAAGGGTGTCTCAGAGCATCCTGCAGTGCTCCAGCATGATCGCAGGGAAAAGCTATAGGAGGAAAAGAGTCAATAAAGTTTAGTTTCTCAACCTCCCACCTCCACCCCATAATAATGACAGCTGGTTAATCATGAGACGCGTGCACACCCCACACGCCCTGTACATGTTTACTCATTGGGATAGCATGTCAGGCCAGAAGGCTCCATGGTCATTTCTATGAAGGTACTTTAGCAGGTCTTCAAGAAGGCAAGTGGCCTGGGTCCCTGCCTCCCCAAATTGCAAGCTCCCTGCTTTATGTAGGAGACCTATGTGTATATTACAGTTCTGTGTAAGATTATTTTGTTATTCTTACCCCCACACCCACCCCCCAACCCCCCGCTGCCACCAAAAAAAAAAAAAAAAAATTCCTCTGACAACCTTCATAAAGTCCTGGGAGTTTGAACACCATTGCTCTAGGAAGTCATCTTATACAAAAATAAGAGTTGTGAGGTGGTTCATATACCTCCTGCGTTCTCCTATTTGGAGTTTTTCCCCATTTATGAAAGAGGTGAAAACGCTAAGATATTTAGCAATTATTACTTTAAACATTTTCTATTTATAGGCCGGGCGCAGTGGCTCATGCCTGTAATCCCAGCGCTTGGGAGGCCAAGGCAGGCAGATCACGAGGTCAGGAGATCGAGACCATCTTGGCTAACACGGTGAAACCCTGTCTCTACTAAAAAAATACAAAAATTTAGTTGGGCATGGTGGGGGATACCTGTGGTCCCAGCTACTCGGGAGGCTGAGACAGGAGAATGGCTTGAACCTGGGAGGCCGAGCTTGCAGTGAGCCAAGATCGCGCCACTGCACTCCAGCCTGGGTGACAGAGCAAGACTCTGTCTCGAAAAAAAAAAAAAATTCTATTTACAGCAGTGAAAATAGTAGTGACTTAATGCACATTGCCAAGGCTTTAGCATAACATGAACACTTTCACTCAATGTCTCTCTGGCCTTTTGTTTTTCCTTGGGAAATTCTTATAATCCTGCTCCGTCTTTAACTATTCATTTTGTATTGGCTATCCAAATATACCCAATAATGCTCTTTCTGAAAATATGCCAATTGTGGTAATTACAGCTAAGCTGGAATATTAAATTGTGATGTCTGTTTTCCAGAGAATGAAGTAGTATTCCCCAGAGCATAGGCTTGGTGCCTGTGCAGGTTCTATTTTAAATATTCCAGGAAGGGTTGTTTTATATACTGAGGATGATTTTACTGGTCTTGCCAGTCGTCTGAAATGCTGGTATTACTCTTGTGGAAGGTTTATTCAAACAAACAAGGACATTTCACACAATACCTAGTCATGTTTTTCAGACATTTTAATGTTTGGTTCATCATTTGCACACACTCTCAAAAATCTAGGTTTGTCTATGTGTTCATATCATTTTGCCTGTTGCCAGCTCAGTCAGCAGGCACACTCTCCCAGGCTGTTGCTGTTTTGTTAGACTTCTTCAGGACCTTCATCTAAAATGGTCTTCCACACGTAGCTATACTGCATAAGTTCACATCATCTGTTTCTTGCATGTGGGTTGTGTCTCAACTCAAGTTTAAGTTAGATTTGGAAGGGCGGAAACTATAGGAGTTGCAGCTTCAGTGGAGAAAAGAGCATTTCCTACTAGTTATGGCTTCCCAAGGAAGGTTAGATTCCTCAGAGTAGGAGTGATTCCCCAATGCTAGAACCTTTGGTCAAATATAATTCTAATCCAGTCAAAATAAATACAGGTATTCTGTAAAACCCGATTTCATTTTGTAAATCCTACTTTGTATAGTATAAGCAATTTTTGTATTTGTGTGGATTATATTTTATTTTCCTATTTCAAAGAGAAGAATTTGTATTAGCAGACTCCCTTTGCATGCGGAGAGGGGATCATTTTCCCAGTAGGCATGGGGTTCCCTTCCATTCCTTGTCCAGTCTTCTTTTCCCCACTAAGTTAAGTCAAACTAAGCAGCTGGTAAGATATTCCCTGGTTCTTGCAAAGAAAGTGAGCAGATGGCAGAATGTATAGCTCTAAGCAGAATACCTGGTGTGGTATCCTCAAACACAAATTGACAGGAGGGTGTGGTGTGGCAAGCTCATTGTGGGGGTAAATTGGAATAAGCTTACAGGGGGAAGAGTTGACAAAAGATAGGAAGAACCTTAAAAATATAGATGCCTTTTATGCAGTGATAAAATGTCTAGATATTTATACTGTGGTGATTATTAGGAATATGTGCAAAGATTGGCTATTAGGATGTTCATTACAGTGTTGTTTAATAATTATAAAAGGACAGAAAGCAATGTGGACTCAAAAATAGGAAAAGAATTTAAATAAATCCTAGTGTACCCGTTATACATGAAATTATGGAAATATGACCCTGAGCATGGAAATATGTACATGAGAATGTCTAAAAGCTAGTTCATTTTGAAAAACAAAATAATGTCACCTCATATTATTTATAGTATATAAAGATGATTTTAAGAGTGGCAGTGTCTGGGATTATAGGTGATTGTATTTCTTCCCTTTTGCACATCTATGTTCTCTCATTTGTATTGTGTGGGGAGAAGTGACTTTTTTTATAAAAAGAAAAAGGTATATGCATCCCAGCAGAGAAGCACTGGCTCCACCCAGTACCTGCCTCCTCATGCCACCCTCTCAAGCCAAAAGCCGGGGGAAGCCCAGGCACCTTGACCATGACCGCCCGAGACTCACACTTCTTCTTTCTCATCAGGGAAGGAAAGCCCCCCTGTAAGCGATACTCCAGATGAGGGCGATGAGCCCATGCCGATCCCCGAGGACCTCTCCACCACCTCGGGAGGACAGCAAAGCTCCAAGAGTGACAGAGTCGTGG", 50344378 - 1); // the stuff from the genome represents a small part, most of it is brought in through the RNA-edit insertion var codingRegion = new CodingRegion(50358658, 50367353, 169, 1602, 1434); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 50344378, 50344383, 13, 17), new TranscriptRegion(TranscriptRegionType.Exon, 1, 50344384, 50344518, 19, 154), new TranscriptRegion(TranscriptRegionType.Intron, 1, 50344519, 50358643, 154, 155), new TranscriptRegion(TranscriptRegionType.Exon, 2, 50358644, 50358697, 155, 208), new TranscriptRegion(TranscriptRegionType.Intron, 2, 50358698, 50367233, 208, 209), new TranscriptRegion(TranscriptRegionType.Exon, 3, 50367234, 50367353, 209, 328) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(1, 0, "GAATTCCGGCGT"), new RnaEdit(6, 5, "A"), new RnaEdit(16, 16, "T"), new RnaEdit(97, 97, "C"), new RnaEdit(316, 315, "CCAGTAATGTTAAAGTAGAGACTCAGAGTGATGAAGAGAATGGGCGTGCCTGTGAAATGAATGGGGAAGAATGTGCGGAGGATTTACGAATGCTTGATGCCTCGGGAGAGAAAATGAATGGCTCCCACAGGGACCAAGGCAGCTCGGCTTTGTCGGGAGTTGGAGGCATTCGACTTCCTAACGGAAAACTAAAGTGTGATATCTGTGGGATCATTTGCATCGGGCCCAATGTGCTCATGGTTCACAAAAGAAGCCACACTGGAGAACGGCCCTTCCAGTGCAATCAGTGCGGGGCCTCATTCACCCAGAAGGGCAACCTGCTCCGGCACATCAAGCTGCATTCCGGGGAGAAGCCCTTCAAATGCCACCTCTGCAACTACGCCTGCCGCCGGAGGGACGCCCTCACTGGCCACCTGAGGACGCACTCCGTCATTAAAGAAGAAACTAATCACAGTGAAATGGCAGAAGACCTGTGCAAGATAGGATCAGAGAGATCTCTCGTGCTGGACAGACTAGCAAGTAACGTCGCCAAACGTAAGAGCTCTATGCCTCAGAAATTTCTTGGGGACAAGGGCCTGTCCGACACGCCCTACGACAGCAGCGCCAGCTACGAGAAGGAGAACGAAATGATGAAGTCCCACGTGATGGACCAAGCCATCAACAACGCCATCAACTACCTGGGGGCCGAGTCCCTGCGCCCGCTGGTGCAGACGCCCCCGGGCGGTTCCGAGGTGGTCCCGGTCATCAGCCCGATGTACCAGCTGCACAAGCCGCTCGCGGAGGGCACCCCGCGCTCCAACCACTCGGCCCAGGACAGCGCCGTGGAGAACCTGCTGCTGCTCTCCAAGGCCAAGTTGGTGCCCTCGGAGCGCGAGGCGTCCCCGAGCAACAGCTGCCAAGACTCCACGGACACCGAGAGCAACAACGAGGAGCAGCGCAGCGGTCTCATCTACCTGACCAACCACATCGCCCCGCACGCGCGCAACGGGCTGTCGCTCAAGGAGGAGCACCGCGCCTACGACCTGCTGCGCGCCGCCTCCGAGAACTCGCAGGACGCGCTCCGCGTGGTCAGCACCAGCGGGGAGCAGATGAAGGTGTACAAGTGCGAACACTGCCGGGTGCTCTTCCTGGATCACGTCATGTACACCATCCACATGGGCTGCCACGGCTTCCGTGATCCTTTTGAGTGCAACATGTGCGGCTACCACAGCCAGGACCGGTACGAGTTCTCGTCGCACATAACGCGAGGGGAGCACCGCTTCCACATGAGCTAAAGCCCTCCCGCGCCCCCACCCCAGACCCCGAGCCACCCCAGGAAAAGCACAAGGACTGCCGCCTTCTCGCTCCCGCCAGCAGCATAGACTGGACTGGACCAGACAATGTTGTGTTTGGATTTGTAACTGTTTTTTGTTTTTTGTTTGAGTTGGTTGATTGGGGTTTGATTTGCTTTTGAAAAGATTTTTATTTTTAGAGGCAGGGCTGCATTGGGAGCATCCAGAACTGCTACCTTCCTAGATGTTTCCCCAGACCGCTGGCTGAGATTCCCTCACCTGTCGCTTCCTAGAATCCCCTTCTCCAAACGATTAGTCTAAATTTTCAGAGAGAAATAGATAAAACACGCCACAGCCTGGGAAGGAGCGTGCTCTACCCTGTGCTAAGCACGGGGTTCGCGCACCAGGTGTCTTTTTCCAGTCCCCAGAAGCAGAGAGCACAGCCCCTGCTGTGTGGGTCTGCAGGTGAGCAGACAGGACAGGTGTGCCGCCACCCAAGTGCCAAGACACAGCAGGGCCAACAACCTGTGCCCAGGCCAGCTTCGAGCTACATGCATCTAGGGCGGAGAGGCTGCACTTGTGAGAGAAAATACTATTTCAAGTCATATTCTGCGTAGGAAAATGAATTGGTTGGGGAAAGTCGTGTCTGTCAGACTGCCCTGGGTGGAGGGAGACGCCGGGCTAGAGCCTTTGGGATCGTCCTGGATTCACTGGCTTTGCGGAGGCTGCTCAGATGGCCTGAGCCTCCCGAGGCTTGCTGCCCCGTAGGAGGAGACTGTCTTCCCGTGGGCATATCTGGGGAGCCCTGTTCCCCGCTTTTTCACTCCCATACCTTTAATGGCCCCCAAAATCTGTCACTACAATTTAAACACCAGTCCCGAAATTTGGATCTTCTTTCTTTTTGAATCTCTCAAACGGCAACATTCCTCAGAAACCAAAGCTTTATTTCAAATCTCTTCCTTCCCTGGCTGGTTCCATCTAGTACCAGAGGCCTCTTTTCCTGAAGAAATCCAATCCTAGCCCTCATTTTAATTATGTACATCTGTTTGTAGCCACAAGCCTGAATTTCTCAGTGTTGGTAAGTTTCTTTACCTACCCTCACTATATATTATTCTCGTTTTAAAACCCATAAAGGAGTGATTTAGAACAGTCATTAATTTTCAACTCAATGAAATATGTGAAGCCCAGCATCTCTGTTGCTAACACACAGAGCTCACCTGTTTGAAACCAAGCTTTCAAACATGTTGAAGCTCTTTACTGTAAAGGCAAGCCAGCATGTGTGTCCACACATACATAGGATGGCTGGCTCTGCACCTGTAGGATATTGGAATGCACAGGGCAATTGAGGGACTGAGCCAGACCTTCGGAGAGTAATGCCACCAGATCCCCTAGGAAAGAGGAGGCAAATGGCACTGCAGGTGAGAACCCCGCCCATCCGTGCTATGACATGGAGGCACTGAAGCCCGAGGAAGGTGTGTGGAGATTCTAATCCCAACAAGCAAGGGTCTCCTTCAAGATTAATGCTATCAATCATTAAGGTCATTACTCTCAACCACCTAGGCAATGAAGAATATACCATTTCAAATATTTACAGTACTTGTCTTCACCAACACTGTCCCAAGGTGAAATGAAGCAACAGAGAGGAAATTGTACATAAGTACCTCAGCATTTAATCCAAACAGGGGTTCTTAGTCTCAGCACTATGACATTTTGGGCTGACTACTTATTTGTTAGGCGGGAGCTCTCCTGTGCATTGTAGGATAATTAGCAGTATCCCTGGTGGCTACCCAATAGACGCCAGTAGCACCCCGAATTGACAACCCAAACTCTCCAGACATCACCAACTGTCCCCTGCGAGGAGAAATCACTCCTGGGGGAGAACCACTGACCCAAATGAATTCTAAACCAATCAAATGTCTGGGAAGCCCTCCAAGAAAAAAAATAGAAAAGCACTTGAAGAATATTCCCAATATTCCCGGTCAGCAGTATCAAGGCTGACTTGTGTTCATGTGGAGTCATTATAAATTCTATAAATCAATTATTCCCCTTCGGTCTTAAAAATATATTTCCTCATAAACATTTGAGTTTTGTTGAAAAGATGGAGTTTACAAAGATACCATTCTTGAGTCATGGATTTCTCTGCTCACAGAAGGGTGTGGCATTTGGAAACGGGAATAAACAAAATTGCTGCACCAATGCACTGAGTGAAGGAAGAGAGACAGAGGATCAAGGGCTTTAGACAGCACTCCTTCAATATGCAATCACAGAGAAAGATGCGCCTTATCCAAGTTAATATCTCTAAGGTGAGAGCCTTCTTAGAGTCAGTTTGTTGCAAATTTCACCTACTCTGTTCTTTTCCATCCATCCCCCTGAGTCAGTTGGTTGAAGGGAGTTATTTTTTCAAGTGGAATTCAAACAAAGCTCAAACCAGAACTGTAAATAGTGATTGCAGGAATTCTTTTCTAAACTGCTTTGCCCTTTCCTCTCACTGCCTTTTATAGCCAATATAAATGTCTCTTTGCACACCTTTTGTTGTGGTTTTATATTGTAACACCATTTTTCTTTGAAACTATTGTATTTAAAGTAAGGTTTCATATTATGTCAGCAAGTAATTAACTTATGTTTAAAAGGTGGCCATATCATGTACCAAAAGTTGCTGAAGTTTCTCTTCTAGCTGGTAAAGTAGGAGTTTGCATGACTTCACACTTTTTTTGCGTAGTTTCTTCTGTTGTATGATGGCGTGAGTGTGTGTCTTGGGTACCGCTGTGTACTACTGTGTGCCTAGATTCCATGCACTCTCGTTGTGTTTGAAGTAAATATTGGAGACCGGAGGGTAACAGGTTGGCCTGTTGATTACAGCTAGTAATCGCTGTGTCTTGTTCCGCCCCCTCCCTGACACCCCAGCTTCCCAGGATGTGGAAAGCCTGGATCTCAGCTCCTTGCCCCATATCCCTTCTGTAATTTGTACCTAAAGAGTGTGATTATCCTAATTCAAGAGTCACTAAAACTCATCACATTATCATTGCATATCAGCAAAGGGTAAAGTCCTAGCACCAATTGCTTCACATACCAGCATGTTCCATTTCCAATTTAGAATTAGCCACATAATAAAATCTTAGAATCTTCCTTGAGAAAGAGCTGCCTGAGATGTAGTTTTGTTATATGGTTCCCCACCGACCATTTTTGTGCTTTTTTCTTGTTTTGTTTTGTTTTGACTGCACTGTGAGTTTTGTAGTGTCCTCTTCTTGCCAAAACAAACGCGAGATGAACTGGACTTATGTAGACAAATCGTGATGCCAGTGTATCCTTCCTTTCTTCAGTTCCAGCAATAATGAATGGTCAACTTTTTTAAAATCTAGATCTCTCTCATTCATTTCAATGTATTTTTACTTTAAGATGAACCAAAATTATTAGACTTATTTAAGATGTACAGGCATCAGAAAAAAGAAGCACATAATGCTTTTGGTGCGATGGCACTCACTGTGAACATGTGTAACCACATATTAATATGCAATATTGTTTCCAATACTTTCTAATACAGTTTTTTATAATGTTGTGTGTGGTGATTGTTCAGGTCGAATCTGTTGTATCCAGTACAGCTTTAGGTCTTCAGCTGCCCTTCTGGCGAGTACATGCACAGGATTGTAAATGAGAAATGCAGTCATATTTCCAGTCTGCCTCTATGATGATGTTAAATTATTGCTGTTTAGCTGTGAACAAGGGATGTACCACTGGAGGAATAGAGTATCCTTTTGTACACATTTTGAAATGCTTCTTCTGTAGTGATAGAACAAATAAATGCAACGAATACTCTGTCTGCCCTATCCCGTGAAGTCCACACTGGCGTAAGAGAAGGCCCAGCAGAGCAGGAATCTGCCTAGACTTTCTCCCAATGAGATCCCAATATGAGAGGGAGAAGAGATGGGCCTCAGGACAGCTGCAATACCACTTGGGAACACATGTGGTGTCTTGATGTGGCCAGCGCAGCAGTTCAGCACAACGTACCTCCCATCTACAACAGTGCTGGACGTGGGAATTCTAAGTCCCAGTCTTGAGGGTGGGTGGAGATGGAGGGCAACAAGAGATACATTTCCAGTTCTCCACTGCAGCATGCTTCAGTCATTCTGTGAGTGGCCGGGCCCAGGGCCCTCACAATTTCACTACCTTGTCTTTTACATAGTCATAAGAATTATCCTCAACATAGCCTTTTGACGCTGTAAATCTTGAGTATTCATTTACCCTTTTCTGATCTCCTGGAAACAGCTGCCTGCCTGCATTGCACTTCTCTTCCCGAGGAGTGGGGTAAATTTAAAAGTCAAGTTATAGTTTGGATGTTAGTATAGAATTTTGAAATTGGGAATTAAAAATCAGGACTGGGGACTGGGAGACCAAAAATTTCTGATCCCATTTCTGATGGATGTGTCACACCTTTTCTGTCAAAATAAAATGTCTTGGAGGTTATGACTCCTTGGTGAAAAAAAAAAAAAAAAAA") }; string actualCds = new CodingSequence(genomicSeq, codingRegion, regions, false, 0, rnaEdits).GetCodingSequence(); const string expectedCds = "ATGGATGCTGATGAGGGTCAAGACATGTCCCAAGTTTCAGGGAAGGAAAGCCCCCCTGTAAGCGATACTCCAGATGAGGGCGATGAGCCCATGCCGATCCCCGAGGACCTCTCCACCACCTCGGGAGGACAGCAAAGCTCCAAGAGTGACAGAGTCGTGGCCAGTAATGTTAAAGTAGAGACTCAGAGTGATGAAGAGAATGGGCGTGCCTGTGAAATGAATGGGGAAGAATGTGCGGAGGATTTACGAATGCTTGATGCCTCGGGAGAGAAAATGAATGGCTCCCACAGGGACCAAGGCAGCTCGGCTTTGTCGGGAGTTGGAGGCATTCGACTTCCTAACGGAAAACTAAAGTGTGATATCTGTGGGATCATTTGCATCGGGCCCAATGTGCTCATGGTTCACAAAAGAAGCCACACTGGAGAACGGCCCTTCCAGTGCAATCAGTGCGGGGCCTCATTCACCCAGAAGGGCAACCTGCTCCGGCACATCAAGCTGCATTCCGGGGAGAAGCCCTTCAAATGCCACCTCTGCAACTACGCCTGCCGCCGGAGGGACGCCCTCACTGGCCACCTGAGGACGCACTCCGTCATTAAAGAAGAAACTAATCACAGTGAAATGGCAGAAGACCTGTGCAAGATAGGATCAGAGAGATCTCTCGTGCTGGACAGACTAGCAAGTAACGTCGCCAAACGTAAGAGCTCTATGCCTCAGAAATTTCTTGGGGACAAGGGCCTGTCCGACACGCCCTACGACAGCAGCGCCAGCTACGAGAAGGAGAACGAAATGATGAAGTCCCACGTGATGGACCAAGCCATCAACAACGCCATCAACTACCTGGGGGCCGAGTCCCTGCGCCCGCTGGTGCAGACGCCCCCGGGCGGTTCCGAGGTGGTCCCGGTCATCAGCCCGATGTACCAGCTGCACAAGCCGCTCGCGGAGGGCACCCCGCGCTCCAACCACTCGGCCCAGGACAGCGCCGTGGAGAACCTGCTGCTGCTCTCCAAGGCCAAGTTGGTGCCCTCGGAGCGCGAGGCGTCCCCGAGCAACAGCTGCCAAGACTCCACGGACACCGAGAGCAACAACGAGGAGCAGCGCAGCGGTCTCATCTACCTGACCAACCACATCGCCCCGCACGCGCGCAACGGGCTGTCGCTCAAGGAGGAGCACCGCGCCTACGACCTGCTGCGCGCCGCCTCCGAGAACTCGCAGGACGCGCTCCGCGTGGTCAGCACCAGCGGGGAGCAGATGAAGGTGTACAAGTGCGAACACTGCCGGGTGCTCTTCCTGGATCACGTCATGTACACCATCCACATGGGCTGCCACGGCTTCCGTGATCCTTTTGAGTGCAACATGTGCGGCTACCACAGCCAGGACCGGTACGAGTTCTCGTCGCACATAACGCGAGGGGAGCACCGCTTCCACATGAGCTAA"; Assert.Equal(expectedCds, actualCds); } [Fact] public void GetCodingSequence_RnaEditSnv_StartsUtr() { // NM_001135635.1, chr11:65684281-65686531 var genomicSeq = new SimpleSequence( "TTTTAAAAAACACTCAAGACACAGACCCAAGCCGGGTTTTATTGAAATGCCAGGAGCAGGCACATGTCAAAGTAGCCAAGGAAGGGGGGACAGTGGTACAGGCTGTGTAAGTTGGCAGGGATGGGCAAGCCTCATGTCCATGGTCCTGGCATCCCCTCTGCCAGGGGATAAGTAGGCACAACTACCCTCCCCTCAAAATGGCATGCTCAGGCCAGTGGGGCCCCTACCCCTGGACCATGAAGGCTCCAAGAAGGGCTGGAAGCACTAAGTTTTCTCTCTCCTGAGGGGGAAGGAAAGAAGGGGAGATGCAGGAGGAAGGGGAGGTATAGCGGGGGATGAGCGTTCCAAGAAGTCTCTCCTTCTAGGTGTCTGCACCCAACTCATGGTGCTGGGCAGTGGAGAGGAGCAGCATTACAAAGGGAGGCTGAAGGCTCATCCCTCAGGGAACCGGAGCCCCCCAGCCTGTGGGGCTTGTGTCAGCCCTGAACAGAGGGCAGAAGTTCAAGGGGACTGAAGATGCAGGTAGTTCCCAAGTGACCTAGGAGTCCCCAGAGCTGGGGGGTGTGGCCTTCATAGGACAAGGAGGAAGACAGGAGGATCCAACCCCAGCATGGAGGGGGGAGTGGGCAGTCTCCCCAATTTGGCCCCCCTAGGTCAGTTCCACGTTGTTGGCACGGTCAAGCACTCGGGAGCCACGGGCACTACCCCCAAGCTGGAAACGGCTCTCATAGAGAGTGGGGCAGAGGTGCCAGCGATTGGCCCGGTAGATGCCCAGGTAGGTGTAGACATCAGGCTTGTAGGTGAGCAGGCACTTAATGCCCGCTGCACGGATGGCTGAATCCGCCTCCAGTACACCCAAGCGGTCCGTGAAGTCGTCCGTGTAAACACAGATGACCTGGCGCCCACCCTCCTTGGCACGTGGGCTCACCTTGGCCACCTGAAGCTGGCCTTCAACCACGGCCCGGGCAATGCCAGCCCAGGCGTGGTCCAGCTTGAAGCCCGGTGCCAGATGCATAAGCCACTTGCCCGAGAGCACGTGGTGGGTGATGGCGAGCTGGCGCAGGGTACCCGGTGTGATGGGCCGCCCACTGGTCTGCAGAGCTTCCCAGGCTGCCTGCAGGCCCTGCACGTCCCCGGAGTTGGGGCTGTAGCCCTGCCCATACACTGCAATCCAGCCCACAGGCTCTGAGTTGGGTGAACCGGGGTCCCCATAGCGGGTAACTTGGGATGGTGGGTACTTGGCCAGCCAGGCATCCAGCTCAGTGGCAGGCGTTGTGCGGGCATCAAACACTAGCCAGGGGTCCATGTCAGCTGCCATGGCCTCTGCAGCCAGGTGCTCGGCGGTGAAGCCATCCTCACGGCCACCTGGAGAGCCCTCCTCTTCCAGCTCCTCACCTGGTTCCATCCTGCTGTGAGGGAACCGAGTCAGGGCAGGGTCTGAGACAATAACTACAGATGCCAGGCACTGGATTAAACTGTGGCCTTGAGTAAGAGTTACTGTCGATGCGCCTCAGTTGCCTCATCTTTACAATGGGATAACAACTGTTCCTGTCCCGTAGATCTGCTATGAAAATTAGATGCCTGAGGAGTCAGCGCTCCAGAAGGGTTGCTGCAGTTATTACTATTCTCCTTGACTTACAGAAAAGGAAACTGAGGCTGAGAAAAAGGACTTGCCCAAGGTCACACCTGCAGTGCGTGGCAGGGCCGAAGGGTGAATCCAGGCGTGGGAGCAACCAGCCCCAGCTACACCTCCCGGCCCTGCCAAGGCCCCCTTTTCCTGGCAGGTATCCGGTGCGCTGGCATTTAATAGAGGAACGCAAAGAAGCGCACGTTCGCGCAGCTCCCGAGGCCGGCTCTGTAAGGCCAGGCCTCCCAGGCAGGCGTTATCGGGCCCACTTACAGACGAGGACGCTGAAGTCCAGAGAGGTTACAGGCCGTTCCGAGGCCAATGGGGCGGTTCCCAGACTCGAACCAGGGCTTGTTAGAGCCTGCAGGAGAGCCAGGCTCCGGCCGTGCCGCGCCCGCCGCCATTAACGCCCACGGGCCCGAGCTGTGCTCCCGCCCCGGCCCTGCCCTGCCCCTCCCGCCGCCCGCAGTCACCTCCGGCCTTCGCTGCGTTCGACGCCGGCCCAGCCCCGGGCCCGGCTCCGCTCCTGCCGTGGCTCCGCGCCACCGCCACCGCGCCCCACCCCCGCCACGGCCGCCGCCGCCGCCGCCGCCATCTTAGCGCCGCGCCACCTCAACAACAACT", 65684281 - 1); // the stuff from the genome represents a small part, most of it is brought in through the RNA-edit insertion var codingRegion = new CodingRegion(65684930, 65686502, 30, 911, 882); var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 65684281, 65685689, 152, 1560), new TranscriptRegion(TranscriptRegionType.Intron, 1, 65685690, 65686380, 151, 152), new TranscriptRegion(TranscriptRegionType.Exon, 2, 65686381, 65686531, 1, 151) }; var rnaEdits = new IRnaEdit[] { new RnaEdit(912, 912, "A"), new RnaEdit(986, 986, "C"), new RnaEdit(1561, 1560, "AAAAAAAAAAA") }; string actualCds = new CodingSequence(genomicSeq, codingRegion, regions, true, 0, rnaEdits).GetCodingSequence(); const string expectedCds = "ATGGCGGCGGCGGCGGCGGCGGCCGTGGCGGGGGTGGGGCGCGGTGGCGGTGGCGCGGAGCCACGGCAGGAGCGGAGCCGGGCCCGGGGCTGGGCCGGCGTCGAACGCAGCGAAGGCCGGAGCAGGATGGAACCAGGTGAGGAGCTGGAAGAGGAGGGCTCTCCAGGTGGCCGTGAGGATGGCTTCACCGCCGAGCACCTGGCTGCAGAGGCCATGGCAGCTGACATGGACCCCTGGCTAGTGTTTGATGCCCGCACAACGCCTGCCACTGAGCTGGATGCCTGGCTGGCCAAGTACCCACCATCCCAAGTTACCCGCTATGGGGACCCCGGTTCACCCAACTCAGAGCCTGTGGGCTGGATTGCAGTGTATGGGCAGGGCTACAGCCCCAACTCCGGGGACGTGCAGGGCCTGCAGGCAGCCTGGGAAGCTCTGCAGACCAGTGGGCGGCCCATCACACCGGGTACCCTGCGCCAGCTCGCCATCACCCACCACGTGCTCTCGGGCAAGTGGCTTATGCATCTGGCACCGGGCTTCAAGCTGGACCACGCCTGGGCTGGCATTGCCCGGGCCGTGGTTGAAGGCCAGCTTCAGGTGGCCAAGGTGAGCCCACGTGCCAAGGAGGGTGGGCGCCAGGTCATCTGTGTTTACACGGACGACTTCACGGACCGCTTGGGTGTACTGGAGGCGGATTCAGCCATCCGTGCAGCGGGCATTAAGTGCCTGCTCACCTACAAGCCTGATGTCTACACCTACCTGGGCATCTACCGGGCCAATCGCTGGCACCTCTGCCCCACTCTCTATGAGAGCCGTTTCCAGCTTGGGGGTAGTGCCCGTGGCTCCCGAGTGCTTGACCGTGCCAACAACGTGGAACTGACCTAG"; Assert.Equal(expectedCds, actualCds); } [Fact] public void GetCodingSequence_NonZeroStartExonPhase_CdsBeforeFirstExon() { // NM_001220775.1, chr7: var genomicSeq = new SimpleSequence( "ACTTTAGTCATTAAAGAAGAAACTAATCACAGTGAAATGGCAGAAGACCTGTGCAAGATAGGATCAGAGAGATCTCTCGTGCTGGACAGACTAGCAAGTAACGTCGCCAAACGTAAGAGCTCTATGCCTCAGAAATTTCTTGGTAAGAGTTAAATGTTTGCTGTCTCTTAAAAAAAAACTATGTGGGTGTTTTAGATGCAAGTAGAAATGAGTTGAGGGTGGAAGAAAGGGAAAAAAATCTTATTTTTTCAAAAGGAAAAATTGGTAAGCTTAACATTCCTTAAATATCTTAGAATTTTTTCCAATAAGTATCTTAAAAATAACAAACCTCCCATCAGTTTTTCCTAGATTTGATTTTGCAGCATCTGGGGCCTGCCCTGTGATCTGCCTGTGGACATCGCTCTTAGGGGCGGCTGCACCAGCGTGCACAGGGTGGAGAGTTTGGGCCTGGCTCGTCCGGGGGACACCACACTGCAGGACACTCCAGGCCTGGCCGGCTTCTCAGAGCTTCAGATCCTCATTTTTCATATGAAGCTCCTAATGCTCCCCTTATGGGGGACTCTGAAGGGTTAATGGGAGGAATCATACAGTGACTGACCCCTGAGAAGTGTCCAGTGAAGACAGGGCTTAGCTAGGATTGCTGTTTTGCCTAATGCTCTGCGGGATTAAAAAAAAAGAAGAAGAAGAACAAGACCATTCGTCTCTCTAGGAGCATTGCCCAGAGTAGGTATTAGACACACCAACACCACCATCCAGCCAGACGCTGCAGGGACAGTGAGCCAGGGTCCGAGTGGAAAGGCGCTAGGCTTGGGAACCAGCTCAGAGTCAATACAGAGCCACCGCCACTCACCAACTCTGTCAGCTTAGTAAAATGGCTCTGCCCCTAGAGCCCTGGTTCCATCCTTTAGTATCTCACAGGGTGATTGTGAATATCCCATGACTCCAAGATTGAGAAAACGTTTAGAATCCCTCGGTGTGAAGGTTAACTCTGTCCGGAAAGAGGACCAGTAAAAGCTTCATGAGGCTGAGATGCACTTTGGAAGAGGAATAGAGTTTCAGCACATTCTAGGTGTTGGAGGAATGGGGGAATCTAGGCAGATGTTTAAAATCAATGAGAAACCAGAATGCTGACCATGAGGGTTGGAGTGGGGGCCTAAGGACATGACGGAGGAGCAGGGTGTGTTCCCAGCTTAACTCAGGTACCCATGGGGAAGCAGGAAAAGTGAAGGTGTCCTAGGCAGCTCTGCCACAGGATGAATGGCTTCAGATGCCAGGTGAGCGAGGGACCCTTCATTCAGTCAGCAGGAAAGAAGCACTGGCATATTTTTTATGAGAACAAAGGCTAGGATAGTAAAGACAGCAAGTACCAAAAAATGACTGGAAAAGGGAGACTGTGGAGGCAGTGGCAGCAGGCATGGAAAGAAGGGCTTGTGAAGGGGAAGGGGTGGTGTCAGAGGAACATAGGGCTGGGGGCAGGGATTAGGTGAGGGAAACCATGAGTCACACTGATTCTAGAGTAGTGTGCCCTTGATGAAAAGGATAACACCAGGTTCTAGGAAAAGATGGGGTTCTGTTTTTGACGTGTTGATTTTCAAGGACTTCTGGTGTTTGTGACACATGGGGAAATTGTGGTGGGAGAGAGGTGGGGCCAGAACAGGGGCTGGTGAGGCCAAGGGTCCCAGAGGGCACCTGTTGACCTGCAGGATGACATGAAGGGGGAAGGACAGAGGCAAGGCCAAGTCCTGGGCACCAGCCTCCCTCTTGCAGCTTCAAATAGGGCTCCATTTTGACCTTTTGATTAATTAGAGGTTTGTCATAGGTTGGGGGTTGAGAGGAGCAAGGGAGAGAAGGATTCAGTGTACAAAAAGAATGAAAGCCACTGGCTGAGCCAGTGGGGAGTTGTCCACACACACATGAGCCTTTGGACCATGAGAACGAGGGAGGCCTTGCCTTCCTGAACGGAGTAGGAGTGAGGTCCTGTGCTGAGCGTAAGCAGTGGGATTCCCACAGCACTGGGCACAGAGCCCACGGGCTGCCTCCTGAGCAGCCAGCATCTGCCTGGGGTGGACACAGTGACAGAGAGATGGGTGGTGACTGGGGTATGGGCAGAGATAAGGCAGCAAGTGTGTGCAAGGGGAGTGAAGGGTTACTGACCTTAAGAAGCAGGGATGGCGTCCTCTGTCAGGTGAGGAGCCTGGAGAATGCTTTGGTGAATGAACGTTTGCAGCCCCTTTTAGCTTTTGGAGACTTGAAACCAAAGGAGAGATTCATCTGTGAAACTCTACTGGAGCCACTCCCCAACCCCCACCCTTGTGAGACCACAATGTGGGCGTTGGCTTGAGATGCTTCTGTGTTAGTAGAAGAAATAAACAACACAGTGCTCTGATGAGGCAAAGCGAAGATGAAAAAGGAGTTCCCAGGGGACATAGTAGGAACAGTGGACGAGGGTAGCAGAAGAGGAGTTTGGAGCAAAAGACTCACAAGCAGCTGCATAATCTGTTGGTGCTTGGCAGTTCATTTGTAAAAATGATGCCTCTTCCTGCCCTAAAATACCTACCTTACCCCCGCTTCAACTTGATGAGATTTCCATCAGTCACTCCCAATGTGTCACAGCTTCTGCAGCCCTAAAATTAAAAGGTGAGTGAGTCTCTGAGGCCCCTCTCCACTTCTCGGATGCTGAGTTTAGCCTTCATGTGAATGTGGAAAGACTAGGAATACAGCTGTTATCACACAAGCTGGCCCAATAGTGGTTCAGTTGAGAGAGCCCCATCCTTCAGAGTCAGCTCCAGCTAGGAGTGACTGGTGGCCTTGAGCATGGTGCTGGGCTTAGTGTTGCCATCTGTGGAATGGGTGTGGGTCTGTTGCCCTGCCTCCTCCCAGAGCTATTCTGAGGCTCAGAAGGGGTGATGGATGTGATGGTGCTCCCAACACTAGAAAGCATCTTAAGAATGTAAGATTTTCATGATGACTGTTGCTCAGAGTGGCTATTATAGTTTTGCTTTATTGTTCTATAACCTATGATTAAAATTTTTACCTTAAACTTTGACGTGAGTGTGAATAAGTATTTGTTTTGCCAGCAACATTCCTCACCACTGGGGCCATTAAAGATCTCCCCCTCTGAGACCATCAAATACAGGTCAACAGGACTGATTAATCTAATTAGAAAAGGGCTTGTATTAAATAGCAATGATAATTGTTGTTTTTAGTCTGTCTGGTGTTTGACTTGGGAACGTTTTTAAAATAGAGAAAAGCACAAAGAGGAAAACAACAATTACCAATATTCCTGCTACCCATTATAATTATCTAGGTATATTTTCTTCTTTTGTAAGAAAAAGAAACCCTGTTATATTGTTAAAATAACACAAAGTTAATATAAAGAATTTTAATGCAAAGATTAATGTTTTCAAATCACCACAAAACCCAACATCCAGAAATTACCAATATTAAAAGTAGAAAAGTATCATTCTAAATATTTTCTGTTGCATATGTATGTGAGTGGATAGGCTGATGAATTAGGTGGATTGATGGATAGGTAAATATGAAATAAATACTTTCATAAATATTCCAACTTATCATACATGCCTTAAATTCAAGAGGTGAAAAAAGACCCAAACAAAACTAGAGAAGCGGCTTATTTTAAATATCCTCTGACATAAAGGAATATTATATTTAAAGGATCCTCTAAGATTAAAAATATGTACTATGAAAAACATTAAGAAATTTGAATTTTTTTTAATCCATTTGTTTCAATTTAAGCAGCATCTACTGGCTCACTGCTTTGAAAAATAAGGACAGTATTCCAGTTCACATTCAGTGTTCCAGTGTTCACATTATCTTATTATTTTTACATTGTCCAGCTTTGTAATATTCACATTCTATTCTGTAATCATAATTCATAGTAGTTTAGTTATTTATTACTAACTCTATTTAAATAGATTCAAGGATCAGACCCTGCCCTTTTCTTCTTATTTATGTTTATTTTGATTAATCTCTTAATTGATTGGACTTTACATTCAAGCAACTTTTTTAAAAAAAAGTTTCTATAGATGTTCTATTTCTATCATTGTATTGTTTTTGAGGATGTTGGCCTGTTGCCTTTGTATTTGATGAGCATTTTGACAGAGTCTATGGTCTTGGGCCACTCTTTCTTTTTCTCCCTTGAGAACTTTTTAGATTTTGCTGATGGCATTGCTTGTTGAATGTTGCTGTGGAAACATCAAGTCTAGTGTAACTGTTTCTTCTTCAAGGTGATTTGCATTTTATTCCTGAATGCCTGAGGGTTCTTTATTTAACCTTGAAGTTAAATACCCTAATTAGGATGTATCTTGGTCTATTCATTCGGAATAAAAAATTCCTGCCATTTTGTCTAGAGAGTCCCTTTTTTTTCTCTTTATTTCTGGGAAATTCTCTTTTATATAAATATGTTTTGTTCCATCTATTGTGATCTCTGTTGAGGGATACCAGTTGTCCATATGTTAGATAATTTGTCTTCCATATCTGTTAACAGTTCTTAAAGTTTTTTGTTTATTTCTTTGTCTATTTTTACATTTACTCACTGTTCTCTTGTGGTTTTCCTCTGTCAGTAATTTAATTTTTAGTAGTTCCTGTTCTATTACTTGCTATTTTTAATCCATGCATTAATTTTATAATAATATTATTTTGCTCCTTATTTTGTCTCCTGAGACCCGAAATCTCTTTTTTCCTCTTACTCTGTTGCTTTTGCATTTTATTTTGAATACTTTTAAAATTGATTCCATGTTATGAAGCAATTATGAGGCATTTCCTCTCTTGTTGGAATTAACGATTTTTTCCCCTAGGAGGGACTCTATGGTCTGTGTTTTACTTCCTTTCTTCCCCTGTATTTCTAGAAAATATTTTCCTAGTAACCCTGACATTTCTTTTCATCTTGCTTATTCTAGTTGGTCTGATATAGCTTGATTGACATTTCAGCCTTCTTCCCACTATATTTTTTTTTCCTGTGAGAGCTATTGGGTTTTCTAAATCCTGAAAGAATGCCAAAGATGGGGTTGGAGGAGTTTGGTGAGGCAAAGTGCAGCCTTTGTTAAAATACTTTTCCTTTGCTCTCTCTCCCTCATCTGAAATTTAGTTAAATACCCTAAGCCATCAGCACTGTACCTAGTTGGGGAATGCTTTCATCCCCACAGGAGATTCTCTGGGGCTTTGGGCCATCTTCCCCTTCAGTGTAGACCACAGAGGACTTTGCTTCTGTCCCAGGGAGCCCGCAGGGGCTCACTTCTCCATGTTCATCTGATTCTTGTCAGCCAAGGTTTCAAATGCTTTTCTGATCAGAACAGGGAAAAGATACCTATCTGAATCATGTCTTTATAGATATGAGGCTATGAGGGAAAATTCTGAGGTTATTCTTGACTCACACCTAAAGATTTGGAAATGAGATTAGCAGCAAAGCTTTGCCCTACATCTCATGTCAGAATTTTCTGTTTCTTTCTAGTCTTTGAGTGTATGTGTGTTCTCACACACGCCATAATGAAATGCATATTATATATAATTATGTGTATATATAATATTCTATGACTATACATGACATGTTCCTTTAGCTGATTGCTGTTAAGAGAAATTTATAGGTTTTTATTTTTCTTGTTTTGTTGGGTATTAAGGAAGAGAAATTCTATGGTAATTTTCATGTGGCACAGTAATCTGGCATATATGTTGATTTTTTTCCTACACCCATTTGTTGTGATACCAAGTTTGAAAACAACAGATTTCAGTGGTTGCTTGGGAAACCACAGAACCATGACTTGGGGAGAGACAGGATGATTAGGTGGGAAAGCACCCTTTTGGTGGGGCTGTAAAGACTTTTATATTTAGCAAAATTGGCTACAAAGTCCATTCCCCTCCTTTTCTTGCCTTGATTTGGTAGAGGGATAGACTTGGATACAAACTAGAATGGATTCATTCTTCTCTGGAGTTAGTGTAACAAGACATTTAGCTGCTCAACACAAAAACAGAAACAAAAAAATTGTGTGGTTTCAGCAGTGCTATACAATTACTTTTTCTGACCTTTAATGGAGAGAAACACCACTTCTTTGGTCCCTACCATCAGCTTCATAGGGTTTTCATCCTGTTCTGTTTCTGGGAGGGCGTAACTGGCCATGCACAAGTTTTTTTTCTCTAATCAGAGTATGTGCCACTTCTGACCACCAGTAGATGAAAACGAATGGAAACCAGGCTATTATATGATACATATCCATTACAAAATAAGACATGAAACTCAAAGGTACTTTATGGTATAATGGGGCATATATTCCTGGACAATTCTTAATGGTCACAGATTTTATAAAAGGACTATTAGTAAATGTATGAATTACAGAGTAATTTATCCTTCTGTTAGTAAGAACCAGCTGATGACCTCAGTGTCAGGTGCATCGTGGAAGGTGTTGGGACCTTCCCTTGCCACCACCCTCACCAGCCATCATCAGCCATAACCTGCACATTGGGGAAGTTTTGACTTATCCCTCACTTTTGCCCCTCTTCAAGCTGTTCTTTCCACAGTGAATGAGAAGGCCACTTCTTCCTTCAAACCTTTCAGTGGTTTCCATTTTCCTTTAGACAAAGTCTCTGCCTAGCTGGCCTCTGCCTGCCCCTCCTGCCTACCTCTCGAGCACTGCCCCCACCTAGGGCTCTGGTTCCCCAACCTTCACTCGGTCCTGCCACACCTCCCAGCCCCTTCTCCCTTCAGAACTTTCCTTCTTGTTGTCCCCAACACTGGGACACAAAACCCTCCTTATCAACCCTCCTTATCTGGCTGACTCTTACAAGATCAGAAACCTGTGTAATGCTCTCATGGCACGCTCCCCTTGTCTTCGTGGATTTCTCAGATGGGAAGGAATTATCCATGCAATCACACATAAACTTCTACCTACCCTCCCCTAGTAGCTGTCTGCTGCTAAGGATGGGGACCATTCTCACTTACTCACTGTTCTGTCCCTCTGCCCAGTCCAGATGTGTTGAAGGATGGAAATATACAGAGTAGTGGTAAAATATAAACCGTTCAGACATTCCAAGGATGGGCTCATGTGCTTTGACTCATTAATGTACCACTGCTGAAAACAGAACACAGCCGCAGTCTTGCCAGTAAGAGTGCAGTTACTGTAATTAATGAATTTGCTAATTAAGCCATGATTTCATACTGAACTTATGACCAACATATTGAGAAGGTGTGTCTTCAAGAAAATTTATTTTTTGTATTAAGATATTTACTCCAAAGCTAATTGAAGAAGCCAAATCTAGGCTCTGGTTTCACCATTGCCAGGGAAATGAGCTCATGGACTCCTATGAACTGATGATGTTAGATCAGAAGTTTCTCAAGGCCAGGGCCCAATCACTGCTGAGGCGTCAACAGTAGTTCCTTGTACATCAATAATTCTCATTACTTTTAAAAAATAACAGATGAATAGCAACTATTTTCCCTGTAGCTCCCTTGCTGTGCCTCCTACCCTCCACCACATGTTTCTGGGGAGCCCTGCTTCGGGCCTGCCAACTACAGAGAATTACTTTTGAGTATCCCTTCCACTCTCATCTCAAGACAGAGTTCATCTACCTTTGGGTTATTTGTCAAAAATGTGTCATTTTATTACAAAAAATATACAATCATCATGTATTTTGATTAAATTTTACACTAGATTATTAAAATTATTAAATACAATTATTAAAATTAATAATTTAACATATCACATATTTTAAATATATTGTATATAATGAATAATAATATAATTATTGTCTATTTTAATTCAATAAATGTATAGTAAGTTAGCCAGTTGTAAATTACTGAGAACACTCTACTGAAAAAGCATCATTTCAAATACACTATTTAAAATATTAAATGAAATACAATAACATAATTAAACTAATCTTTGGTTCCCCTATTTATGTATTCATTTATCCAACAAAATCTCCTTAAGTGCTTATAATGGGTAGGTCCTGGCTCGGTGTCCCCTAGACAGACGCATGGGCCTTCCCCCAGCCCGTCAGTATGGTGCAGGTGTGATGTGTCCGCAGGTGTGTGTGTATGTGTGCAGGTGTGGGGTCCGCAGGCGTGCTGGGCCCCCAGGCCGTGTTCCCCTTCCCCTCCCCGGTTGTAGATTTCAGCTGTTGCTGCCAGACCTGACCGGTTCCGGAGGTGGCCGCGCCCCACTCACTGTCGCCTGCTTTCCACAGGGGACAAGGGCCTGTCCGACACGCCCTACGACAGCAGCGCCAGCTACGAGAAGGAGAACGAAATGATGAAGTCCCACGTGATGGACCAAGCCATCAACAACGCCATCAACTACCTGGGGGCCGAGTCCCTGCGCCCGCTGGTGCAGACGCCCCCGGGCGGTTCCGAGGTGGTCCCGGTCATCAGCCCGATGTACCAGCTGCACAAGCCGCTCGCGGAGGGCACCCCGCGCTCCAACCACTCGGCCCAGGACAGCGCCGTGGAGAACCTGCTGCTGCTCTCCAAGGCCAAGTTGGTGCCCTCGGAGCGCGAGGCGTCCCCGAGCAACAGCTGCCAAGACTCCACGGACACCGAGAGCAACAACGAGGAGCAGCGCAGCGGTCTCATCTACCTGACCAACCACATCGCCCCGCACGCGCGCAACGGGCTGTCGCTCAAGGAGGAGCACCGCGCCTACGACCTGCTGCGCGCCGCCTCCGAGAACTCGCAGGACGCGCTCCGCGTGGTCAGCACCAGCGGGGAGCAGATGAAGGTGTACAAGTGCGAACACTGCCGGGTGCTCTTCCTGGATCACGTCATGTACACCATCCACATGGGCTGCCACGGCTTCCGTGATCCTTTTGAGTGCAACATGTGCGGCTACCACAGCCAGGACCGGTACGAGTTCTCGTCGCACATAACGCGAGGGGAGCACCGCTTCCACATGAGCTAAAGCCCTCCCGCGCCCCCACCCCAGACCCCGAGCCACCCCAGGAAAAGCACAAGGACTGCCGCCTTCTCGCTCCCGCCAGCAGCATAGACTGGACTGGACCAGACAATGTTGTGTTTGGATTTGTAACTGTTTTTTGTTTTTTGTTTGAGTTGGTTGATTGGGGTTTGATTTGCTTTTGAAAAGATTTTTATTTTTAGAGGCAGGGCTGCATTGGGAGCATCCAGAACTGCTACCTTCCTAGATGTTTCCCCAGACCGCTGGCTGAGATTCCCTCACCTGTCGCTTCCTAGAATCCCCTTCTCCAAACGATTAGTCTAAATTTTCAGAGAGAAATAGATAAAACACGCCACAGCCTGGGAAGGAGCGTGCTCTACCCTGTGCTAAGCACGGGGTTCGCGCACCAGGTGTCTTTTTCCAGTCCCCAGAAGCAGAGAGCACAGCCCCTGCTGTGTGGGTCTGCAGGTGAGCAGACAGGACAGGTGTGCCGCCACCCAAGTGCCAAGACACAGCAGGGCCAACAACCTGTGCCCAGGCCAGCTTCGAGCTACATGCATCTAGGGCGGAGAGGCTGCACTTGTGAGAGAAAATACTATTTCAAGTCATATTCTGCGTAGGAAAATGAATTGGTTGGGGAAAGTCGTGTCTGTCAGACTGCCCTGGGTGGAGGGAGACGCCGGGCTAGAGCCTTTGGGATCGTCCTGGATTCACTGGCTTTGCGGAGGCTGCTCAGATGGCCTGAGCCTCCCGAGGCTTGCTGCCCCGTAGGAGGAGACTGTCTTCCCGTGGGCATATCTGGGGAGCCCTGTTCCCCGCTTTTTCACTCCCATACCTTTAATGGCCCCCAAAATCTGTCACTACAATTTAAACACCAGTCCCGAAATTTGGATCTTCTTTCTTTTTGAATCTCTCAAACGGCAACATTCCTCAGAAACCAAAGCTTTATTTCAAATCTCTTCCTTCCCTGGCTGGTTCCATCTAGTACCAGAGGCCTCTTTTCCTGAAGAAATCCAATCCTAGCCCTCATTTTAATTATGTACATCTGTTTGTAGCCACAAGCCTGAATTTCTCAGTGTTGGTAAGTTTCTTTACCTACCCTCACTATATATTATTCTCGTTTTAAAACCCATAAAGGAGTGATTTAGAACAGTCATTAATTTTCAACTCAATGAAATATGTGAAGCCCAGCATCTCTGTTGCTAACACACAGAGCTCACCTGTTTGAAACCAAGCTTTCAAACATGTTGAAGCTCTTTACTGTAAAGGCAAGCCAGCATGTGTGTCCACACATACATAGGATGGCTGGCTCTGCACCTGTAGGATATTGGAATGCACAGGGCAATTGAGGGACTGAGCCAGACCTTCGGAGAGTAATGCCACCAGATCCCCTAGGAAAGAGGAGGCAAATGGCACTGCAGGTGAGAACCCCGCCCATCCGTGCTATGACATGGAGGCACTGAAGCCCGAGGAAGGTGTGTGGAGATTCTAATCCCAACAAGCAAGGGTCTCCTTCAAGATTAATGCTATCAATCATTAAGGTCATTACTCTCAACCACCTAGGCAATGAAGAATATACCATTTCAAATATTTACAGTACTTGTCTTCACCAACACTGTCCCAAGGTGAAATGAAGCAACAGAGAGGAAATTGTACATAAGTACCTCAGCATTTAATCCAAACAGGGGTTCTTAGTCTCAGCACTATGACATTTTGGGCTGACTACTTATTTGTTAGGCGGGAGCTCTCCTGTGCATTGTAGGATAATTAGCAGTATCCCTGGTGGCTACCCAATAGACGCCAGTAGCACCCCGAATTGACAACCCAAACTCTCCAGACATCACCAACTGTCCCCTGCGAGGAGAAATCACTCCTGGGGGAGAACCACTGACCCAAATGAATTCTAAACCAATCAAATGTCTGGGAAGCCCTCCAAGAAAAAAAATAGAAAAGCACTTGAAGAATATTCCCAATATTCCCGGTCAGCAGTATCAAGGCTGACTTGTGTTCATGTGGAGTCATTATAAATTCTATAAATCAATTATTCCCCTTCGGTCTTAAAAATATATTTCCTCATAAACATTTGAGTTTTGTTGAAAAGATGGAGTTTACAAAGATACCATTCTTGAGTCATGGATTTCTCTGCTCACAGAAGGGTGTGGCATTTGGAAACGGGAATAAACAAAATTGCTGCACCAATGCACTGAGTGAAGGAAGAGAGACAGAGGATCAAGGGCTTTAGACAGCACTCCTTCAATATGCAATCACAGAGAAAGATGCGCCTTATCCAAGTTAATATCTCTAAGGTGAGAGCCTTCTTAGAGTCAGTTTGTTGCAAATTTCACCTACTCTGTTCTTTTCCATCCATCCCCCTGAGTCAGTTGGTTGAAGGGAGTTATTTTTTCAAGTGGAATTCAAACAAAGCTCAAACCAGAACTGTAAATAGTGATTGCAGGAATTCTTTTCTAAACTGCTTTGCCCTTTCCTCTCACTGCCTTTTATAGCCAATATAAATGTCTCTTTGCACACCTTTTGTTGTGGTTTTATATTGTAACACCATTTTTCTTTGAAACTATTGTATTTAAAGTAAGGTTTCATATTATGTCAGCAAGTAATTAACTTATGTTTAAAAGGTGGCCATATCATGTACCAAAAGTTGCTGAAGTTTCTCTTCTAGCTGGTAAAGTAGGAGTTTGCATGACTTCACACTTTTTTTGCGTAGTTTCTTCTGTTGTATGATGGCGTGAGTGTGTGTCTTGGGTACCGCTGTGTACTACTGTGTGCCTAGATTCCATGCACTCTCGTTGTGTTTGAAGTAAATATTGGAGACCGGAGGGTAACAGGTTGGCCTGTTGATTACAGCTAGTAATCGCTGTGTCTTGTTCCGCCCCCTCCCTGACACCCCAGCTTCCCAGGATGTGGAAAGCCTGGATCTCAGCTCCTTGCCCCATATCCCTTCTGTAATTTGTACCTAAAGAGTGTGATTATCCTAATTCAAGAGTCACTAAAACTCATCACATTATCATTGCATATCAGCAAAGGGTAAAGTCCTAGCACCAATTGCTTCACATACCAGCATGTTCCATTTCCAATTTAGAATTAGCCACATAATAAAATCTTAGAATCTTCCTTGAGAAAGAGCTGCCTGAGATGTAGTTTTGTTATATGGTTCCCCACCGACCATTTTTGTGCTTTTTTCTTGTTTTGTTTTGTTTTGACTGCACTGTGAGTTTTGTAGTGTCCTCTTCTTGCCAAAACAAACGCGAGATGAACTGGACTTATGTAGACAAATCGTGATGCCAGTGTATCCTTCCTTTCTTCAGTTCCAGCAATAATGAATGGTCAACTTTTTTAAAATCTAGATCTCTCTCATTCATTTCAATGTATTTTTACTTTAAGATGAACCAAAATTATTAGACTTATTTAAGATGTACAGGCATCAGAAAAAAGAAGCACATAATGCTTTTGGTGCGATGGCACTCACTGTGAACATGTGTAACCACATATTAATATGCAATATTGTTTCCAATACTTTCTAATACAGTTTTTTATAATGTTGTGTGTGGTGATTGTTCAGGTCGAATCTGTTGTATCCAGTACAGCTTTAGGTCTTCAGCTGCCCTTCTGGCGAGTACATGCACAGGATTGTAAATGAGAAATGCAGTCATATTTCCAGTCTGCCTCTATGATGATGTTAAATTATTGCTGTTTAGCTGTGAACAAGGGATGTACCACTGGAGGAATAGAGTATCCTTTTGTACACATTTTGAAATGCTTCTTCTGTAGTGATAGAACAAATAAATGCAACGAATACTCTGTCTGCCCTATCCCGTGAAGTCCACACTGGCGTAAGAGAAGGCCCAGCAGAGCAGGAATCTGCCTAGACTTTCTCCCAATGAGATCCCAATATGAGAGGGAGAAGAGATGGGCCTCAGGACAGCTGCAATACCACTTGGGAACACATGTGGTGTCTTGATGTGGCCAGCGCAGCAGTTCAGCACAACGTACCTCCCATCTACAACAGTGCTGGACGTGGGAATTCTAAGTCCCAGTCTTGAGGGTGGGTGGAGATGGAGGGCAACAAGAGATACATTTCCAGTTCTCCACTGCAGCATGCTTCAGTCATTCTGTGAGTGGCCGGGCCCAGGGCCCTCACAATTTCACTACCTTGTCTTTTACATAGTCATAAGAATTATCCTCAACATAGCCTTTTGACGCTGTAAATCTTGAGTATTCATTTACCCTTTTCTGATCTCCTGGAAACAGCTGCCTGCCTGCATTGCACTTCTCTTCCCGAGGAGTGGGGTAAATTTAAAAGTCAAGTTATAGTTTGGATGTTAGTATAGAATTTTGAAATTGGGAATTAAAAATCAGGACTGGGGACTGGGAGACCAAAAATTTCTGATCCCATTTCTGATGGATGTGTCACACCTTTTCTGTCAAAATAAAATGTCTTGGAGGTTATGACTCCTTGGTGAAACC", 50459420 - 1); var codingRegion = new CodingRegion(50459422, 50468325, 169, 1053, 885); var rnaEdits = new IRnaEdit[] { new RnaEdit(1, 0, "GAATTCCGGCGTCGCGGACGCATCCCAGTCTGGGCGGGACGCTCGGCCGCGGCGAGGCGGGCAAGCCTGGCAGGGCAGAGGGAGCCCCGGCTCCGAGGTTGCTCTTCGCCCCCGAGGATCAGTCTTGGCCCCAAAGCGCGACGCACAAATCCACATAACCTGAGGACCATGGATGCTGATGAGGGTCAAGACATGTCCCAAG"), new RnaEdit(4, 3, "C"), new RnaEdit(5325, 5324, "AAAAAAAAAAAAAAA") }; var regions = new ITranscriptRegion[] { // insertion new TranscriptRegion(TranscriptRegionType.Exon, 1, 50459422, 50459424, 204, 206), new TranscriptRegion(TranscriptRegionType.Exon, 1, 50459425, 50459561, 208, 343), new TranscriptRegion(TranscriptRegionType.Exon, 2, 50467616, 50472799, 344, 5527) }; string actualCdna = new CdnaSequence(genomicSeq, codingRegion, regions, false, rnaEdits).GetCdnaSequence(); const string expectedCdna = "GAATTCCGGCGTCGCGGACGCATCCCAGTCTGGGCGGGACGCTCGGCCGCGGCGAGGCGGGCAAGCCTGGCAGGGCAGAGGGAGCCCCGGCTCCGAGGTTGCTCTTCGCCCCCGAGGATCAGTCTTGGCCCCAAAGCGCGACGCACAAATCCACATAACCTGAGGACCATGGATGCTGATGAGGGTCAAGACATGTCCCAAGTTTCAGTCATTAAAGAAGAAACTAATCACAGTGAAATGGCAGAAGACCTGTGCAAGATAGGATCAGAGAGATCTCTCGTGCTGGACAGACTAGCAAGTAACGTCGCCAAACGTAAGAGCTCTATGCCTCAGAAATTTCTTGGGGACAAGGGCCTGTCCGACACGCCCTACGACAGCAGCGCCAGCTACGAGAAGGAGAACGAAATGATGAAGTCCCACGTGATGGACCAAGCCATCAACAACGCCATCAACTACCTGGGGGCCGAGTCCCTGCGCCCGCTGGTGCAGACGCCCCCGGGCGGTTCCGAGGTGGTCCCGGTCATCAGCCCGATGTACCAGCTGCACAAGCCGCTCGCGGAGGGCACCCCGCGCTCCAACCACTCGGCCCAGGACAGCGCCGTGGAGAACCTGCTGCTGCTCTCCAAGGCCAAGTTGGTGCCCTCGGAGCGCGAGGCGTCCCCGAGCAACAGCTGCCAAGACTCCACGGACACCGAGAGCAACAACGAGGAGCAGCGCAGCGGTCTCATCTACCTGACCAACCACATCGCCCCGCACGCGCGCAACGGGCTGTCGCTCAAGGAGGAGCACCGCGCCTACGACCTGCTGCGCGCCGCCTCCGAGAACTCGCAGGACGCGCTCCGCGTGGTCAGCACCAGCGGGGAGCAGATGAAGGTGTACAAGTGCGAACACTGCCGGGTGCTCTTCCTGGATCACGTCATGTACACCATCCACATGGGCTGCCACGGCTTCCGTGATCCTTTTGAGTGCAACATGTGCGGCTACCACAGCCAGGACCGGTACGAGTTCTCGTCGCACATAACGCGAGGGGAGCACCGCTTCCACATGAGCTAAAGCCCTCCCGCGCCCCCACCCCAGACCCCGAGCCACCCCAGGAAAAGCACAAGGACTGCCGCCTTCTCGCTCCCGCCAGCAGCATAGACTGGACTGGACCAGACAATGTTGTGTTTGGATTTGTAACTGTTTTTTGTTTTTTGTTTGAGTTGGTTGATTGGGGTTTGATTTGCTTTTGAAAAGATTTTTATTTTTAGAGGCAGGGCTGCATTGGGAGCATCCAGAACTGCTACCTTCCTAGATGTTTCCCCAGACCGCTGGCTGAGATTCCCTCACCTGTCGCTTCCTAGAATCCCCTTCTCCAAACGATTAGTCTAAATTTTCAGAGAGAAATAGATAAAACACGCCACAGCCTGGGAAGGAGCGTGCTCTACCCTGTGCTAAGCACGGGGTTCGCGCACCAGGTGTCTTTTTCCAGTCCCCAGAAGCAGAGAGCACAGCCCCTGCTGTGTGGGTCTGCAGGTGAGCAGACAGGACAGGTGTGCCGCCACCCAAGTGCCAAGACACAGCAGGGCCAACAACCTGTGCCCAGGCCAGCTTCGAGCTACATGCATCTAGGGCGGAGAGGCTGCACTTGTGAGAGAAAATACTATTTCAAGTCATATTCTGCGTAGGAAAATGAATTGGTTGGGGAAAGTCGTGTCTGTCAGACTGCCCTGGGTGGAGGGAGACGCCGGGCTAGAGCCTTTGGGATCGTCCTGGATTCACTGGCTTTGCGGAGGCTGCTCAGATGGCCTGAGCCTCCCGAGGCTTGCTGCCCCGTAGGAGGAGACTGTCTTCCCGTGGGCATATCTGGGGAGCCCTGTTCCCCGCTTTTTCACTCCCATACCTTTAATGGCCCCCAAAATCTGTCACTACAATTTAAACACCAGTCCCGAAATTTGGATCTTCTTTCTTTTTGAATCTCTCAAACGGCAACATTCCTCAGAAACCAAAGCTTTATTTCAAATCTCTTCCTTCCCTGGCTGGTTCCATCTAGTACCAGAGGCCTCTTTTCCTGAAGAAATCCAATCCTAGCCCTCATTTTAATTATGTACATCTGTTTGTAGCCACAAGCCTGAATTTCTCAGTGTTGGTAAGTTTCTTTACCTACCCTCACTATATATTATTCTCGTTTTAAAACCCATAAAGGAGTGATTTAGAACAGTCATTAATTTTCAACTCAATGAAATATGTGAAGCCCAGCATCTCTGTTGCTAACACACAGAGCTCACCTGTTTGAAACCAAGCTTTCAAACATGTTGAAGCTCTTTACTGTAAAGGCAAGCCAGCATGTGTGTCCACACATACATAGGATGGCTGGCTCTGCACCTGTAGGATATTGGAATGCACAGGGCAATTGAGGGACTGAGCCAGACCTTCGGAGAGTAATGCCACCAGATCCCCTAGGAAAGAGGAGGCAAATGGCACTGCAGGTGAGAACCCCGCCCATCCGTGCTATGACATGGAGGCACTGAAGCCCGAGGAAGGTGTGTGGAGATTCTAATCCCAACAAGCAAGGGTCTCCTTCAAGATTAATGCTATCAATCATTAAGGTCATTACTCTCAACCACCTAGGCAATGAAGAATATACCATTTCAAATATTTACAGTACTTGTCTTCACCAACACTGTCCCAAGGTGAAATGAAGCAACAGAGAGGAAATTGTACATAAGTACCTCAGCATTTAATCCAAACAGGGGTTCTTAGTCTCAGCACTATGACATTTTGGGCTGACTACTTATTTGTTAGGCGGGAGCTCTCCTGTGCATTGTAGGATAATTAGCAGTATCCCTGGTGGCTACCCAATAGACGCCAGTAGCACCCCGAATTGACAACCCAAACTCTCCAGACATCACCAACTGTCCCCTGCGAGGAGAAATCACTCCTGGGGGAGAACCACTGACCCAAATGAATTCTAAACCAATCAAATGTCTGGGAAGCCCTCCAAGAAAAAAAATAGAAAAGCACTTGAAGAATATTCCCAATATTCCCGGTCAGCAGTATCAAGGCTGACTTGTGTTCATGTGGAGTCATTATAAATTCTATAAATCAATTATTCCCCTTCGGTCTTAAAAATATATTTCCTCATAAACATTTGAGTTTTGTTGAAAAGATGGAGTTTACAAAGATACCATTCTTGAGTCATGGATTTCTCTGCTCACAGAAGGGTGTGGCATTTGGAAACGGGAATAAACAAAATTGCTGCACCAATGCACTGAGTGAAGGAAGAGAGACAGAGGATCAAGGGCTTTAGACAGCACTCCTTCAATATGCAATCACAGAGAAAGATGCGCCTTATCCAAGTTAATATCTCTAAGGTGAGAGCCTTCTTAGAGTCAGTTTGTTGCAAATTTCACCTACTCTGTTCTTTTCCATCCATCCCCCTGAGTCAGTTGGTTGAAGGGAGTTATTTTTTCAAGTGGAATTCAAACAAAGCTCAAACCAGAACTGTAAATAGTGATTGCAGGAATTCTTTTCTAAACTGCTTTGCCCTTTCCTCTCACTGCCTTTTATAGCCAATATAAATGTCTCTTTGCACACCTTTTGTTGTGGTTTTATATTGTAACACCATTTTTCTTTGAAACTATTGTATTTAAAGTAAGGTTTCATATTATGTCAGCAAGTAATTAACTTATGTTTAAAAGGTGGCCATATCATGTACCAAAAGTTGCTGAAGTTTCTCTTCTAGCTGGTAAAGTAGGAGTTTGCATGACTTCACACTTTTTTTGCGTAGTTTCTTCTGTTGTATGATGGCGTGAGTGTGTGTCTTGGGTACCGCTGTGTACTACTGTGTGCCTAGATTCCATGCACTCTCGTTGTGTTTGAAGTAAATATTGGAGACCGGAGGGTAACAGGTTGGCCTGTTGATTACAGCTAGTAATCGCTGTGTCTTGTTCCGCCCCCTCCCTGACACCCCAGCTTCCCAGGATGTGGAAAGCCTGGATCTCAGCTCCTTGCCCCATATCCCTTCTGTAATTTGTACCTAAAGAGTGTGATTATCCTAATTCAAGAGTCACTAAAACTCATCACATTATCATTGCATATCAGCAAAGGGTAAAGTCCTAGCACCAATTGCTTCACATACCAGCATGTTCCATTTCCAATTTAGAATTAGCCACATAATAAAATCTTAGAATCTTCCTTGAGAAAGAGCTGCCTGAGATGTAGTTTTGTTATATGGTTCCCCACCGACCATTTTTGTGCTTTTTTCTTGTTTTGTTTTGTTTTGACTGCACTGTGAGTTTTGTAGTGTCCTCTTCTTGCCAAAACAAACGCGAGATGAACTGGACTTATGTAGACAAATCGTGATGCCAGTGTATCCTTCCTTTCTTCAGTTCCAGCAATAATGAATGGTCAACTTTTTTAAAATCTAGATCTCTCTCATTCATTTCAATGTATTTTTACTTTAAGATGAACCAAAATTATTAGACTTATTTAAGATGTACAGGCATCAGAAAAAAGAAGCACATAATGCTTTTGGTGCGATGGCACTCACTGTGAACATGTGTAACCACATATTAATATGCAATATTGTTTCCAATACTTTCTAATACAGTTTTTTATAATGTTGTGTGTGGTGATTGTTCAGGTCGAATCTGTTGTATCCAGTACAGCTTTAGGTCTTCAGCTGCCCTTCTGGCGAGTACATGCACAGGATTGTAAATGAGAAATGCAGTCATATTTCCAGTCTGCCTCTATGATGATGTTAAATTATTGCTGTTTAGCTGTGAACAAGGGATGTACCACTGGAGGAATAGAGTATCCTTTTGTACACATTTTGAAATGCTTCTTCTGTAGTGATAGAACAAATAAATGCAACGAATACTCTGTCTGCCCTATCCCGTGAAGTCCACACTGGCGTAAGAGAAGGCCCAGCAGAGCAGGAATCTGCCTAGACTTTCTCCCAATGAGATCCCAATATGAGAGGGAGAAGAGATGGGCCTCAGGACAGCTGCAATACCACTTGGGAACACATGTGGTGTCTTGATGTGGCCAGCGCAGCAGTTCAGCACAACGTACCTCCCATCTACAACAGTGCTGGACGTGGGAATTCTAAGTCCCAGTCTTGAGGGTGGGTGGAGATGGAGGGCAACAAGAGATACATTTCCAGTTCTCCACTGCAGCATGCTTCAGTCATTCTGTGAGTGGCCGGGCCCAGGGCCCTCACAATTTCACTACCTTGTCTTTTACATAGTCATAAGAATTATCCTCAACATAGCCTTTTGACGCTGTAAATCTTGAGTATTCATTTACCCTTTTCTGATCTCCTGGAAACAGCTGCCTGCCTGCATTGCACTTCTCTTCCCGAGGAGTGGGGTAAATTTAAAAGTCAAGTTATAGTTTGGATGTTAGTATAGAATTTTGAAATTGGGAATTAAAAATCAGGACTGGGGACTGGGAGACCAAAAATTTCTGATCCCATTTCTGATGGATGTGTCACACCTTTTCTGTCAAAATAAAATGTCTTGGAGGTTATGACTCCTTGGTGAAAAAAAAAAAAAAAAAA"; Assert.Equal(expectedCdna, actualCdna); // var cdsTemp = actualCdna.Substring(codingRegion.CdnaStart - 1, codingRegion.Length); const string expectedCds = "ATGGATGCTGATGAGGGTCAAGACATGTCCCAAGTTTCAGTCATTAAAGAAGAAACTAATCACAGTGAAATGGCAGAAGACCTGTGCAAGATAGGATCAGAGAGATCTCTCGTGCTGGACAGACTAGCAAGTAACGTCGCCAAACGTAAGAGCTCTATGCCTCAGAAATTTCTTGGGGACAAGGGCCTGTCCGACACGCCCTACGACAGCAGCGCCAGCTACGAGAAGGAGAACGAAATGATGAAGTCCCACGTGATGGACCAAGCCATCAACAACGCCATCAACTACCTGGGGGCCGAGTCCCTGCGCCCGCTGGTGCAGACGCCCCCGGGCGGTTCCGAGGTGGTCCCGGTCATCAGCCCGATGTACCAGCTGCACAAGCCGCTCGCGGAGGGCACCCCGCGCTCCAACCACTCGGCCCAGGACAGCGCCGTGGAGAACCTGCTGCTGCTCTCCAAGGCCAAGTTGGTGCCCTCGGAGCGCGAGGCGTCCCCGAGCAACAGCTGCCAAGACTCCACGGACACCGAGAGCAACAACGAGGAGCAGCGCAGCGGTCTCATCTACCTGACCAACCACATCGCCCCGCACGCGCGCAACGGGCTGTCGCTCAAGGAGGAGCACCGCGCCTACGACCTGCTGCGCGCCGCCTCCGAGAACTCGCAGGACGCGCTCCGCGTGGTCAGCACCAGCGGGGAGCAGATGAAGGTGTACAAGTGCGAACACTGCCGGGTGCTCTTCCTGGATCACGTCATGTACACCATCCACATGGGCTGCCACGGCTTCCGTGATCCTTTTGAGTGCAACATGTGCGGCTACCACAGCCAGGACCGGTACGAGTTCTCGTCGCACATAACGCGAGGGGAGCACCGCTTCCACATGAGCTAA"; // Assert.Equal(expectedCds, cdsTemp); string actualCds = new CodingSequence(genomicSeq, codingRegion, regions, false, 0, rnaEdits).GetCodingSequence(); Assert.Equal(expectedCds, actualCds); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/CodonsTests.cs ================================================ using Genome; using Moq; using UnitTests.TestDataStructures; using VariantAnnotation.AnnotatedPositions.Transcript; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class CodonsTests { [Fact] public void Assign_WhenIntervalsNull_ReturnNull() { var sequence = new SimpleSequence("AAA"); var codons = Codons.GetCodons("G", -1, -1, -1, -1, sequence); Assert.Equal("", codons.Reference); Assert.Equal("", codons.Alternate); } [Fact] public void Assign_SNV_SuffixLenTooBig() { var sequence = new Mock(); sequence.SetupGet(x => x.Length).Returns(89); sequence.Setup(x => x.Substring(87, 1)).Returns("t"); sequence.Setup(x => x.Substring(88, 1)).Returns("C"); var codons = Codons.GetCodons("T", 89, 89, 30, 30, sequence.Object); Assert.Equal("tC", codons.Reference); Assert.Equal("tT", codons.Alternate); } [Fact] public void Assign_SNV() { var sequence = new Mock(); sequence.SetupGet(x => x.Length).Returns(100); sequence.Setup(x => x.Substring(21, 2)).Returns("CA"); sequence.Setup(x => x.Substring(23, 1)).Returns("A"); var codons = Codons.GetCodons("G", 24, 24, 8, 8, sequence.Object); Assert.Equal("caA", codons.Reference); Assert.Equal("caG", codons.Alternate); } [Fact] public void Assign_MNV() { var sequence = new Mock(); sequence.SetupGet(x => x.Length).Returns(100); sequence.Setup(x => x.Substring(21, 2)).Returns("CA"); sequence.Setup(x => x.Substring(28, 2)).Returns("GG"); sequence.Setup(x => x.Substring(23, 5)).Returns("GTGCT"); var codons = Codons.GetCodons("ACCGA", 24, 28, 8, 10, sequence.Object); Assert.Equal("caGTGCTgg", codons.Reference); Assert.Equal("caACCGAgg", codons.Alternate); } [Fact] public void GetCodon_NullPrefixAndSuffix() { const string allele = "GAA"; var observedResult = Codons.GetCodon(allele, "", ""); Assert.Equal(allele, observedResult); } [Theory] [InlineData(3, true)] [InlineData(1, false)] public void IsTriplet(int len, bool expectedResult) { var observedResult = Codons.IsTriplet(len); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(-33, 4, -11, 2, "ACGTca")] [InlineData(95, 101, 32, 34, "gGCTGA")] public void GetCodons_OutOfRangeIndexes_Adjusted(int cdsStart, int cdsEnd, int proteinBegin, int proteinEnd, string expectedRefCodons) { var sequence = new Mock(); sequence.SetupGet(x => x.Length).Returns(99); sequence.Setup(x => x.Substring(0, 0)).Returns(""); sequence.Setup(x => x.Substring(0, 4)).Returns("ACGT"); sequence.Setup(x => x.Substring(4, 2)).Returns("CA"); sequence.Setup(x => x.Substring(94, 5)).Returns("GCTGA"); sequence.Setup(x => x.Substring(93, 1)).Returns("G"); var codons = Codons.GetCodons("", cdsStart, cdsEnd, proteinBegin, proteinEnd, sequence.Object); Assert.Equal(expectedRefCodons, codons.Reference); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/CompactIdTests.cs ================================================ using System; using System.IO; using System.Text; using IO; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class CompactIdTests { [Fact] public void Convert_ENSG() { var id = CompactId.Convert("ENSG00000223972"); Assert.Equal("ENSG00000223972", id.ToString()); } [Fact] public void Convert_ENST() { var id = CompactId.Convert("ENST00000456328", 2); Assert.Equal("ENST00000456328.2", id.WithVersion); } [Fact] public void Convert_ENSP() { var id = CompactId.Convert("ENSP00000334393", 3); Assert.Equal("ENSP00000334393.3", id.WithVersion); } [Fact] public void Convert_ENSESTG() { var id = CompactId.Convert("ENSESTG00000027277"); Assert.Equal("ENSESTG00000027277", id.WithVersion); } [Fact] public void Convert_ENSESTP() { var id = CompactId.Convert("ENSESTP00000068714", 1); Assert.Equal("ENSESTP00000068714.1", id.WithVersion); } [Fact] public void Convert_ENSR() { var id = CompactId.Convert("ENSR00001576074", 4); Assert.Equal("ENSR00001576074.4", id.WithVersion); } [Fact] public void Convert_CCDS() { var id = CompactId.Convert("CCDS30555", 1); Assert.Equal("CCDS30555.1", id.WithVersion); } [Fact] public void Convert_NR() { var id = CompactId.Convert("NR_074509", 1); Assert.Equal("NR_074509.1", id.WithVersion); } [Fact] public void Convert_NM() { var id = CompactId.Convert("NM_001029885", 1); Assert.Equal("NM_001029885.1", id.WithVersion); } [Fact] public void Convert_NP() { var id = CompactId.Convert("NP_001025056", 1); Assert.Equal("NP_001025056.1", id.WithVersion); } [Fact] public void Convert_XR() { var id = CompactId.Convert("XR_246629", 1); Assert.Equal("XR_246629.1", id.WithVersion); } [Fact] public void Convert_XM() { var id = CompactId.Convert("XM_005244723", 1); Assert.Equal("XM_005244723.1", id.WithVersion); } [Fact] public void Convert_XP() { var id = CompactId.Convert("XP_005244780", 1); Assert.Equal("XP_005244780.1", id.WithVersion); } [Fact] public void Convert_NullInput_ReturnsEmptyId() { var id = CompactId.Convert(null); Assert.True(id.IsEmpty()); Assert.Null(id.WithVersion); id = CompactId.Convert(string.Empty); Assert.True(id.IsEmpty()); Assert.Null(id.WithVersion); } [Fact] public void Convert_Unknown() { var id = CompactId.Convert("ABC123"); Assert.True(id.IsEmpty()); Assert.Null(id.WithVersion); } [Fact] public void Convert_OnlyNumbers() { var id = CompactId.Convert("268435455"); Assert.Equal("268435455", id.WithoutVersion); } [Fact] public void Convert_OnlyNumbers_ThrowException_NumberTooLarge() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable var id = CompactId.Convert("268435456"); }); } [Fact] public void CompactId_IO_EndToEnd() { const string expectedResults = "ENSP00000334393.3"; var id = CompactId.Convert("ENSP00000334393", 3); ICompactId observedId; using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { id.Write(writer); } ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) { observedId = CompactId.Read(reader); } } Assert.NotNull(observedId); Assert.Equal(expectedResults, observedId.WithVersion); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/FeatureVariantEffectsTests.cs ================================================ using Intervals; using VariantAnnotation.AnnotatedPositions.Transcript; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class FeatureVariantEffectsTests { [Theory] [InlineData(VariantType.deletion, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, true)] [InlineData(VariantType.copy_number_loss, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, true)] [InlineData(VariantType.deletion, OverlapType.Partial, EndpointOverlapType.Start, false)] [InlineData(VariantType.copy_number_loss, OverlapType.Partial, EndpointOverlapType.End, false)] [InlineData(VariantType.copy_number_loss, OverlapType.CompletelyWithin, EndpointOverlapType.None, false)] public void Ablation(VariantType variantType, OverlapType overlapType, EndpointOverlapType endpointOverlapType, bool expectResult) { var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, false, variantType, true); bool observedResult = featureEffect.Ablation(); Assert.Equal(expectResult, observedResult); } [Theory] [InlineData(VariantType.copy_number_gain, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, true)] [InlineData(VariantType.duplication, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, true)] [InlineData(VariantType.tandem_duplication, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, true)] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.Start, false)] [InlineData(VariantType.duplication, OverlapType.CompletelyWithin, EndpointOverlapType.None, false)] [InlineData(VariantType.tandem_duplication, OverlapType.Partial, EndpointOverlapType.End, false)] public void Amplification(VariantType variantType, OverlapType overlapType, EndpointOverlapType endpointOverlapType, bool expectedResult) { var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, false, variantType, true); bool observedResult = featureEffect.Amplification(); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(VariantType.deletion, OverlapType.Partial, EndpointOverlapType.Start, true)] [InlineData(VariantType.copy_number_loss, OverlapType.Partial, EndpointOverlapType.End, true)] [InlineData(VariantType.copy_number_loss, OverlapType.CompletelyWithin, EndpointOverlapType.None, true)] [InlineData(VariantType.deletion, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, false)] [InlineData(VariantType.copy_number_loss, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, false)] public void Truncation(VariantType variantType, OverlapType overlapType, EndpointOverlapType endpointOverlapType, bool expectedResult) { var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, false, variantType, true); bool observedResult = featureEffect.Truncation(); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(VariantType.copy_number_gain, OverlapType.CompletelyWithin, EndpointOverlapType.None, true)] [InlineData(VariantType.duplication, OverlapType.CompletelyWithin, EndpointOverlapType.None, true)] [InlineData(VariantType.tandem_duplication, OverlapType.CompletelyWithin, EndpointOverlapType.None, true)] [InlineData(VariantType.insertion, OverlapType.CompletelyWithin, EndpointOverlapType.None, true)] [InlineData(VariantType.copy_number_gain, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, false)] [InlineData(VariantType.duplication, OverlapType.Partial, EndpointOverlapType.Start, false)] [InlineData(VariantType.tandem_duplication, OverlapType.Partial, EndpointOverlapType.End, false)] public void Elongation(VariantType variantType, OverlapType overlapType, EndpointOverlapType endpointOverlapType, bool expectedResult) { var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, false, variantType, true); bool observedResult = featureEffect.Elongation(); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.End, false, false)] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.End, true, true)] [InlineData(VariantType.copy_number_gain, OverlapType.CompletelyWithin, EndpointOverlapType.None, false, false)] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.Start, false, true)] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.Start, true, false)] [InlineData(VariantType.copy_number_gain, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, false, false)] [InlineData(VariantType.duplication, OverlapType.Partial, EndpointOverlapType.End, true, true)] [InlineData(VariantType.tandem_duplication, OverlapType.Partial, EndpointOverlapType.End, true, true)] [InlineData(VariantType.duplication, OverlapType.Partial, EndpointOverlapType.Start, true, false)] [InlineData(VariantType.tandem_duplication, OverlapType.Partial, EndpointOverlapType.Start, true, false)] [InlineData(VariantType.duplication, OverlapType.CompletelyWithin, EndpointOverlapType.Start, false, true)] [InlineData(VariantType.duplication, OverlapType.CompletelyWithin, EndpointOverlapType.End, true, true)] public void FivePrimeDuplicatedTranscript(VariantType variantType, OverlapType overlapType, EndpointOverlapType endpointOverlapType, bool onReverseStrand, bool expectedResult) { var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, onReverseStrand, variantType, true); bool observedResult = featureEffect.FivePrimeDuplicatedTranscript(); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.End, false, true)] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.End, true, false)] [InlineData(VariantType.copy_number_gain, OverlapType.CompletelyWithin, EndpointOverlapType.None, false, false)] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.Start, false, false)] [InlineData(VariantType.copy_number_gain, OverlapType.Partial, EndpointOverlapType.Start, true, true)] [InlineData(VariantType.copy_number_gain, OverlapType.CompletelyOverlaps, EndpointOverlapType.Both, false, false)] [InlineData(VariantType.duplication, OverlapType.Partial, EndpointOverlapType.End, true, false)] [InlineData(VariantType.tandem_duplication, OverlapType.Partial, EndpointOverlapType.End, true, false)] [InlineData(VariantType.duplication, OverlapType.Partial, EndpointOverlapType.Start, true, true)] [InlineData(VariantType.tandem_duplication, OverlapType.Partial, EndpointOverlapType.Start, true, true)] [InlineData(VariantType.duplication, OverlapType.CompletelyWithin, EndpointOverlapType.End, false, true)] [InlineData(VariantType.duplication, OverlapType.CompletelyWithin, EndpointOverlapType.Start, true, true)] public void ThreePrimeDuplicatedTranscript(VariantType variantType, OverlapType overlapType, EndpointOverlapType endpointOverlapType, bool onReverseStrand, bool expectedResult) { var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, onReverseStrand, variantType, true); bool observedResult = featureEffect.ThreePrimeDuplicatedTranscript(); Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/MappedPositionUtilitiesTests.cs ================================================ using Intervals; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class MappedPositionUtilitiesTests { private readonly ITranscriptRegion[] _forwardTranscriptRegions; private readonly ITranscriptRegion[] _reverseTranscriptRegions; private const int ForwardVariantStart = 78024346; private const int ForwardVariantEnd = 78024345; // Mother.vcf: chr2 313885 . CTGATTTGCTATGAAA C private const int ReverseVariantStart = 313886; private const int ReverseVariantEnd = 313900; // NM_033517.1, SHANK3 private readonly ITranscriptRegion[] _regionsNm33517 = { new TranscriptRegion(TranscriptRegionType.Exon, 1, 51113070, 51113132, 1, 63), new TranscriptRegion(TranscriptRegionType.Intron, 1, 51113133, 51113475, 63, 64), new TranscriptRegion(TranscriptRegionType.Exon, 2, 51113476, 51113679, 64, 267), new TranscriptRegion(TranscriptRegionType.Intron, 2, 51113680, 51115049, 267, 268), new TranscriptRegion(TranscriptRegionType.Exon, 3, 51115050, 51115121, 268, 339), new TranscriptRegion(TranscriptRegionType.Intron, 3, 51115122, 51117012, 339, 340), new TranscriptRegion(TranscriptRegionType.Exon, 4, 51117013, 51117121, 340, 448), new TranscriptRegion(TranscriptRegionType.Intron, 4, 51117122, 51117196, 448, 449), new TranscriptRegion(TranscriptRegionType.Exon, 5, 51117197, 51117348, 449, 600), new TranscriptRegion(TranscriptRegionType.Intron, 5, 51117349, 51117446, 600, 601), new TranscriptRegion(TranscriptRegionType.Exon, 6, 51117447, 51117614, 601, 768), new TranscriptRegion(TranscriptRegionType.Intron, 6, 51117615, 51117739, 768, 769), new TranscriptRegion(TranscriptRegionType.Exon, 7, 51117740, 51117856, 769, 885), new TranscriptRegion(TranscriptRegionType.Intron, 7, 51117857, 51121767, 885, 886), new TranscriptRegion(TranscriptRegionType.Exon, 8, 51121768, 51121845, 886, 963), new TranscriptRegion(TranscriptRegionType.Intron, 8, 51121846, 51123012, 963, 964), new TranscriptRegion(TranscriptRegionType.Exon, 9, 51123013, 51123079, 964, 1030), new TranscriptRegion(TranscriptRegionType.Intron, 9, 51123080, 51133202, 1030, 1031), new TranscriptRegion(TranscriptRegionType.Exon, 10, 51133203, 51133474, 1031, 1302), new TranscriptRegion(TranscriptRegionType.Intron, 10, 51133475, 51135984, 1302, 1342), new TranscriptRegion(TranscriptRegionType.Exon, 11, 51135985, 51135989, 1342, 1346), new TranscriptRegion(TranscriptRegionType.Gap, 11, 51135990, 51135991, 1346, 1347), new TranscriptRegion(TranscriptRegionType.Exon, 11, 51135992, 51136143, 1347, 1498), new TranscriptRegion(TranscriptRegionType.Intron, 11, 51136144, 51137117, 1498, 1499), new TranscriptRegion(TranscriptRegionType.Exon, 12, 51137118, 51137231, 1499, 1612), new TranscriptRegion(TranscriptRegionType.Intron, 12, 51137232, 51142287, 1612, 1613), new TranscriptRegion(TranscriptRegionType.Exon, 13, 51142288, 51142363, 1613, 1688), new TranscriptRegion(TranscriptRegionType.Intron, 13, 51142364, 51142593, 1688, 1689), new TranscriptRegion(TranscriptRegionType.Exon, 14, 51142594, 51142676, 1689, 1771), new TranscriptRegion(TranscriptRegionType.Intron, 14, 51142677, 51143165, 1771, 1772), new TranscriptRegion(TranscriptRegionType.Exon, 15, 51143166, 51143290, 1772, 1896), new TranscriptRegion(TranscriptRegionType.Intron, 15, 51143291, 51143391, 1896, 1897), new TranscriptRegion(TranscriptRegionType.Exon, 16, 51143392, 51143524, 1897, 2029), new TranscriptRegion(TranscriptRegionType.Intron, 16, 51143525, 51144499, 2029, 2030), new TranscriptRegion(TranscriptRegionType.Exon, 17, 51144500, 51144580, 2030, 2110), new TranscriptRegion(TranscriptRegionType.Intron, 17, 51144581, 51150042, 2110, 2111), new TranscriptRegion(TranscriptRegionType.Exon, 18, 51150043, 51150066, 2111, 2134), new TranscriptRegion(TranscriptRegionType.Intron, 18, 51150067, 51153344, 2134, 2135), new TranscriptRegion(TranscriptRegionType.Exon, 19, 51153345, 51153475, 2135, 2265), new TranscriptRegion(TranscriptRegionType.Intron, 19, 51153476, 51154096, 2265, 2266), new TranscriptRegion(TranscriptRegionType.Exon, 20, 51154097, 51154181, 2266, 2350), new TranscriptRegion(TranscriptRegionType.Intron, 20, 51154182, 51158611, 2350, 2351), new TranscriptRegion(TranscriptRegionType.Exon, 21, 51158612, 51160865, 2351, 4604), new TranscriptRegion(TranscriptRegionType.Intron, 21, 51160866, 51169148, 4604, 4605), new TranscriptRegion(TranscriptRegionType.Exon, 22, 51169149, 51171640, 4605, 7096) }; // NM_000682.6 private readonly ITranscriptRegion[] _regionsNm682 = { new TranscriptRegion(TranscriptRegionType.Exon, 1, 96778623, 96780986, 1008, 3371), new TranscriptRegion(TranscriptRegionType.Exon, 1, 96780987, 96781984, 1, 998) }; // NM_001317107.1 private readonly ITranscriptRegion[] _regionsNm1317107 = { new TranscriptRegion(TranscriptRegionType.Exon, 1, 22138125, 22138561, 670, 1106), new TranscriptRegion(TranscriptRegionType.Gap, 1, 22138562, 22138563, 669, 670), new TranscriptRegion(TranscriptRegionType.Exon, 1, 22138564, 22139232, 1, 669) }; public MappedPositionUtilitiesTests() { _forwardTranscriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 77997792, 77998025, 1, 234), new TranscriptRegion(TranscriptRegionType.Intron, 1, 77998026, 78001531, 234, 235), new TranscriptRegion(TranscriptRegionType.Exon, 2, 78001532, 78001723, 235, 426), new TranscriptRegion(TranscriptRegionType.Intron, 2, 78001724, 78024286, 426, 427), new TranscriptRegion(TranscriptRegionType.Exon, 3, 78024287, 78024416, 427, 556) }; // ENST00000591244 _reverseTranscriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 5, 309218, 309407, 622, 811), new TranscriptRegion(TranscriptRegionType.Intron, 4, 309408, 310214, 621, 622), new TranscriptRegion(TranscriptRegionType.Exon, 4, 310215, 310499, 337, 621), new TranscriptRegion(TranscriptRegionType.Intron, 3, 310500, 312956, 336, 337), new TranscriptRegion(TranscriptRegionType.Exon, 3, 312957, 313157, 136, 336), new TranscriptRegion(TranscriptRegionType.Intron, 2, 313158, 313873, 135, 136), new TranscriptRegion(TranscriptRegionType.Exon, 2, 313874, 313892, 117, 135), new TranscriptRegion(TranscriptRegionType.Intron, 1, 313893, 314242, 116, 117), new TranscriptRegion(TranscriptRegionType.Exon, 1, 314243, 314358, 1, 116) }; } [Fact] public void FindRegion_Forward_Insertion() { var observedStart = MappedPositionUtilities.FindRegion(_forwardTranscriptRegions, ForwardVariantStart); var observedEnd = MappedPositionUtilities.FindRegion(_forwardTranscriptRegions, ForwardVariantEnd); Assert.Equal(4, observedStart.Index); Assert.Equal(4, observedEnd.Index); Assert.NotNull(observedStart.Region); Assert.NotNull(observedEnd.Region); } [Fact] public void FindRegion_Reverse_Deletion() { var observedStart = MappedPositionUtilities.FindRegion(_reverseTranscriptRegions, ReverseVariantStart); var observedEnd = MappedPositionUtilities.FindRegion(_reverseTranscriptRegions, ReverseVariantEnd); Assert.Equal(6, observedStart.Index); Assert.Equal(7, observedEnd.Index); Assert.NotNull(observedStart.Region); Assert.NotNull(observedEnd.Region); } [Fact] public void GetCdnaPositions_Forward_Insertion() { var variant = new Interval(ForwardVariantStart, ForwardVariantEnd); var observed = MappedPositionUtilities.GetCdnaPositions(_forwardTranscriptRegions[4], _forwardTranscriptRegions[4], variant, false, true); Assert.Equal(486, observed.CdnaStart); Assert.Equal(485, observed.CdnaEnd); } [Fact] public void GetCdnaPositions_Reverse_Deletion() { var variant = new Interval(ReverseVariantStart, ReverseVariantEnd); var observed = MappedPositionUtilities.GetCdnaPositions(_reverseTranscriptRegions[6], _reverseTranscriptRegions[7], variant, true, false); Assert.Equal(123, observed.CdnaStart); Assert.Equal(-1, observed.CdnaEnd); } [Fact] public void GetCdnaPosition_Snv_AfterOutFrameRnaEditDeletion() { // NM_001317107.1 var variant = new Interval(22138550, 22138550); var observed = MappedPositionUtilities.GetCdnaPositions(_regionsNm1317107[0], _regionsNm1317107[0], variant, true, false); Assert.Equal(681, observed.CdnaStart); } [Fact] public void GetCdnaPosition_Snv_AfterInframeRnaEditInsertion() { // NM_000682.6 var variant = new Interval(96780984, 96780984); var observed = MappedPositionUtilities.GetCdnaPositions(_regionsNm682[0], _regionsNm682[0], variant, true, false); Assert.Equal(1010, observed.CdnaStart); } [Fact] public void GetCdnaPosition_Snv_AfterOutframeRnaEditInsertion() { // NM_033517.1 var variant = new Interval(51135986, 51135986); var observed = MappedPositionUtilities.GetCdnaPositions(_regionsNm33517[20], _regionsNm33517[20], variant, false, false); Assert.Equal(1343, observed.CdnaStart); } [Fact] public void GetCoveredCdnaPositions_Forward_StartBefore_EndExon() { var observedResults = _forwardTranscriptRegions.GetCoveredCdnaPositions(-1, -1, 300, 2, false); Assert.Equal(1, observedResults.Start); Assert.Equal(300, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Forward_StartIntron_EndExon() { var observedResults = _forwardTranscriptRegions.GetCoveredCdnaPositions(-1, 1, 300, 2, false); Assert.Equal(235, observedResults.Start); Assert.Equal(300, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Forward_StartExon_EndIntron() { var observedResults = _forwardTranscriptRegions.GetCoveredCdnaPositions(250, 2, -1, 3, false); Assert.Equal(250, observedResults.Start); Assert.Equal(426, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Forward_StartExon_EndAfter() { var observedResults = _forwardTranscriptRegions.GetCoveredCdnaPositions(-1, ~_forwardTranscriptRegions.Length, 300, 2, false); Assert.Equal(300, observedResults.Start); Assert.Equal(556, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Forward_StartBefore_EndAfter() { var observedResults = _forwardTranscriptRegions.GetCoveredCdnaPositions(-1, -1, -1, ~_forwardTranscriptRegions.Length, false); Assert.Equal(1, observedResults.Start); Assert.Equal(556, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Forward_Insertion_StartAfter_EndExon() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 10, 19, 10, 19), new TranscriptRegion(TranscriptRegionType.Intron, 1, 20, 29, 19, 20), new TranscriptRegion(TranscriptRegionType.Exon, 2, 30, 39, 20, 29), new TranscriptRegion(TranscriptRegionType.Intron, 2, 40, 49, 29, 30), new TranscriptRegion(TranscriptRegionType.Exon, 3, 50, 59, 30, 39), new TranscriptRegion(TranscriptRegionType.Intron, 3, 60, 69, 39, 40), new TranscriptRegion(TranscriptRegionType.Exon, 4, 70, 79, 40, 49), new TranscriptRegion(TranscriptRegionType.Intron, 4, 80, 89, 49, 50), new TranscriptRegion(TranscriptRegionType.Exon, 5, 90, 4834618, 50, 1676), new TranscriptRegion(TranscriptRegionType.Intron, 5, 4834619, 4842604, 1676, 1677), new TranscriptRegion(TranscriptRegionType.Exon, 6, 4842605, 4852594, 1677, 11666) }; var observedResults = regions.GetCoveredCdnaPositions(-1, -12, 11666, 1, false); Assert.Equal(11666, observedResults.Start); Assert.Equal(11666, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Reverse_StartBefore_EndExon() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 4, 103288513, 103288696, 522, 705) }; // ClinVar ENST00000546844 103288512 var observedResults = regions.GetCoveredCdnaPositions(523, -1, -1, 0, true); Assert.Equal(523, observedResults.Start); Assert.Equal(705, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Reverse_StartIntron_EndExon() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Intron, 3, 103271329, 103288512, 825, 826), new TranscriptRegion(TranscriptRegionType.Exon, 3, 103288513, 103288696, 642, 825) }; // ClinVar ENST00000553106 103288512 var observedResults = regions.GetCoveredCdnaPositions(643, 0, -1, 1, true); Assert.Equal(643, observedResults.Start); Assert.Equal(825, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Reverse_StartExon_EndIntron() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 3, 103288513, 103288696, 642, 825), new TranscriptRegion(TranscriptRegionType.Intron, 2, 103288697, 103288999, 641, 642) }; var observedResults = regions.GetCoveredCdnaPositions(-1, 0, 666, 1, true); Assert.Equal(642, observedResults.Start); Assert.Equal(666, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Reverse_StartExon_EndAfter() { var regions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 103288513, 103288696, 1, 825) }; // synthetic var observedResults = regions.GetCoveredCdnaPositions(-1, ~1, -1, -1, true); Assert.Equal(1, observedResults.Start); Assert.Equal(825, observedResults.End); } [Fact] public void GetCoveredCdnaPositions_Reverse_StartBefore_EndAfter() { var observedResults = _reverseTranscriptRegions.GetCoveredCdnaPositions(-1, -1, -1, ~_reverseTranscriptRegions.Length, true); Assert.Equal(1, observedResults.Start); Assert.Equal(811, observedResults.End); } [Fact] public void GetCdsPosition_Forward_Insertion() { var codingRegion = new CodingRegion(78001559, 78024355, 262, 495, 234); const byte startExonPhase = 0; var (cdsStart, cdsEnd) = MappedPositionUtilities.GetCdsPositions(codingRegion, 486, 485, startExonPhase, true); Assert.Equal(225, cdsStart); Assert.Equal(224, cdsEnd); } [Fact] public void GetCdsPosition_Snv_AfterOutFrameRnaEditDeletion() { // NM_001317107.1 var codingRegion = new CodingRegion(22138201, 22139150, 83, 1030, 948); const byte startExonPhase = 0; (int cdsStart, _) = MappedPositionUtilities.GetCdsPositions(codingRegion, 681, 681, startExonPhase, false); Assert.Equal(599, cdsStart); } [Fact] public void GetCdsPosition_Snv_AfterInframeRnaEditInsertion() { // NM_000682.6 var codingRegion = new CodingRegion(96780545, 96781888, 97, 1449, 1344); const byte startExonPhase = 0; (int cdsStart, _) = MappedPositionUtilities.GetCdsPositions(codingRegion, 1010, 1010, startExonPhase, false); Assert.Equal(914, cdsStart); } [Fact] public void GetCdsPosition_Snv_AfterOutframeRnaEditInsertion() { // NM_033517.1 var codingRegion = new CodingRegion(51113070, 51169740, 1, 5196, 5157); const byte startExonPhase = 0; (int cdsStart, _) = MappedPositionUtilities.GetCdsPositions(codingRegion, 1343, 1343, startExonPhase, false); Assert.Equal(1343, cdsStart); } [Fact] public void GetCdsPosition_Forward_Insertion_WithStartExonPhase() { var codingRegion = new CodingRegion(6413107, 6415837, 1, 953, 953); const byte startExonPhase = 1; var (cdsStart, cdsEnd) = MappedPositionUtilities.GetCdsPositions(codingRegion, 29, 28, startExonPhase, true); Assert.Equal(30, cdsStart); Assert.Equal(29, cdsEnd); } [Fact] public void GetCdsPosition_Reverse_NoCodingRegion_Deletion() { const byte startExonPhase = 0; var (cdsStart, cdsEnd) = MappedPositionUtilities.GetCdsPositions(null, -1, 123, startExonPhase, false); Assert.Equal(-1, cdsStart); Assert.Equal(-1, cdsEnd); } [Fact] public void GetCdsPosition_SilenceOutput_InsertionAfterCodingRegion_Forward() { // variant: [6647337, 6647336] insertion after coding region var codingRegion = new CodingRegion(6643999, 6647336, 667, 1674, 1008); const byte startExonPhase = 0; var (cdsStart, cdsEnd) = MappedPositionUtilities.GetCdsPositions(codingRegion, 1675, 1674, startExonPhase, true); Assert.Equal(-1, cdsStart); Assert.Equal(-1, cdsEnd); } [Fact] public void GetCdsPosition_SilenceOutput_InsertionAfterCodingRegion_Reverse() { // variant: [103629803, 103629804] insertion after coding region var codingRegion = new CodingRegion(103113259, 103629803, 161, 10543, 10383); const byte startExonPhase = 0; var (cdsStart, cdsEnd) = MappedPositionUtilities.GetCdsPositions(codingRegion, 161, 160, startExonPhase, true); Assert.Equal(-1, cdsStart); Assert.Equal(-1, cdsEnd); } [Fact] public void GetCdsPosition_SilenceOutput_InsertionBeforeCodingRegion_Reverse() { // variant: [37480320, 37480319] insertion after coding region var codingRegion = new CodingRegion(37480320, 37543667, 556, 3228, 2673); const byte startExonPhase = 0; var (cdsStart, cdsEnd) = MappedPositionUtilities.GetCdsPositions(codingRegion, 3229, 3228, startExonPhase, true); Assert.Equal(-1, cdsStart); Assert.Equal(-1, cdsEnd); } [Fact] public void GetCdsPosition_DoNotSilenceOutput_Reverse() { // variant: [179315139, 179315692] var codingRegion = new CodingRegion(179308070, 179315170, 617, 942, 326); const byte startExonPhase = 0; var (cdsStart, cdsEnd) = MappedPositionUtilities.GetCdsPositions(codingRegion, 95, 648, startExonPhase, false); Assert.Equal(-1, cdsStart); Assert.Equal(32, cdsEnd); } [Fact] public void GetProteinPosition_Forward_Insertion() { int proteinStart = MappedPositionUtilities.GetProteinPosition(225); int proteinEnd = MappedPositionUtilities.GetProteinPosition(224); Assert.Equal(75, proteinStart); Assert.Equal(75, proteinEnd); } [Fact] public void GetProteinPosition_Reverse_Deletion() { int proteinStart = MappedPositionUtilities.GetProteinPosition(-1); int proteinEnd = MappedPositionUtilities.GetProteinPosition(-1); Assert.Equal(-1, proteinStart); Assert.Equal(-1, proteinEnd); } private static ITranscriptRegion GetExon() => new TranscriptRegion(TranscriptRegionType.Exon, 0, 10001, 10199, 1, 199); private static ITranscriptRegion GetIntron() => new TranscriptRegion(TranscriptRegionType.Intron, 0, 10200, 10299, 199, 200); [Fact] public void FoundExonEndpointInsertion_NotInsertion_ReturnFalse() { Assert.False(MappedPositionUtilities.FoundExonEndpointInsertion(false, -1, 100, GetExon(), GetIntron())); } [Fact] public void FoundExonEndpointInsertion_BothExons_ReturnFalse() { Assert.False(MappedPositionUtilities.FoundExonEndpointInsertion(true, -1, 100, GetExon(), GetExon())); } [Fact] public void FoundExonEndpointInsertion_BothIntrons_ReturnFalse() { Assert.False(MappedPositionUtilities.FoundExonEndpointInsertion(true, -1, 100, GetIntron(), GetIntron())); } [Fact] public void FoundExonEndpointInsertion_BothDefinedCdnaPositions_ReturnFalse() { Assert.False(MappedPositionUtilities.FoundExonEndpointInsertion(true, 100, 110, GetExon(), GetIntron())); } [Fact] public void FoundExonEndpointInsertion_BothUndefinedCdnaPositions_ReturnFalse() { Assert.False(MappedPositionUtilities.FoundExonEndpointInsertion(true, -1, -1, GetExon(), GetIntron())); } [Fact] public void FoundExonEndpointInsertion_UndefinedRegion_ReturnFalse() { Assert.False(MappedPositionUtilities.FoundExonEndpointInsertion(true, -1, -1, null, GetIntron())); } [Fact] public void FoundExonEndpointInsertion_OneIntron_OneExon_OneUndefinedPosition_ReturnTrue() { Assert.True(MappedPositionUtilities.FoundExonEndpointInsertion(true, 108, -1, GetExon(), GetIntron())); } [Fact] public void FixExonEndpointInsertion_VariantEnd_ExonEnd_Reverse() { var startRegion = new TranscriptRegion(TranscriptRegionType.Intron, 7, 243736351, 243776972, 762, 763); var endRegion = new TranscriptRegion(TranscriptRegionType.Exon, 8, 243736228, 243736350, 763, 885); var result = MappedPositionUtilities.FixExonEndpointInsertion(-1, 763, true, startRegion, endRegion, new Interval(243736351, 243736350)); Assert.Equal(762, result.CdnaStart); Assert.Equal(763, result.CdnaEnd); } [Fact] public void FixExonEndpointInsertion_VariantStart_ExonStart_Reverse() { // N.B. this configuration has never been spotted in the wild var startRegion = new TranscriptRegion(TranscriptRegionType.Exon, 2, 2000, 2199, 1, 200); var endRegion = new TranscriptRegion(TranscriptRegionType.Intron, 2, 1999, 1000, 200, 201); var result = MappedPositionUtilities.FixExonEndpointInsertion(200, -1, true, startRegion, endRegion, new Interval(2000, 1999)); Assert.Equal(200, result.CdnaStart); Assert.Equal(201, result.CdnaEnd); } [Fact] public void FixExonEndpointInsertion_VariantEnd_ExonEnd_Forward() { var startRegion = new TranscriptRegion(TranscriptRegionType.Intron, 16, 89521770, 89528546, 3071, 3072); var endRegion = new TranscriptRegion(TranscriptRegionType.Exon, 16, 89521614, 89521769, 2916, 3071); var result = MappedPositionUtilities.FixExonEndpointInsertion(-1, 3071, false, startRegion, endRegion, new Interval(89521770, 89521769)); Assert.Equal(3072, result.CdnaStart); Assert.Equal(3071, result.CdnaEnd); } [Fact] public void FixExonEndpointInsertion_VariantStart_ExonStart_Forward() { var startRegion = new TranscriptRegion(TranscriptRegionType.Exon, 2, 99459243, 99459360, 108, 225); var endRegion = new TranscriptRegion(TranscriptRegionType.Intron, 1, 99456512, 99459242, 107, 108); var result = MappedPositionUtilities.FixExonEndpointInsertion(108, -1, false, startRegion, endRegion, new Interval(99459243, 99459242)); Assert.Equal(108, result.CdnaStart); Assert.Equal(107, result.CdnaEnd); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/StringExtensionsTests.cs ================================================ using VariantAnnotation.AnnotatedPositions.Transcript; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class StringExtensionsTests { [Theory] [InlineData(null,null,0)] [InlineData("abc",null,0)] [InlineData("abc", "abgg", 2)] [InlineData("abcfdg", "abgg", 2)] public void CommonPrefixLength(string a, string b, int expResult) { Assert.Equal(expResult,a.CommonPrefixLength(b)); } [Theory] [InlineData(null, null, 0)] [InlineData("abc", null, 0)] [InlineData("abc", "abgg", 0)] [InlineData("abcfdg", "abgg", 1)] public void CommonSuffixLength(string a, string b, int expResult) { Assert.Equal(expResult, a.CommonSuffixLength(b)); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/TranscriptPositionalEffectTests.cs ================================================ using Intervals; using Moq; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class TranscriptPositionalEffectTests { private readonly Mock _forwardTranscript; // use info from "ENST00000455979.1" with modification private readonly Mock _reverseTranscript; // use info from "ENST00000385042" private readonly ITranscriptRegion[] _otherTranscriptRegions; public TranscriptPositionalEffectTests() { const int start = 874655; const int end = 879639; _otherTranscriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 200, 300, 1, 186), new TranscriptRegion(TranscriptRegionType.Intron, 1, 301, 400, 186, 187), new TranscriptRegion(TranscriptRegionType.Exon, 2, 401, 699, 187, 349), new TranscriptRegion(TranscriptRegionType.Intron, 2, 700, 709, 359, 360), new TranscriptRegion(TranscriptRegionType.Exon, 3, 710, 800, 350, 465) }; var forwardTranscriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 874655, 874840, 1, 186), new TranscriptRegion(TranscriptRegionType.Intron, 1, 874841, 876523, 186, 187), new TranscriptRegion(TranscriptRegionType.Exon, 2, 876524, 876686, 187, 349), new TranscriptRegion(TranscriptRegionType.Intron, 2, 876687, 877515, 349, 350), new TranscriptRegion(TranscriptRegionType.Exon, 3, 877516, 877631, 350, 465), new TranscriptRegion(TranscriptRegionType.Intron, 3, 877632, 877789, 465, 466), new TranscriptRegion(TranscriptRegionType.Exon, 4, 877790, 877868, 466, 544), new TranscriptRegion(TranscriptRegionType.Intron, 4, 877869, 877938, 544, 545), new TranscriptRegion(TranscriptRegionType.Exon, 5, 877939, 878438, 545, 1044), new TranscriptRegion(TranscriptRegionType.Intron, 5, 878439, 878632, 1044, 1045), new TranscriptRegion(TranscriptRegionType.Exon, 6, 878633, 878757, 1045, 1169), new TranscriptRegion(TranscriptRegionType.Intron, 6, 878758, 879077, 1169, 1170), new TranscriptRegion(TranscriptRegionType.Exon, 7, 879078, 879639, 1170, 1731) }; var reverseTranscriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 3477259, 3477354, 1, 96) }; var translation = new Mock(); translation.SetupGet(x => x.CodingRegion).Returns(new CodingRegion(874655, 879533, 1, 1625, 1625)); var gene = new Mock(); gene.SetupGet(x => x.OnReverseStrand).Returns(false); _forwardTranscript = new Mock(); _forwardTranscript.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); _forwardTranscript.SetupGet(x => x.Start).Returns(start); _forwardTranscript.SetupGet(x => x.End).Returns(end); _forwardTranscript.SetupGet(x => x.Gene).Returns(gene.Object); _forwardTranscript.SetupGet(x => x.TranscriptRegions).Returns(forwardTranscriptRegions); _forwardTranscript.SetupGet(x => x.Translation).Returns(translation.Object); _forwardTranscript.SetupGet(x => x.TotalExonLength).Returns(1731); _reverseTranscript = new Mock(); _reverseTranscript.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); _reverseTranscript.SetupGet(x => x.Start).Returns(3477259); _reverseTranscript.SetupGet(x => x.Start).Returns(3477354); _reverseTranscript.SetupGet(x => x.Gene.OnReverseStrand).Returns(true); _reverseTranscript.SetupGet(x => x.Translation).Returns((ITranslation)null); _reverseTranscript.SetupGet(x => x.BioType).Returns(BioType.miRNA); _reverseTranscript.SetupGet(x => x.TranscriptRegions).Returns(reverseTranscriptRegions); _reverseTranscript.SetupGet(x => x.MicroRnas).Returns(new IInterval[] { new Interval(61, 81) }); } [Fact] public void DetermineIntronicEffect_NullIntrons() { var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineIntronicEffect(null, new Interval(400, 400), VariantType.SNV); Assert.False(positionalEffect.IsEndSpliceSite); Assert.False(positionalEffect.IsStartSpliceSite); Assert.False(positionalEffect.IsWithinFrameshiftIntron); Assert.False(positionalEffect.IsWithinIntron); Assert.False(positionalEffect.IsWithinSpliceSiteRegion); Assert.False(positionalEffect.HasExonOverlap); Assert.False(positionalEffect.AfterCoding); Assert.False(positionalEffect.BeforeCoding); Assert.False(positionalEffect.WithinCdna); Assert.False(positionalEffect.WithinCds); Assert.False(positionalEffect.HasFrameShift); Assert.False(positionalEffect.IsCoding); } [Fact] public void DetermineIntronicEffect_NotWithinFrameshiftIntron() { var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 201342300, 201342340, 1, 186), new TranscriptRegion(TranscriptRegionType.Intron, 1, 201342340, 201342343, 186, 187), new TranscriptRegion(TranscriptRegionType.Exon, 2, 201342344, 201342400, 187, 349) }; IInterval variant = new Interval(201342344, 201342344); var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineIntronicEffect(transcriptRegions, variant, VariantType.SNV); Assert.True(positionalEffect.IsWithinSpliceSiteRegion); } [Fact] public void DetermineIntronicEffect_IsEndSpliceSite() { var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineIntronicEffect(_otherTranscriptRegions, new Interval(400, 400), VariantType.SNV); Assert.True(positionalEffect.IsEndSpliceSite); } [Fact] public void DetermineIntronicEffect_IsStartSpliceSite() { var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineIntronicEffect(_otherTranscriptRegions, new Interval(300, 303), VariantType.deletion); Assert.True(positionalEffect.IsStartSpliceSite); } [Fact] public void DetermineIntronicEffect_IsWithinFrameshiftIntron_NotInSpliceSite() { var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineIntronicEffect(_otherTranscriptRegions, new Interval(701, 709), VariantType.deletion); Assert.True(positionalEffect.IsWithinFrameshiftIntron); Assert.False(positionalEffect.IsStartSpliceSite); Assert.False(positionalEffect.IsEndSpliceSite); } [Fact] public void DetermineIntronicEffect_IsWithinIntron() { IInterval variant = new Interval(300, 302); var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineIntronicEffect(_otherTranscriptRegions, variant, VariantType.deletion); Assert.False(positionalEffect.IsWithinIntron); IInterval variant2 = new Interval(303, 303); var positionalEffect2 = new TranscriptPositionalEffect(); positionalEffect2.DetermineIntronicEffect(_otherTranscriptRegions, variant2, VariantType.deletion); Assert.True(positionalEffect2.IsWithinIntron); } [Fact] public void DetermineIntronicEffect_IsWithinSpliceSiteRegion() { var positionalEffect = new TranscriptPositionalEffect(); IInterval variant = new Interval(298, 302); positionalEffect.DetermineIntronicEffect(_otherTranscriptRegions, variant, VariantType.deletion); Assert.True(positionalEffect.IsWithinSpliceSiteRegion); } [Fact] public void DetermineExonicEffect_HasExonOverlap() { IInterval variant = new Interval(876686, 876686); var position = new MappedPosition(349, 349, 349, 349, 117, 117, 2, 2, -1, -1, 2, 2); var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineExonicEffect(_forwardTranscript.Object, variant, position, 349, 349, 349, 349, "G", false); Assert.True(positionalEffect.HasExonOverlap); } [Fact] public void DetermineExonicEffect_AfterCoding() { IInterval variant = new Interval(879600, 879600); var position = new MappedPosition(1692, 1692, -1, -1, -1, -1, 7, 7, -1, -1, 12, 12); var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineExonicEffect(_forwardTranscript.Object, variant, position, 1692, 1692, -1, -1, "G", false); Assert.True(positionalEffect.AfterCoding); } [Fact] public void DetermineExonicEffect_WithinCdna() { IInterval variant = new Interval(879600, 879600); var position = new MappedPosition(1692, 1692, -1, -1, -1, -1, 7, 7, -1, -1, 12, 12); var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineExonicEffect(_forwardTranscript.Object, variant, position, 1692, 1692, -1, -1, "G", false); Assert.True(positionalEffect.WithinCdna); } [Fact] public void DetermineExonicEffect_WithinCds() { IInterval variant = new Interval(876543, 876543); var position = new MappedPosition(206, 206, 206, 206, 69, 69, 2, 2, -1, -1, 2, 2); var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineExonicEffect(_forwardTranscript.Object, variant, position, 206, 206, 206, 206, "G", false); Assert.True(positionalEffect.WithinCdna); } [Fact] public void DetermineExonicEffect_OverlapWithMicroRna() { IInterval variant = new Interval(3477284, 3477284); var position = new MappedPosition(71, 71, -1, -1, -1, -1, 1, 1, -1, -1, 0, 0); var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineExonicEffect(_reverseTranscript.Object, variant, position, 71, 71, -1, -1, "G", false); Assert.True(positionalEffect.OverlapWithMicroRna); } [Fact] public void ExonOverlaps_NoOverlap() { var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 200, 300, 400) }; IInterval variant = new Interval(201, 500); var observedResult = transcriptRegions[0].Overlaps(variant); Assert.False(observedResult); } [Fact] public void IsMatureMirnaVariant_NullMirnas() { var observedResult = TranscriptPositionalEffect.IsMatureMirnaVariant(-1, -1, null, true); Assert.False(observedResult); } [Fact] public void IsWithinCds_ReturnFalse() { var positionalEffect = new TranscriptPositionalEffect(); var observedResult = positionalEffect.IsWithinCds(-1, -1, null, null); Assert.False(observedResult); } [Fact] public void IsWithinCds_ReturnTrue() { var positionalEffect = new TranscriptPositionalEffect(); var observedResult = positionalEffect.IsWithinCds(180, 180, null, null); Assert.True(observedResult); } [Fact] public void IsWithinCds_IsWithinFrameshiftIntron_OverlapCodingRegion_ReturnTrue() { var variant = new Interval(100, 101); var codingRegion = new Interval(90, 120); var positionalEffect = new TranscriptPositionalEffect { IsWithinFrameshiftIntron = true }; var observedResult = positionalEffect.IsWithinCds(-1, -1, codingRegion, variant); Assert.True(observedResult); } [Fact] public void IsWithinCds_IsWithinFrameshiftIntron_ReturnFalse() { var variant = new Interval(100, 101); var codingRegion = new Interval(102, 120); var positionalEffect = new TranscriptPositionalEffect { IsWithinFrameshiftIntron = true }; var observedResult = positionalEffect.IsWithinCds(-1, -1, codingRegion, variant); Assert.False(observedResult); } [Fact] public void IsAfterCoding_True_WhenInsertion() { var observedResult = TranscriptPositionalEffect.IsAfterCoding(101, 100, 100, 100); Assert.True(observedResult); } [Fact] public void IsBeforeCoding_True_WhenInsertion() { var observedResult = TranscriptPositionalEffect.IsBeforeCoding(101, 100, 100, 101); Assert.True(observedResult); } [Theory] [InlineData(100, 200, 300, true)] [InlineData(500, 600, 300, false)] public void IsWithinCdna(int cdnaStart, int cdnaEnd, int totalExonLen, bool expectedResult) { var observedResult = TranscriptPositionalEffect.IsWithinCdna(cdnaStart, cdnaEnd, totalExonLen); Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/TranscriptUtilitiesTests.cs ================================================ using Genome; using Intervals; using Moq; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class TranscriptUtilitiesTests { private readonly ISequence _refSequence = new SimpleSequence("ACTTCGGGC", 12340); [Fact] public void IsDuplicateWithinInterval_not_intertion() { var simpleVar = GenSimpleDeletionMock(); Assert.False(HgvsUtilities.IsDuplicateWithinInterval(_refSequence, simpleVar.Object, new Interval(1, 3), false)); } [Fact] public void IsDuplicateWithinInterval_outside_interval() { var simpleVar = GenSimpleInsertionMock(); // forward strand Assert.False(HgvsUtilities.IsDuplicateWithinInterval(_refSequence, simpleVar.Object, new Interval(12344, 12370), false)); // reverse strand Assert.False(HgvsUtilities.IsDuplicateWithinInterval(_refSequence, simpleVar.Object, new Interval(12340, 12347), true)); } [Fact] public void IsDuplicateWithinInterval_insertion_not_dup() { var simpleVar = GenSimpleInsertionMock(); // forward strand Assert.False(HgvsUtilities.IsDuplicateWithinInterval(new SimpleSequence("ACTTCGGGC", 12340), simpleVar.Object, new Interval(12300, 12400), false)); // reverse strand Assert.False(HgvsUtilities.IsDuplicateWithinInterval(new SimpleSequence("ACTTCGGGC", 12340), simpleVar.Object, new Interval(12300, 12400), true)); } [Fact] public void IsDuplicateWithinInterval_insertion_is_dup() { var simpleVar = GenSimpleInsertionMock(); // forward strand Assert.True(HgvsUtilities.IsDuplicateWithinInterval(new SimpleSequence("ACCTGGGGC", 12340), simpleVar.Object, new Interval(12300, 12400), false)); // reverse strand Assert.True(HgvsUtilities.IsDuplicateWithinInterval(new SimpleSequence("ACTTCCTGC", 12340), simpleVar.Object, new Interval(12300, 12400), true)); } private static Mock GenSimpleDeletionMock() { var simpleVar = new Mock(); simpleVar.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr21); simpleVar.SetupGet(x => x.Start).Returns(12345); simpleVar.SetupGet(x => x.End).Returns(12348); simpleVar.SetupGet(x => x.RefAllele).Returns("CTG"); simpleVar.SetupGet(x => x.AltAllele).Returns(""); simpleVar.SetupGet(x => x.Type).Returns(VariantType.deletion); return simpleVar; } private static Mock GenSimpleInsertionMock() { var simpleVar = new Mock(); simpleVar.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr21); simpleVar.SetupGet(x => x.Start).Returns(12346); simpleVar.SetupGet(x => x.End).Returns(12345); simpleVar.SetupGet(x => x.RefAllele).Returns(""); simpleVar.SetupGet(x => x.AltAllele).Returns("CTG"); simpleVar.SetupGet(x => x.Type).Returns(VariantType.insertion); return simpleVar; } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatedPositions/Transcript/VariantEffectTests.cs ================================================ using Moq; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.AnnotatedPositions.Transcript { public sealed class VariantEffectTests { [Theory] [InlineData(false,false,false,false)] [InlineData(false,false,true,true)] [InlineData(true, false, true, false)] [InlineData(true, true, true, true)] public void IsSpliceAcceptorVariant(bool onReverseStrand,bool isStartSpliceSite, bool isEndSpliceSite,bool expectedResult) { var positionalEffect = new TranscriptPositionalEffect { IsStartSpliceSite = isStartSpliceSite, IsEndSpliceSite = isEndSpliceSite }; var variant = new Mock(); var transcript = new Mock(); variant.SetupGet(x => x.AltAllele).Returns("G"); variant.SetupGet(x => x.RefAllele).Returns("C"); var variantEffect = new VariantEffect(positionalEffect, variant.Object, transcript.Object, "", "", "", "", null, "", ""); var gene = new Mock(); transcript.SetupGet(x => x.Gene).Returns(gene.Object); gene.SetupGet(x => x.OnReverseStrand).Returns(onReverseStrand); Assert.Equal(expectedResult, variantEffect.IsSpliceAcceptorVariant()); } [Theory] [InlineData(false, false, false, false)] [InlineData(false, true, true, true)] [InlineData(true, false, false, false)] [InlineData(true, true, true, true)] public void IsSpliceDonorVariant(bool onReverseStrand, bool isStartSpliceSite, bool isEndSpliceSite, bool expectedResult) { var positionalEffect = new TranscriptPositionalEffect { IsStartSpliceSite = isStartSpliceSite, IsEndSpliceSite = isEndSpliceSite }; var variant = new Mock(); var transcript = new Mock(); variant.SetupGet(x => x.AltAllele).Returns("G"); variant.SetupGet(x => x.RefAllele).Returns("C"); var variantEffect = new VariantEffect(positionalEffect, variant.Object, transcript.Object, "", "", "", "", null, "", ""); var gene = new Mock(); transcript.SetupGet(x => x.Gene).Returns(gene.Object); gene.SetupGet(x => x.OnReverseStrand).Returns(onReverseStrand); Assert.Equal(expectedResult, variantEffect.IsSpliceDonorVariant()); } [Theory] [InlineData(1, "M", "KM", "", "TCT", true)] [InlineData(2, "M", "Mk", "", "TCT", false)] [InlineData(1, "K", "MK", "", "ATG", true)] public void IsStartRetainedVariant(int proteinBegin, string refAminoAcids, string altAminoAcids, string refAllele, string altAllele, bool isStartRetained) { var variant = new Mock(); var transcript = new Mock(); variant.SetupGet(x => x.AltAllele).Returns(refAllele); variant.SetupGet(x => x.RefAllele).Returns(altAllele); var variantEffect = new VariantEffect(null, variant.Object, transcript.Object, refAminoAcids, altAminoAcids , "", "", proteinBegin, refAminoAcids, altAminoAcids); if (isStartRetained) Assert.True(variantEffect.IsStartRetained()); else Assert.False(variantEffect.IsStartRetained()); } [Theory] [InlineData(false, true, false, false, false)] [InlineData(false, true, true, true, true)] [InlineData(false, false, true, true, false)] [InlineData(true, true, false, false, false)] [InlineData(true, true, true, true, true)] [InlineData(true, false, true, true, false)] public void IsFivePrimeUtrVariant(bool onReverseStrand, bool withinCdna, bool beforeCoding, bool afterCoding, bool expectedResult) { var positionalEffect = new TranscriptPositionalEffect { BeforeCoding = beforeCoding, AfterCoding = afterCoding, WithinCdna = withinCdna }; var variant = new Mock(); var transcript = new Mock(); variant.SetupGet(x => x.AltAllele).Returns("G"); variant.SetupGet(x => x.RefAllele).Returns("C"); var variantEffect = new VariantEffect(positionalEffect, variant.Object, transcript.Object, "", "", "", "", null, "", ""); var gene = new Mock(); transcript.SetupGet(x => x.Gene).Returns(gene.Object); gene.SetupGet(x => x.OnReverseStrand).Returns(onReverseStrand); var translation = new Mock(); transcript.SetupGet(x => x.Translation).Returns(translation.Object); Assert.Equal(expectedResult, variantEffect.IsFivePrimeUtrVariant()); } [Fact] public void IsStopLost_DeletionOverStopCodon_ReturnTrue() { var positionalEffect = new TranscriptPositionalEffect { BeforeCoding = false, AfterCoding = true, WithinCdna = true }; var variant = new Mock(); variant.SetupGet(x => x.AltAllele).Returns("ATAGCCC"); variant.SetupGet(x => x.RefAllele).Returns("A"); var variantEffect = new VariantEffect(positionalEffect, variant.Object, null, "", "", "", "", null, "*", "X"); Assert.True(variantEffect.IsStopLost()); } [Theory] [InlineData(ConsequenceTag.start_retained_variant)] [InlineData(ConsequenceTag.incomplete_terminal_codon_variant)] public void IsCodingSequenceVariant_WithMoreSpecificConsequence_ReturnFalse(ConsequenceTag ct) { var positionalEffect = new TranscriptPositionalEffect { BeforeCoding = false, AfterCoding = true, WithinCdna = true }; var cache = new VariantEffectCache(); cache.Add(ct, true); var variant = new Mock(); variant.SetupGet(x => x.AltAllele).Returns("ATAGCCC"); variant.SetupGet(x => x.RefAllele).Returns("A"); var variantEffect = new VariantEffect(positionalEffect, variant.Object, null, "", "", "", "", null, null, null, cache); Assert.False(variantEffect.IsCodingSequenceVariant()); } } } ================================================ FILE: UnitTests/VariantAnnotation/AnnotatorTests.cs ================================================ using ErrorHandling.Exceptions; using Genome; using Moq; using UnitTests.TestUtilities; using VariantAnnotation; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.GeneAnnotation; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Variants; using Xunit; namespace UnitTests.VariantAnnotation { public sealed class AnnotatorTest { private static IVariant[] GetVariants() { var variant = new Mock(); variant.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); variant.SetupGet(x => x.Type).Returns(VariantType.SNV); variant.SetupGet(x => x.Start).Returns(949523); variant.SetupGet(x => x.End).Returns(949523); variant.SetupGet(x => x.RefAllele).Returns("C"); variant.SetupGet(x => x.AltAllele).Returns("T"); variant.SetupGet(x => x.Behavior).Returns(AnnotationBehavior.SmallVariants); return new[] { variant.Object }; } private static IVariant[] GetMitoVariants() { var variant = new Mock(); variant.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.ChrM); variant.SetupGet(x => x.Type).Returns(VariantType.SNV); variant.SetupGet(x => x.Start).Returns(9495); variant.SetupGet(x => x.End).Returns(9495); variant.SetupGet(x => x.RefAllele).Returns("C"); variant.SetupGet(x => x.AltAllele).Returns("T"); variant.SetupGet(x => x.Behavior).Returns(AnnotationBehavior.SmallVariants); return new[] { variant.Object }; } [Fact] public void Annotate_conservation_annotation() { var position = new Mock(); position.SetupGet(x => x.Variants).Returns(GetMitoVariants); position.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); var csProvider = new Mock(); csProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); csProvider.Setup(x => x.Annotate(It.IsAny())). Callback((IAnnotatedPosition x) => { x.CytogeneticBand = "testCytoBand"; }); var taProvider = new Mock(); taProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); taProvider.Setup(x => x.Annotate(It.IsAny())).Callback((IAnnotatedPosition x) => { });//do nothing var annotator = new Annotator(taProvider.Object, null, null, csProvider.Object, null, null, null, null); var annotatedPosition = annotator.Annotate(position.Object); Assert.Equal("testCytoBand", annotatedPosition.CytogeneticBand); } [Fact] public void Annotate_mito_hg19() { var position = new Mock(); position.SetupGet(x => x.Variants).Returns(GetVariants); position.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.ChrM); var csProvider = new Mock(); csProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); csProvider.Setup(x => x.Annotate(It.IsAny())). Callback((IAnnotatedPosition x) => { x.CytogeneticBand = "testCytoBand"; }); var taProvider = new Mock(); taProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); taProvider.Setup(x => x.Annotate(It.IsAny())).Callback((IAnnotatedPosition x) => { });//do nothing var annotator = new Annotator(taProvider.Object, null, null, csProvider.Object, null, null, null, null); var annotatedPosition = annotator.Annotate(position.Object); Assert.Null(annotatedPosition.CytogeneticBand); } [Fact] public void Annotate_mito_GRCh37() { var position = new Mock(); position.SetupGet(x => x.Variants).Returns(GetVariants); position.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.ChrM); var csProvider = new Mock(); csProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); csProvider.Setup(x => x.Annotate(It.IsAny())). Callback((IAnnotatedPosition x) => { x.CytogeneticBand = "testCytoBand"; }); var taProvider = new Mock(); taProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); taProvider.Setup(x => x.Annotate(It.IsAny())).Callback((IAnnotatedPosition x) => { });//do nothing var annotator = new Annotator(taProvider.Object, null, null, csProvider.Object, null, null, null, null); annotator.EnableMitochondrialAnnotation(); var annotatedPosition = annotator.Annotate(position.Object); Assert.NotNull(annotatedPosition.CytogeneticBand); } [Fact] public void Annotate_null_position() { var annotator = new Annotator(null, null, null, null, null, null, null, null); var annotatedPosition = annotator.Annotate(null); Assert.Null(annotatedPosition); } //[Fact] //public void TrackAffectedGenes() //{ // var taProvider = new Mock(); // taProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); // taProvider.Setup(x => x.Annotate(It.IsAny())).Callback((IAnnotatedPosition x) => { });//do nothing // var geneAnnotationProvider = new Mock(); // geneAnnotationProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); // var annotator = new Annotator(taProvider.Object, null, null, null, geneAnnotationProvider.Object); // var annotatedPosition = new Mock(); // var annotatedVariant = new Mock(); // var ensembleTranscript = new Mock(); // annotatedVariant.SetupGet(x => x.EnsemblTranscripts) // .Returns(new List { ensembleTranscript.Object }); // ensembleTranscript.SetupGet(x => x.Transcript.Gene.Symbol).Returns("ensembl1"); // var refSeqTranscript = new Mock(); // annotatedVariant.SetupGet(x => x.RefSeqTranscripts) // .Returns(new List { refSeqTranscript.Object }); // refSeqTranscript.SetupGet(x => x.Transcript.Gene.Symbol).Returns("refseq1"); // annotatedPosition.SetupGet(x => x.AnnotatedVariants).Returns(new[] { annotatedVariant.Object }); // annotator.TrackAffectedGenes(annotatedPosition.Object); // var geneAnnotation = new Mock(); // geneAnnotationProvider.Setup(x => x.Annotate("ensembl1")).Returns(geneAnnotation.Object); // geneAnnotationProvider.Setup(x => x.Annotate("refseq1")).Returns((string)null); // var annotatedGenes = annotator.GetGeneAnnotations(); // Assert.Equal(1, annotatedGenes.Count); //} [Fact] public void CheckAssemblyConsistency_Consistent() { var taProvider = new Mock(); taProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); var saProvider = new Mock(); saProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); var csProvider = new Mock(); csProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); var omimProvider = new Mock(); omimProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); var annotator = new Annotator(taProvider.Object, null, saProvider.Object, csProvider.Object, null, omimProvider.Object, null, null); Assert.NotNull(annotator); } [Fact] public void CheckAssemblyConsistency_Inconsistent() { var taProvider = new Mock(); taProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); var saProvider = new Mock(); saProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); var csProvider = new Mock(); csProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh38); var omimProvider = new Mock(); omimProvider.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); Assert.Throws(() => new Annotator(taProvider.Object, null, saProvider.Object, csProvider.Object, null, omimProvider.Object, null, null)); } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/DataStructures/EncodedTranscriptDataTests.cs ================================================ using System.IO; using System.Text; using IO; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.Caches.DataStructures { public sealed class EncodedTranscriptDataTests { [Fact] public void EncodedTranscriptData_EndToEnd() { const BioType expectedBiotype = BioType.Y_RNA; const bool expectedCdsStartNotFound = true; const bool expectedCdsEndNotFound = true; const Source expectedSource = Source.BothRefSeqAndEnsembl; const bool expectedCanonical = true; const bool expectedSift = true; const bool expectedPolyPhen = true; const bool expectedMirnas = true; const bool expectedRnaEdits = true; const bool expectedSelenocysteines = true; const bool expectedTranscriptRegions = true; const bool expectedTranslation = true; const byte expectedStartExonPhase = 3; // ReSharper disable ConditionIsAlwaysTrueOrFalse var encodedData = EncodedTranscriptData.GetEncodedTranscriptData(expectedBiotype, expectedCdsStartNotFound, expectedCdsEndNotFound, expectedSource, expectedCanonical, expectedSift, expectedPolyPhen, expectedMirnas, expectedRnaEdits, expectedSelenocysteines, expectedTranscriptRegions, expectedTranslation, expectedStartExonPhase); // ReSharper restore ConditionIsAlwaysTrueOrFalse EncodedTranscriptData observedEncodedTranscriptData; using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { encodedData.Write(writer); } ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) { observedEncodedTranscriptData = EncodedTranscriptData.Read(reader); } } Assert.NotNull(observedEncodedTranscriptData); Assert.Equal(expectedBiotype, observedEncodedTranscriptData.BioType); Assert.Equal(expectedSource, observedEncodedTranscriptData.TranscriptSource); Assert.Equal(expectedCanonical, observedEncodedTranscriptData.IsCanonical); Assert.Equal(expectedSift, observedEncodedTranscriptData.HasSift); Assert.Equal(expectedPolyPhen, observedEncodedTranscriptData.HasPolyPhen); Assert.Equal(expectedMirnas, observedEncodedTranscriptData.HasMirnas); Assert.Equal(expectedTranscriptRegions, observedEncodedTranscriptData.HasTranscriptRegions); Assert.Equal(expectedTranslation, observedEncodedTranscriptData.HasTranslation); Assert.Equal(expectedStartExonPhase, observedEncodedTranscriptData.StartExonPhase); } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/DataStructures/GeneTests.cs ================================================ using System.IO; using System.Text; using Genome; using IO; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.Caches.DataStructures { public sealed class GeneTests { [Fact] public void Gene_EndToEnd() { const int expectedStart = int.MaxValue; const int expectedEnd = int.MinValue; Chromosome expectedChromosome = ChromosomeUtilities.Chr1; const bool expectedReverseStrand = true; const string expectedSymbol = "anavrin"; const string expectedEntrezGeneId = "7157"; const string expectedEnsemblId = "ENSG00000141510"; const int expectedHgncId = int.MaxValue; // ReSharper disable ConditionIsAlwaysTrueOrFalse var gene = new Gene(expectedChromosome, expectedStart, expectedEnd, expectedReverseStrand, expectedSymbol, expectedHgncId, CompactId.Convert(expectedEntrezGeneId), CompactId.Convert(expectedEnsemblId)); // ReSharper restore ConditionIsAlwaysTrueOrFalse IGene observedGene; using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { gene.Write(writer); } ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) { observedGene = Gene.Read(reader, ChromosomeUtilities.RefIndexToChromosome); } } Assert.NotNull(observedGene); Assert.Equal(expectedStart, observedGene.Start); Assert.Equal(expectedEnd, observedGene.End); Assert.Equal(expectedChromosome.Index, observedGene.Chromosome.Index); Assert.Equal(expectedReverseStrand, observedGene.OnReverseStrand); Assert.Equal(expectedSymbol, observedGene.Symbol); Assert.Equal(expectedEntrezGeneId, observedGene.EntrezGeneId.ToString()); Assert.Equal(expectedEnsemblId, observedGene.EnsemblId.ToString()); Assert.Equal(expectedHgncId, observedGene.HgncId); } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/DataStructures/RegulatoryRegionTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Text; using Genome; using IO; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using Xunit; namespace UnitTests.VariantAnnotation.Caches.DataStructures { public sealed class RegulatoryRegionTests { [Fact] public void RegulatoryRegion_EndToEnd() { Chromosome expectedChromosome = ChromosomeUtilities.Chr1; const int expectedStart = int.MaxValue; const int expectedEnd = int.MinValue; const string expectedId = "ENST00000540021"; const RegulatoryRegionType expectedType = RegulatoryRegionType.open_chromatin_region; var indexToChromosome = new Dictionary { [expectedChromosome.Index] = expectedChromosome }; // ReSharper disable ConditionIsAlwaysTrueOrFalse var regulatoryRegion = new RegulatoryRegion(expectedChromosome, expectedStart, expectedEnd, CompactId.Convert(expectedId), expectedType); // ReSharper restore ConditionIsAlwaysTrueOrFalse IRegulatoryRegion observedRegulatoryRegion; using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { regulatoryRegion.Write(writer); } ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) { observedRegulatoryRegion = RegulatoryRegion.Read(reader, indexToChromosome); } } Assert.NotNull(observedRegulatoryRegion); Assert.Equal(expectedStart, observedRegulatoryRegion.Start); Assert.Equal(expectedEnd, observedRegulatoryRegion.End); Assert.Equal(expectedId, observedRegulatoryRegion.Id.WithoutVersion); Assert.Equal(expectedType, observedRegulatoryRegion.Type); Assert.Equal(expectedChromosome.Index, observedRegulatoryRegion.Chromosome.Index); } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/DataStructures/TranscriptRegionExtensionsTests.cs ================================================ using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.Caches.DataStructures { public sealed class TranscriptRegionExtensionsTests { private readonly ITranscriptRegion[] _forwardTranscriptRegions; private readonly ITranscriptRegion[] _reverseTranscriptRegions; public TranscriptRegionExtensionsTests() { _forwardTranscriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 77997792, 77998025, 1, 234), new TranscriptRegion(TranscriptRegionType.Intron, 1, 77998026, 78001531, 234, 235), new TranscriptRegion(TranscriptRegionType.Exon, 2, 78001532, 78001723, 235, 426), new TranscriptRegion(TranscriptRegionType.Intron, 2, 78001724, 78024286, 426, 427), new TranscriptRegion(TranscriptRegionType.Exon, 3, 78024287, 78024416, 427, 556) }; _reverseTranscriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 3, 312957, 313157, 136, 336), new TranscriptRegion(TranscriptRegionType.Intron, 2, 313158, 313873, 135, 136), new TranscriptRegion(TranscriptRegionType.Exon, 2, 313874, 313892, 117, 135), new TranscriptRegion(TranscriptRegionType.Intron, 1, 313893, 314242, 116, 117), new TranscriptRegion(TranscriptRegionType.Exon, 1, 314243, 314358, 1, 116) }; } [Theory] [InlineData(77997792, 0)] [InlineData(78001723, 2)] [InlineData(78024416, 4)] [InlineData(78001724, 3)] public void BinarySearch_Nominal(int position, int expectedResult) { int observedResult = _forwardTranscriptRegions.BinarySearch(position); Assert.Equal(expectedResult, observedResult); } [Theory] [InlineData(77997791, -1)] [InlineData(78024417, -6)] // the binary search method returns the bitwise complement of the next larger element public void BinarySearch_ReturnNegative_BeyondExons(int position, int expectedResult) { int observedResult = _forwardTranscriptRegions.BinarySearch(position); Assert.Equal(expectedResult, observedResult); } [Fact] public void GetExonsAndIntrons_Forward_Internal() { var observedResults = _forwardTranscriptRegions.GetExonsAndIntrons(1, 3); Assert.Equal(2, observedResults.ExonStart); Assert.Equal(2, observedResults.ExonEnd); Assert.Equal(1, observedResults.IntronStart); Assert.Equal(2, observedResults.IntronEnd); } [Fact] public void GetExonsAndIntrons_Reverse_Gap_NIR_3592() { var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 19, 16606122, 16606679, 3404, 3961), new TranscriptRegion(TranscriptRegionType.Gap, 19, 16606680, 16606680, 3403, 3404), new TranscriptRegion(TranscriptRegionType.Exon, 19, 16606681, 16607898, 2186, 3403) }; var observedResults = transcriptRegions.GetExonsAndIntrons(1, 1); Assert.Equal(19, observedResults.ExonStart); Assert.Equal(19, observedResults.ExonEnd); Assert.Equal(-1, observedResults.IntronStart); Assert.Equal(-1, observedResults.IntronEnd); } [Fact] public void GetExonsAndIntrons_Reverse_Internal() { var observedResults = _reverseTranscriptRegions.GetExonsAndIntrons(2, 4); Assert.Equal(1, observedResults.ExonStart); Assert.Equal(2, observedResults.ExonEnd); Assert.Equal(1, observedResults.IntronStart); Assert.Equal(1, observedResults.IntronEnd); } [Fact] public void GetExonsAndIntrons_Forward_StartBefore() { var observedResults = _forwardTranscriptRegions.GetExonsAndIntrons(-1, 3); Assert.Equal(1, observedResults.ExonStart); Assert.Equal(2, observedResults.ExonEnd); Assert.Equal(1, observedResults.IntronStart); Assert.Equal(2, observedResults.IntronEnd); } [Fact] public void GetExonsAndIntrons_Forward_EndAfter() { var observedResults = _forwardTranscriptRegions.GetExonsAndIntrons(2, -6); Assert.Equal(2, observedResults.ExonStart); Assert.Equal(3, observedResults.ExonEnd); Assert.Equal(2, observedResults.IntronStart); Assert.Equal(2, observedResults.IntronEnd); } [Fact] public void GetExonsAndIntrons_Reverse_StartBefore_EndAfter() { var observedResults = _reverseTranscriptRegions.GetExonsAndIntrons(-1, -6); Assert.Equal(1, observedResults.ExonStart); Assert.Equal(3, observedResults.ExonEnd); Assert.Equal(1, observedResults.IntronStart); Assert.Equal(2, observedResults.IntronEnd); } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/DataStructures/TranscriptRegionTests.cs ================================================ using System.IO; using System.Text; using CacheUtils.TranscriptCache.Comparers; using IO; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.Caches.DataStructures { public sealed class TranscriptRegionTests { [Fact] public void TranscriptRegion_EndToEnd() { var expectedResults = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 13, 100, 200, 300, 400), new TranscriptRegion(TranscriptRegionType.Gap, 0, 120, 230, 10, 20), new TranscriptRegion(TranscriptRegionType.Intron, 14, 130, 230, 330, 430) }; var observedResults = new ITranscriptRegion[expectedResults.Length]; using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { foreach(var region in expectedResults) region.Write(writer); } ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) { for (int i = 0; i < expectedResults.Length; i++) { observedResults[i] = TranscriptRegion.Read(reader); } } } var comparer = new TranscriptRegionComparer(); Assert.Equal(expectedResults, observedResults, comparer); } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/DataStructures/TranscriptTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Text; using Genome; using Intervals; using IO; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.Caches.DataStructures { public sealed class TranscriptTests { [Fact] public void Transcript_EndToEnd() { Chromosome expectedChromosome = ChromosomeUtilities.Chr1; const int expectedStart = int.MaxValue; const int expectedEnd = int.MinValue; const string expectedId = "ENST00000540021"; const byte expectedVersion = 7; const BioType expectedBioType = BioType.IG_J_pseudogene; const bool expectedCanonical = true; const Source expectedSource = Source.BothRefSeqAndEnsembl; const bool expectedCdsStartNotFound = true; const bool expectedCdsEndNotFound = true; var expectedIdAndVersion = expectedId + "." + expectedVersion; ICodingRegion expectedCodingRegion = new CodingRegion(10001, 10200, 1, 200, 200); ITranscriptRegion[] expectedTranscriptRegions = GetTranscriptRegions(); const byte expectedNumExons = 3; const int expectedTotalExonLength = 300; const byte expectedStartExonPhase = 3; const int expectedSiftIndex = 11; const int expectedPolyPhenIndex = 13; IInterval[] expectedMicroRnas = GetMicroRnas(); ITranslation expectedTranslation = new Translation(expectedCodingRegion, CompactId.Convert("ENSP00000446475", 17), "VEIDSD"); IGene expectedGene = new Gene(expectedChromosome, 100, 200, true, "TP53", 300, CompactId.Convert("7157"), CompactId.Convert("ENSG00000141510")); var genes = new IGene[1]; genes[0] = expectedGene; var peptideSeqs = new string[1]; peptideSeqs[0] = expectedTranslation.PeptideSeq; var geneIndices = CreateIndices(genes); var transcriptRegionIndices = CreateIndices(expectedTranscriptRegions); var microRnaIndices = CreateIndices(expectedMicroRnas); var peptideIndices = CreateIndices(peptideSeqs); var indexToChromosome = new Dictionary { [expectedChromosome.Index] = expectedChromosome }; // ReSharper disable ConditionIsAlwaysTrueOrFalse var transcript = new Transcript(expectedChromosome, expectedStart, expectedEnd, CompactId.Convert(expectedId, expectedVersion), expectedTranslation, expectedBioType, expectedGene, expectedTotalExonLength, expectedStartExonPhase, expectedCanonical, expectedTranscriptRegions, expectedNumExons, expectedMicroRnas, expectedSiftIndex, expectedPolyPhenIndex, expectedSource, expectedCdsStartNotFound, expectedCdsEndNotFound, null, null); // ReSharper restore ConditionIsAlwaysTrueOrFalse ITranscript observedTranscript; using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { transcript.Write(writer, geneIndices, transcriptRegionIndices, microRnaIndices, peptideIndices); } ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) { observedTranscript = Transcript.Read(reader, indexToChromosome, genes, expectedTranscriptRegions, expectedMicroRnas, peptideSeqs); } } Assert.NotNull(observedTranscript); Assert.Equal(expectedStart, observedTranscript.Start); Assert.Equal(expectedEnd, observedTranscript.End); Assert.Equal(expectedIdAndVersion, observedTranscript.Id.WithVersion); Assert.Equal(expectedBioType, observedTranscript.BioType); Assert.Equal(expectedCanonical, observedTranscript.IsCanonical); Assert.Equal(expectedSource, observedTranscript.Source); Assert.Equal(expectedTotalExonLength, observedTranscript.TotalExonLength); Assert.Equal(expectedStartExonPhase, observedTranscript.StartExonPhase); Assert.Equal(expectedSiftIndex, observedTranscript.SiftIndex); Assert.Equal(expectedPolyPhenIndex, observedTranscript.PolyPhenIndex); Assert.Equal(expectedChromosome.Index, observedTranscript.Chromosome.Index); Assert.Equal(expectedGene.Symbol, observedTranscript.Gene.Symbol); Assert.Equal(expectedTranslation.PeptideSeq, observedTranscript.Translation.PeptideSeq); Assert.Equal(expectedTranscriptRegions.Length, observedTranscript.TranscriptRegions.Length); Assert.Equal(expectedMicroRnas.Length, observedTranscript.MicroRnas.Length); } private static Dictionary CreateIndices(T[] objects) { var indexDict = new Dictionary(); for (int i = 0; i < objects.Length; i++) indexDict[objects[i]] = i; return indexDict; } private static ITranscriptRegion[] GetTranscriptRegions() { var regions = new ITranscriptRegion[5]; regions[0] = new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 199, 300, 399); regions[1] = new TranscriptRegion(TranscriptRegionType.Intron, 1, 200, 299, 400, 499); regions[2] = new TranscriptRegion(TranscriptRegionType.Exon, 2, 300, 399, 500, 599); regions[3] = new TranscriptRegion(TranscriptRegionType.Intron, 2, 400, 499, 600, 699); regions[4] = new TranscriptRegion(TranscriptRegionType.Exon, 3, 500, 599, 700, 799); return regions; } private static IInterval[] GetMicroRnas() { var introns = new IInterval[1]; introns[0] = new Interval(100, 200); return introns; } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/DataStructures/TranslationTests.cs ================================================ using System.IO; using System.Text; using IO; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.Caches.DataStructures { public sealed class TranslationTests { [Fact] public void Translation_EndToEnd() { ICodingRegion expectedCodingRegion = new CodingRegion(100, 200, 300, 400, 101); const string expectedProteinId = "ENSP00000446475.7"; const string expectedPeptideSeq = "VEIDSD"; string[] peptideSeqs = { expectedPeptideSeq }; ITranslation expectedTranslation = new Translation(expectedCodingRegion, CompactId.Convert(expectedProteinId, 7), expectedPeptideSeq); ITranslation observedTranslation; using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { expectedTranslation.Write(writer, 0); } ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) { observedTranslation = Translation.Read(reader, peptideSeqs); } } Assert.NotNull(observedTranslation); Assert.Equal(expectedCodingRegion.CdnaStart, observedTranslation.CodingRegion.CdnaStart); Assert.Equal(expectedProteinId, observedTranslation.ProteinId.WithVersion); Assert.Equal(expectedPeptideSeq, observedTranslation.PeptideSeq); } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/TranscriptCacheTests.cs ================================================ using System; using System.Collections.Generic; using System.Linq; using CacheUtils.TranscriptCache; using Genome; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.VariantAnnotation.Caches { public sealed class TranscriptCacheTests { private readonly ITranscriptCache _cache; private readonly IEnumerable _expectedDataSourceVersions; private const GenomeAssembly ExpectedAssembly = GenomeAssembly.hg19; public TranscriptCacheTests() { _expectedDataSourceVersions = GetDataSourceVersions(); var transcriptIntervalArrays = GetTranscripts().ToIntervalArrays(11); var regulatoryRegionIntervalArrays = GetRegulatoryRegions().ToIntervalArrays(11); _cache = new TranscriptCache(_expectedDataSourceVersions, ExpectedAssembly, transcriptIntervalArrays, regulatoryRegionIntervalArrays); } [Fact] public void GetOverlappingFlankingTranscripts_TwoOverlaps() { var interval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); ITranscript[] overlappingTranscripts = _cache.TranscriptIntervalForest.GetAllFlankingValues(interval); Assert.NotNull(overlappingTranscripts); Assert.Equal(2, overlappingTranscripts.Length); } [Fact] public void GetOverlappingFlankingTranscripts_NoOverlaps() { var interval = new ChromosomeInterval(ChromosomeUtilities.Chr11, 5000, 5001); ITranscript[] overlappingTranscripts = _cache.TranscriptIntervalForest.GetAllFlankingValues(interval); Assert.Null(overlappingTranscripts); } [Fact] public void GetOverlappingRegulatoryRegions_OneOverlap() { var overlappingRegulatoryRegions = _cache.RegulatoryIntervalForest.GetAllOverlappingValues(ChromosomeUtilities.Chr1.Index, 100, 200); Assert.NotNull(overlappingRegulatoryRegions); Assert.Single(overlappingRegulatoryRegions); } [Fact] public void GetOverlappingRegulatoryRegions_NoOverlaps() { var overlappingRegulatoryRegions = _cache.RegulatoryIntervalForest.GetAllOverlappingValues(ChromosomeUtilities.Chr1.Index, 5000, 5001); Assert.Null(overlappingRegulatoryRegions); } [Fact] public void Assembly_Get() { var observedAssembly = _cache.Assembly; Assert.Equal(ExpectedAssembly, observedAssembly); } [Fact] public void DataSourceVersions_Get() { var observedDataSourceVersions = _cache.DataSourceVersions.ToArray(); Assert.Single(observedDataSourceVersions); var expectedDataSourceVersion = _expectedDataSourceVersions.ToArray()[0]; var observedDataSourceVersion = observedDataSourceVersions[0]; Assert.Equal(expectedDataSourceVersion.Name, observedDataSourceVersion.Name); } [Fact] private IEnumerable GetDataSourceVersions() { return new List { new DataSourceVersion("VEP", "87", DateTime.Now.Ticks, Source.BothRefSeqAndEnsembl.ToString()) }; } private static IRegulatoryRegion[] GetRegulatoryRegions() { var regulatoryRegions = new IRegulatoryRegion[3]; regulatoryRegions[0] = new RegulatoryRegion(ChromosomeUtilities.Chr11, 11000, 12000, CompactId.Empty, RegulatoryRegionType.promoter); regulatoryRegions[1] = new RegulatoryRegion(ChromosomeUtilities.Chr1, 120, 180, CompactId.Empty, RegulatoryRegionType.promoter); regulatoryRegions[2] = new RegulatoryRegion(ChromosomeUtilities.Chr1, 300, 320, CompactId.Empty, RegulatoryRegionType.promoter); return regulatoryRegions; } internal static ITranscript[] GetTranscripts() { return new ITranscript[] { new Transcript(ChromosomeUtilities.Chr11, 11000, 12000, CompactId.Empty, null, BioType.other, null, 0, 0, false, null, 0, null, 0, 0, Source.None, false, false, null, null), new Transcript(ChromosomeUtilities.Chr1, 120, 180, CompactId.Empty, null, BioType.other, null, 0, 0, false, null, 0, null, 0, 0, Source.None, false, false, null, null), new Transcript(ChromosomeUtilities.Chr1, 300, 320, CompactId.Empty, null, BioType.other, null, 0, 0, false, null, 0, null, 0, 0, Source.None, false, false, null, null) }; } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/Utilities/RnaEditUtilitiesTests.cs ================================================ using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Caches.Utilities; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.Caches.Utilities { public sealed class RnaEditUtilitiesTests { [Theory] [InlineData(100, 100, "G", VariantType.SNV)] [InlineData(100, 101, "GT", VariantType.MNV)] [InlineData(101, 100, "GCTA", VariantType.insertion)] [InlineData(100, 100, "", VariantType.deletion)] [InlineData(100, 101, null, VariantType.deletion)] public void RnaEditTypes(int start, int end, string bases, VariantType expectedType) { var rnaEdit = new RnaEdit(start, end, bases); Assert.Equal(expectedType, RnaEditUtilities.GetRnaEditType(rnaEdit)); } } } ================================================ FILE: UnitTests/VariantAnnotation/Caches/Utilities/TranscriptUtilitiesTests.cs ================================================ using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Caches.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.Caches.Utilities { public sealed class TranscriptUtilitiesTests { private readonly ITranscriptRegion[] _transcriptRegions; public TranscriptUtilitiesTests() { _transcriptRegions = GetTranscriptRegions(); } [Fact] public void GetTotalExonLength_MultipleExons() { const int expectedLength = 300; int observedLength = ExonUtilities.GetTotalExonLength(_transcriptRegions); Assert.Equal(expectedLength, observedLength); } private static ITranscriptRegion[] GetTranscriptRegions() { return new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 199, 0, 99), new TranscriptRegion(TranscriptRegionType.Gap, 0, 200, 299, 99, 100), new TranscriptRegion(TranscriptRegionType.Exon, 1, 300, 399, 100, 199), new TranscriptRegion(TranscriptRegionType.Intron, 1, 400, 499, 199, 200), new TranscriptRegion(TranscriptRegionType.Exon, 2, 500, 599, 200, 299) }; } } } ================================================ FILE: UnitTests/VariantAnnotation/GeneFusions/Calling/GeneFusionCallerTests.cs ================================================ using System.Collections.Generic; using System.Text; using CacheUtils.TranscriptCache; using Genome; using Intervals; using UnitTests.MockedData; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.GeneFusions.Calling; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Pools; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.GeneFusions.Calling { public sealed class GeneFusionCallerTests { private readonly ITranscript[] _forwardTranscripts = {Transcripts.ENST00000370673}; private readonly ITranscript[] _forwardNonCodingTranscripts = {Transcripts.ENST00000427819}; private readonly ITranscript[] _reverseTranscripts = {Transcripts.ENST00000615053}; [Fact] public void AddGeneFusionsToDictionary_ForwardFirst5PrimeUtr_ReverseFirstCds_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84298366, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130509235, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_-192::ENST00000615053.3(POTEI):r.1_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardLast5PrimeUtr_ReverseIntronCds_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84298557, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130508713, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Null(geneFusion.exon); Assert.Equal(1, geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_-1::ENST00000615053.3(POTEI):r.521+2_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardFirstCds_ReverseFirst3PrimeUtr_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84298558, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130465652, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(12, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_1::ENST00000615053.3(POTEI):r.*1_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardIntronCds_ReverseLastCds_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84298569, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130465653, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(12, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_10+2::ENST00000615053.3(POTEI):r.1527_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardLastCds_ReverseLast3PrimeUtr_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84349774, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130463799, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(13, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_351::ENST00000615053.3(POTEI):r.*347_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardFirst3PrimeUtr_ReverseFirst5PrimeUtr_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84349775, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130509287, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_*1::ENST00000615053.3(POTEI):r.-52_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardLast3PrimeUtr_ReverseLast5PrimeUtr_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84350798, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130509236, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_*1024::ENST00000615053.3(POTEI):r.-1_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ReverseFirst5PrimeUtr_ForwardLastCds_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr2, 130509287, false); var partner = new BreakPoint(ChromosomeUtilities.Chr1, 84349774, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _reverseTranscripts, _forwardTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000615053.3"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000370673.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(4, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_351::ENST00000615053.3(POTEI):r.-52_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ReverseLast5PrimeUtr_ForwardFirstCds_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr2, 130509236, false); var partner = new BreakPoint(ChromosomeUtilities.Chr1, 84298558, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _reverseTranscripts, _forwardTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000615053.3"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000370673.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_1::ENST00000615053.3(POTEI):r.-1_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ReverseFirstCds_ForwardFirst3PrimeUtr_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr2, 130509235, false); var partner = new BreakPoint(ChromosomeUtilities.Chr1, 84349775, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _reverseTranscripts, _forwardTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000615053.3"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000370673.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(4, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_*1::ENST00000615053.3(POTEI):r.1_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ReverseLastCds_ForwardLast3PrimeUtr_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr2, 130465653, false); var partner = new BreakPoint(ChromosomeUtilities.Chr1, 84350798, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _reverseTranscripts, _forwardTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000615053.3"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000370673.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(4, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_*1024::ENST00000615053.3(POTEI):r.1527_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ReverseFirst3PrimeUtr_ForwardFirst5PrimeUtr_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr2, 130465652, false); var partner = new BreakPoint(ChromosomeUtilities.Chr1, 84298366, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _reverseTranscripts, _forwardTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000615053.3"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000370673.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_-192::ENST00000615053.3(POTEI):r.*1_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ReverseLast3PrimeUtr_ForwardLast5PrimeUtr_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr2, 130463799, false); var partner = new BreakPoint(ChromosomeUtilities.Chr1, 84298557, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _reverseTranscripts, _forwardTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000615053.3"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000370673.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_-1::ENST00000615053.3(POTEI):r.*347_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardNonCodingFirstCdna_ReverseFirstCds_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 85276715, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130509235, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardNonCodingTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000427819.5"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000427819.5(AL078459.1):r.?_1::ENST00000615053.3(POTEI):r.1_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardNonCodingLastCdna_ReverseLastCds_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 85399963, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130465653, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardNonCodingTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000427819.5"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(12, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000427819.5(AL078459.1):r.?_1950::ENST00000615053.3(POTEI):r.1527_?", geneFusion.hgvsr); } [Fact] public void AddGeneFusionsToDictionary_ForwardCds_ReverseCds_InFrame_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84298558, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130509234, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, false); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_1::ENST00000615053.3(POTEI):r.2_?", geneFusion.hgvsr); Assert.True(geneFusion.isInFrame); } [Fact] public void AddGeneFusionsToDictionary_ForwardCds_ReverseCds_Imprecise_NotInFrame_ActualFusion() { var origin = new BreakPoint(ChromosomeUtilities.Chr1, 84298558, false); var partner = new BreakPoint(ChromosomeUtilities.Chr2, 130509234, true); var adjacency = new BreakEndAdjacency(origin, partner); var transcriptIdToGeneFusions = new Dictionary(); GeneFusionCaller.AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, _forwardTranscripts, _reverseTranscripts, true); IAnnotatedGeneFusion[] actualGeneFusions = transcriptIdToGeneFusions["ENST00000370673.7"]; Assert.Single(actualGeneFusions); IAnnotatedGeneFusion geneFusion = actualGeneFusions[0]; Assert.Equal(Transcripts.ENST00000615053.Id.WithVersion, geneFusion.transcript.Id.WithVersion); Assert.Equal(1, geneFusion.exon); Assert.Null(geneFusion.intron); Assert.Equal("ENST00000370673.7(SAMD13):r.?_1::ENST00000615053.3(POTEI):r.2_?", geneFusion.hgvsr); Assert.False(geneFusion.isInFrame); } [Fact] public void FoundViableGeneFusion_ReturnTrue() { var adjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 100, false), new BreakPoint(ChromosomeUtilities.Chr2, 100, true)); var originInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); var partnerInterval = new ChromosomeInterval(ChromosomeUtilities.Chr2, 100, 200); bool actualResult = GeneFusionCaller.FoundViableGeneFusion(adjacency, Genes.SAMD13, originInterval, Source.Ensembl, Genes.POTEI, partnerInterval, Source.Ensembl); Assert.True(actualResult); } [Fact] public void FoundViableGeneFusion_AffectedByOriginAdjacency_ReturnTrue() { var adjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr7, 26241365, true), new BreakPoint(ChromosomeUtilities.Chr15, 40854180, false)); var originInterval = new ChromosomeInterval(ChromosomeUtilities.Chr7, 26240782, 26252976); var partnerInterval = new ChromosomeInterval(ChromosomeUtilities.Chr15, 40820882, 40857210); var originGene = new Gene(ChromosomeUtilities.Chr7, 26240782, 26253227, false, "CBX3", 1553, CompactId.Convert("11335"), CompactId.Convert("ENSG00000122565")); var partnerGene = new Gene(ChromosomeUtilities.Chr15, 40820882, 40857256, true, "CCDC32", 28295, CompactId.Convert("90416"), CompactId.Convert("ENSG00000128891")); bool actualResult = GeneFusionCaller.FoundViableGeneFusion(adjacency, originGene, originInterval, Source.Ensembl, partnerGene, partnerInterval, Source.Ensembl); Assert.True(actualResult); } [Fact] public void FoundViableGeneFusion_SameGeneSymbol_ReturnFalse() { var adjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 100, false), new BreakPoint(ChromosomeUtilities.Chr2, 100, true)); var originInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); var partnerInterval = new ChromosomeInterval(ChromosomeUtilities.Chr2, 100, 200); bool actualResult = GeneFusionCaller.FoundViableGeneFusion(adjacency, Genes.SAMD13, originInterval, Source.Ensembl, Genes.SAMD13, partnerInterval, Source.Ensembl); Assert.False(actualResult); } [Fact] public void FoundViableGeneFusion_DifferentOriginOrientation_ReturnFalse() { var adjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 100, true), new BreakPoint(ChromosomeUtilities.Chr2, 100, true)); var originInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); var partnerInterval = new ChromosomeInterval(ChromosomeUtilities.Chr2, 100, 200); bool actualResult = GeneFusionCaller.FoundViableGeneFusion(adjacency, Genes.SAMD13, originInterval, Source.Ensembl, Genes.POTEI, partnerInterval, Source.Ensembl); Assert.False(actualResult); } [Fact] public void FoundViableGeneFusion_DifferentPartnerOrientation_ReturnFalse() { var adjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 100, false), new BreakPoint(ChromosomeUtilities.Chr2, 100, false)); var originInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); var partnerInterval = new ChromosomeInterval(ChromosomeUtilities.Chr2, 100, 200); bool actualResult = GeneFusionCaller.FoundViableGeneFusion(adjacency, Genes.SAMD13, originInterval, Source.Ensembl, Genes.POTEI, partnerInterval, Source.Ensembl); Assert.False(actualResult); } [Fact] public void FoundViableGeneFusion_DifferentTranscriptSource_ReturnFalse() { var adjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 100, false), new BreakPoint(ChromosomeUtilities.Chr2, 100, true)); var originInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); var partnerInterval = new ChromosomeInterval(ChromosomeUtilities.Chr2, 100, 200); bool actualResult = GeneFusionCaller.FoundViableGeneFusion(adjacency, Genes.SAMD13, originInterval, Source.RefSeq, Genes.POTEI, partnerInterval, Source.Ensembl); Assert.False(actualResult); } [Fact] public void FoundViableGeneFusion_TranscriptsAlreadyOverlap_ReturnFalse() { var adjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 100, false), new BreakPoint(ChromosomeUtilities.Chr1, 100, true)); var originInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 100, 200); var partnerInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 105, 205); bool actualResult = GeneFusionCaller.FoundViableGeneFusion(adjacency, Genes.SAMD13, originInterval, Source.Ensembl, Genes.POTEI, partnerInterval, Source.Ensembl); Assert.False(actualResult); } private sealed class GetCodonPositionData : TheoryData { public GetCodonPositionData() { Add(84298557, 0, null); // UTR Add(84298558, 0, 1); Add(84298559, 0, 2); Add(84298560, 0, 3); Add(84298561, 0, 1); Add(84298562, 0, 2); Add(84298563, 0, 3); Add(84298568, 1, null); // intron } } [Theory] [ClassData(typeof(GetCodonPositionData))] public void GetCodonPosition_Forward_ExpectedResults(int genomicPosition, int regionIndex, byte? expectedCodonPosition) { ITranscript transcript = Transcripts.ENST00000370673; byte? actualCodonPosition = GeneFusionCaller.GetCodonPosition(transcript.TranscriptRegions[regionIndex], transcript.Translation, transcript.StartExonPhase, transcript.Gene.OnReverseStrand, genomicPosition); Assert.Equal(expectedCodonPosition, actualCodonPosition); } [Theory] [InlineData(84298558, 130509234, true)] // 1 -> 2 [InlineData(84298559, 130509233, true)] // 2 -> 3 [InlineData(84298560, 130509232, true)] // 3 -> 1 [InlineData(84298561, 130509231, true)] // 1 -> 2 [InlineData(84298562, 130509230, true)] // 2 -> 3 [InlineData(84298563, 130509229, true)] // 3 -> 1 [InlineData(84298561, 130509227, false)] // 1 -> 3 [InlineData(84298562, 130509228, false)] // 2 -> 2 [InlineData(84298563, 130509225, false)] // 3 -> 2 [InlineData(84298564, 130509226, false)] // 1 -> 1 [InlineData(84298565, 130509223, false)] // 2 -> 1 [InlineData(84298566, 130509221, false)] // 3 -> 3 public void DetermineInFrameFusion_ExpectedResults(int firstGenomicPosition, int secondGenomicPosition, bool expectedResult) { var first = new BreakPointTranscript(Transcripts.ENST00000370673, firstGenomicPosition, 0); var second = new BreakPointTranscript(Transcripts.ENST00000615053, secondGenomicPosition, 24); bool actualResult = GeneFusionCaller.DetermineInFrameFusion(first, second); Assert.Equal(expectedResult, actualResult); } [Fact] public void GetGeneSymbols_SameChromosome() { IGene a = new Gene(ChromosomeUtilities.Chr1, 1000, 2000, false, "A", 0, CompactId.Empty, CompactId.Empty); IGene b = new Gene(ChromosomeUtilities.Chr1, 900, 1900, false, "B", 0, CompactId.Empty, CompactId.Empty); var expectedFirstGeneSymbol = "B"; var expectedSecondGeneSymbol = "A"; (ulong _, string actualFirstGeneSymbol, uint _, string actualSecondGeneSymbol, uint _) = GeneFusionCaller.GetGeneAndFusionKeys(a, b); Assert.Equal(expectedFirstGeneSymbol, actualFirstGeneSymbol); Assert.Equal(expectedSecondGeneSymbol, actualSecondGeneSymbol); } [Fact] public void GetGeneSymbols_DifferentChromosomes() { IGene a = new Gene(ChromosomeUtilities.Chr1, 1000, 2000, false, "A", 0, CompactId.Empty, CompactId.Empty); IGene b = new Gene(ChromosomeUtilities.Chr3, 900, 1900, false, "B", 0, CompactId.Empty, CompactId.Empty); var expectedFirstGeneSymbol = "A"; var expectedSecondGeneSymbol = "B"; (ulong _, string actualFirstGeneSymbol, uint _, string actualSecondGeneSymbol, uint _) = GeneFusionCaller.GetGeneAndFusionKeys(a, b); Assert.Equal(expectedFirstGeneSymbol, actualFirstGeneSymbol); Assert.Equal(expectedSecondGeneSymbol, actualSecondGeneSymbol); } [Fact] public void AddGeneFusions_ExpectedResults() { const string expectedConsequences = "\"consequence\":[\"unidirectional_gene_fusion\"]"; const string expectedGeneFusionJson = "\"geneFusions\":[{\"transcript\":\"ENST00000615053.3\",\"bioType\":\"protein_coding\",\"exon\":1,\"geneId\":\"ENSG00000196834\",\"hgnc\":\"POTEI\",\"hgvsr\":\"ENST00000370673.7(SAMD13):r.?_1::ENST00000615053.3(POTEI):r.2_?\",\"inFrame\":true}]}"; IntervalForest transcriptIntervalForest = GetTranscriptIntervalForest(); IAnnotatedVariant[] annotatedVariants = GetAnnotatedVariants(); var geneFusionCaller = new GeneFusionCaller(ChromosomeUtilities.RefNameToChromosome, transcriptIntervalForest); geneFusionCaller.AddGeneFusions(annotatedVariants, false, false, false); IAnnotatedVariant annotatedVariant = annotatedVariants[0]; var sb = new StringBuilder(); annotatedVariant.Transcripts[0].SerializeJson(sb); var json = sb.ToString(); VariantPool.Return((Variant)annotatedVariant.Variant); AnnotatedTranscriptPool.Return((AnnotatedTranscript) annotatedVariant.Transcripts[0]); AnnotatedVariantPool.Return((AnnotatedVariant)annotatedVariant); Assert.Contains(expectedConsequences, json); Assert.Contains(expectedGeneFusionJson, json); } private IAnnotatedVariant[] GetAnnotatedVariants() { var variant = VariantPool.Get(ChromosomeUtilities.Chr1, 84298558, 84298558, "A", "A]chr2:130509234]", VariantType.translocation_breakend, "1-84298558-A-A]chr2:130509234]", false, false, false, null, AnnotationBehavior.StructuralVariants, true); var annotatedTranscript = AnnotatedTranscriptPool.Get(Transcripts.ENST00000370673, null, null, null, null, null, null, null, null, null, new List(), null); var annotatedVariant = AnnotatedVariantPool.Get(variant); annotatedVariant.Transcripts.Add(annotatedTranscript); return new IAnnotatedVariant[] {annotatedVariant}; } private IntervalForest GetTranscriptIntervalForest() { var transcripts = new List(); transcripts.AddRange(_forwardTranscripts); transcripts.AddRange(_reverseTranscripts); IntervalArray[] intervalArrays = transcripts.ToIntervalArrays(2); return new IntervalForest(intervalArrays); } } } ================================================ FILE: UnitTests/VariantAnnotation/GeneFusions/IO/GeneFusionIndexEntryTests.cs ================================================ using VariantAnnotation.GeneFusions.IO; using Xunit; namespace UnitTests.VariantAnnotation.GeneFusions.IO { public sealed class GeneFusionIndexEntryTests { [Theory] [InlineData(1000, 1)] [InlineData(2000, 0)] [InlineData(3000, -1)] public void Compare_ExpectedResults(ulong otherGeneKey, int expectedResult) { var indexEntry = new GeneFusionIndexEntry(2000, 0); int actualResult = indexEntry.Compare(otherGeneKey); Assert.Equal(expectedResult, actualResult); } } } ================================================ FILE: UnitTests/VariantAnnotation/GeneFusions/IO/GeneFusionSourceReaderTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Text; using IO.v2; using SAUtils.FusionCatcher; using UnitTests.SAUtils.FusionCatcher; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.VariantAnnotation.GeneFusions.IO { public sealed class GeneFusionSourceReaderTests { [Theory] [InlineData(FileType.FusionCatcher, GeneFusionSourceReader.SupportedFileFormatVersion, true)] [InlineData(FileType.GeneFusionJson, GeneFusionSourceReader.SupportedFileFormatVersion, false)] [InlineData(FileType.FusionCatcher, 0, false)] public void CheckHeader_ExpectedResults(FileType fileType, ushort fileFormatVersion, bool expectedIsValid) { Exception ex = Record.Exception(() => { GeneFusionSourceReader.CheckHeader(fileType, fileFormatVersion); }); bool actualIsValid = ex == null; Assert.Equal(expectedIsValid, actualIsValid); } [Fact] public void AddAnnotations_ExpectedResults() { const string expectedJson = "[{\"genes\":{\"first\":{\"hgnc\":\"A\",\"isOncogene\":true},\"second\":{\"hgnc\":\"B\"},\"isParalogPair\":true},\"germlineSources\":[\"1000 Genomes Project\",\"Healthy (strong support)\",\"Illumina Body Map 2.0\"],\"somaticSources\":[\"Alaei-Mahabadi 18 cancers\",\"DepMap CCLE\"]},{\"genes\":{\"first\":{\"hgnc\":\"E\"},\"second\":{\"hgnc\":\"F\"}},\"somaticSources\":[\"CCLE Vellichirammal\",\"Cancer Genome Project\"]}]"; using var ms = new MemoryStream(); WriteGeneFusionSourceFile(ms); var supplementaryAnnotations = new List(); IGeneFusionPair[] fusionPairs = { new GeneFusionPair(1000, "A", 123, "B", 456), new GeneFusionPair(1500, "C", 234, "D", 567), // no matching SA new GeneFusionPair(3000, "E", 345, "F", 678) }; using (var reader = new GeneFusionSourceReader(ms)) { reader.LoadAnnotations(); reader.AddAnnotations(fusionPairs, supplementaryAnnotations); } Assert.Single(supplementaryAnnotations); ISupplementaryAnnotation sa = supplementaryAnnotations[0]; var sb = new StringBuilder(); sa.SerializeJson(sb); var actualJson = sb.ToString(); Assert.Equal("fusionCatcher", sa.JsonKey); Assert.Equal(expectedJson, actualJson); } [Fact] public void AddAnnotations_NoResults() { using var ms = new MemoryStream(); WriteGeneFusionSourceFile(ms); var supplementaryAnnotations = new List(); IGeneFusionPair[] fusionPairs = { new GeneFusionPair(1500, "C", 234, "D", 567) // no matching SA }; using (var reader = new GeneFusionSourceReader(ms)) { reader.LoadAnnotations(); reader.AddAnnotations(fusionPairs, supplementaryAnnotations); } Assert.Empty(supplementaryAnnotations); } private static void WriteGeneFusionSourceFile(MemoryStream ms) { (uint[] expectedOncogeneKeys, GeneFusionSourceCollection[] expectedIndex, GeneFusionIndexEntry[] expectedIndexEntries) = GeneFusionSourceWriterTests.GetKeyToGeneFusion(); IDataSourceVersion expectedVersion = new DataSourceVersion("FusionCatcher", "1.33", DateTime.Now.Ticks, "gene fusions"); const string expectedJsonKey = "fusionCatcher"; using (var writer = new GeneFusionSourceWriter(ms, expectedJsonKey, expectedVersion, true)) { writer.Write(expectedOncogeneKeys, expectedIndex, expectedIndexEntries); } ms.Position = 0; } } } ================================================ FILE: UnitTests/VariantAnnotation/GeneFusions/SA/GeneFusionPairTests.cs ================================================ using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.Interface.AnnotatedPositions; using Xunit; namespace UnitTests.VariantAnnotation.GeneFusions.SA { public sealed class GeneFusionPairTests { private readonly GeneFusionPair _fusionPair = new(1000, "A", 123, "B", 456); private readonly GeneFusionPair _fusionPairDup = new(1000, "A", 123, "B", 456); private readonly GeneFusionPair _fusionPairDiff = new(2000, "A", 123, "B", 456); [Fact] public void Equals_ExpectedResults() { Assert.False(_fusionPair.Equals(null)); Assert.Equal(_fusionPair, _fusionPair); Assert.Equal(_fusionPair, _fusionPairDup); Assert.NotEqual(_fusionPair, _fusionPairDiff); } [Fact] public void Equals_IGeneFusionPair_ExpectedResults() { IGeneFusionPair fusionPair = _fusionPair; IGeneFusionPair fusionPairDup = _fusionPairDup; IGeneFusionPair fusionPairDiff = _fusionPairDiff; Assert.False(fusionPair.Equals(null)); Assert.Equal(fusionPair, fusionPair); Assert.Equal(fusionPair, fusionPairDup); Assert.NotEqual(fusionPair, fusionPairDiff); } [Fact] public void GetHashCode_ExpectedResults() { Assert.Equal(_fusionPair.GetHashCode(), _fusionPairDup.GetHashCode()); Assert.NotEqual(_fusionPair.GetHashCode(), _fusionPairDiff.GetHashCode()); } } } ================================================ FILE: UnitTests/VariantAnnotation/GeneFusions/SA/GeneFusionSourceCollectionTests.cs ================================================ using VariantAnnotation.GeneFusions.SA; using Xunit; namespace UnitTests.VariantAnnotation.GeneFusions.SA { public sealed class GeneFusionSourceCollectionTests { private readonly GeneFusionSourceCollection _sourceCollection = new(false, false, false, new[] {GeneFusionSource.Healthy}, new[] {GeneFusionSource.Bao_gliomas, GeneFusionSource.Robinson_prostate_cancers}); private readonly GeneFusionSourceCollection _sourceCollectionDup = new(false, false, false, new[] {GeneFusionSource.Healthy}, new[] {GeneFusionSource.Bao_gliomas, GeneFusionSource.Robinson_prostate_cancers}); private readonly GeneFusionSourceCollection _sourceCollectionDiff = new(false, true, false, new[] {GeneFusionSource.Healthy}, new[] {GeneFusionSource.Bao_gliomas, GeneFusionSource.Robinson_prostate_cancers}); [Fact] public void Equals_ExpectedResults() { Assert.False(_sourceCollection.Equals(null)); Assert.Equal(_sourceCollection, _sourceCollection); Assert.Equal(_sourceCollection, _sourceCollectionDup); Assert.NotEqual(_sourceCollection, _sourceCollectionDiff); } [Fact] public void GetJsonEntry_ExpectedResults() { const string expectedJson = "\"genes\":{\"first\":{\"hgnc\":\"A\"},\"second\":{\"hgnc\":\"B\"}},\"germlineSources\":[\"Healthy\"],\"somaticSources\":[\"Bao gliomas\",\"Robinson prostate cancers\"]"; var geneFusionPair = new GeneFusionPair(100, "A", 100, "B", 200); string actualJson = _sourceCollection.GetJsonEntry(geneFusionPair, new uint[] {123}); Assert.Equal(expectedJson, actualJson); } [Fact] public void GetHashCode_ExpectedResults() { Assert.Equal(_sourceCollection.GetHashCode(), _sourceCollectionDup.GetHashCode()); Assert.NotEqual(_sourceCollection.GetHashCode(), _sourceCollectionDiff.GetHashCode()); } } } ================================================ FILE: UnitTests/VariantAnnotation/GeneFusions/SA/GeneFusionSourceUtilitiesTests.cs ================================================ using VariantAnnotation.GeneFusions.SA; using Xunit; namespace UnitTests.VariantAnnotation.GeneFusions.SA { public sealed class GeneFusionSourceUtilitiesTests { [Theory] [InlineData(GeneFusionSource.Babiceanu_NonCancerTissues, "Babiceanu non-cancer tissues")] [InlineData(GeneFusionSource.Bailey_pancreatic_cancers, "Bailey pancreatic cancers")] [InlineData(GeneFusionSource.Bao_gliomas, "Bao gliomas")] [InlineData(GeneFusionSource.CACG, "CACG")] [InlineData(GeneFusionSource.ConjoinG, "ConjoinG")] [InlineData(GeneFusionSource.COSMIC, "COSMIC")] [InlineData(GeneFusionSource.Duplicated_Genes_Database, "Duplicated Genes Database")] [InlineData(GeneFusionSource.GTEx_healthy_tissues, "GTEx healthy tissues")] [InlineData(GeneFusionSource.Healthy, "Healthy")] [InlineData(GeneFusionSource.Healthy_prefrontal_cortex, "Healthy prefrontal cortex")] [InlineData(GeneFusionSource.Human_Protein_Atlas, "Human Protein Atlas")] [InlineData(GeneFusionSource.NonTumorCellLines, "non-tumor cell lines")] [InlineData(GeneFusionSource.Robinson_prostate_cancers, "Robinson prostate cancers")] [InlineData(GeneFusionSource.TumorFusions_normal, "TumorFusions normal")] [InlineData(GeneFusionSource.TCGA_oesophageal_carcinomas, "TCGA oesophageal carcinomas")] [InlineData(GeneFusionSource.TCGA_Tumor, "TCGA tumor")] public void Convert_ExpectedResults(GeneFusionSource source, string expectedResult) { string actualResult = GeneFusionSourceUtilities.Convert(source); Assert.Equal(expectedResult, actualResult); } [Fact] public void Convert_UnknownSource_ReturnsNull() { string actualResult = GeneFusionSourceUtilities.Convert(GeneFusionSource.None); Assert.Null(actualResult); } } } ================================================ FILE: UnitTests/VariantAnnotation/GeneFusions/Utilities/GeneFusionKeyTests.cs ================================================ using VariantAnnotation.GeneFusions.Utilities; using Xunit; namespace UnitTests.VariantAnnotation.GeneFusions.Utilities { public sealed class GeneFusionKeyTests { [Fact] public void Create_ExpectedResults() { const string geneA = "ENSG00000006210"; const string geneB = "ENSG00000102962"; const ulong expectedFusionKey = 26671747011122; ulong actualFusionKey = GeneFusionKey.Create(GeneFusionKey.CreateGeneKey(geneA), GeneFusionKey.CreateGeneKey(geneB)); Assert.Equal(expectedFusionKey, actualFusionKey); } [Theory] [InlineData("ENSG00000006210", null)] [InlineData(null, "ENSG00000102962")] [InlineData(null, null)] public void Create_OneGeneIsNull_ReturnZero(string geneA, string geneB) { const ulong expectedFusionKey = 0; ulong actualFusionKey = GeneFusionKey.Create(GeneFusionKey.CreateGeneKey(geneA), GeneFusionKey.CreateGeneKey(geneB)); Assert.Equal(expectedFusionKey, actualFusionKey); } } } ================================================ FILE: UnitTests/VariantAnnotation/GeneFusions/Utilities/IndexEntryExtensionsTests.cs ================================================ using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.GeneFusions.Utilities; using Xunit; namespace UnitTests.VariantAnnotation.GeneFusions.Utilities { public sealed class IndexEntryExtensionsTests { private readonly GeneFusionIndexEntry[] _indexEntries; public IndexEntryExtensionsTests() { _indexEntries = new GeneFusionIndexEntry[] { new(1000, 1), new(1001, 2), new(2000, 3), new(3000, 4), new(3100, 5) }; } [Theory] [InlineData(1000, 1)] [InlineData(1001, 2)] [InlineData(2000, 3)] [InlineData(3000, 4)] [InlineData(3100, 5)] public void GetIndex_Matches_ExpectedResults(ulong geneKey, ushort expectedIndex) { ushort? actualIndex = _indexEntries.GetIndex(geneKey); Assert.NotNull(actualIndex); Assert.Equal(expectedIndex, actualIndex); } [Theory] [InlineData(100)] [InlineData(1002)] [InlineData(4000)] public void GetIndex_NotFound_ReturnNull(ulong geneKey) { ushort? actualIndex = _indexEntries.GetIndex(geneKey); Assert.Null(actualIndex); } } } ================================================ FILE: UnitTests/VariantAnnotation/IO/Caches/CacheConstantsTests.cs ================================================ using IO; using Xunit; namespace UnitTests.VariantAnnotation.IO.Caches { public sealed class CacheConstantsTests { [Fact] public void TranscriptPath_Null_WithNullPrefix() { var observedResult = CacheConstants.TranscriptPath(null); Assert.Null(observedResult); } [Fact] public void TranscriptPath_NominalCase() { const string expectedResult = "bob.transcripts.ndb"; var observedResult = CacheConstants.TranscriptPath("bob"); Assert.Equal(expectedResult, observedResult); } [Fact] public void SiftPath_NominalCase() { const string expectedResult = "bob.sift.ndb"; var observedResult = CacheConstants.SiftPath("bob"); Assert.Equal(expectedResult, observedResult); } [Fact] public void PolyPhenPath_NominalCase() { const string expectedResult = "bob.polyphen.ndb"; var observedResult = CacheConstants.PolyPhenPath("bob"); Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/VariantAnnotation/IO/Caches/CacheHeaderTests.cs ================================================ using System.IO; using System.Text; using Genome; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO.Caches; using Xunit; namespace UnitTests.VariantAnnotation.IO.Caches { public sealed class CacheHeaderTests { [Fact] public void CacheHeader_EndToEnd() { const Source expectedTranscriptSource = Source.BothRefSeqAndEnsembl; const long expectedCreationTimeTicks = long.MaxValue; const GenomeAssembly expectedAssembly = GenomeAssembly.hg19; const ushort expectedVepVersion = ushort.MaxValue; var expectedBaseHeader = new Header("VEP", 1, 2, expectedTranscriptSource, expectedCreationTimeTicks, expectedAssembly); var expectedCustomHeader = new TranscriptCacheCustomHeader(expectedVepVersion, 0); var expectedHeader = new CacheHeader(expectedBaseHeader, expectedCustomHeader); CacheHeader observedHeader; using (var ms = new MemoryStream()) { using (var writer = new BinaryWriter(ms, Encoding.UTF8, true)) { expectedHeader.Write(writer); } ms.Position = 0; observedHeader = CacheHeader.Read(ms); } Assert.NotNull(observedHeader); Assert.Equal(expectedTranscriptSource, observedHeader.Source); Assert.Equal(expectedCreationTimeTicks, observedHeader.CreationTimeTicks); Assert.Equal(expectedAssembly, observedHeader.Assembly); Assert.Equal(expectedVepVersion, observedHeader.Custom.VepVersion); } } } ================================================ FILE: UnitTests/VariantAnnotation/IO/Caches/TranscriptCacheReaderTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using CacheUtils.TranscriptCache; using Genome; using Intervals; using IO; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.IO.Caches; using Xunit; namespace UnitTests.VariantAnnotation.IO.Caches { public sealed class TranscriptCacheReaderTests { private readonly TranscriptCacheData _expectedCacheData; private readonly CacheHeader _expectedHeader; public TranscriptCacheReaderTests() { const GenomeAssembly genomeAssembly = GenomeAssembly.GRCh38; var baseHeader = new Header("test", 2, 3, Source.BothRefSeqAndEnsembl, 4, genomeAssembly); var customHeader = new TranscriptCacheCustomHeader(1, 2); _expectedHeader = new CacheHeader(baseHeader, customHeader); var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 199, 300, 399), new TranscriptRegion(TranscriptRegionType.Intron, 1, 200, 299, 399, 400), new TranscriptRegion(TranscriptRegionType.Exon, 2, 300, 399, 400, 499) }; var mirnas = new IInterval[2]; mirnas[0] = new Interval(100, 200); mirnas[1] = new Interval(300, 400); var peptideSeqs = new[] { "MASE*" }; var genes = new IGene[1]; genes[0] = new Gene(ChromosomeUtilities.Chr3, 100, 200, true, "TP53", 300, CompactId.Convert("7157"), CompactId.Convert("ENSG00000141510")); var regulatoryRegions = new IRegulatoryRegion[2]; regulatoryRegions[0] = new RegulatoryRegion(ChromosomeUtilities.Chr3, 1200, 1300, CompactId.Convert("123"), RegulatoryRegionType.enhancer); regulatoryRegions[1] = new RegulatoryRegion(ChromosomeUtilities.Chr3, 1250, 1450, CompactId.Convert("456"), RegulatoryRegionType.enhancer); var regulatoryRegionIntervalArrays = regulatoryRegions.ToIntervalArrays(3); var transcripts = GetTranscripts(ChromosomeUtilities.Chr3, genes, transcriptRegions, mirnas); var transcriptIntervalArrays = transcripts.ToIntervalArrays(3); _expectedCacheData = new TranscriptCacheData(_expectedHeader, genes, transcriptRegions, mirnas, peptideSeqs, transcriptIntervalArrays, regulatoryRegionIntervalArrays); } [Fact] public void TranscriptCacheReader_EndToEnd() { TranscriptCacheData observedCache; using (var ms = new MemoryStream()) { using (var writer = new TranscriptCacheWriter(ms, _expectedHeader, true)) { writer.Write(_expectedCacheData); } ms.Position = 0; using (var reader = new TranscriptCacheReader(ms)) { observedCache = reader.Read(ChromosomeUtilities.RefIndexToChromosome); } } Assert.NotNull(observedCache); Assert.Equal(_expectedCacheData.PeptideSeqs, observedCache.PeptideSeqs); CheckChromosomeIntervals(_expectedCacheData.Genes, observedCache.Genes); CheckIntervalArrays(_expectedCacheData.RegulatoryRegionIntervalArrays, observedCache.RegulatoryRegionIntervalArrays); CheckIntervalArrays(_expectedCacheData.TranscriptIntervalArrays, observedCache.TranscriptIntervalArrays); CheckIntervals(_expectedCacheData.TranscriptRegions, observedCache.TranscriptRegions); CheckIntervals(_expectedCacheData.Mirnas, observedCache.Mirnas); } private static void CheckIntervalArrays(IntervalArray[] expected, IntervalArray[] observed) where T : IInterval { Assert.Equal(expected.Length, observed.Length); for (var refIndex = 0; refIndex < expected.Length; refIndex++) { var expectedIntervalArray = expected[refIndex]; var observedIntervalArray = observed[refIndex]; if (expectedIntervalArray == null && observedIntervalArray == null) continue; Assert.NotNull(expectedIntervalArray); Assert.NotNull(observedIntervalArray); Assert.Equal(expectedIntervalArray.Array.Length, observedIntervalArray.Array.Length); for (var i = 0; i < expectedIntervalArray.Array.Length; i++) { var expectedInterval = expectedIntervalArray.Array[i]; var observedInterval = observedIntervalArray.Array[i]; Assert.Equal(expectedInterval.Begin, observedInterval.Begin); Assert.Equal(expectedInterval.End, observedInterval.End); } } } private static void CheckChromosomeIntervals(IEnumerable expected, IEnumerable observed) { var expectedList = expected.ToList(); var observedList = observed.ToList(); Assert.Equal(expectedList.Count, observedList.Count); for (var i = 0; i < expectedList.Count; i++) { var expectedEntry = expectedList[i]; var observedEntry = observedList[i]; Assert.Equal(expectedEntry.Chromosome.EnsemblName, observedEntry.Chromosome.EnsemblName); Assert.Equal(expectedEntry.Start, observedEntry.Start); Assert.Equal(expectedEntry.End, observedEntry.End); } } private static void CheckIntervals(IEnumerable expected, IEnumerable observed) { var expectedList = expected.ToList(); var observedList = observed.ToList(); Assert.Equal(expectedList.Count, observedList.Count); for (var i = 0; i < expectedList.Count; i++) { var expectedEntry = expectedList[i]; var observedEntry = observedList[i]; Assert.Equal(expectedEntry.Start, observedEntry.Start); Assert.Equal(expectedEntry.End, observedEntry.End); } } [Fact] public void ReadItems_EndToEnd() { var expectedStrings = new[] { "Huey", "Duey", "Louie" }; string[] observedStrings; using (var ms = new MemoryStream()) { // ReSharper disable AccessToDisposedClosure using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) { TranscriptCacheWriter.WriteItems(writer, expectedStrings, x => writer.WriteOptAscii(x)); } ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) { observedStrings = TranscriptCacheReader.ReadItems(reader, () => reader.ReadAsciiString()); } // ReSharper restore AccessToDisposedClosure } Assert.NotNull(observedStrings); Assert.Equal(expectedStrings, observedStrings); } [Fact] public void CheckGuard_InvalidGuard() { Assert.Throws(delegate { using (var ms = new MemoryStream()) { using (var writer = new ExtendedBinaryWriter(ms, Encoding.UTF8, true)) writer.Write(7); ms.Position = 0; using (var reader = new BufferedBinaryReader(ms)) TranscriptCacheReader.CheckGuard(reader); } }); } private static ITranscript[] GetTranscripts(Chromosome chromosome, IGene[] genes, ITranscriptRegion[] regions, IInterval[] mirnas) { return new ITranscript[] { new Transcript(chromosome, 120, 180, CompactId.Convert("789"), null, BioType.IG_D_gene, genes[0], 0, 0, false, regions, 0, mirnas, -1, -1, Source.None, false, false, null, null) }; } } } ================================================ FILE: UnitTests/VariantAnnotation/IO/JsonObjectTests.cs ================================================ using System.Globalization; using System.Text; using System.Threading; using System.Threading.Tasks; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.IO; using Xunit; namespace UnitTests.VariantAnnotation.IO { public sealed class JsonObjectTests { [Fact] public void ProcessBoolValue_True_TwoTimes() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddBoolValue("test1", true); json.AddBoolValue("test2", true); const string expectedResult = "\"test1\":true,\"test2\":true"; var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResult, observedResult); } [Fact] public void AddBoolValue_True_TwoTimes() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddBoolValue("test1", true); json.AddBoolValue("test2", true); const string expectedResult = "\"test1\":true,\"test2\":true"; var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResult, observedResult); } [Fact] public void AddIntValue_TwoTimes() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddIntValue("test1", 5); json.AddIntValue("test2", 7); const string expectedResult = "\"test1\":5,\"test2\":7"; var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResult, observedResult); } [Fact] public void AddIntValue_NullInt() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddIntValue("test1", null); var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(string.Empty, observedResult); } [Fact] public void AddDoubleValue_TwoTimes() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddDoubleValue("test1", 5.7); json.AddDoubleValue("test2", 7.9); const string expectedResult = "\"test1\":5.7,\"test2\":7.9"; var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResult, observedResult); } public static string GetJsonDoubleString() { var defaultCulture = Thread.CurrentThread.CurrentCulture; var newCulture = CultureInfo.CreateSpecificCulture("fr-FR"); Thread.CurrentThread.CurrentCulture = newCulture; var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddDoubleValue("test1", 5.7); json.AddDoubleValue("test2", 7.9); var result = StringBuilderPool.GetStringAndReturn(sb); Thread.CurrentThread.CurrentCulture = defaultCulture; return result; } [Fact] public void AddDoubleValue_InvariantCulture() { var task = Task.Factory.StartNew(GetJsonDoubleString); var observedResult = task.Result; const string expectedResult = "\"test1\":5.7,\"test2\":7.9"; Assert.Equal(expectedResult, observedResult); } [Fact] public void AddDoubleValue_NullInt() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddDoubleValue("test1", null); var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(string.Empty, observedResult); } [Fact] public void AddStringValue_TwoTimes() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddStringValue("test1", "bob"); json.AddStringValue("test2", "jane", false); const string expectedResult = "\"test1\":\"bob\",\"test2\":jane"; var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResult, observedResult); } [Fact] public void AddStringValue_NullInt() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddStringValue("test1", null); var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(string.Empty, observedResult); } [Fact] public void AddStringValues_TwoTimes() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); var strings = new[] { "A", "B", "C" }; var strings2 = new[] { "D", "E", "F" }; json.AddStringValues("test1", strings); json.AddStringValues("test2", strings2, false); const string expectedResult = "\"test1\":[\"A\",\"B\",\"C\"],\"test2\":[D,E,F]"; var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResult, observedResult); } [Fact] public void AddStringValues_NullArray() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddStringValues("test1", (string[])null); var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(string.Empty, observedResult); } [Fact] public void AddIntValues_TwoTimes() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); var ints = new[] { 1, 2, 3 }; var ints2 = new[] { 4, 5, 6 }; json.AddIntValues("test1", ints); json.AddIntValues("test2", ints2); const string expectedResult = "\"test1\":[1,2,3],\"test2\":[4,5,6]"; var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResult, observedResult); } [Fact] public void AddIntValues_NullArray() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddIntValues("test1", null); var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(string.Empty, observedResult); } [Fact] public void AddObjectValues_TwoTimes() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); var points = new Point[2]; points[0] = new Point(1, 2); points[1] = new Point(3, 4); var points2 = new Point[1]; points2[0] = new Point(5, 6); json.AddObjectValues("test1", points); json.AddObjectValues("test2", points2); const string expectedResult = "\"test1\":[{\"X\":1,\"Y\":2},{\"X\":3,\"Y\":4}],\"test2\":[{\"X\":5,\"Y\":6}]"; var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(expectedResult, observedResult); } [Fact] public void AddObjectValues_NullArray() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddObjectValues("test1", null as Point[]); var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(string.Empty, observedResult); } [Fact] public void AddStringValues_EmptyArray() { var sb = StringBuilderPool.Get(); var json = new JsonObject(sb); json.AddStringValues("test1", new string[0]); var observedResult = StringBuilderPool.GetStringAndReturn(sb); Assert.Equal(string.Empty, observedResult); } private sealed class Point : IJsonSerializer { private readonly int _x; private readonly int _y; public Point(int x, int y) { _x = x; _y = y; } public void SerializeJson(StringBuilder sb) { var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddIntValue("X", _x); jsonObject.AddIntValue("Y", _y); sb.Append(JsonObject.CloseBrace); } } } } ================================================ FILE: UnitTests/VariantAnnotation/IO/JsonWriterTests.cs ================================================ using System.Collections.Generic; using System.IO; using System.Text; using Moq; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.VariantAnnotation.IO { public sealed class JsonWriterTests { [Fact] public void WriteJsonEntry_Nominal() { var dataSourceVersions = new List { new DataSourceVersion("nirvana", "2.0", 100) }; var sampleNames = new[] { "NA12878" }; var position1 = new Mock(); position1.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); position1.SetupGet(x => x.Start).Returns(100); position1.SetupGet(x => x.End).Returns(100); var position2 = new Mock(); position2.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); position2.SetupGet(x => x.Start).Returns(101); position2.SetupGet(x => x.End).Returns(101); string observedResult; using (var ms = new MemoryStream()) { using (var streamWriter = new StreamWriter(ms, Encoding.ASCII, 1024, true)) using (var writer = new JsonWriter(streamWriter, null, "nirvana", "time", "vep", dataSourceVersions, "hg19", sampleNames, false)) { writer.WritePosition(position1.Object, "{\"test\":\"good\"}"); writer.WritePosition(position2.Object, "{\"crash\":\"bad\"}"); writer.WritePosition(null, (string)null); } observedResult = Encoding.UTF8.GetString(ms.ToArray()); } const string expectedResult = "{\"header\":{\"annotator\":\"nirvana\",\"creationTime\":\"time\",\"genomeAssembly\":\"hg19\",\"schemaVersion\":6,\"dataVersion\":\"vep\",\"dataSources\":[{\"name\":\"nirvana\",\"version\":\"2.0\",\"releaseDate\":\"0001-01-01\"}],\"samples\":[\"NA12878\"]},\"positions\":[\n{\"test\":\"good\"},\n{\"crash\":\"bad\"}\n]}\n"; Assert.Equal(expectedResult, observedResult); } } } ================================================ FILE: UnitTests/VariantAnnotation/IO/SampleExtensionsTests.cs ================================================ using VariantAnnotation.IO; using Vcf.Sample; using Xunit; namespace UnitTests.VariantAnnotation.IO { public sealed class SampleExtensionsTests { [Fact] public void GetJsonString_Nominal() { var sample = new Sample(new[] {23, 34}, 12.345f, 3, new[] {"-", "+"}, true, "1/2", 98, true, 98.3, 56.67f, new[] {8, 14}, new[] {7, 4}, new[] {10, 15}, 34, new[] {0.34, 0.56}, 1, 2.3, null, new []{"12.34", "null"}, 1234); string observedResult = sample.GetJsonString(); Assert.Contains("\"alleleDepths\":[23,34]", observedResult); Assert.Contains("\"artifactAdjustedQualityScore\":12.3", observedResult); Assert.Contains("\"copyNumber\":3", observedResult); Assert.Contains("\"diseaseAffectedStatuses\":[\"-\",\"+\"]", observedResult); Assert.Contains("\"failedFilter\":true", observedResult); Assert.Contains("\"genotype\":\"1/2\"", observedResult); Assert.Contains("\"genotypeQuality\":98", observedResult); Assert.Contains("\"isDeNovo\":true", observedResult); Assert.Contains("\"deNovoQuality\":98.3", observedResult); Assert.Contains("\"likelihoodRatioQualityScore\":56.7", observedResult); Assert.Contains("\"pairedEndReadCounts\":[8,14]", observedResult); Assert.Contains("\"repeatUnitCounts\":[7,4]", observedResult); Assert.Contains("\"splitReadCounts\":[10,15]", observedResult); Assert.Contains("\"totalDepth\":34", observedResult); Assert.Contains("\"variantFrequencies\":[0.34,0.56]", observedResult); Assert.Contains("\"minorHaplotypeCopyNumber\":1", observedResult); Assert.Contains("\"somaticQuality\":2.3", observedResult); Assert.Contains("\"heteroplasmyPercentile\":[12.34,null]", observedResult); Assert.Contains("\"binCount\":1234", observedResult); } } } ================================================ FILE: UnitTests/VariantAnnotation/NSA/NsaIndexTests.cs ================================================ using System; using System.IO; using Genome; using IO; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Xunit; namespace UnitTests.VariantAnnotation.NSA { public sealed class NsaIndexTests { [Fact] public void Query_chunks_in_same_chrom() { var stream = new MemoryStream(); var writer = new ExtendedBinaryWriter(stream); var version = new DataSourceVersion("dbsnp", "150", DateTime.Now.Ticks, "dbsnp ids"); var index = new NsaIndex(writer, GenomeAssembly.GRCh37, version, "dbsnp", true, true, SaCommon.SchemaVersion, false); index.Add(0, 100, 2000, 23457, 89320); index.Add(0, 2100, 4000, 112778, 58746); index.Add(0, 4100, 7000, 171525, 658794); (long start, int chunkCount) = index.GetFileRange(0, 150, 2120); Assert.Equal(23457, start); Assert.Equal(2, chunkCount); (start, chunkCount) = index.GetFileRange(0, 50, 98); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); (start, chunkCount) = index.GetFileRange(0, 150, 2010); Assert.Equal(23457, start); Assert.Equal(1, chunkCount); (start, chunkCount) = index.GetFileRange(0, 2010, 4050); Assert.Equal(112778, start); Assert.Equal(1, chunkCount); (start, chunkCount) = index.GetFileRange(0, 4010, 4050); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); (start, chunkCount) = index.GetFileRange(0, 7010, 7050); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); } [Fact] public void Query_chunks_in_different_chrom() { var stream = new MemoryStream(); var writer = new ExtendedBinaryWriter(stream); var version = new DataSourceVersion("dbsnp", "150", DateTime.Now.Ticks, "dbsnp ids"); var index = new NsaIndex(writer, GenomeAssembly.GRCh37, version, "dbsnp", true, true, SaCommon.SchemaVersion, false); index.Add(0, 100, 2000, 23457, 89320); index.Add(0, 2100, 4000, 112778, 58746); index.Add(0, 4100, 7000, 171525, 658794); index.Add(1, 100, 2000, 23457, 89320); index.Add(1, 2100, 4000, 112778, 58746); index.Add(1, 4100, 7000, 171525, 658794); (long start, int chunkCount) = index.GetFileRange(0, 150, 2120); Assert.Equal(23457, start); Assert.Equal(2, chunkCount); (start, chunkCount) = index.GetFileRange(0, 50, 98); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); (start, chunkCount) = index.GetFileRange(0, 150, 2010); Assert.Equal(23457, start); Assert.Equal(1, chunkCount); (start, chunkCount) = index.GetFileRange(0, 2010, 4050); Assert.Equal(112778, start); Assert.Equal(1, chunkCount); (start, chunkCount) = index.GetFileRange(0, 4010, 4050); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); (start, chunkCount) = index.GetFileRange(0, 7010, 7050); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); //chr2 (start, chunkCount) = index.GetFileRange(0, 150, 2120); Assert.Equal(23457, start); Assert.Equal(2, chunkCount); (start, chunkCount) = index.GetFileRange(0, 50, 98); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); (start, chunkCount) = index.GetFileRange(0, 150, 2010); Assert.Equal(23457, start); Assert.Equal(1, chunkCount); (start, chunkCount) = index.GetFileRange(0, 2010, 4050); Assert.Equal(112778, start); Assert.Equal(1, chunkCount); (start, chunkCount) = index.GetFileRange(0, 4010, 4050); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); (start, chunkCount) = index.GetFileRange(0, 7010, 7050); Assert.Equal(-1, start); Assert.Equal(0, chunkCount); } } } ================================================ FILE: UnitTests/VariantAnnotation/NSA/RefMinorIndexTests.cs ================================================ using System; using System.IO; using Genome; using IO; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Xunit; namespace UnitTests.VariantAnnotation.NSA { public sealed class RefMinorIndexTests { [Fact] public void CreateAndQuery_one_chromosome() { using (var stream = new MemoryStream()) using(var writer = new ExtendedBinaryWriter(stream)) { var index = new RefMinorIndex(writer, GenomeAssembly.GRCh37, new DataSourceVersion("name", "1", DateTime.Now.Ticks), SaCommon.SchemaVersion ); index.Add(0, 100); index.Add(0, 105); index.Add(0, 110); index.Add(0, 115); index.Write(120); (long location, int byteCount, int count) = index.GetFileRange(0); Assert.Equal(100, location); Assert.Equal(20, byteCount); Assert.Equal(4, count); } } [Fact] public void CreateAndQuery_multiple_chromosomes() { using (var stream = new MemoryStream()) using (var writer = new ExtendedBinaryWriter(stream)) { var index = new RefMinorIndex(writer, GenomeAssembly.GRCh37, new DataSourceVersion("name", "1", DateTime.Now.Ticks), SaCommon.SchemaVersion); index.Add(0, 100); index.Add(0, 105); index.Add(0, 110); index.Add(0, 115); index.Add(1, 200); index.Add(1, 205); index.Add(1, 210); index.Add(2, 315); index.Write(320); (long location, int byteCount, int count) = index.GetFileRange(0); Assert.Equal(100, location); Assert.Equal(100, byteCount); Assert.Equal(4, count); (location, byteCount, count) = index.GetFileRange(1); Assert.Equal(200, location); Assert.Equal(115, byteCount); Assert.Equal(3, count); (location, byteCount, count) = index.GetFileRange(2); Assert.Equal(315, location); Assert.Equal(5, byteCount); Assert.Equal(1, count); } } [Fact] public void ReadBack() { var stream = new MemoryStream(); using (var writer = new ExtendedBinaryWriter(stream)) { var index = new RefMinorIndex(writer, GenomeAssembly.GRCh37, new DataSourceVersion("name", "1", DateTime.Now.Ticks), SaCommon.SchemaVersion); index.Add(0, 100); index.Add(0, 105); index.Add(0, 110); index.Add(0, 115); index.Add(1, 200); index.Add(1, 205); index.Add(1, 210); index.Add(2, 315); index.Write(320); } var readStream = new MemoryStream(stream.ToArray()) { Position = 0 }; using (var reader = new ExtendedBinaryReader(readStream)) { var index = new RefMinorIndex(reader); (long location, int byteCount, int count) = index.GetFileRange(0); Assert.Equal(100, location); Assert.Equal(100, byteCount); Assert.Equal(4, count); (location, byteCount, count) = index.GetFileRange(1); Assert.Equal(200, location); Assert.Equal(115, byteCount); Assert.Equal(3, count); (location, byteCount, count) = index.GetFileRange(2); Assert.Equal(315, location); Assert.Equal(5, byteCount); Assert.Equal(1, count); } } } } ================================================ FILE: UnitTests/VariantAnnotation/NSA/SuppAnnotationsOutputTests.cs ================================================ using System.Text; using ErrorHandling.Exceptions; using VariantAnnotation.NSA; using Xunit; namespace UnitTests.VariantAnnotation.NSA { public sealed class SuppAnnotationsOutputTests { [Fact] public void Output_positional_not_array() { var sa = new SupplementaryAnnotation("Anno", false, true, "pathogenic", null); var sb = new StringBuilder(); sa.SerializeJson(sb); Assert.Equal("pathogenic", sb.ToString()); } [Fact] public void Output_not_positional_not_array() { var sa = new SupplementaryAnnotation("alleleFreq", false, false, "pathogenic", null); var sb = new StringBuilder(); sa.SerializeJson(sb); Assert.Equal("{pathogenic}", sb.ToString()); } [Fact] public void Output_not_positional_array() { //e.g. clinvar var sa = new SupplementaryAnnotation("spliceAi", true, false, null, new []{"likely pathogenic", "unknown pathogenicity"}); var sb = new StringBuilder(); sa.SerializeJson(sb); Assert.Equal("[{likely pathogenic},{unknown pathogenicity}]", sb.ToString()); } [Fact] public void Output_emptyJsonStrings_array() { Assert.Throws(()=>new SupplementaryAnnotation("svAnno", true, true, "pathogenic", null)); } [Fact] public void Output_emptyJsonString_not_array() { Assert.Throws(() => new SupplementaryAnnotation("svAnno", false, true, null, new []{"pathogenic"})); } } } ================================================ FILE: UnitTests/VariantAnnotation/NSA/SuppIntervalUtilitiesTests.cs ================================================ using Genome; using UnitTests.TestUtilities; using VariantAnnotation.NSA; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.NSA { public sealed class SuppIntervalUtilitiesTests { [Theory] [InlineData(1, 100, 51, 200, 0.33333, 0.33333)] [InlineData(1, 300, 51, 200, 0.5, 1)] [InlineData(101, 300, 51, 200, 0.5, 0.66667)] [InlineData(1, 100, 100, 299, 0.005, 0.005)] public void GetOverlapFractions_NotNull_AsExpected(int varStart, int varEnd, int saStart, int saEnd, double expectedReciprocalOverlap, double expecedAnnotationOverlap) { var saInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, saStart, saEnd); var variant = new SimpleVariant(ChromosomeUtilities.Chr1, varStart, varEnd, null, null, VariantType.deletion); var (reciprocalOverlap, annotationOverlap) = SuppIntervalUtilities.GetOverlapFractions(saInterval, variant); Assert.NotNull(reciprocalOverlap); Assert.NotNull(annotationOverlap); Assert.Equal(expectedReciprocalOverlap, reciprocalOverlap.Value, 5); Assert.Equal(expecedAnnotationOverlap, annotationOverlap.Value, 5); } [Fact] public void GetOverlapFractions_ReturnNulls_DifferentChroms() { var saInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 1, 2); var variant = new SimpleVariant(ChromosomeUtilities.Chr2, 1, 2, null, null, VariantType.deletion); var (reciprocalOverlap, annotationOverlap) = SuppIntervalUtilities.GetOverlapFractions(saInterval, variant); Assert.Null(reciprocalOverlap); Assert.Null(annotationOverlap); } [Fact] public void GetOverlapFractions_ReturnNulls_Insertion() { var saInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 1, 2); var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1, 2, null, null, VariantType.insertion); var (reciprocalOverlap, annotationOverlap) = SuppIntervalUtilities.GetOverlapFractions(saInterval, variant); Assert.Null(reciprocalOverlap); Assert.Null(annotationOverlap); } [Fact] public void GetOverlapFractions_ReturnNulls_SaInsertion() { var saInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 2, 1); var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1, 2, null, null, VariantType.deletion); var (reciprocalOverlap, annotationOverlap) = SuppIntervalUtilities.GetOverlapFractions(saInterval, variant); Assert.Null(reciprocalOverlap); Assert.Null(annotationOverlap); } [Fact] public void GetOverlapFractions_ReturnNulls_BreakEnd() { var saInterval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 2, 1); var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 1, 2, null, null, VariantType.translocation_breakend); var (reciprocalOverlap, annotationOverlap) = SuppIntervalUtilities.GetOverlapFractions(saInterval, variant); Assert.Null(reciprocalOverlap); Assert.Null(annotationOverlap); } } } ================================================ FILE: UnitTests/VariantAnnotation/ProviderTests/GsaProviderTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using Genome; using Moq; using OptimizedCore; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestUtilities; using UnitTests.VariantAnnotation.ScoreFile; using VariantAnnotation.GenericScore; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Pools; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Variants; using Vcf.VariantCreator; using Xunit; namespace UnitTests.VariantAnnotation.ProviderTests; public sealed class GsaProviderTests { private static (ScoreReader, Dictionary>>) GetScoreReaderWithData() { var testSetup = new Dictionary>> { { ChromosomeUtilities.Chr1, new List> { new Dictionary { {"startPosition", 10_001}, {"endPosition", 23_000}, } } }, { ChromosomeUtilities.Chr2, new List> { new Dictionary { {"startPosition", 11_001}, {"endPosition", 23_500}, } } }, }; return (TestDataGenerator.GetScoreReaderWithRandomData(testSetup), testSetup); } private static (ScoreProvider provider, Dictionary>> providerTestData) GetScoreProvider() { (ScoreReader scoreReader, Dictionary>> testData) = GetScoreReaderWithData(); var provider = new ScoreProvider(new[] {scoreReader}); return (provider, testData); } private static IAnnotatedPosition GetPosition(Chromosome chrom, int start, string refAllele, string[] altAlleles) { var position = new Mock(); var annotatedVariants = new List(); foreach (string altAllele in altAlleles) { VariantType type = SmallVariantCreator.GetVariantType(refAllele, altAllele); int end = start + altAllele.Length - 1; var variant = VariantPool.Get(chrom, start, end, refAllele, altAllele, type, null, false, false, false, null, AnnotationBehavior.SmallVariants, false); annotatedVariants.Add(AnnotatedVariantPool.Get(variant)); } position.SetupGet(x => x.AnnotatedVariants).Returns(annotatedVariants.ToArray); return position.Object; } [Fact] public void TestAnnotateUsingScoreProvider() { (IAnnotationProvider provider, Dictionary>> testSetup) = GetScoreProvider(); foreach ((Chromosome chromosome, List> chromosomeTests) in testSetup) { foreach (Dictionary chromosomeTest in chromosomeTests) { var expectedScores = (List) chromosomeTest["expectedScores"]; var startPosition = (int) chromosomeTest["startPosition"]; for (var i = 0; i < expectedScores.Count; i++) { IAnnotatedPosition position = GetPosition(chromosome, startPosition + i, "T", new[] {"A"}); provider.Annotate(position); var sb = position.AnnotatedVariants[0].GetJsonStringBuilder(chromosome.UcscName); var jsonString = sb.ToString(); StringBuilderPool.Return(sb); var expectedScore = $"{Math.Round(expectedScores[i], 2):0.##}"; var expectedString = "{\"chromosome\":\"" + $"{chromosome.UcscName}\"," + "\"begin\":" + $"{startPosition + i}," + "\"end\":" + $"{startPosition + i}," + "\"refAllele\":" + "\"T\"," + "\"altAllele\":" + "\"A\"," + "\"variantType\":" + "\"SNV\"," + "\"TestKey\":" + $"{expectedScore}" + "}"; Assert.Equal(expectedString, jsonString); } } } } [Fact] public void TestSNVTypeAnnotationOnly() { (IAnnotationProvider provider, Dictionary>> testSetup) = GetScoreProvider(); var position = new Mock(); var annotatedVariants = new List(); var type = VariantType.insertion; Variant variant = VariantPool.Get(ChromosomeUtilities.Chr1, 15_000, 15_001, "-", "G", type, null, false, false, false, null, AnnotationBehavior.SmallVariants, false); annotatedVariants.Add(AnnotatedVariantPool.Get(variant)); position.SetupGet(x => x.AnnotatedVariants).Returns(annotatedVariants.ToArray); IAnnotatedPosition annotatedPosition = position.Object; provider.Annotate(annotatedPosition); Assert.Empty(annotatedPosition.AnnotatedVariants[0].SaList); } [Fact] private void TestUnknownPosition() { (IAnnotationProvider provider, Dictionary>> testSetup) = GetScoreProvider(); IAnnotatedPosition position = GetPosition(ChromosomeUtilities.Chr1, 5_000, "T", new[] {"A"}); provider.Annotate(position); Assert.Empty(position.AnnotatedVariants[0].SaList); // Unknown Chromosome position = GetPosition(ChromosomeUtilities.Chr7, 5_000, "T", new[] {"A"}); provider.Annotate(position); Assert.Empty(position.AnnotatedVariants[0].SaList); } [Fact] private void TestUnknownAssembly() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); string[] nucleotides = {"A", "C", "G", "T"}; var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); var position = 10_010; using (var dataStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new ScoreFileWriter( writerSettings, dataStream, indexStream, version, GenericScoreTests.GetAllASequenceProvider(GenomeAssembly.Unknown), SaCommon.SchemaVersion, skipIncorrectRefEntries: false, leaveOpen: true )) { IEnumerable items = new List { new(ChromosomeUtilities.Chr1, position, "A", "C", 0.5), }; saWriter.Write(items); } dataStream.Position = 0; indexStream.Position = 0; ScoreReader scoreReader = ScoreReader.Read(dataStream, indexStream); Assert.Throws(() => new ScoreProvider(new[] {scoreReader})); } } } ================================================ FILE: UnitTests/VariantAnnotation/ProviderTests/LcrProviderTests.cs ================================================ using System; using System.IO; using Genome; using Moq; using SAUtils.gnomAD; using UnitTests.TestUtilities; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Xunit; namespace UnitTests.VariantAnnotation.ProviderTests { public class LcrProviderTests { private Stream GetNsiStream() { var stream = new MemoryStream(); var version = new DataSourceVersion("test", "June_2020", DateTime.Now.Ticks, "dummy"); using (var writer = new NsiWriter(stream, version, GenomeAssembly.GRCh37, SaCommon.LowComplexityRegionTag, ReportFor.AllVariants, SaCommon.NsiSchemaVersion, true)) { writer.Write(new [] { new LcrInterval(ChromosomeUtilities.Chr1, 100, 150), new LcrInterval(ChromosomeUtilities.Chr1, 300, 450), new LcrInterval(ChromosomeUtilities.Chr1, 600, 650), new LcrInterval(ChromosomeUtilities.Chr2, 100, 150), new LcrInterval(ChromosomeUtilities.Chr2, 300, 450), new LcrInterval(ChromosomeUtilities.Chr2, 600, 650) }); } stream.Position = 0; return stream; } private IAnnotatedVariant GetAnnotatedVariant(Chromosome chromosome, int start, int end) { var annoVariant = new Mock(); annoVariant.SetupGet(x => x.Variant.Chromosome).Returns(chromosome); annoVariant.SetupGet(x => x.Variant.Start).Returns(start); annoVariant.SetupGet(x => x.Variant.End).Returns(end); annoVariant.SetupProperty(x => x.InLowComplexityRegion); return annoVariant.Object; } private IAnnotatedPosition GetAnnotatedPosition(Chromosome chromosome, int start, int end) { var annoPosition = new Mock(); annoPosition.SetupGet(x => x.AnnotatedVariants).Returns( new [] { GetAnnotatedVariant(chromosome, start, end) } ); return annoPosition.Object; } [Fact] public void AddAnnotationsTest() { using (var provider = new LcrProvider(GetNsiStream())) { var position = GetAnnotatedPosition(ChromosomeUtilities.Chr1, 50, 70); provider.Annotate(position); Assert.False(position.AnnotatedVariants[0].InLowComplexityRegion); position = GetAnnotatedPosition(ChromosomeUtilities.Chr1, 110, 160); provider.Annotate(position); Assert.True(position.AnnotatedVariants[0].InLowComplexityRegion); position = GetAnnotatedPosition(ChromosomeUtilities.Chr2, 110, 160); provider.Annotate(position); Assert.True(position.AnnotatedVariants[0].InLowComplexityRegion); } } } } ================================================ FILE: UnitTests/VariantAnnotation/ProviderTests/NsaProviderTests.cs ================================================ using System; using System.Collections.Generic; using Genome; using Moq; using OptimizedCore; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.Pools; using VariantAnnotation.Providers; using Variants; using Vcf.VariantCreator; using Xunit; using VariantAnnotation.SA; namespace UnitTests.VariantAnnotation.ProviderTests { public sealed class NsaProviderTests { private static IAnnotationProvider GetDbSnpProvider() { var chrom1Pos100Annotations = new List<(string refAllele, string altAllele, string annotation)> { ("A", "T", "\"rs100\""), ("A", "C", "\"rs101\"") }; var dbsnpReader = new Mock(); dbsnpReader.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); dbsnpReader.SetupGet(x => x.MatchByAllele).Returns(true); dbsnpReader.SetupGet(x => x.IsArray).Returns(true); dbsnpReader.SetupGet(x => x.JsonKey).Returns("dbSnp"); dbsnpReader.SetupGet(x => x.Version) .Returns(new DataSourceVersion("dbsnp", "v1", DateTime.Now.Ticks, "dummy db snp")); //dbsnpReader.SetupSequence(x => x.GetAnnotation(100)).Returns(chrom1Pos100Annotations); //List<(string refAllele, string altAllele, string annotation)> annotations=null; dbsnpReader.Setup(x => x.GetAnnotation(It.IsAny(), It.IsAny>() )) .Callback((int position, List<(string refAllele, string altAllele, string annotation)> annotations) => { annotations.Clear(); annotations.AddRange(chrom1Pos100Annotations); }); var provider = new NsaProvider(new[] {dbsnpReader.Object}, null, null); return provider; } private static IAnnotationProvider GetClinVarProvider() { var chrom1Pos100Annotations = new List<(string refAllele, string altAllele, string annotation)> { ("A", "T", "RCV00001"), ("A", "C", "RCV00002") }; var clinvarReader = new Mock(); clinvarReader.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); clinvarReader.SetupGet(x => x.MatchByAllele).Returns(false); clinvarReader.SetupGet(x => x.IsArray).Returns(true); clinvarReader.SetupGet(x => x.JsonKey).Returns("clinvar"); clinvarReader.SetupGet(x => x.Version) .Returns(new DataSourceVersion("clinvar", "v1", DateTime.Now.Ticks, "dummy clinvar data")); clinvarReader.Setup(x => x.GetAnnotation(It.IsAny(), It.IsAny>() )) .Callback((int position, List<(string refAllele, string altAllele, string annotation)> annotations) => { annotations.Clear(); annotations.AddRange(chrom1Pos100Annotations); }); var provider = new NsaProvider(new[] {clinvarReader.Object}, null, null); return provider; } private static IAnnotationProvider GetGmeProvider() { var chrom1Post69134Annotations = new List<(string refAllele, string altAllele, string annotation)> { ("A", "G", "\"allAc\":10,\"allAn\":202,\"allAf\":0.0495,\"failedFilter\":true") }; var gmeReader = new Mock(); gmeReader.SetupGet(x => x.Assembly).Returns(GenomeAssembly.GRCh37); gmeReader.SetupGet(x => x.MatchByAllele).Returns(true); gmeReader.SetupGet(x => x.IsArray).Returns(false); gmeReader.SetupGet(x => x.JsonKey).Returns(SaCommon.GmeTag); gmeReader.SetupGet(x => x.Version) .Returns(new DataSourceVersion(SaCommon.GmeTag, "v1", DateTime.Now.Ticks, "dummy gme data")); //dbsnpReader.SetupSequence(x => x.GetAnnotation(100)).Returns(chrom1Pos100Annotations); //List<(string refAllele, string altAllele, string annotation)> annotations=null; gmeReader.Setup(x => x.GetAnnotation(It.IsAny(), It.IsAny>() )) .Callback((int position, List<(string refAllele, string altAllele, string annotation)> annotations) => { annotations.Clear(); annotations.AddRange(chrom1Post69134Annotations); }); var provider = new NsaProvider(new[] {gmeReader.Object}, null, null); return provider; } private static IAnnotatedPosition GetPosition(Chromosome chrom, int start, string refAllele, string[] altAlleles) { var position = new Mock(); var annotatedVariants = new List(); foreach (string altAllele in altAlleles) { VariantType type = SmallVariantCreator.GetVariantType(refAllele, altAllele); int end = start + altAllele.Length - 1; var variant = VariantPool.Get(chrom, start, end, refAllele, altAllele, type, null, false, false, false, null, AnnotationBehavior.SmallVariants, false); annotatedVariants.Add(AnnotatedVariantPool.Get(variant)); } position.SetupGet(x => x.AnnotatedVariants).Returns(annotatedVariants.ToArray); return position.Object; } [Fact] public void Annotate_alleleSpecific() { var provider = GetDbSnpProvider(); var position = GetPosition(ChromosomeUtilities.Chr1, 100, "A", new []{"T"}); provider.Annotate(position); var sb = position.AnnotatedVariants[0].GetJsonStringBuilder("chr1"); var jsonString = sb.ToString(); StringBuilderPool.Return(sb); Assert.Equal("{\"chromosome\":\"chr1\",\"begin\":100,\"end\":100,\"refAllele\":\"A\",\"altAllele\":\"T\",\"variantType\":\"SNV\",\"dbSnp\":[\"rs100\"]}", jsonString); VariantPool.Return((Variant)position.AnnotatedVariants[0].Variant); AnnotatedVariantPool.Return((AnnotatedVariant) position.AnnotatedVariants[0]); } [Fact] public void Annotate_gme() { var provider = GetGmeProvider(); var position = GetPosition(ChromosomeUtilities.Chr1, 69134, "A", new []{"G"}); provider.Annotate(position); var sb = position.AnnotatedVariants[0].GetJsonStringBuilder("chr1"); var jsonString = sb.ToString(); StringBuilderPool.Return(sb); Assert.Equal("{\"chromosome\":\"chr1\",\"begin\":69134,\"end\":69134,\"refAllele\":\"A\",\"altAllele\":\"G\",\"variantType\":\"SNV\",\"gmeVariome\":{\"allAc\":10,\"allAn\":202,\"allAf\":0.0495,\"failedFilter\":true}}", jsonString); VariantPool.Return((Variant)position.AnnotatedVariants[0].Variant); AnnotatedVariantPool.Return((AnnotatedVariant) position.AnnotatedVariants[0]); } [Fact] public void Annotate_notAlleleSpecific_isArray() { var provider = GetClinVarProvider(); var position = GetPosition(ChromosomeUtilities.Chr1, 100, "A", new[] { "T" }); provider.Annotate(position); var sb = position.AnnotatedVariants[0].GetJsonStringBuilder("chr1"); var jsonString = sb.ToString(); StringBuilderPool.Return(sb); Assert.Equal("{\"chromosome\":\"chr1\",\"begin\":100,\"end\":100,\"refAllele\":\"A\",\"altAllele\":\"T\",\"variantType\":\"SNV\",\"clinvar\":[{RCV00001,\"isAlleleSpecific\":true},{RCV00002}]}", jsonString); VariantPool.Return((Variant)position.AnnotatedVariants[0].Variant); AnnotatedVariantPool.Return((AnnotatedVariant) position.AnnotatedVariants[0]); } } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/GenericScoreEncoderTests.cs ================================================ using System.Collections.Generic; using System.IO; using IO; using VariantAnnotation.GenericScore; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile; public sealed class GenericScoreEncoderTests { [Fact] public void TestEncoderDecoder() { var testData = new List<(double inputNumber, double expectedResult)> { (0.246, 0.246), (0.2461, 0.2461), (0.999, 0.999), (0.127, 0.127), (0.128, 0.128), (0.129, 0.129), // Duplicate of above 3 data points to check if the generic score only stores the codes uniquely (0.127, 0.127), (0.128, 0.128), (0.129, 0.129), (0.254, 0.254), (0.255, 0.255), (0.256, 0.256), (0.1271, 0.1271), (0.1281, 0.1281), (0.1291, 0.1291), (0.2541, 0.2541), (0.2551, 0.2551), (0.2561, 0.2561), (0.1266, 0.1266), (0.1276, 0.1276), (0.0, 0.0), (1.0, 1.0), (-1.0, -1.0), (double.NaN, double.NaN) }; var scoreEncoder = new GenericScoreEncoder(); foreach ((double input, _) in testData) { scoreEncoder.AddScore(input); } using var stream = new MemoryStream(); using var writer = new ExtendedBinaryWriter(stream, System.Text.Encoding.Default); scoreEncoder.Write(writer); stream.Position = 0; var reader = new ExtendedBinaryReader(stream); GenericScoreEncoder deserializedScoreEncoder = GenericScoreEncoder.Read(reader); stream.Close(); foreach ((double inputNumber, double expectedOutput)in testData) { Assert.Equal(expectedOutput, EncodeDecode(deserializedScoreEncoder, inputNumber)); } } private static double EncodeDecode(GenericScoreEncoder encoder, double number) { return encoder.DecodeFromBytes(encoder.EncodeToBytes(number)); } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/GenericScoreEndToEndTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestUtilities; using VariantAnnotation.GenericScore; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile { public sealed class GenericScoreEndToEndTests { [Fact] public void ScoreWriterTestRandomData() { const int blockLength = 10_000; const int places = 2; double tol = Math.Pow(10, -places); string[] nucleotides = {"A", "C", "G", "T"}; var testSetup = new Dictionary>> { // Normal Chromosome { ChromosomeUtilities.Chr1, new List> { new Dictionary { {"startPosition", 10_001}, {"endPosition", 23_000}, } } }, // Chromosome with large gaps { ChromosomeUtilities.Chr2, new List> { new Dictionary { {"startPosition", 24_001}, {"endPosition", 100_000}, }, // 5 Block gap new Dictionary { {"startPosition", 154_001}, {"endPosition", 200_000}, }, } }, // Next chromosome starting at immediately next position to last chromosome ending position { ChromosomeUtilities.Chr3, new List> { new Dictionary { {"startPosition", 200_001}, {"endPosition", 210_000}, }, new Dictionary { {"startPosition", 210_001}, {"endPosition", 214_000}, }, // Short gap but still within the same block new Dictionary { {"startPosition", 215_001}, {"endPosition", 216_000}, }, // Larger gap to go to next block new Dictionary { {"startPosition", 221_001}, {"endPosition", 235_000}, }, } }, // New chromosome with positions that preceed others { ChromosomeUtilities.Chr4, new List> { new Dictionary { {"startPosition", 10_001}, {"endPosition", 21_000}, } } }, }; var writeStream = new MemoryStream(); var indexStream = new MemoryStream(); var saItems = new List(); var version = new DataSourceVersion("Test", "1", DateTime.Parse(DateTime.Now.ToString("yyyy-MM-dd")).Ticks, "No description"); var writerSettings = new WriterSettings( blockLength, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); // Scoring function to fill random scores TestDataGenerator.GenerateRandomScoreData(testSetup, saItems, TestDataGenerator.GetSequenceProvider()); using (var scoreFileWriter = new ScoreFileWriter( writerSettings, writeStream, indexStream, version, TestDataGenerator.GetSequenceProvider(), SaCommon.SchemaVersion, leaveOpen: true )) { // Write saItems to stream scoreFileWriter.Write(saItems); // Reset streams in preparation for reading them indexStream.Position = 0; writeStream.Position = 0; // Read the scores ScoreReader scoreReader = ScoreReader.Read(writeStream, indexStream); // Assert scores are equal to what was set in test data AssertTestData(testSetup, scoreReader, blockLength, places, tol); // Scores in the gap Assert.Equal(double.NaN, scoreReader.GetScore(2, 100_001, "A")); // Scores for unspecified Allele Assert.Equal(double.NaN, scoreReader.GetScore(2, 100_001, "C")); } } [Fact] public void ScoreWriterTestDeterministicData() { const int blockLength = 10_000; const int places = 2; double tol = Math.Pow(10, -places); string[] nucleotides = {"A", "C", "G", "T"}; var testSetup = new Dictionary>> { { ChromosomeUtilities.Chr1, new List> { new Dictionary { {"startPosition", 10_001}, {"endPosition", 23_000}, } } }, { ChromosomeUtilities.Chr2, new List> { new Dictionary { {"startPosition", 24_001}, {"endPosition", 100_000}, } } }, }; var saItems = new List(); var writeStream = new MemoryStream(); var indexStream = new MemoryStream(); var version = new DataSourceVersion("Test", "1", DateTime.Parse(DateTime.Now.ToString("yyyy-MM-dd")).Ticks, "No description"); var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); // Scoring function to fill scores from position double ScoreFunction(int i, int endPosition) => (double) i / endPosition; TestDataGenerator.GenerateTestData(testSetup, saItems, ScoreFunction, TestDataGenerator.GetSequenceProvider()); using (var scoreFileWriter = new ScoreFileWriter( writerSettings, writeStream, indexStream, version, TestDataGenerator.GetSequenceProvider(), SaCommon.SchemaVersion, leaveOpen: true )) { // Write saItems to stream scoreFileWriter.Write(saItems); // Reset streams in preparation for reading them indexStream.Position = 0; writeStream.Position = 0; // Read the scores var scoreReader = ScoreReader.Read(writeStream, indexStream); // Assert scores are equal to what was set in test data AssertTestData(testSetup, scoreReader, blockLength, places, tol); } } private static void AssertTestData(Dictionary>> testSetup, ScoreReader scoreReader, int blockLength, int places, double tol) { foreach ((Chromosome chromosome, List> chromosomeTests) in testSetup) { foreach (Dictionary chromosomeTest in chromosomeTests) { var expectedScores = (List) chromosomeTest["expectedScores"]; var startPosition = (int) chromosomeTest["startPosition"]; for (var i = 0; i < expectedScores.Count; i++) { // Read score at position double score = scoreReader.GetScore(chromosome.Index, startPosition + i, "A"); Assert.True(Math.Round(Math.Abs(expectedScores[i] - score), places) <= tol); } } var chromosomeStartPosition = (int) chromosomeTests[0]["startPosition"]; var chromosomeEndPosition = (int) chromosomeTests[^1]["endPosition"]; Assert.Equal(double.NaN, scoreReader.GetScore(chromosome.Index, chromosomeStartPosition - 1, "A")); Assert.Equal(double.NaN, scoreReader.GetScore(chromosome.Index, chromosomeEndPosition + 1, "A")); } } } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/GenericScoreTests.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using Genome; using IO; using IO.v2; using Moq; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.GenericScore; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile { public sealed class GenericScoreTests { [Fact] public void TestScoreReader() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); string[] nucleotides = {"A", "C", "G", "T"}; var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new ScoreFileWriter( writerSettings, saStream, indexStream, version, GetAllASequenceProvider(), SaCommon.SchemaVersion, skipIncorrectRefEntries: false, leaveOpen: true )) { var items = GetSaItems(1000); saWriter.Write(items); } saStream.Position = 0; indexStream.Position = 0; var saReader = ScoreReader.Read(saStream, indexStream); // before any SA existed Assert.True(double.IsNaN(saReader.GetScore(0, 90, "C"))); // first entry of first block Assert.False(double.IsNaN(saReader.GetScore(0, 100, "C"))); // last query of first block Assert.False(double.IsNaN(saReader.GetScore(0, 480, "C"))); // between first and second block Assert.True(double.IsNaN(saReader.GetScore(0, 488, "C"))); // first entry of second block Assert.False(double.IsNaN(saReader.GetScore(0, 490, "C"))); // unknown allele Assert.True(double.IsNaN(saReader.GetScore(0, 490, "K"))); } } [Fact] public void TestParRegion() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); string[] nucleotides = {"A", "C", "G", "T"}; var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); var count = 1000; using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new ScoreFileWriter( writerSettings, saStream, indexStream, version, GetAllASequenceProvider(), SaCommon.SchemaVersion, skipIncorrectRefEntries: false, leaveOpen: true )) { IEnumerable items = GetParRegionItems(count); saWriter.Write(items); } saStream.Position = 0; indexStream.Position = 0; var saReader = ScoreReader.Read(saStream, indexStream); var position = 10_010; for (int i = 0; i < count; i++, position += 2) { Assert.False(double.IsNaN(saReader.GetScore(23, position, "C"))); } } } [Fact] public void TestWriteUnknownAllele() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); string[] nucleotides = {"A", "C", "G", "T"}; var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); var position = 10_010; using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new ScoreFileWriter( writerSettings, saStream, indexStream, version, GetAllASequenceProvider(), SaCommon.SchemaVersion, skipIncorrectRefEntries: false, leaveOpen: true )) { IEnumerable items = new List { new(ChromosomeUtilities.Chr1, position, "A", "K", 0.5), }; saWriter.Write(items); saStream.Position = 0; indexStream.Position = 0; var saReader = ScoreReader.Read(saStream, indexStream); Assert.True(double.IsNaN(saReader.GetScore(ChromosomeUtilities.Chr1.Index, position, "A"))); Assert.True(double.IsNaN(saReader.GetScore(ChromosomeUtilities.Chr1.Index, position, "C"))); Assert.True(double.IsNaN(saReader.GetScore(ChromosomeUtilities.Chr1.Index, position, "G"))); Assert.True(double.IsNaN(saReader.GetScore(ChromosomeUtilities.Chr1.Index, position, "T"))); } } } [Fact] public void TestOutOfOrderWriting() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); string[] nucleotides = {"A", "C", "G", "T"}; var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); var position = 10_010; using (var saStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new ScoreFileWriter( writerSettings, saStream, indexStream, version, GetAllASequenceProvider(), SaCommon.SchemaVersion, skipIncorrectRefEntries: false, leaveOpen: true )) { IEnumerable items = new List { new(ChromosomeUtilities.Chr1, position, "A", "C", 0.5), new(ChromosomeUtilities.Chr1, position - 1, "A", "G", 0.5), }; Assert.Throws(() => saWriter.Write(items)); } } } [Fact] public void TestParRegion2() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); string[] nucleotides = {"A", "C", "G", "T"}; var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); var position = 10_010; using (var dataStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new ScoreFileWriter( writerSettings, dataStream, indexStream, version, GetAllASequenceProvider(), SaCommon.SchemaVersion, skipIncorrectRefEntries: false, leaveOpen: true )) { IEnumerable items = new List { new(ChromosomeUtilities.ChrY, position, "N", "C", 0.5), }; saWriter.Write(items); } dataStream.Position = 0; indexStream.Position = 0; var saReader = ScoreReader.Read(dataStream, indexStream); Assert.Equal(0.5, saReader.GetScore(23, position, "C")); } } [Fact] public void SchemaVersionTest() { var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); string[] nucleotides = {"A", "C", "G", "T"}; var writerSettings = new WriterSettings( 10_000, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, true) ); var position = 10_010; using (var dataStream = new MemoryStream()) using (var indexStream = new MemoryStream()) { using (var saWriter = new ScoreFileWriter( writerSettings, dataStream, indexStream, version, GetAllASequenceProvider(), SaCommon.SchemaVersion + SaCommon.SchemaVersion, skipIncorrectRefEntries: false, leaveOpen: true )) { IEnumerable items = new List { new(ChromosomeUtilities.Chr1, position, "A", "C", 0.5), }; saWriter.Write(items); } dataStream.Position = 0; indexStream.Position = 0; Assert.Throws(() => ScoreReader.Read(dataStream, indexStream)); } } [Fact] public void TestHeader() { var testData = new List<(FileType GsaIndex, uint GuardInt, ushort)> { (FileType.GsaIndex, SaCommon.GuardInt, 1), // Incorrect File Type (FileType.GsaWriter, SaCommon.GuardInt, 2), // Incorrect File Format Version (FileType.GsaWriter, 2, 1) // Incorrect Guard Int }; foreach ((FileType fileType, uint guardInt, ushort fileFormatVersion) in testData) { var writerStream = PrepareHeaderTestData(fileType, guardInt, fileFormatVersion); Assert.Throws(() => ScoreReader.Read(writerStream, null)); } } private MemoryStream PrepareHeaderTestData(FileType fileType, uint guardInt, ushort fileFormatVersion) { var writerStream = new MemoryStream(); var writer = new ExtendedBinaryWriter(writerStream, System.Text.Encoding.Default); var header = new Header(fileType, fileFormatVersion); header.Write(writer); writer.WriteOpt(1); // FilePairId writer.Write(guardInt); writerStream.Position = 0; return writerStream; } // [Fact] // TODO Understand what this test is doing // public void RemoveConflictingItems() // { // const int blockLength = 10_000; // string[] nucleotides = {"A", "C", "G", "T"}; // var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); // // using (var saStream = new MemoryStream()) // using (var indexStream = new MemoryStream()) // using (var saWriter = new ScoreFileWriter(saStream, indexStream, version, GetAllASequenceProvider(), "dbsnp", // SaCommon.SchemaVersion, nucleotides, blockLength, GenomeAssembly.GRCh37, 1, false, true, false)) // { // Assert.Equal(0, saWriter.Write(GetConflictingGnomadItems())); // } // } private static IEnumerable GetSaItems(int count) { var items = new List(); var position = 100; var random = new Random(); for (int i = 0; i < count; i++, position += 5) { double score = Math.Round(random.NextDouble(), 2); items.Add(new GenericScoreItem(ChromosomeUtilities.Chr1, position, "A", "C", score)); } return items; } private static IEnumerable GetParRegionItems(int count) { var items = new List(); var position = 10_010; var random = new Random(); for (int i = 0; i < count; i++, position += 2) { double score = Math.Round(random.NextDouble(), 2); items.Add(new GenericScoreItem(ChromosomeUtilities.ChrY, position, "A", "C", score)); } return items; } [Fact] public void WrongRefAllele_ThrowUserException() { var saItem = new GenericScoreItem(ChromosomeUtilities.Chr1, 100, "C", "T", 0.9); Assert.Throws(() => WriteCustomSaItem(saItem, false)); WriteCustomSaItem(saItem, true); } private static void WriteCustomSaItem(GenericScoreItem customItem, bool skipIncorrectRefEntries) { const int blockLength = 10_000; string[] nucleotides = {"A", "C", "G", "T"}; var version = new DataSourceVersion("source1", "v1", DateTime.Now.Ticks, "description"); var writerSettings = new WriterSettings( blockLength, nucleotides, false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1.0), new ScoreJsonEncoder("TestKey", "TestSubKey"), new SaItemValidator(true, !skipIncorrectRefEntries) ); using (var writeStream = new MemoryStream()) using (var indexStream = new MemoryStream()) using (var scoreFileWriter = new ScoreFileWriter( writerSettings, writeStream, indexStream, version, GetAllASequenceProvider(), SaCommon.SchemaVersion, skipIncorrectRefEntries, true )) { scoreFileWriter.Write(new[] {customItem}); } } private static Stream GetChr22_17467787_17467799_genome() { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.WriteLine("##gnomAD"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); writer.WriteLine( "22\t17467787\trs1013532764\tAAAAG\tA\t5607.38\tPASS\tAC=9;AN=7342;AF=0.00122582;rf_tp_probability=0.526938;FS=1.835;InbreedingCoeff=-0.0586;MQ=60.31;MQRankSum=-0.363;QD=12.01;ReadPosRankSum=0.416;SOR=0.869;BaseQRankSum=0.067;ClippingRankSum=0.263;DP=659925;VQSLOD=-0.9495;VQSR_culprit=FS;variant_type=indel;allele_type=del;n_alt_alleles=1;pab_max=0.864166;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|17;gq_hist_all_bin_freq=2625|6415|2399|2552|894|245|475|590|299|567|573|228|560|58|171|68|135|8|78|194;dp_hist_alt_bin_freq=0|0|0|2|4|6|2|2|0|1|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=4|18|221|1132|2818|4248|4392|3451|2107|976|414|186|95|56|40|33|32|20|18|17;dp_hist_all_n_larger=32;ab_hist_alt_bin_freq=0|0|0|0|0|0|2|1|4|1|2|5|2|0|0|0|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=38;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=1;controls_AN_afr_male=132;controls_AF_afr_male=0.00757576;controls_nhomalt_afr_male=0;non_topmed_AC_amr=1;non_topmed_AN_amr=168;non_topmed_AF_amr=0.00595238;non_topmed_nhomalt_amr=0;AC_raw=9;AN_raw=29502;AF_raw=0.000305064;nhomalt_raw=0;AC_fin_female=0;AN_fin_female=598;AF_fin_female=0;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=12;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=1;non_neuro_AN_afr_male=154;non_neuro_AF_afr_male=0.00649351;non_neuro_nhomalt_afr_male=0;AC_afr_male=1;AN_afr_male=446;AF_afr_male=0.00224215;nhomalt_afr_male=0;AC_afr=2;AN_afr=756;AF_afr=0.0026455;nhomalt_afr=0;non_neuro_AC_afr_female=1;non_neuro_AN_afr_female=164;non_neuro_AF_afr_female=0.00609756;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=1;non_topmed_AN_amr_female=72;non_topmed_AF_amr_female=0.0138889;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=2;non_topmed_AN_oth_female=110;non_topmed_AF_oth_female=0.0181818;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=12;AF_eas_female=0;nhomalt_eas_female=0;AC_afr_female=1;AN_afr_female=310;AF_afr_female=0.00322581;nhomalt_afr_female=0;non_neuro_AC_female=2;non_neuro_AN_female=2324;non_neuro_AF_female=0.000860585;non_neuro_nhomalt_female=0;controls_AC_afr=1;controls_AN_afr=228;controls_AF_afr=0.00438596;controls_nhomalt_afr=0;AC_nfe_onf=1;AN_nfe_onf=628;AF_nfe_onf=0.00159236;nhomalt_nfe_onf=0;controls_AC_fin_male=0;controls_AN_fin_male=200;controls_AF_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=2;non_neuro_AN_nfe_nwe=2582;non_neuro_AF_nfe_nwe=0.000774593;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=526;AF_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=0;AN_nfe_female=2104;AF_nfe_female=0;nhomalt_nfe_female=0;AC_amr=1;AN_amr=178;AF_amr=0.00561798;nhomalt_amr=0;non_topmed_AC_nfe_male=3;non_topmed_AN_nfe_male=1778;non_topmed_AF_nfe_male=0.00168729;non_topmed_nhomalt_nfe_male=0;AC_eas=0;AN_eas=48;AF_eas=0;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=0;non_neuro_AN_nfe_female=1840;non_neuro_AF_nfe_female=0;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=2;non_neuro_AN_afr=318;non_neuro_AF_afr=0.00628931;non_neuro_nhomalt_afr=0;controls_AC_raw=2;controls_AN_raw=10110;controls_AF_raw=0.000197824;controls_nhomalt_raw=0;controls_AC_male=2;controls_AN_male=1340;controls_AF_male=0.00149254;controls_nhomalt_male=0;non_topmed_AC_male=5;non_topmed_AN_male=3004;non_topmed_AF_male=0.00166445;non_topmed_nhomalt_male=0;controls_AC_nfe_female=0;controls_AN_nfe_female=740;controls_AF_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=114;non_neuro_AF_amr=0;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=12;non_neuro_AF_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=1;AN_asj_male=50;AF_asj_male=0.02;nhomalt_asj_male=0;controls_AC_nfe_male=1;controls_AN_nfe_male=908;controls_AF_nfe_male=0.00110132;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=378;non_neuro_AF_fin=0;non_neuro_nhomalt_fin=0;AC_oth_female=2;AN_oth_female=112;AF_oth_female=0.0178571;nhomalt_oth_female=0;controls_AC_nfe=1;controls_AN_nfe=1648;controls_AF_nfe=0.000606796;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=48;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=8;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=58;non_neuro_AF_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=1;controls_AN_nfe_nwe=308;controls_AF_nfe_nwe=0.00324675;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=2;AN_nfe_nwe=2906;AF_nfe_nwe=0.000688231;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=16;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=56;non_neuro_AF_amr_female=0;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=1;non_neuro_AN_nfe_onf=464;non_neuro_AF_nfe_onf=0.00215517;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=34;non_topmed_AF_eas_male=0;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=0;controls_AN_amr_female=16;controls_AF_amr_female=0;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=200;non_neuro_AF_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=4;AN_female=3236;AF_female=0.00123609;nhomalt_female=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=84;non_neuro_AF_oth_male=0;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=1352;non_topmed_AF_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=2;non_topmed_AN_nfe_nwe=1632;non_topmed_AF_nfe_nwe=0.00122549;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=96;non_topmed_AF_amr_male=0;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=1;non_topmed_AN_nfe_onf=448;non_topmed_AF_nfe_onf=0.00223214;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=0;controls_AN_eas_male=16;controls_AF_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=52;controls_AF_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=9;non_topmed_AN=5806;non_topmed_AF=0.00155012;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=378;controls_AF_fin=0;controls_nhomalt_fin=0;non_neuro_AC_nfe=3;non_neuro_AN_nfe=4272;non_neuro_AF_nfe=0.000702247;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=178;non_neuro_AF_fin_female=0;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=38;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=12;controls_AF_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=1;non_topmed_AN_asj=38;non_topmed_AF_asj=0.0263158;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=124;controls_AF_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=7;non_neuro_AN=5332;non_neuro_AF=0.00131283;non_neuro_nhomalt=0;non_topmed_AC_nfe=3;non_topmed_AN_nfe=3470;non_topmed_AF_nfe=0.000864553;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=9;non_topmed_AN_raw=24832;non_topmed_AF_raw=0.000362436;non_topmed_nhomalt_raw=0;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=1212;non_neuro_AF_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=114;non_topmed_AF_oth_male=0;non_topmed_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=1356;AF_nfe_est=0;nhomalt_nfe_est=0;non_topmed_AC_afr_male=1;non_topmed_AN_afr_male=434;non_topmed_AF_afr_male=0.00230415;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=36;AF_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=28;controls_AF_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=36;non_neuro_AF_eas_male=0;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=1;non_neuro_AN_asj_male=44;non_neuro_AF_asj_male=0.0227273;non_neuro_nhomalt_asj_male=0;controls_AC_oth=0;controls_AN_oth=100;controls_AF_oth=0;controls_nhomalt_oth=0;AC_nfe=3;AN_nfe=4928;AF_nfe=0.000608766;nhomalt_nfe=0;non_topmed_AC_female=4;non_topmed_AN_female=2802;non_topmed_AF_female=0.00142755;non_topmed_nhomalt_female=0;non_neuro_AC_asj=1;non_neuro_AN_asj=56;non_neuro_AF_asj=0.0178571;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=10;non_topmed_AF_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=7;non_neuro_AN_raw=20066;non_neuro_AF_raw=0.000348849;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=0;non_topmed_AN_eas=44;non_topmed_AF_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=526;non_topmed_AF_fin_male=0;non_topmed_nhomalt_fin_male=0;AC_fin=0;AN_fin=1124;AF_fin=0;nhomalt_fin=0;AC_nfe_male=3;AN_nfe_male=2824;AF_nfe_male=0.00106232;nhomalt_nfe_male=0;controls_AC_amr_male=0;controls_AN_amr_male=30;controls_AF_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=96;controls_AF_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=46;controls_AF_amr=0;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=22;AF_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=0;non_neuro_AN_eas=48;non_neuro_AF_eas=0;non_neuro_nhomalt_eas=0;non_neuro_AC_male=5;non_neuro_AN_male=3008;non_neuro_AF_male=0.00166223;non_neuro_nhomalt_male=0;AC_asj=1;AN_asj=72;AF_asj=0.0138889;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=1200;controls_AF_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=16;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=2;non_topmed_AN_oth=224;non_topmed_AF_oth=0.00892857;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=598;non_topmed_AF_fin_female=0;non_topmed_nhomalt_fin_female=0;AC_oth=2;AN_oth=236;AF_oth=0.00847458;nhomalt_oth=0;non_neuro_AC_nfe_male=3;non_neuro_AN_nfe_male=2432;non_neuro_AF_nfe_male=0.00123355;non_neuro_nhomalt_nfe_male=0;controls_AC_female=0;controls_AN_female=1096;controls_AF_female=0;controls_nhomalt_female=0;non_topmed_AC_fin=0;non_topmed_AN_fin=1124;non_topmed_AF_fin=0;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=0;non_topmed_AN_nfe_female=1692;non_topmed_AF_nfe_female=0;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=2;controls_AF_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=1;non_topmed_AN_asj_male=22;non_topmed_AF_asj_male=0.0454545;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=1;non_neuro_AN_oth=146;non_neuro_AF_oth=0.00684932;non_neuro_nhomalt_oth=0;AC_male=5;AN_male=4106;AF_male=0.00121773;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=178;controls_AF_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=6;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=100;AF_amr_male=0;nhomalt_amr_male=0;AC_amr_female=1;AN_amr_female=78;AF_amr_female=0.0128205;nhomalt_amr_female=0;AC_oth_male=0;AN_oth_male=124;AF_oth_male=0;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=14;non_neuro_AF_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=1;non_topmed_AN_afr_female=304;non_topmed_AF_afr_female=0.00328947;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=2;non_topmed_AN_afr=738;non_topmed_AF_afr=0.00271003;non_topmed_nhomalt_afr=0;controls_AC=2;controls_AN=2436;controls_AF=0.000821018;controls_nhomalt=0;non_neuro_AC_oth_female=1;non_neuro_AN_oth_female=62;non_neuro_AF_oth_female=0.016129;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0.000305;non_topmed_faf99_amr=0.000305;faf95_afr=0.00047001;faf99_afr=0.00046996;controls_faf95_afr=0.000224;controls_faf99_afr=0.000224;faf95_amr=0.000288;faf99_amr=0.000288;faf95_eas=0;faf99_eas=0;faf95=0.00063865;faf99=0.0006395;non_neuro_faf95_afr=0.00111728;non_neuro_faf99_afr=0.00111671;non_neuro_faf95_amr=0;non_neuro_faf99_amr=0;controls_faf95_nfe=3.1e-05;controls_faf99_nfe=3.1e-05;non_topmed_faf95=0.00080814;non_topmed_faf99=0.00080791;non_neuro_faf95_nfe=0.000191;non_neuro_faf99_nfe=0.00019047;non_neuro_faf95=0.00061599;non_neuro_faf99=0.00061588;non_topmed_faf95_nfe=0.0002353;non_topmed_faf99_nfe=0.00023558;controls_faf95_eas=0;controls_faf99_eas=0;faf95_nfe=0.0001658;faf99_nfe=0.00016511;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0;controls_faf99_amr=0;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_topmed_faf95_afr=0.00048118;non_topmed_faf99_afr=0.00048064;controls_faf95=0.00014568;controls_faf99=0.00014565;controls_popmax=afr;controls_AC_popmax=1;controls_AN_popmax=228;controls_AF_popmax=0.00438596;controls_nhomalt_popmax=0;popmax=amr;AC_popmax=1;AN_popmax=178;AF_popmax=0.00561798;nhomalt_popmax=0;age_hist_het_bin_freq=1|0|1|1|0|2|0|0|0|0;age_hist_het_n_smaller=1;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=afr;non_neuro_AC_popmax=2;non_neuro_AN_popmax=318;non_neuro_AF_popmax=0.00628931;non_neuro_nhomalt_popmax=0;non_topmed_popmax=amr;non_topmed_AC_popmax=1;non_topmed_AN_popmax=168;non_topmed_AF_popmax=0.00595238;non_topmed_nhomalt_popmax=0"); writer.WriteLine( "22\t17467793\trs200526150\tAAGAA\tA\t2.96178e+06\tPASS\tAC=25;AN=13820;AF=0.00180897;rf_tp_probability=0.6944;FS=0;InbreedingCoeff=-0.0226;MQ=61.07;MQRankSum=0.061;QD=19.6;ReadPosRankSum=0.177;SOR=0.694;BaseQRankSum=-0.031;ClippingRankSum=-0.053;DP=657153;VQSLOD=5.11;VQSR_culprit=FS;variant_type=multi-indel;allele_type=del;n_alt_alleles=2;pab_max=1;gq_hist_alt_bin_freq=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|36;gq_hist_all_bin_freq=2892|4902|1140|827|277|141|343|478|268|556|481|207|525|87|178|89|169|40|119|5100;dp_hist_alt_bin_freq=0|0|0|1|5|8|10|5|4|1|0|0|1|1|0|0|0|0|1|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=3|25|286|1366|3137|4439|4355|3211|1821|851|331|175|79|53|32|42|22|27|18|12;dp_hist_all_n_larger=25;ab_hist_alt_bin_freq=0|0|0|0|0|0|2|2|6|8|3|6|7|2|0|0|0|0|0|0;AC_nfe_seu=0;AN_nfe_seu=60;AF_nfe_seu=0;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=654;controls_AF_afr_male=0;controls_nhomalt_afr_male=0;non_topmed_AC_amr=17;non_topmed_AN_amr=272;non_topmed_AF_amr=0.0625;non_topmed_nhomalt_amr=1;AC_raw=25;AN_raw=28996;AF_raw=0.000862188;nhomalt_raw=1;AC_fin_female=0;AN_fin_female=834;AF_fin_female=0;nhomalt_fin_female=0;non_neuro_AC_asj_female=0;non_neuro_AN_asj_female=38;non_neuro_AF_asj_female=0;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=0;non_neuro_AN_afr_male=730;non_neuro_AF_afr_male=0;non_neuro_nhomalt_afr_male=0;AC_afr_male=2;AN_afr_male=2172;AF_afr_male=0.00092081;nhomalt_afr_male=0;AC_afr=2;AN_afr=3678;AF_afr=0.000543774;nhomalt_afr=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=754;non_neuro_AF_afr_female=0;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=9;non_topmed_AN_amr_female=132;non_topmed_AF_amr_female=0.0681818;non_topmed_nhomalt_amr_female=1;non_topmed_AC_oth_female=2;non_topmed_AN_oth_female=190;non_topmed_AF_oth_female=0.0105263;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=248;AF_eas_female=0;nhomalt_eas_female=0;AC_afr_female=0;AN_afr_female=1506;AF_afr_female=0;nhomalt_afr_female=0;non_neuro_AC_female=7;non_neuro_AN_female=4262;non_neuro_AF_female=0.00164242;non_neuro_nhomalt_female=0;controls_AC_afr=0;controls_AN_afr=1120;controls_AF_afr=0;controls_nhomalt_afr=0;AC_nfe_onf=0;AN_nfe_onf=904;AF_nfe_onf=0;nhomalt_nfe_onf=0;controls_AC_fin_male=0;controls_AN_fin_male=276;controls_AF_fin_male=0;controls_nhomalt_fin_male=0;non_neuro_AC_nfe_nwe=1;non_neuro_AN_nfe_nwe=3534;non_neuro_AF_nfe_nwe=0.000282965;non_neuro_nhomalt_nfe_nwe=0;AC_fin_male=0;AN_fin_male=708;AF_fin_male=0;nhomalt_fin_male=0;AC_nfe_female=1;AN_nfe_female=3128;AF_nfe_female=0.000319693;nhomalt_nfe_female=0;AC_amr=18;AN_amr=286;AF_amr=0.0629371;nhomalt_amr=1;non_topmed_AC_nfe_male=1;non_topmed_AN_nfe_male=2566;non_topmed_AF_nfe_male=0.000389712;non_topmed_nhomalt_nfe_male=0;AC_eas=0;AN_eas=656;AF_eas=0;nhomalt_eas=0;nhomalt=1;non_neuro_AC_nfe_female=1;non_neuro_AN_nfe_female=2732;non_neuro_AF_nfe_female=0.000366032;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=1484;non_neuro_AF_afr=0;non_neuro_nhomalt_afr=0;controls_AC_raw=4;controls_AN_raw=9932;controls_AF_raw=0.000402739;controls_nhomalt_raw=0;controls_AC_male=3;controls_AN_male=2680;controls_AF_male=0.0011194;controls_nhomalt_male=0;non_topmed_AC_male=11;non_topmed_AN_male=6164;non_topmed_AF_male=0.00178456;non_topmed_nhomalt_male=0;controls_AC_nfe_female=0;controls_AN_nfe_female=1186;controls_AF_nfe_female=0;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=9;non_neuro_AN_amr=184;non_neuro_AF_amr=0.048913;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=248;non_neuro_AF_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=92;AF_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=1378;controls_AF_nfe_male=0;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=532;non_neuro_AF_fin=0;non_neuro_nhomalt_fin=0;AC_oth_female=2;AN_oth_female=194;AF_oth_female=0.0103093;nhomalt_oth_female=0;controls_AC_nfe=0;controls_AN_nfe=2564;controls_AF_nfe=0;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=76;controls_AF_oth_female=0;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=20;controls_AF_asj=0;controls_nhomalt_asj=0;non_neuro_AC_amr_male=4;non_neuro_AN_amr_male=74;non_neuro_AF_amr_male=0.0540541;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=426;controls_AF_nfe_nwe=0;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=2;AN_nfe_nwe=3958;AF_nfe_nwe=0.000505306;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=26;controls_AF_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=5;non_neuro_AN_amr_female=110;non_neuro_AF_amr_female=0.0454545;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=0;non_neuro_AN_nfe_onf=704;non_neuro_AF_nfe_onf=0;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=400;non_topmed_AF_eas_male=0;non_topmed_nhomalt_eas_male=0;controls_AC_amr_female=1;controls_AN_amr_female=46;controls_AF_amr_female=0.0217391;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=276;non_neuro_AF_fin_male=0;non_neuro_nhomalt_fin_male=0;AC_female=13;AN_female=6098;AF_female=0.00213185;nhomalt_female=1;non_neuro_AC_oth_male=1;non_neuro_AN_oth_male=156;non_neuro_AF_oth_male=0.00641026;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=2184;non_topmed_AF_nfe_est=0;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=2;non_topmed_AN_nfe_nwe=2250;non_topmed_AF_nfe_nwe=0.000888889;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=8;non_topmed_AN_amr_male=140;non_topmed_AF_amr_male=0.0571429;non_topmed_nhomalt_amr_male=0;non_topmed_AC_nfe_onf=0;non_topmed_AN_nfe_onf=646;non_topmed_AF_nfe_onf=0;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=0;controls_AN_eas_male=244;controls_AF_eas_male=0;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=84;controls_AF_oth_male=0;controls_nhomalt_oth_male=0;non_topmed_AC=23;non_topmed_AN=11642;non_topmed_AF=0.00197561;non_topmed_nhomalt=1;controls_AC_fin=0;controls_AN_fin=532;controls_AF_fin=0;controls_nhomalt_fin=0;non_neuro_AC_nfe=1;non_neuro_AN_nfe=6226;non_neuro_AF_nfe=0.000160617;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=256;non_neuro_AF_fin_female=0;non_neuro_nhomalt_fin_female=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=60;non_topmed_AF_nfe_seu=0;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=172;controls_AF_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=68;non_topmed_AF_asj=0;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=168;controls_AF_nfe_onf=0;controls_nhomalt_nfe_onf=0;non_neuro_AC=12;non_neuro_AN=9480;non_neuro_AF=0.00126582;non_neuro_nhomalt=0;non_topmed_AC_nfe=2;non_topmed_AN_nfe=5140;non_topmed_AF_nfe=0.000389105;non_topmed_nhomalt_nfe=0;non_topmed_AC_raw=23;non_topmed_AN_raw=24482;non_topmed_AF_raw=0.000939466;non_topmed_nhomalt_raw=1;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=1962;non_neuro_AF_nfe_est=0;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=1;non_topmed_AN_oth_male=184;non_topmed_AF_oth_male=0.00543478;non_topmed_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=2192;AF_nfe_est=0;nhomalt_nfe_est=0;non_topmed_AC_afr_male=1;non_topmed_AN_afr_male=2132;non_topmed_AF_afr_male=0.000469043;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=408;AF_eas_male=0;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=416;controls_AF_eas=0;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=408;non_neuro_AF_eas_male=0;non_neuro_nhomalt_eas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=80;non_neuro_AF_asj_male=0;non_neuro_nhomalt_asj_male=0;controls_AC_oth=0;controls_AN_oth=160;controls_AF_oth=0;controls_nhomalt_oth=0;AC_nfe=2;AN_nfe=7114;AF_nfe=0.000281136;nhomalt_nfe=0;non_topmed_AC_female=12;non_topmed_AN_female=5478;non_topmed_AF_female=0.00219058;non_topmed_nhomalt_female=1;non_neuro_AC_asj=0;non_neuro_AN_asj=118;non_neuro_AF_asj=0;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=240;non_topmed_AF_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=12;non_neuro_AN_raw=19660;non_neuro_AF_raw=0.000610376;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=0;non_topmed_AN_eas=640;non_topmed_AF_eas=0;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=708;non_topmed_AF_fin_male=0;non_topmed_nhomalt_fin_male=0;AC_fin=0;AN_fin=1542;AF_fin=0;nhomalt_fin=0;AC_nfe_male=1;AN_nfe_male=3986;AF_nfe_male=0.000250878;nhomalt_nfe_male=0;controls_AC_amr_male=3;controls_AN_amr_male=38;controls_AF_amr_male=0.0789474;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=466;controls_AF_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=4;controls_AN_amr=84;controls_AF_amr=0.047619;controls_nhomalt_amr=0;AC_asj_female=0;AN_asj_female=46;AF_asj_female=0;nhomalt_asj_female=0;non_neuro_AC_eas=0;non_neuro_AN_eas=656;non_neuro_AF_eas=0;non_neuro_nhomalt_eas=0;non_neuro_AC_male=5;non_neuro_AN_male=5218;non_neuro_AF_male=0.000958222;non_neuro_nhomalt_male=0;AC_asj=0;AN_asj=138;AF_asj=0;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=1944;controls_AF_nfe_est=0;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=34;non_topmed_AF_asj_female=0;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=3;non_topmed_AN_oth=374;non_topmed_AF_oth=0.00802139;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=834;non_topmed_AF_fin_female=0;non_topmed_nhomalt_fin_female=0;AC_oth=3;AN_oth=406;AF_oth=0.00738916;nhomalt_oth=0;non_neuro_AC_nfe_male=0;non_neuro_AN_nfe_male=3494;non_neuro_AF_nfe_male=0;non_neuro_nhomalt_nfe_male=0;controls_AC_female=1;controls_AN_female=2216;controls_AF_female=0.000451264;controls_nhomalt_female=0;non_topmed_AC_fin=0;non_topmed_AN_fin=1542;non_topmed_AF_fin=0;non_topmed_nhomalt_fin=0;non_topmed_AC_nfe_female=1;non_topmed_AN_nfe_female=2574;non_topmed_AF_nfe_female=0.0003885;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=6;controls_AF_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=34;non_topmed_AF_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=2;non_neuro_AN_oth=280;non_neuro_AF_oth=0.00714286;non_neuro_nhomalt_oth=0;AC_male=12;AN_male=7722;AF_male=0.001554;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=256;controls_AF_fin_female=0;controls_nhomalt_fin_female=0;controls_AC_asj_female=0;controls_AN_asj_female=14;controls_AF_asj_female=0;controls_nhomalt_asj_female=0;AC_amr_male=8;AN_amr_male=144;AF_amr_male=0.0555556;nhomalt_amr_male=0;AC_amr_female=10;AN_amr_female=142;AF_amr_female=0.0704225;nhomalt_amr_female=1;AC_oth_male=1;AN_oth_male=212;AF_oth_male=0.00471698;nhomalt_oth_male=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=26;non_neuro_AF_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=1474;non_topmed_AF_afr_female=0;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=1;non_topmed_AN_afr=3606;non_topmed_AF_afr=0.000277316;non_topmed_nhomalt_afr=0;controls_AC=4;controls_AN=4896;controls_AF=0.000816993;controls_nhomalt=0;non_neuro_AC_oth_female=1;non_neuro_AN_oth_female=124;non_neuro_AF_oth_female=0.00806452;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=0.0398231;non_topmed_faf99_amr=0.0398236;faf95_afr=9.592e-05;faf99_afr=9.609e-05;controls_faf95_afr=0;controls_faf99_afr=0;faf95_amr=0.0406793;faf99_amr=0.0406792;faf95_eas=0;faf99_eas=0;faf95=0.00125772;faf99=0.00125736;non_neuro_faf95_afr=0;non_neuro_faf99_afr=0;non_neuro_faf95_amr=0.0255171;non_neuro_faf99_amr=0.0255167;controls_faf95_nfe=0;controls_faf99_nfe=0;non_topmed_faf95=0.00134988;non_topmed_faf99=0.00134945;non_neuro_faf95_nfe=8e-06;non_neuro_faf99_nfe=8e-06;non_neuro_faf95=0.00072973;non_neuro_faf99=0.00073008;non_topmed_faf95_nfe=6.881e-05;non_topmed_faf99_nfe=6.877e-05;controls_faf95_eas=0;controls_faf99_eas=0;faf95_nfe=4.922e-05;faf99_nfe=4.923e-05;non_topmed_faf95_eas=0;non_topmed_faf99_eas=0;controls_faf95_amr=0.0162655;controls_faf99_amr=0.0162653;non_neuro_faf95_eas=0;non_neuro_faf99_eas=0;non_topmed_faf95_afr=1.4e-05;non_topmed_faf99_afr=1.4e-05;controls_faf95=0.00027835;controls_faf99=0.00027827;controls_popmax=amr;controls_AC_popmax=4;controls_AN_popmax=84;controls_AF_popmax=0.047619;controls_nhomalt_popmax=0;popmax=amr;AC_popmax=18;AN_popmax=286;AF_popmax=0.0629371;nhomalt_popmax=1;age_hist_het_bin_freq=0|0|2|1|1|1|0|0|0|0;age_hist_het_n_smaller=4;age_hist_het_n_larger=0;age_hist_hom_bin_freq=0|0|0|0|0|0|0|0|0|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=amr;non_neuro_AC_popmax=9;non_neuro_AN_popmax=184;non_neuro_AF_popmax=0.048913;non_neuro_nhomalt_popmax=0;non_topmed_popmax=amr;non_topmed_AC_popmax=17;non_topmed_AN_popmax=272;non_topmed_AF_popmax=0.0625;non_topmed_nhomalt_popmax=1"); writer.WriteLine( "22\t17467793\trs200526150\tAAGAA\tA\t2.96178e+06\tPASS\tAC=4501;AN=13820;AF=0.325687;rf_tp_probability=0.6944;FS=0;InbreedingCoeff=-0.0226;MQ=61.07;MQRankSum=0.061;QD=19.6;ReadPosRankSum=0.177;SOR=0.694;BaseQRankSum=-0.031;ClippingRankSum=-0.053;DP=657153;VQSLOD=5.11;VQSR_culprit=FS;variant_type=multi-indel;allele_type=del;n_alt_alleles=2;pab_max=1;gq_hist_alt_bin_freq=3|3|4|4|5|3|4|6|8|10|21|14|36|33|27|47|34|35|43|4884;gq_hist_all_bin_freq=2897|4907|1144|830|282|143|344|482|273|559|484|208|528|92|176|87|149|45|119|5070;dp_hist_alt_bin_freq=0|6|126|551|1133|1285|1033|600|260|102|40|27|13|13|3|11|1|6|7|2;dp_hist_alt_n_larger=5;dp_hist_all_bin_freq=3|25|286|1366|3137|4439|4355|3211|1821|851|331|175|79|53|32|42|22|27|18|12;dp_hist_all_n_larger=25;ab_hist_alt_bin_freq=0|7|1|7|36|124|277|456|835|741|1055|616|404|155|42|25|5|6|5|0;AC_nfe_seu=19;AN_nfe_seu=60;AF_nfe_seu=0.316667;nhomalt_nfe_seu=1;controls_AC_afr_male=325;controls_AN_afr_male=654;controls_AF_afr_male=0.496942;controls_nhomalt_afr_male=35;non_topmed_AC_amr=77;non_topmed_AN_amr=272;non_topmed_AF_amr=0.283088;non_topmed_nhomalt_amr=2;AC_raw=4527;AN_raw=28996;AF_raw=0.156125;nhomalt_raw=356;AC_fin_female=187;AN_fin_female=834;AF_fin_female=0.224221;nhomalt_fin_female=6;non_neuro_AC_asj_female=15;non_neuro_AN_asj_female=38;non_neuro_AF_asj_female=0.394737;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=358;non_neuro_AN_afr_male=730;non_neuro_AF_afr_male=0.490411;non_neuro_nhomalt_afr_male=37;AC_afr_male=1071;AN_afr_male=2172;AF_afr_male=0.493094;nhomalt_afr_male=113;AC_afr=1825;AN_afr=3678;AF_afr=0.496194;nhomalt_afr=196;non_neuro_AC_afr_female=376;non_neuro_AN_afr_female=754;non_neuro_AF_afr_female=0.498674;non_neuro_nhomalt_afr_female=42;non_topmed_AC_amr_female=35;non_topmed_AN_amr_female=132;non_topmed_AF_amr_female=0.265152;non_topmed_nhomalt_amr_female=0;non_topmed_AC_oth_female=58;non_topmed_AN_oth_female=190;non_topmed_AF_oth_female=0.305263;non_topmed_nhomalt_oth_female=6;AC_eas_female=135;AN_eas_female=248;AF_eas_female=0.544355;nhomalt_eas_female=14;AC_afr_female=754;AN_afr_female=1506;AF_afr_female=0.500664;nhomalt_afr_female=83;non_neuro_AC_female=1325;non_neuro_AN_female=4262;non_neuro_AF_female=0.310887;non_neuro_nhomalt_female=93;controls_AC_afr=566;controls_AN_afr=1120;controls_AF_afr=0.505357;controls_nhomalt_afr=67;AC_nfe_onf=233;AN_nfe_onf=904;AF_nfe_onf=0.257743;nhomalt_nfe_onf=13;controls_AC_fin_male=58;controls_AN_fin_male=276;controls_AF_fin_male=0.210145;controls_nhomalt_fin_male=2;non_neuro_AC_nfe_nwe=797;non_neuro_AN_nfe_nwe=3534;non_neuro_AF_nfe_nwe=0.225523;non_neuro_nhomalt_nfe_nwe=38;AC_fin_male=146;AN_fin_male=708;AF_fin_male=0.206215;nhomalt_fin_male=4;AC_nfe_female=774;AN_nfe_female=3128;AF_nfe_female=0.247442;nhomalt_nfe_female=42;AC_amr=79;AN_amr=286;AF_amr=0.276224;nhomalt_amr=2;non_topmed_AC_nfe_male=636;non_topmed_AN_nfe_male=2566;non_topmed_AF_nfe_male=0.247857;non_topmed_nhomalt_nfe_male=33;AC_eas=359;AN_eas=656;AF_eas=0.547256;nhomalt_eas=35;nhomalt=352;non_neuro_AC_nfe_female=666;non_neuro_AN_nfe_female=2732;non_neuro_AF_nfe_female=0.243777;non_neuro_nhomalt_nfe_female=30;non_neuro_AC_afr=734;non_neuro_AN_afr=1484;non_neuro_AF_afr=0.494609;non_neuro_nhomalt_afr=79;controls_AC_raw=1673;controls_AN_raw=9932;controls_AF_raw=0.168445;controls_nhomalt_raw=138;controls_AC_male=920;controls_AN_male=2680;controls_AF_male=0.343284;controls_nhomalt_male=78;non_topmed_AC_male=2163;non_topmed_AN_male=6164;non_topmed_AF_male=0.350909;non_topmed_nhomalt_male=179;controls_AC_nfe_female=300;controls_AN_nfe_female=1186;controls_AF_nfe_female=0.252951;controls_nhomalt_nfe_female=11;non_neuro_AC_amr=55;non_neuro_AN_amr=184;non_neuro_AF_amr=0.298913;non_neuro_nhomalt_amr=1;non_neuro_AC_eas_female=135;non_neuro_AN_eas_female=248;non_neuro_AF_eas_female=0.544355;non_neuro_nhomalt_eas_female=14;AC_asj_male=34;AN_asj_male=92;AF_asj_male=0.369565;nhomalt_asj_male=5;controls_AC_nfe_male=360;controls_AN_nfe_male=1378;controls_AF_nfe_male=0.261248;controls_nhomalt_nfe_male=21;non_neuro_AC_fin=118;non_neuro_AN_fin=532;non_neuro_AF_fin=0.221805;non_neuro_nhomalt_fin=3;AC_oth_female=60;AN_oth_female=194;AF_oth_female=0.309278;nhomalt_oth_female=7;controls_AC_nfe=660;controls_AN_nfe=2564;controls_AF_nfe=0.25741;controls_nhomalt_nfe=32;controls_AC_oth_female=19;controls_AN_oth_female=76;controls_AF_oth_female=0.25;controls_nhomalt_oth_female=1;controls_AC_asj=9;controls_AN_asj=20;controls_AF_asj=0.45;controls_nhomalt_asj=1;non_neuro_AC_amr_male=24;non_neuro_AN_amr_male=74;non_neuro_AF_amr_male=0.324324;non_neuro_nhomalt_amr_male=1;controls_AC_nfe_nwe=99;controls_AN_nfe_nwe=426;controls_AF_nfe_nwe=0.232394;controls_nhomalt_nfe_nwe=5;AC_nfe_nwe=894;AN_nfe_nwe=3958;AF_nfe_nwe=0.225872;nhomalt_nfe_nwe=44;controls_AC_nfe_seu=10;controls_AN_nfe_seu=26;controls_AF_nfe_seu=0.384615;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=31;non_neuro_AN_amr_female=110;non_neuro_AF_amr_female=0.281818;non_neuro_nhomalt_amr_female=0;non_neuro_AC_nfe_onf=190;non_neuro_AN_nfe_onf=704;non_neuro_AF_nfe_onf=0.269886;non_neuro_nhomalt_nfe_onf=12;non_topmed_AC_eas_male=219;non_topmed_AN_eas_male=400;non_topmed_AF_eas_male=0.5475;non_topmed_nhomalt_eas_male=20;controls_AC_amr_female=18;controls_AN_amr_female=46;controls_AF_amr_female=0.391304;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=58;non_neuro_AN_fin_male=276;non_neuro_AF_fin_male=0.210145;non_neuro_nhomalt_fin_male=2;AC_female=1965;AN_female=6098;AF_female=0.322237;nhomalt_female=152;non_neuro_AC_oth_male=49;non_neuro_AN_oth_male=156;non_neuro_AF_oth_male=0.314103;non_neuro_nhomalt_oth_male=5;non_topmed_AC_nfe_est=577;non_topmed_AN_nfe_est=2184;non_topmed_AF_nfe_est=0.264194;non_topmed_nhomalt_nfe_est=32;non_topmed_AC_nfe_nwe=515;non_topmed_AN_nfe_nwe=2250;non_topmed_AF_nfe_nwe=0.228889;non_topmed_nhomalt_nfe_nwe=28;non_topmed_AC_amr_male=42;non_topmed_AN_amr_male=140;non_topmed_AF_amr_male=0.3;non_topmed_nhomalt_amr_male=2;non_topmed_AC_nfe_onf=169;non_topmed_AN_nfe_onf=646;non_topmed_AF_nfe_onf=0.26161;non_topmed_nhomalt_nfe_onf=8;controls_AC_eas_male=136;controls_AN_eas_male=244;controls_AF_eas_male=0.557377;controls_nhomalt_eas_male=15;controls_AC_oth_male=25;controls_AN_oth_male=84;controls_AF_oth_male=0.297619;controls_nhomalt_oth_male=4;non_topmed_AC=3972;non_topmed_AN=11642;non_topmed_AF=0.341178;non_topmed_nhomalt=324;controls_AC_fin=118;controls_AN_fin=532;controls_AF_fin=0.221805;controls_nhomalt_fin=3;non_neuro_AC_nfe=1506;non_neuro_AN_nfe=6226;non_neuro_AF_nfe=0.241889;non_neuro_nhomalt_nfe=73;non_neuro_AC_fin_female=60;non_neuro_AN_fin_female=256;non_neuro_AF_fin_female=0.234375;non_neuro_nhomalt_fin_female=1;non_topmed_AC_nfe_seu=19;non_topmed_AN_nfe_seu=60;non_topmed_AF_nfe_seu=0.316667;non_topmed_nhomalt_nfe_seu=1;controls_AC_eas_female=95;controls_AN_eas_female=172;controls_AF_eas_female=0.552326;controls_nhomalt_eas_female=12;non_topmed_AC_asj=24;non_topmed_AN_asj=68;non_topmed_AF_asj=0.352941;non_topmed_nhomalt_asj=1;controls_AC_nfe_onf=46;controls_AN_nfe_onf=168;controls_AF_nfe_onf=0.27381;controls_nhomalt_nfe_onf=4;non_neuro_AC=2909;non_neuro_AN=9480;non_neuro_AF=0.306857;non_neuro_nhomalt=207;non_topmed_AC_nfe=1280;non_topmed_AN_nfe=5140;non_topmed_AF_nfe=0.249027;non_topmed_nhomalt_nfe=69;non_topmed_AC_raw=3996;non_topmed_AN_raw=24482;non_topmed_AF_raw=0.163222;non_topmed_nhomalt_raw=327;non_neuro_AC_nfe_est=509;non_neuro_AN_nfe_est=1962;non_neuro_AF_nfe_est=0.259429;non_neuro_nhomalt_nfe_est=23;non_topmed_AC_oth_male=56;non_topmed_AN_oth_male=184;non_topmed_AF_oth_male=0.304348;non_topmed_nhomalt_oth_male=6;AC_nfe_est=579;AN_nfe_est=2192;AF_nfe_est=0.264142;nhomalt_nfe_est=32;non_topmed_AC_afr_male=1054;non_topmed_AN_afr_male=2132;non_topmed_AF_afr_male=0.494371;non_topmed_nhomalt_afr_male=113;AC_eas_male=224;AN_eas_male=408;AF_eas_male=0.54902;nhomalt_eas_male=21;controls_AC_eas=231;controls_AN_eas=416;controls_AF_eas=0.555288;controls_nhomalt_eas=27;non_neuro_AC_eas_male=224;non_neuro_AN_eas_male=408;non_neuro_AF_eas_male=0.54902;non_neuro_nhomalt_eas_male=21;non_neuro_AC_asj_male=31;non_neuro_AN_asj_male=80;non_neuro_AF_asj_male=0.3875;non_neuro_nhomalt_asj_male=5;controls_AC_oth=44;controls_AN_oth=160;controls_AF_oth=0.275;controls_nhomalt_oth=5;AC_nfe=1725;AN_nfe=7114;AF_nfe=0.24248;nhomalt_nfe=90;non_topmed_AC_female=1809;non_topmed_AN_female=5478;non_topmed_AF_female=0.33023;non_topmed_nhomalt_female=145;non_neuro_AC_asj=46;non_neuro_AN_asj=118;non_neuro_AF_asj=0.389831;non_neuro_nhomalt_asj=5;non_topmed_AC_eas_female=132;non_topmed_AN_eas_female=240;non_topmed_AF_eas_female=0.55;non_topmed_nhomalt_eas_female=14;non_neuro_AC_raw=2928;non_neuro_AN_raw=19660;non_neuro_AF_raw=0.148932;non_neuro_nhomalt_raw=211;non_topmed_AC_eas=351;non_topmed_AN_eas=640;non_topmed_AF_eas=0.548438;non_topmed_nhomalt_eas=34;non_topmed_AC_fin_male=146;non_topmed_AN_fin_male=708;non_topmed_AF_fin_male=0.206215;non_topmed_nhomalt_fin_male=4;AC_fin=333;AN_fin=1542;AF_fin=0.215953;nhomalt_fin=10;AC_nfe_male=951;AN_nfe_male=3986;AF_nfe_male=0.238585;nhomalt_nfe_male=48;controls_AC_amr_male=12;controls_AN_amr_male=38;controls_AF_amr_male=0.315789;controls_nhomalt_amr_male=0;controls_AC_afr_female=241;controls_AN_afr_female=466;controls_AF_afr_female=0.517167;controls_nhomalt_afr_female=32;controls_AC_amr=30;controls_AN_amr=84;controls_AF_amr=0.357143;controls_nhomalt_amr=0;AC_asj_female=18;AN_asj_female=46;AF_asj_female=0.391304;nhomalt_asj_female=0;non_neuro_AC_eas=359;non_neuro_AN_eas=656;non_neuro_AF_eas=0.547256;non_neuro_nhomalt_eas=35;non_neuro_AC_male=1584;non_neuro_AN_male=5218;non_neuro_AF_male=0.303565;non_neuro_nhomalt_male=114;AC_asj=52;AN_asj=138;AF_asj=0.376812;nhomalt_asj=5;controls_AC_nfe_est=505;controls_AN_nfe_est=1944;controls_AF_nfe_est=0.259774;controls_nhomalt_nfe_est=23;non_topmed_AC_asj_female=14;non_topmed_AN_asj_female=34;non_topmed_AF_asj_female=0.411765;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=114;non_topmed_AN_oth=374;non_topmed_AF_oth=0.304813;non_topmed_nhomalt_oth=12;non_topmed_AC_fin_female=187;non_topmed_AN_fin_female=834;non_topmed_AF_fin_female=0.224221;non_topmed_nhomalt_fin_female=6;AC_oth=128;AN_oth=406;AF_oth=0.315271;nhomalt_oth=14;non_neuro_AC_nfe_male=840;non_neuro_AN_nfe_male=3494;non_neuro_AF_nfe_male=0.240412;non_neuro_nhomalt_nfe_male=43;controls_AC_female=738;controls_AN_female=2216;controls_AF_female=0.333032;controls_nhomalt_female=57;non_topmed_AC_fin=333;non_topmed_AN_fin=1542;non_topmed_AF_fin=0.215953;non_topmed_nhomalt_fin=10;non_topmed_AC_nfe_female=644;non_topmed_AN_nfe_female=2574;non_topmed_AF_nfe_female=0.250194;non_topmed_nhomalt_nfe_female=36;controls_AC_asj_male=4;controls_AN_asj_male=6;controls_AF_asj_male=0.666667;controls_nhomalt_asj_male=1;non_topmed_AC_asj_male=10;non_topmed_AN_asj_male=34;non_topmed_AF_asj_male=0.294118;non_topmed_nhomalt_asj_male=1;non_neuro_AC_oth=91;non_neuro_AN_oth=280;non_neuro_AF_oth=0.325;non_neuro_nhomalt_oth=11;AC_male=2536;AN_male=7722;AF_male=0.328412;nhomalt_male=200;controls_AC_fin_female=60;controls_AN_fin_female=256;controls_AF_fin_female=0.234375;controls_nhomalt_fin_female=1;controls_AC_asj_female=5;controls_AN_asj_female=14;controls_AF_asj_female=0.357143;controls_nhomalt_asj_female=0;AC_amr_male=42;AN_amr_male=144;AF_amr_male=0.291667;nhomalt_amr_male=2;AC_amr_female=37;AN_amr_female=142;AF_amr_female=0.260563;nhomalt_amr_female=0;AC_oth_male=68;AN_oth_male=212;AF_oth_male=0.320755;nhomalt_oth_male=7;non_neuro_AC_nfe_seu=10;non_neuro_AN_nfe_seu=26;non_neuro_AF_nfe_seu=0.384615;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=739;non_topmed_AN_afr_female=1474;non_topmed_AF_afr_female=0.501357;non_topmed_nhomalt_afr_female=83;non_topmed_AC_afr=1793;non_topmed_AN_afr=3606;non_topmed_AF_afr=0.497227;non_topmed_nhomalt_afr=196;controls_AC=1658;controls_AN=4896;controls_AF=0.338644;controls_nhomalt=135;non_neuro_AC_oth_female=42;non_neuro_AN_oth_female=124;non_neuro_AF_oth_female=0.33871;non_neuro_nhomalt_oth_female=6;non_topmed_faf95_amr=0.232194;non_topmed_faf99_amr=0.232194;faf95_afr=0.477244;faf99_afr=0.477244;controls_faf95_afr=0.470932;controls_faf99_afr=0.470932;faf95_amr=0.227168;faf99_amr=0.227169;faf95_eas=0.500629;faf99_eas=0.500629;faf95=0.317744;faf99=0.317744;non_neuro_faf95_afr=0.464967;non_neuro_faf99_afr=0.464967;non_neuro_faf95_amr=0.235846;non_neuro_faf99_amr=0.235846;controls_faf95_nfe=0.241154;controls_faf99_nfe=0.241154;non_topmed_faf95=0.332322;non_topmed_faf99=0.332323;non_neuro_faf95_nfe=0.231727;non_neuro_faf99_nfe=0.231728;non_neuro_faf95=0.297558;non_neuro_faf99=0.297559;non_topmed_faf95_nfe=0.237689;non_topmed_faf99_nfe=0.23769;controls_faf95_eas=0.49659;controls_faf99_eas=0.49659;faf95_nfe=0.232957;faf99_nfe=0.232956;non_topmed_faf95_eas=0.501191;non_topmed_faf99_eas=0.501191;controls_faf95_amr=0.257071;controls_faf99_amr=0.257071;non_neuro_faf95_eas=0.500629;non_neuro_faf99_eas=0.500629;non_topmed_faf95_afr=0.47807;non_topmed_faf99_afr=0.47807;controls_faf95=0.32508;controls_faf99=0.325081;controls_popmax=eas;controls_AC_popmax=231;controls_AN_popmax=416;controls_AF_popmax=0.555288;controls_nhomalt_popmax=27;popmax=eas;AC_popmax=359;AN_popmax=656;AF_popmax=0.547256;nhomalt_popmax=35;age_hist_het_bin_freq=128|162|214|283|349|260|234|152|93|46;age_hist_het_n_smaller=717;age_hist_het_n_larger=23;age_hist_hom_bin_freq=9|11|18|24|26|15|20|8|12|6;age_hist_hom_n_smaller=82;age_hist_hom_n_larger=4;non_neuro_popmax=eas;non_neuro_AC_popmax=359;non_neuro_AN_popmax=656;non_neuro_AF_popmax=0.547256;non_neuro_nhomalt_popmax=35;non_topmed_popmax=eas;non_topmed_AC_popmax=351;non_topmed_AN_popmax=640;non_topmed_AF_popmax=0.548438;non_topmed_nhomalt_popmax=34"); writer.Flush(); stream.Position = 0; return stream; } private static IEnumerable GetConflictingGnomadItems() { var sequence = new SimpleSequence( new string('T', VariantUtils.MaxUpstreamLength) + "AAAGAAAGAAAG", 17467787 - 1 - VariantUtils.MaxUpstreamLength ); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); var parserSettings = new ParserSettings( new ColumnIndex(0, 2, 3, 4, 5, null), new[] {"A", "C", "G", "T"}, GenericScoreParser.MaxRepresentativeScores ); var gnomadReader = new GenericScoreParser(parserSettings, new StreamReader(GetChr22_17467787_17467799_genome()), null); return gnomadReader.GetItems(); } public static ISequenceProvider GetAllASequenceProvider(GenomeAssembly assembly = GenomeAssembly.GRCh37) { var seqProvider = new Mock(); seqProvider.SetupGet(x => x.Assembly).Returns(assembly); seqProvider.Setup(x => x.Sequence.Substring(It.IsAny(), 1)).Returns("A"); return seqProvider.Object; } } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/HeaderTests.cs ================================================ using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using SAUtils; using SAUtils.GenericScore; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestUtilities; using VariantAnnotation.GenericScore; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile { public sealed class HeaderTests { [Fact] public void TestFilePairId() { ( List saItems1, WriterSettings writerSettings1, MemoryStream indexStream1, MemoryStream writeStream1, DataSourceVersion version1, _ ) = TestDataGenerator.GetRandomSingleChromosomeData(ChromosomeUtilities.Chr1, 10_001, 15_001); ( List saItems2, WriterSettings writerSettings2, MemoryStream indexStream2, MemoryStream writeStream2, DataSourceVersion version2, _ ) = TestDataGenerator.GetRandomSingleChromosomeData(ChromosomeUtilities.Chr1, 10_001, 15_001); using (var scoreFileWriter1 = new ScoreFileWriter( writerSettings1, writeStream1, indexStream1, version1, TestDataGenerator.GetSequenceProvider(), SaCommon.SchemaVersion )) using (var scoreFileWriter2 = new ScoreFileWriter( writerSettings2, writeStream2, indexStream2, version2, TestDataGenerator.GetSequenceProvider(), SaCommon.SchemaVersion )) { // Write saItems to stream scoreFileWriter1.Write(saItems1); scoreFileWriter2.Write(saItems2); // Reset streams in preparation for reading them indexStream1.Position = 0; indexStream2.Position = 0; writeStream1.Position = 0; writeStream2.Position = 0; // Mixing indexes with different data files must throw exception Assert.Throws(() => ScoreReader.Read(writeStream2, indexStream1)); Assert.Throws(() => ScoreReader.Read(writeStream1, indexStream2)); indexStream1.Position = 0; indexStream2.Position = 0; writeStream1.Position = 0; writeStream2.Position = 0; // Shoud not throw any exception ScoreReader.Read(writeStream1, indexStream1); ScoreReader.Read(writeStream2, indexStream2); } } [Fact] public void TestFileType() { ( List saItems1, WriterSettings writerSettings1, MemoryStream indexStream1, MemoryStream writeStream1, DataSourceVersion version1, _ ) = TestDataGenerator.GetRandomSingleChromosomeData(ChromosomeUtilities.Chr1, 10_001, 15_001); ( List saItems2, _, MemoryStream indexStream2, MemoryStream writeStream2, DataSourceVersion version2, _ ) = TestDataGenerator.GetRandomSingleChromosomeData(ChromosomeUtilities.Chr1, 10_001, 15_001); using (var scoreFileWriter1 = new ScoreFileWriter( writerSettings1, writeStream1, indexStream1, version1, TestDataGenerator.GetSequenceProvider(), SaCommon.SchemaVersion )) using (var nsaWriter = new NsaWriter( writeStream2, indexStream2, version2, TestDataGenerator.GetSequenceProvider(), "TestNsa", true, false, SaCommon.SchemaVersion, false )) { scoreFileWriter1.Write(saItems1); nsaWriter.Write(saItems2); // Reset streams in preparation for reading them indexStream1.Position = 0; indexStream2.Position = 0; writeStream1.Position = 0; writeStream2.Position = 0; // Attempting to read NSA file with this score reader must throw exception Assert.Throws(() => ScoreReader.Read(writeStream2, indexStream1)); Assert.Throws(() => ScoreReader.Read(writeStream1, indexStream2)); indexStream1.Position = 0; indexStream2.Position = 0; writeStream1.Position = 0; writeStream2.Position = 0; // Shoud not throw any exception ScoreReader.Read(writeStream1, indexStream1); } } } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/ReaderSettingsTests.cs ================================================ using System; using System.IO; using IO; using VariantAnnotation.GenericScore; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile; public sealed class ReaderSettingsTests { [Fact] public void TestReadWriteZeroToOne() { var stream = new MemoryStream(); var writer = new ExtendedBinaryWriter(stream, System.Text.Encoding.Default); var nucleotides = new[] {"A", "C", "G", "T"}; var blockLength = 25; var encoderType = EncoderType.ZeroToOne; var readerSettings = GetReaderSettings(encoderType, nucleotides, blockLength); AssertData(writer, readerSettings, nucleotides, blockLength); } [Fact] public void TestReadWriteGenericScoreEncoder() { var stream = new MemoryStream(); var writer = new ExtendedBinaryWriter(stream, System.Text.Encoding.Default); var nucleotides = new[] {"N"}; var blockLength = 25; var encoderType = EncoderType.Generic; var readerSettings = GetReaderSettings(encoderType, nucleotides, blockLength); AssertData(writer, readerSettings, nucleotides, blockLength); } [Fact] public void TestReadUnknownEncoder() { var writer = new ExtendedBinaryWriter(new MemoryStream(), System.Text.Encoding.Default); var nucleotides = new[] {"N"}; var blockLength = 25; EncoderType encoderType = EncoderType.Generic; var readerSettings = GetReaderSettings(encoderType, nucleotides, blockLength); using (writer) { readerSettings.Write(writer); writer.BaseStream.Position = 1; // Changing EncoderType in base stream to unknown writer.Write(255); writer.BaseStream.Position = 0; Assert.Throws(() => ReaderSettings.Read(new ExtendedBinaryReader(writer.BaseStream))); } } private void AssertData(ExtendedBinaryWriter writer, ReaderSettings readerSettings, string[] nucleotides, int blockLength) { using (writer) { readerSettings.Write(writer); writer.BaseStream.Position = 0; var reader = new ExtendedBinaryReader(writer.BaseStream); ReaderSettings deserializedReader = ReaderSettings.Read(reader); Assert.Equal(nucleotides, deserializedReader.Nucleotides); Assert.Equal(blockLength, deserializedReader.BlockLength); } } private ReaderSettings GetReaderSettings(EncoderType encoderType, string[] nucleotides, int blockLength) { IScoreEncoder scoreEncoder = encoderType switch { EncoderType.Generic => new GenericScoreEncoder(), EncoderType.ZeroToOne => new ZeroToOneScoreEncoder(2, 1), _ => null }; return new ReaderSettings( false, encoderType, scoreEncoder, new ScoreJsonEncoder("TestKey", "TestSubKey"), nucleotides, blockLength ); } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/SaItemValidatorTests.cs ================================================ using System.IO; using Genome; using SAUtils.GenericScore.GenericScoreParser; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile; public sealed class SaItemValidatorTests { [Fact] public void TestParRegion() { var saItemValidator = new SaItemValidator(true, true); var sequence = new SimpleSequence(new string('A', 15_000)); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.ChrY, 10_011, "A", "C", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.ChrY, 10_011, "N", "C", 0.5), sequenceProvider )); Assert.Throws(() => saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.ChrY, 10_011, "C", "C", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.ChrY, 10_011, "N", "N", 0.5), sequenceProvider )); saItemValidator = new SaItemValidator(true, false); Assert.False(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.ChrY, 10_011, "C", "N", 0.5), sequenceProvider )); } [Fact] public void TestIncorrectReference() { var sequence = new SimpleSequence(new string('A', 99)); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); // Strict Checking throws exceptions var saItemValidator = new SaItemValidator(true, true); Assert.Throws(() => saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "C", "G", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "G", 0.5), sequenceProvider )); // Will not throw exceptions saItemValidator = new SaItemValidator(true, false); Assert.False(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "C", "A", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "G", 0.5), sequenceProvider )); // Ref checking disabled saItemValidator = new SaItemValidator(true, null); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "C", "A", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "G", 0.5), sequenceProvider )); } [Fact] public void TestCheckSnv() { var sequence = new SimpleSequence(new string('A', 99)); var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, ChromosomeUtilities.RefNameToChromosome); // Strict checking throws exceptions on invalid items var saItemValidator = new SaItemValidator(true, true); Assert.Throws(() => saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "AA", "C", 0.5), sequenceProvider )); Assert.Throws(() => saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "CG", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "G", 0.5), sequenceProvider )); // SnvCheck will not throw exceptions saItemValidator = new SaItemValidator(false, true); Assert.False(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "AA", "C", 0.5), sequenceProvider )); Assert.False(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "CG", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "G", 0.5), sequenceProvider )); // SnvCheck disabled saItemValidator = new SaItemValidator(null, true); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "AA", "C", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "CG", 0.5), sequenceProvider )); Assert.True(saItemValidator.Validate( new GenericScoreItem(ChromosomeUtilities.Chr1, 11, "A", "G", 0.5), sequenceProvider )); } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/ScoreEncoderTests.cs ================================================ using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using IO; using VariantAnnotation.GenericScore; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile { public sealed class ScoreEncoderTests { [Fact] public void TestEncoderDecoder() { const int numberOfDigits = 3; const double maxScore = 1.0; var scoreEncoder = new ZeroToOneScoreEncoder(numberOfDigits, maxScore); var stream = new MemoryStream(); var writer = new ExtendedBinaryWriter(stream, System.Text.Encoding.Default); scoreEncoder.Write(writer); stream.Position = 0; var reader = new ExtendedBinaryReader(stream); var deserializedScoreEncoder = ZeroToOneScoreEncoder.Read(reader); stream.Close(); var testData = new List<(double inputNumber, double expectedResult)> { (0.246, 0.246), (0.2461, 0.246), (0.2466, 0.247), (0.800, 0.800), (0.999, 0.999), (0.9999, 1.000), (0.127, 0.127), (0.128, 0.128), (0.129, 0.129), (0.254, 0.254), (0.255, 0.255), (0.256, 0.256), (0.1271, 0.127), (0.1281, 0.128), (0.1291, 0.129), (0.2541, 0.254), (0.2551, 0.255), (0.2561, 0.256), (0.1266, 0.127), (0.1276, 0.128), (0.1286, 0.129), (0.1296, 0.130), (0.2536, 0.254), (0.2546, 0.255), (0.2556, 0.256), (0.2566, 0.257), (0.0, 0.0), (1.0, 1.0), (double.NaN, double.NaN) }; // Test encoder and its deserialized version foreach (ZeroToOneScoreEncoder encoder in new[] {scoreEncoder, deserializedScoreEncoder}) { foreach ((double inputNumber, double expectedOutput)in testData) { Assert.Equal(expectedOutput, EncodeDecode(encoder, inputNumber)); } Assert.Throws(() => encoder.EncodeToBytes(2.1)); } } [Fact] public void TestByteRequired() { var testData = new List<(int numberOfDigits, double maxScore, int expectedBytesRequired)> { (2, 1.0, 1), (2, 10.0, 1), (3, 1.0, 2), (4, 1.0, 2), (5, 1.0, 3), (6, 1.0, 3), (7, 1.0, 3), (5, 1000, 3) }; foreach ((int numberOfDigits, double maxScore, int expectedBytesRequired) in testData) { var scoreEncoder = new ZeroToOneScoreEncoder(numberOfDigits, maxScore); Assert.Equal(expectedBytesRequired, scoreEncoder.BytesRequired); } } private static double EncodeDecode(ZeroToOneScoreEncoder encoder, double number) { return encoder.DecodeFromBytes(encoder.EncodeToBytes(number)); } } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/ScoreIndexTests.cs ================================================ using System; using System.IO; using ErrorHandling.Exceptions; using Genome; using IO; using IO.v2; using VariantAnnotation.GenericScore; using VariantAnnotation.Providers; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile { public sealed class ScoreIndexTests { [Fact] public void ScoreIndexTest() { (Stream indexStream, ScoreIndex scoreIndex) = GetScoreIndex(); // Add chromosome blocks scoreIndex.AddChromosomeBlock(1, 10); scoreIndex.Add(1, 0, 1, 1); scoreIndex.AddChromosomeBlock(2, 80); scoreIndex.Add(2, 1, 2, 3); scoreIndex.Add(2, 3, 2, 3); scoreIndex.Add(2, 5, 2, 3); scoreIndex.AddChromosomeBlock(3, 70); scoreIndex.Add(3, 7, 20, 30); scoreIndex.Add(3, 27, 30, 30); scoreIndex.Add(3, 57, 20, 30); // Serialization and deserialization scoreIndex.Write(); indexStream.Position = 0; ScoreIndex scoreIndexDeserialized = ScoreIndex.Read(indexStream, 1); indexStream.Close(); Assert.Equal(scoreIndex.GetBlockNumber(1, 10), scoreIndexDeserialized.GetBlockNumber(1, 10)); Assert.Equal(scoreIndex.GetBlockNumber(2, 104), scoreIndexDeserialized.GetBlockNumber(2, 104)); Assert.Equal(scoreIndex.GetBlockLength(), scoreIndexDeserialized.GetBlockLength()); Assert.Equal(scoreIndex.GetNucleotideCount(), scoreIndexDeserialized.GetNucleotideCount()); // LastBlockNumber Assert.Equal(0, scoreIndexDeserialized.GetLastBlockNumber(1)); Assert.Equal(2, scoreIndexDeserialized.GetLastBlockNumber(2)); Assert.Equal(2, scoreIndexDeserialized.GetLastBlockNumber(3)); // BlockNumber Assert.Equal(-1, scoreIndexDeserialized.GetBlockNumber(1, 9)); Assert.Equal(0, scoreIndexDeserialized.GetBlockNumber(1, 10)); Assert.Equal(0, scoreIndexDeserialized.GetBlockNumber(1, 34)); Assert.Equal(-1, scoreIndexDeserialized.GetBlockNumber(1, 35)); Assert.Equal(-1, scoreIndexDeserialized.GetBlockNumber(2, 70)); Assert.Equal(-1, scoreIndexDeserialized.GetBlockNumber(2, 75)); Assert.Equal(-1, scoreIndexDeserialized.GetBlockNumber(2, 79)); Assert.Equal(0, scoreIndexDeserialized.GetBlockNumber(2, 80)); Assert.Equal(0, scoreIndexDeserialized.GetBlockNumber(2, 104)); Assert.Equal(1, scoreIndexDeserialized.GetBlockNumber(2, 105)); Assert.Equal(1, scoreIndexDeserialized.GetBlockNumber(2, 129)); Assert.Equal(2, scoreIndexDeserialized.GetBlockNumber(2, 130)); Assert.Equal(2, scoreIndexDeserialized.GetBlockNumber(2, 154)); Assert.Equal(-1, scoreIndexDeserialized.GetBlockNumber(2, 155)); Assert.Equal(-1, scoreIndexDeserialized.GetBlockNumber(3, 68)); Assert.Equal(0, scoreIndexDeserialized.GetBlockNumber(3, 70)); Assert.Equal(0, scoreIndexDeserialized.GetBlockNumber(3, 80)); Assert.Equal(0, scoreIndexDeserialized.GetBlockNumber(3, 94)); Assert.Equal(1, scoreIndexDeserialized.GetBlockNumber(3, 95)); Assert.Equal(1, scoreIndexDeserialized.GetBlockNumber(3, 119)); Assert.Equal(2, scoreIndexDeserialized.GetBlockNumber(3, 120)); Assert.Equal(2, scoreIndexDeserialized.GetBlockNumber(3, 144)); Assert.Equal(-1, scoreIndexDeserialized.GetBlockNumber(3, 145)); // Position before chromosome starts Assert.Equal((-1, -1), scoreIndex.PositionToBlockLocation((ushort) 3, 67)); Assert.Equal((-1, -1), scoreIndex.PositionToBlockLocation((ushort) 3, 67)); // Chromosome not added Assert.Equal(-1, scoreIndex.GetBlockNumber(4, 67)); Assert.Equal(-1, scoreIndex.GetFilePosition(4, 67)); Assert.Equal((-1, -1), scoreIndex.PositionToBlockLocation((ushort) 4, 1)); } [Fact] public void PositionToBlockIndexTest() { (Stream indexStream, ScoreIndex scoreIndex) = GetScoreIndex(); // Position to block location tests var testData = new[] { // Start psotion, postiion, expected Block number, expected block index (10, 11, 0, 4), (10, 26, 0, 64), (10, 34, 0, 96), (10, 35, 1, 0), (10, 40, 1, 20), }; foreach ((int startingPosition, int position, int expectedBlockNumber, int expectedBlockIndex) in testData) { Assert.Equal((expectedBlockNumber, expectedBlockIndex), scoreIndex.PositionToBlockLocation(position, startingPosition)); } } [Fact] public void AddGetChromosomeBlocksTest() { (_, ScoreIndex scoreIndex) = GetScoreIndex(); // Add and get chromosome blocks scoreIndex.AddChromosomeBlock(1, 10); scoreIndex.Add(1, 0, 1, 1); Assert.Single(scoreIndex.GetChromosomeBlocks()); Assert.Equal(1, scoreIndex.GetChromosomeBlocks()[1].BlockCount); scoreIndex.AddChromosomeBlock(2, 80); scoreIndex.Add(2, 1, 2, 3); scoreIndex.Add(2, 3, 2, 3); scoreIndex.Add(2, 5, 2, 3); Assert.Equal(2, scoreIndex.GetChromosomeBlocks().Count); Assert.Equal(1, scoreIndex.GetChromosomeBlocks()[1].BlockCount); Assert.Equal(3, scoreIndex.GetChromosomeBlocks()[2].BlockCount); } [Fact] public void TestGetNucleotidePosition() { (_, ScoreIndex scoreIndex) = GetScoreIndex(); // Add and get chromosome blocks scoreIndex.AddChromosomeBlock(1, 10); scoreIndex.Add(1, 0, 1, 1); Assert.Null(scoreIndex.GetNucleotidePosition("F")); Assert.Equal(0, (short) scoreIndex.GetNucleotidePosition("A")); Assert.Equal(1, (short) scoreIndex.GetNucleotidePosition("C")); Assert.Equal(2, (short) scoreIndex.GetNucleotidePosition("G")); Assert.Equal(3, (short) scoreIndex.GetNucleotidePosition("T")); } private static (Stream stream, ScoreIndex scoreIndex) GetScoreIndex() { var indexStream = new MemoryStream(); var indexWriter = new ExtendedBinaryWriter(indexStream, System.Text.Encoding.Default); var version = new DataSourceVersion("Test", "1", DateTime.Parse(DateTime.Now.ToString("yyyy-MM-dd")).Ticks, "No description"); var header = new Header(FileType.GsaIndex, 1); var readerSettings = new ReaderSettings( false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1), new ScoreJsonEncoder("TestKey", "TestSubKey"), new[] {"A", "C", "G", "T"}, 25 ); var scoreIndex = new ScoreIndex( indexWriter, readerSettings, GenomeAssembly.Unknown, version, 0, header, 1 ); return (indexStream, scoreIndex); } [Fact] public void TestHeader() { var indexStream = new MemoryStream(); var indexWriter = new ExtendedBinaryWriter(indexStream, System.Text.Encoding.Default); var version = new DataSourceVersion("Test", "1", DateTime.Parse(DateTime.Now.ToString("yyyy-MM-dd")).Ticks, "No description"); var header = new Header(FileType.GsaWriter, 1); var readerSettings = new ReaderSettings( false, EncoderType.ZeroToOne, new ZeroToOneScoreEncoder(2, 1), new ScoreJsonEncoder("TestKey", "TestSubKey"), new[] {"A", "C", "G", "T"}, 25 ); var scoreIndex = new ScoreIndex( indexWriter, readerSettings, GenomeAssembly.Unknown, version, 0, header, 1 ); scoreIndex.Write(); indexStream.Position = 0; Assert.Throws(() => ScoreIndex.Read(indexStream, 1)); } } } ================================================ FILE: UnitTests/VariantAnnotation/ScoreFile/ScoreJsonEncoderTests.cs ================================================ using VariantAnnotation.GenericScore; using Xunit; namespace UnitTests.VariantAnnotation.ScoreFile; public sealed class ScoreJsonEncoderTests { [Fact] public void TestJsonRepresentation() { var scoreJsonEncoder = new ScoreJsonEncoder("Test", "TestSubKey"); Assert.Equal( "\"TestSubKey\":1", new ScoreJsonEncoder("Test", "TestSubKey").JsonRepresentation(1)); Assert.Equal( "1", new ScoreJsonEncoder("Test", null).JsonRepresentation(1)); } } ================================================ FILE: UnitTests/VariantAnnotation/Sequence/CompressedSequenceReaderTests.cs ================================================ using Genome; using ReferenceSequence.IO; using UnitTests.TestUtilities; using Xunit; namespace UnitTests.VariantAnnotation.Sequence { public sealed class CompressedSequenceReaderTests { [Fact] public void GetCompressedSequence() { using (var reader = new CompressedSequenceReader(ResourceUtilities.GetReadStream(Resources.TopPath("TestSeq_reference.dat")))) { Assert.Equal(GenomeAssembly.GRCh37, reader.Assembly); var sequence = reader.Sequence; var chromosome = new Chromosome("chrBob", "Bob", null, null, 1, 1); reader.GetCompressedSequence(chromosome); Assert.Null(sequence.CytogeneticBands); Assert.Equal(0, sequence.Length); chromosome = new Chromosome("chrTestSeq", "TestSeq", null, null, 1, 0); reader.GetCompressedSequence(chromosome); var bases = sequence.Substring(0, 100); Assert.NotNull(sequence.CytogeneticBands); Assert.Equal(53, sequence.Length); Assert.Equal("NNATGTTTCCACTTTCTCCTCATTAGANNNTAACGAATGGGTGATTTCCCTAN", bases); } } } } ================================================ FILE: UnitTests/VariantAnnotation/Sequence/CompressedSequenceTests.cs ================================================ using Genome; using Intervals; using ReferenceSequence.Common; using Xunit; namespace UnitTests.VariantAnnotation.Sequence { public sealed class CompressedSequenceTests { private readonly ReferenceSequence.Common.Sequence _sequence; private const int NumBases = 53; public CompressedSequenceTests() { _sequence = new ReferenceSequence.Common.Sequence { Assembly = GenomeAssembly.hg19 }; // create the following sequence: NNATGTTTCCACTTTCTCCTCATTAGANNNTAACGAATGGGTGATTTCCCTAN var twoBitBuffer = new byte[] { 14, 42, 93, 169, 150, 122, 204, 11, 211, 224, 35, 169, 91, 0 }; var maskedIntervals = new Interval[3]; maskedIntervals[0] = new Interval(0, 1, new MaskedEntry(0, 1)); maskedIntervals[1] = new Interval(27, 29, new MaskedEntry(27, 29)); maskedIntervals[2] = new Interval(52, 52, new MaskedEntry(52, 52)); var maskedIntervalArray = new IntervalArray(maskedIntervals); _sequence.Set(NumBases, 0, twoBitBuffer, maskedIntervalArray, null); } [Fact] public void Assembly_hg19() { Assert.Equal(GenomeAssembly.hg19, _sequence.Assembly); } [Fact] public void GetNumBufferBytes() { const int expectedNumBufferBytes = 25; var observedNumBufferBytes = ReferenceSequence.Common.Sequence.GetNumBufferBytes(97); Assert.Equal(expectedNumBufferBytes, observedNumBufferBytes); } [Theory] [InlineData(23, 5, "TAGAN")] [InlineData(0, 5, "NNATG")] [InlineData(-1, 5, null)] [InlineData(48, 5, "CCTAN")] [InlineData(49, 5, "CTAN")] [InlineData(53, 5, null)] [InlineData(23, 0, null)] public void Substring(int offset, int length, string expectedSubstring) { var observedSubstring = _sequence.Substring(offset, length); Assert.Equal(expectedSubstring, observedSubstring); } } } ================================================ FILE: UnitTests/VariantAnnotation/TranscriptAnnotation/BreakEndUtilitiesTests.cs ================================================ using System.IO; using Genome; using UnitTests.TestUtilities; using VariantAnnotation.GeneFusions.Calling; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.TranscriptAnnotation { public sealed class BreakEndUtilitiesTests { [Theory] [InlineData(28722335, "T", "[3:115024109[T", true, "3", 115024109, false)] [InlineData(31410878, "C", "]6:42248252]C", true, "6", 42248252, true)] [InlineData(31561816, "C", "CGATCTCAT[6:41297838[", false, "6", 41297838, false)] [InlineData(84461562, "A", "A]8:100990100]", false, "8", 100990100, true)] [InlineData(32518102, "C", "C]HLA-DRB1*10:01:01:12922]", false, "HLA-DRB1*10:01:01", 12922, true)] public void CreateFromTranslocation_Nominal(int position, string refAllele, string altAllele, bool expectedOnReverseStrand, string expectedPartnerChr, int expectedPartnerPosition, bool expectedPartnerOnReverseStrand) { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, position, position, refAllele, altAllele, VariantType.translocation_breakend); BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateAdjacencies(variant, ChromosomeUtilities.RefNameToChromosome, false, false); Assert.NotNull(adjacencies); Assert.Single(adjacencies); BreakEndAdjacency actual = adjacencies[0]; Assert.Equal(expectedOnReverseStrand, actual.Origin.OnReverseStrand); Assert.Equal(expectedPartnerChr, actual.Partner.Chromosome.EnsemblName); Assert.Equal(expectedPartnerPosition, actual.Partner.Position); Assert.Equal(expectedPartnerOnReverseStrand, actual.Partner.OnReverseStrand); } [Fact] public void CreateFromTranslocation_InvalidAltAllele_ThrowException() { Assert.Throws(delegate { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 100, 100, "A", "A{3:115024109{T", VariantType.translocation_breakend); // ReSharper disable once UnusedVariable BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateFromTranslocation(variant, ChromosomeUtilities.RefNameToChromosome); }); } [Fact] public void CreateFromSymbolicAllele_Deletion() { var interval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 1594584, 1660503); BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateFromSymbolicAllele(interval, VariantType.deletion, false, false); Assert.NotNull(adjacencies); Assert.Equal(2, adjacencies.Length); BreakEndAdjacency actual = adjacencies[0]; Assert.Equal(ChromosomeUtilities.Chr1.EnsemblName, actual.Origin.Chromosome.EnsemblName); Assert.Equal(1594583, actual.Origin.Position); Assert.False(actual.Origin.OnReverseStrand); Assert.Equal(ChromosomeUtilities.Chr1.EnsemblName, actual.Partner.Chromosome.EnsemblName); Assert.Equal(1660504, actual.Partner.Position); Assert.False(actual.Partner.OnReverseStrand); BreakEndAdjacency actual2 = adjacencies[1]; Assert.Equal(ChromosomeUtilities.Chr1.EnsemblName, actual2.Origin.Chromosome.EnsemblName); Assert.Equal(1660504, actual2.Origin.Position); Assert.True(actual2.Origin.OnReverseStrand); Assert.Equal(ChromosomeUtilities.Chr1.EnsemblName, actual2.Partner.Chromosome.EnsemblName); Assert.Equal(1594583, actual2.Partner.Position); Assert.True(actual2.Partner.OnReverseStrand); } [Fact] public void CreateFromSymbolicAllele_Duplication() { var interval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 37820921, 38404543); BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateFromSymbolicAllele(interval, VariantType.tandem_duplication, false, false); Assert.NotNull(adjacencies); Assert.Equal(2, adjacencies.Length); BreakEndAdjacency actual = adjacencies[0]; Assert.Equal(ChromosomeUtilities.Chr1.EnsemblName, actual.Origin.Chromosome.EnsemblName); Assert.Equal(38404543, actual.Origin.Position); Assert.False(actual.Origin.OnReverseStrand); Assert.Equal(ChromosomeUtilities.Chr1.EnsemblName, actual.Partner.Chromosome.EnsemblName); Assert.Equal(37820920, actual.Partner.Position); Assert.False(actual.Partner.OnReverseStrand); BreakEndAdjacency actual2 = adjacencies[1]; Assert.Equal(ChromosomeUtilities.Chr1.EnsemblName, actual2.Origin.Chromosome.EnsemblName); Assert.Equal(37820920, actual2.Origin.Position); Assert.True(actual2.Origin.OnReverseStrand); Assert.Equal(ChromosomeUtilities.Chr1.EnsemblName, actual2.Partner.Chromosome.EnsemblName); Assert.Equal(38404543, actual2.Partner.Position); Assert.True(actual2.Partner.OnReverseStrand); } [Fact] public void CreateFromSymbolicAllele_Inversion() { var expectedAdjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 63989115, false), // 63989116 + (+1 offset) new BreakPoint(ChromosomeUtilities.Chr1, 64291267, true)); // 64291267 - (0 offset) var expectedAdjacency2 = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 64291268, true), // 64291268 - (0 offset) new BreakPoint(ChromosomeUtilities.Chr1, 63989116, false)); // 63989117 + (+1 offset) var interval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 63989116, 64291267); BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateFromSymbolicAllele(interval, VariantType.inversion, false, false); Assert.NotNull(adjacencies); Assert.Equal(2, adjacencies.Length); Assert.Equal(expectedAdjacency, adjacencies[0]); Assert.Equal(expectedAdjacency2, adjacencies[1]); } [Fact] public void CreateFromSymbolicAllele_Inversion_INV3() { var expectedAdjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 63989115, false), // GOOD new BreakPoint(ChromosomeUtilities.Chr1, 64291267, true)); // GOOD var expectedAdjacency2 = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 64291267, false), new BreakPoint(ChromosomeUtilities.Chr1, 63989115, true)); var interval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 63989116, 64291267); BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateFromSymbolicAllele(interval, VariantType.inversion, true, false); Assert.NotNull(adjacencies); Assert.Equal(2, adjacencies.Length); Assert.Equal(expectedAdjacency, adjacencies[0]); Assert.Equal(expectedAdjacency2, adjacencies[1]); } [Fact] public void CreateFromSymbolicAllele_Inversion_INV5() { var expectedAdjacency = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 63989116, true), new BreakPoint(ChromosomeUtilities.Chr1, 64291268, false)); var expectedAdjacency2 = new BreakEndAdjacency( new BreakPoint(ChromosomeUtilities.Chr1, 64291268, true), // GOOD new BreakPoint(ChromosomeUtilities.Chr1, 63989116, false)); // GOOD var interval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 63989116, 64291267); BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateFromSymbolicAllele(interval, VariantType.inversion, false, true); Assert.NotNull(adjacencies); Assert.Equal(2, adjacencies.Length); Assert.Equal(expectedAdjacency, adjacencies[0]); Assert.Equal(expectedAdjacency2, adjacencies[1]); } [Fact] public void CreateFromSymbolicAllele_UnhandledVariantType_ReturnNull() { var interval = new ChromosomeInterval(ChromosomeUtilities.Chr1, 63989116, 64291267); BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateFromSymbolicAllele(interval, VariantType.complex_structural_alteration, false, false); Assert.Null(adjacencies); } } } ================================================ FILE: UnitTests/VariantAnnotation/TranscriptAnnotation/FullTranscriptAnnotatorTests.cs ================================================ using VariantAnnotation.TranscriptAnnotation; using Xunit; namespace UnitTests.VariantAnnotation.TranscriptAnnotation { public sealed class FullTranscriptAnnotatorTests { [Theory] [InlineData("S", "S", 60, 60, "S", "S", 60, 60)] [InlineData("S", "T", 60, 60, "S", "T", 60, 60)] [InlineData("ELC", "DVR", 632, 634, "ELC", "DVR", 632, 634)] [InlineData("LL", "LI", 213, 214, "L", "I", 214, 214)] [InlineData("K", "KLX", 523, 523, "K", "KLX", 523, 523 )] [InlineData("C", "CC", 46, 46, "C", "CC", 46, 46)] [InlineData("R", "KR", 22955, 22955, "R", "KR", 22955, 22955)] [InlineData("PPPPPQQQQ", "", 65, 73, "PPPPPQQQQ", "", 65, 73)] [InlineData("DMEIHA", "D", 370, 375, "MEIHA", "", 371, 375)] [InlineData("VV", "V", 690, 691, "V", "", 691, 691)] [InlineData("NARCN", "N", 243, 247, "ARCN", "", 244, 247)] [InlineData("QQQQP", "P", 52, 56, "QQQQ", "", 52, 55)] [InlineData("RV", "X", 1172, 1173, "RV", "X", 1172, 1173)] [InlineData("GA", "GX", 112, 113, "A", "X", 113, 113)] [InlineData("SPDGHE", "R", 566, 571, "SPDGHE", "R", 566, 571)] [InlineData("Q", "*VRX", 96, 96, "Q", "*VRX", 96, 96)] public void TryTrimAminoAcidsAndUpdateProteinPositions_AsExpected(string reference, string alt, int start, int end, string newReference, string newAlt, int newStart, int newEnd) { var trimmedAa = FullTranscriptAnnotator.TryTrimAminoAcidsAndUpdateProteinPositions(new SequenceChange(reference, alt), start, end); Assert.Equal(newReference, trimmedAa.AaChange.Reference); Assert.Equal(newAlt, trimmedAa.AaChange.Alternate); Assert.Equal(newStart, trimmedAa.ProteinStart); Assert.Equal(newEnd, trimmedAa.ProteinEnd); } } } ================================================ FILE: UnitTests/VariantAnnotation/TranscriptAnnotation/TranscriptAnnotationFactoryTests.cs ================================================ using System.Collections.Generic; using Genome; using Intervals; using Moq; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.TranscriptAnnotation; using Variants; using Xunit; namespace UnitTests.VariantAnnotation.TranscriptAnnotation { public sealed class TranscriptAnnotationFactoryTests { [Fact] public void DecideAnnotationStatus_NoOverlap_ReturnNoAnnotation() { var observedStatus = TranscriptAnnotationFactory.DecideAnnotationStatus(new Interval(100, 101), new Interval(5102, 6100), AnnotationBehavior.SmallVariants, Chromosome.ShortFlankingLength); Assert.Equal(TranscriptAnnotationFactory.Status.NoAnnotation, observedStatus); } [Fact] public void DecideAnnotationStatus_Flanking_ReturnFlankingAnnotation() { var observedStatus = TranscriptAnnotationFactory.DecideAnnotationStatus(new Interval(100, 100), new Interval(102, 305), AnnotationBehavior.SmallVariants, Chromosome.ShortFlankingLength); Assert.Equal(TranscriptAnnotationFactory.Status.FlankingAnnotation, observedStatus); } [Fact] public void DecideAnnotationStatus_Reduced_TranscriptPartialOverlap_ReturnReducedAnnotation() { var observedStatus = TranscriptAnnotationFactory.DecideAnnotationStatus(new Interval(100, 200), new Interval(102, 305), AnnotationBehavior.StructuralVariants, Chromosome.ShortFlankingLength); Assert.Equal(TranscriptAnnotationFactory.Status.ReducedAnnotation, observedStatus); } [Fact] public void DecideAnnotationStatus_Full_PartialOverlap_ReturnFullAnnotation() { var observedStatus = TranscriptAnnotationFactory.DecideAnnotationStatus(new Interval(100, 105), new Interval(102, 305), AnnotationBehavior.SmallVariants, Chromosome.ShortFlankingLength); Assert.Equal(TranscriptAnnotationFactory.Status.FullAnnotation, observedStatus); } [Fact] public void DecideAnnotationStatus_Full_CompleteOverlap_ReturnFullAnnotation() { var observedStatus = TranscriptAnnotationFactory.DecideAnnotationStatus(new Interval(100, 500), new Interval(102, 305), AnnotationBehavior.SmallVariants, Chromosome.ShortFlankingLength); Assert.Equal(TranscriptAnnotationFactory.Status.FullAnnotation, observedStatus); } [Fact] public void DecideAnnotationStatus_ROH_Return_RohAnnotation() { var observedStatus = TranscriptAnnotationFactory.DecideAnnotationStatus(new Interval(100, 500), new Interval(102, 305), AnnotationBehavior.RunsOfHomozygosity, Chromosome.ShortFlankingLength); Assert.Equal(TranscriptAnnotationFactory.Status.RohAnnotation, observedStatus); } [Fact] public void GetAnnotatedTranscripts_ReturnEmptyList() { var variant = new Mock(); var transcript1 = new Mock(); var transcript2 = new Mock(); ITranscript[] transcripts = { transcript1.Object, transcript2.Object }; var chromosome = ChromosomeUtilities.Chr1; variant.SetupGet(x => x.Behavior).Returns(AnnotationBehavior.SmallVariants); variant.SetupGet(x => x.Chromosome).Returns(chromosome); //variant.SetupGet(x => x.Chromosome.FlankingLength).Returns(Chromosome.ShortFlankingLength); variant.SetupGet(x => x.Start).Returns(123456); variant.SetupGet(x => x.End).Returns(123456); transcript1.SetupGet(x => x.Id).Returns(CompactId.Convert("NR_046018.2")); transcript1.SetupGet(x => x.Start).Returns(108455); transcript1.SetupGet(x => x.End).Returns(118455); transcript1.SetupGet(x => x.Gene.Start).Returns(108455); transcript1.SetupGet(x => x.Gene.End).Returns(118455); transcript2.SetupGet(x => x.Id).Returns(CompactId.Convert("NR_106918.1")); transcript2.SetupGet(x => x.Start).Returns(128460); transcript2.SetupGet(x => x.End).Returns(129489); transcript2.SetupGet(x => x.Gene.Start).Returns(128460); transcript2.SetupGet(x => x.Gene.End).Returns(129489); var compressedSequence = new Mock(); IList observedAnnotatedTranscripts = TranscriptAnnotationFactory.GetAnnotatedTranscripts(variant.Object, transcripts, compressedSequence.Object, null, null); Assert.Empty(observedAnnotatedTranscripts); } [Fact] public void GetAnnotatedTranscripts_RohAnnotation_ReturnsCanonicalOnly() { var variant = new Mock(); var transcript1 = new Mock(); var transcript2 = new Mock(); ITranscript[] transcripts = { transcript1.Object, transcript2.Object }; variant.SetupGet(x => x.Chromosome).Returns(ChromosomeUtilities.Chr1); variant.SetupGet(x => x.Behavior).Returns(AnnotationBehavior.RunsOfHomozygosity); variant.SetupGet(x => x.Start).Returns(10000); variant.SetupGet(x => x.End).Returns(20000); transcript1.SetupGet(x => x.Id).Returns(CompactId.Convert("NM_123.1")); transcript1.SetupGet(x => x.Start).Returns(11000); transcript1.SetupGet(x => x.End).Returns(15000); transcript1.SetupGet(x => x.IsCanonical).Returns(true); transcript2.SetupGet(x => x.Id).Returns(CompactId.Convert("NM_456.2")); transcript2.SetupGet(x => x.Start).Returns(11000); transcript2.SetupGet(x => x.End).Returns(15000); transcript2.SetupGet(x => x.IsCanonical).Returns(false); IList observedAnnotatedTranscripts = TranscriptAnnotationFactory.GetAnnotatedTranscripts(variant.Object, transcripts, null, null, null); Assert.Single(observedAnnotatedTranscripts); Assert.Equal("NM_123", observedAnnotatedTranscripts[0].Transcript.Id.WithVersion); } } } ================================================ FILE: UnitTests/VariantAnnotation/Utilities/DateTests.cs ================================================ using System; using System.Text.RegularExpressions; using VariantAnnotation.Utilities; using Xunit; namespace UnitTests.VariantAnnotation.Utilities { public sealed class DateTests { [Fact] public void GetTimeStamp_CheckFormat() { var timeStamp = Date.CurrentTimeStamp; var regex = new Regex(@"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"); Assert.True(regex.Match(timeStamp).Success); } [Fact] public void GetDate() { long numTicks = new DateTime(2017, 6, 23).Ticks; const string expectedDate = "2017-06-23"; var observedDate = Date.GetDate(numTicks); Assert.Equal(expectedDate, observedDate); } } } ================================================ FILE: UnitTests/VariantAnnotation/Utilities/FormatUtilitiesTests.cs ================================================ using VariantAnnotation.Utilities; using Xunit; namespace UnitTests.VariantAnnotation.Utilities { public sealed class FormatUtilitiesTests { [Fact] public void SplitVersion_ReturnNull_WithNullInput() { var result = FormatUtilities.SplitVersion(null); Assert.Null(result.Id); Assert.Equal(0, result.Version); } [Theory] [InlineData("ENSG00000141510.7", "ENSG00000141510", 7)] [InlineData("ENSG00000141510", "ENSG00000141510", 0)] public void SplitVersion(string combinedId, string expectedId, byte expectedVersion) { var result = FormatUtilities.SplitVersion(combinedId); Assert.Equal(expectedId, result.Id); Assert.Equal(expectedVersion, result.Version); } } } ================================================ FILE: UnitTests/VariantAnnotation/Utilities/GeneComparerTests.cs ================================================ using System.Collections.Generic; using UnitTests.TestUtilities; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Utilities; using Xunit; namespace UnitTests.VariantAnnotation.Utilities { public sealed class GeneComparerTests { private readonly IGene _geneA; private readonly IGene _geneB; private readonly IGene _geneC; private readonly GeneComparer _geneComparer; public GeneComparerTests() { _geneA = new Gene(ChromosomeUtilities.Chr1, 100, 200, false, "PAX", 123, CompactId.Convert("NM_123"), CompactId.Convert("ENST0000123")); _geneB = new Gene(ChromosomeUtilities.Chr1, 100, 200, false, "PAX", 123, CompactId.Convert("NM_123"), CompactId.Convert("ENST0000123")); _geneC = new Gene(ChromosomeUtilities.Chr1, 101, 200, false, "PAX", 123, CompactId.Convert("NM_123"), CompactId.Convert("ENST0000123")); _geneComparer = new GeneComparer(); } [Fact] public void Equals_AsExpected() { Assert.Equal(_geneA, _geneB, _geneComparer); Assert.NotEqual(_geneA, _geneC, _geneComparer); } [Fact] public void GetHashCode_AsExpected() { IGene geneD = new Gene(_geneA.Chromosome, 100, 200, false, "PAX", 123, CompactId.Convert("NM_123", 2), CompactId.Convert("ENST0000123")); var hashCodes = new HashSet { _geneComparer.GetHashCode(_geneA), _geneComparer.GetHashCode(_geneB), _geneComparer.GetHashCode(_geneC), _geneComparer.GetHashCode(geneD) }; Assert.Equal(3, hashCodes.Count); } } } ================================================ FILE: UnitTests/Variants/BiDirectionalTrimmerTests.cs ================================================ using Variants; using Xunit; namespace UnitTests.Variants { public sealed class BiDirectionalTrimmerTests { [Theory] [InlineData(100, "A", "C", 100, "A", "C")] [InlineData(100, "A", "A", 100, "A", "A")] [InlineData(100, "AT", null, 100, "AT", "")] [InlineData(100, null, "CG", 100, "", "CG")] [InlineData(100, "ATTT", "AT", 102, "TT", "")] [InlineData(100, "CGGG", "TGGG", 100, "C", "T")] public void Trim(int start, string refAllele, string altAllele, int expectedStart, string expectedRef, string expectedAlt) { (int observedStart, string observedRef, string observedAlt) = BiDirectionalTrimmer.Trim(start, refAllele, altAllele); Assert.Equal(expectedStart, observedStart); Assert.Equal(expectedRef, observedRef); Assert.Equal(expectedAlt, observedAlt); } } } ================================================ FILE: UnitTests/Variants/SimpleVariantTests.cs ================================================ using UnitTests.TestUtilities; using Variants; using Xunit; namespace UnitTests.Variants { public sealed class SimpleVariantTests { [Fact] public void SimpleVariant_Set() { const int expectedStart = 100; const int expectedEnd = 102; const string expectedRef = "AT"; const string expectedAlt = ""; const VariantType expectedType = VariantType.deletion; var variant = new SimpleVariant(ChromosomeUtilities.Chr1, expectedStart, expectedEnd, expectedRef, expectedAlt, expectedType); Assert.Equal(ChromosomeUtilities.Chr1, variant.Chromosome); Assert.Equal(expectedStart, variant.Start); Assert.Equal(expectedEnd, variant.End); Assert.Equal(expectedRef, variant.RefAllele); Assert.Equal(expectedAlt, variant.AltAllele); Assert.Equal(expectedType, variant.Type); } } } ================================================ FILE: UnitTests/Variants/VariantRotatorTests.cs ================================================ using Genome; using Moq; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; using Xunit; namespace UnitTests.Variants { public sealed class VariantRotatorTests { private readonly ISequence _refSequence = new SimpleSequence( new string('A', VariantRotator.MaxDownstreamLength) + "ATGTGTGTGTGCAGT" + new string('A', VariantRotator.MaxDownstreamLength), 965891); [Fact] public void Right_Deletion_ForwardStrand() { // chr1 966391 . ATG A 2694.00 PASS . var variant = GetDeletion(); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(966300); transcript.SetupGet(x => x.End).Returns(966405); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); var rotatedVariant = VariantRotator.Right(variant, transcript.Object, _refSequence, transcript.Object.Gene.OnReverseStrand); Assert.False(ReferenceEquals(variant, rotatedVariant)); Assert.Equal(966400, rotatedVariant.Start); Assert.Equal("TG", rotatedVariant.RefAllele); } [Fact] public void Right_Deletion_ReverseStrand() { var variant = new SimpleVariant(ChromosomeUtilities.Chr1, 966399, 966401, "TG", "", VariantType.deletion); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(966300); transcript.SetupGet(x => x.End).Returns(966405); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(true); var rotatedVariant = VariantRotator.Right(variant, transcript.Object, _refSequence, transcript.Object.Gene.OnReverseStrand); Assert.False(ReferenceEquals(variant, rotatedVariant)); Assert.Equal(966393, rotatedVariant.Start); Assert.Equal("TG", rotatedVariant.RefAllele); } [Fact] public void Right_Insertion() { var variant = GetInsertion(); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(966300); transcript.SetupGet(x => x.End).Returns(966405); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); var rotated = VariantRotator.Right(variant, transcript.Object, _refSequence, transcript.Object.Gene.OnReverseStrand); Assert.False(ReferenceEquals(variant, rotated)); Assert.Equal(966403, rotated.Start); Assert.Equal("TG", rotated.AltAllele); } [Fact] public void Right_Identity_WhenRefSequenceNull() { var originalVariant = GetDeletion(); var rotatedVariant = VariantRotator.Right(originalVariant, null, null, false); Assert.True(ReferenceEquals(originalVariant, rotatedVariant)); } [Fact] public void Right_Identity_WhenNotInsertionOrDeletion() { var originalVariant = new SimpleVariant(ChromosomeUtilities.Chr1, 966392, 966392, "T", "A", VariantType.SNV); var rotated = VariantRotator.Right(originalVariant, null, _refSequence, false); Assert.True(ReferenceEquals(originalVariant, rotated)); } [Fact] public void Right_Identity_VariantBeforeTranscript_ForwardStrand() { var originalVariant = GetDeletion(); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(966397); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); var rotated = VariantRotator.Right(originalVariant, transcript.Object, _refSequence, transcript.Object.Gene.OnReverseStrand); Assert.True(ReferenceEquals(originalVariant, rotated)); } [Fact] public void Right_Identity_VariantBeforeTranscript_ReverseStrand() { var originalVariant = GetDeletion(); var transcript = new Mock(); transcript.SetupGet(x => x.End).Returns(966390); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(true); var rotated = VariantRotator.Right(originalVariant, transcript.Object, _refSequence, transcript.Object.Gene.OnReverseStrand); Assert.True(ReferenceEquals(originalVariant, rotated)); } [Fact] public void Right_Identity_InsertionVariantBeforeTranscript_ForwardStrand() { var originalVariant = GetInsertion(); var transcript = new Mock(); transcript.SetupGet(x => x.End).Returns(966392); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); var rotated = VariantRotator.Right(originalVariant, transcript.Object, _refSequence, transcript.Object.Gene.OnReverseStrand); Assert.True(ReferenceEquals(originalVariant, rotated)); } [Fact] public void Right_Identity_WithNoRotation() { var originalVariant = GetDeletion(); ISequence refSequence = new SimpleSequence( new string('A', VariantRotator.MaxDownstreamLength) + "GAGAGTTAGGTA" + new string('A', VariantRotator.MaxDownstreamLength), 965891); var transcript = new Mock(); transcript.SetupGet(x => x.Start).Returns(966300); transcript.SetupGet(x => x.End).Returns(966405); transcript.SetupGet(x => x.Gene.OnReverseStrand).Returns(false); var rotated = VariantRotator.Right(originalVariant, transcript.Object, refSequence, transcript.Object.Gene.OnReverseStrand); Assert.True(ReferenceEquals(originalVariant, rotated)); } private static ISimpleVariant GetDeletion() => new SimpleVariant(ChromosomeUtilities.Chr1, 966392, 966394, "TG", "", VariantType.deletion); private static ISimpleVariant GetInsertion() => new SimpleVariant(ChromosomeUtilities.Chr1, 966397, 966396, "", "TG", VariantType.insertion); [Theory] [InlineData(519, "TG", 515, "TG")] [InlineData(511, "ATT", 509, "TTA")] [InlineData(508, "GTT", 504, "TGT")] public void Left_align_deletions(int position, string refAllele, int rotatedPos, string rotatedRef) { var reference = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "ATGTGTTGTTATTCTGTGTGCAT"); var rotatedVariant = VariantUtils.TrimAndLeftAlign(position, refAllele, "", reference); Assert.Equal(rotatedPos, rotatedVariant.start); Assert.Equal(rotatedRef, rotatedVariant.refAllele); } [Theory] [InlineData(519, "TG", 515, "TG")] [InlineData(511, "ATT", 509, "TTA")] [InlineData(508, "GTT", 504, "TGT")] public void Left_align_insertion(int position, string altAllele, int rotatedPos, string rotatedAlt) { var reference = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "ATGTGTTGTTATTCTGTGTGCAT"); var rotatedVariant = VariantUtils.TrimAndLeftAlign(position, "", altAllele, reference); Assert.Equal(rotatedPos, rotatedVariant.start); Assert.Equal(rotatedAlt, rotatedVariant.altAllele); } [Fact] public void Left_align_multiple_padding_bases() { var reference = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "ATGTGTTGTTATTCTGTGTGCAT"); var rotatedVariant = VariantUtils.TrimAndLeftAlign(501, "AT", "ATT", reference); Assert.Equal(502, rotatedVariant.start); Assert.Equal("T", rotatedVariant.altAllele); } [Theory] [InlineData("TC", "T", false)] [InlineData("T", "TC", false)] [InlineData("T", "TCT", true)] [InlineData("TCT", "T", true)] [InlineData("TCT", "TA", true)] // no conclusion for indels [InlineData("TC", "AT", true)]//no conclusion for mnvs [InlineData("T", "A", false)] [InlineData("T", "T", false)] public void CanNotLeftRotate(string refAllele, string altAllele, bool result) { Assert.Equal(result, VariantUtils.IsLeftShiftPossible(refAllele, altAllele)); } } } ================================================ FILE: UnitTests/Variants/VariantTests.cs ================================================ using UnitTests.TestUtilities; using VariantAnnotation.Pools; using Variants; using Xunit; namespace UnitTests.Variants { public sealed class VariantTests { [Fact] public void Variant_Set() { const int expectedStart = 100; const int expectedEnd = 102; const string expectedRef = "AT"; const string expectedAlt = ""; const VariantType expectedType = VariantType.deletion; const string expectedVid = "1:100:A:C"; const bool expectedRefMinor = true; const bool expectedDecomposed = false; const bool expectedRecomposed = true; var expectedLinkedVids = new[] { "1:102:T:G" }; var expectedBehavior = AnnotationBehavior.SmallVariants; var variant = VariantPool.Get(ChromosomeUtilities.Chr1, expectedStart, expectedEnd, expectedRef, expectedAlt, expectedType, expectedVid, expectedRefMinor, expectedDecomposed, expectedRecomposed, expectedLinkedVids, expectedBehavior, false); Assert.Equal(ChromosomeUtilities.Chr1, variant.Chromosome); Assert.Equal(expectedStart, variant.Start); Assert.Equal(expectedEnd, variant.End); Assert.Equal(expectedRef, variant.RefAllele); Assert.Equal(expectedAlt, variant.AltAllele); Assert.Equal(expectedType, variant.Type); Assert.Equal(expectedVid, variant.VariantId); Assert.Equal(expectedRefMinor, variant.IsRefMinor); Assert.Equal(expectedDecomposed, variant.IsDecomposed); Assert.Equal(expectedRecomposed, variant.IsRecomposed); Assert.Equal(expectedLinkedVids, variant.LinkedVids); Assert.Equal(expectedBehavior, variant.Behavior); VariantPool.Return(variant); } } } ================================================ FILE: UnitTests/Vcf/Samples/BooleanExtensionsTests.cs ================================================ using Vcf.Sample; using Xunit; namespace UnitTests.Vcf.Samples { public sealed class BooleanExtensionsTests { [Theory] [InlineData("PASS", false)] [InlineData("LowGQX", true)] [InlineData(null, false)] public void GetFailedFilter(string filter, bool? expectedFailedFilter) { bool observedFailedFilter = filter.GetFailedFilter(); Assert.Equal(expectedFailedFilter, observedFailedFilter); } } } ================================================ FILE: UnitTests/Vcf/Samples/FormatIndicesTests.cs ================================================ using Vcf.Sample; using Xunit; namespace UnitTests.Vcf.Samples { public sealed class FormatIndicesTests { [Fact] public void FormatIndicesTest() { const string formatColumn = "AD:AQ:BOB:CN:DN:DP:DST:FT:GQ:GT:LQ:PR:SR:VF"; var formatIndices = new FormatIndices(); formatIndices.Set(formatColumn); Assert.Equal(0, formatIndices.AD); Assert.Equal(1, formatIndices.AQ); Assert.Equal(3, formatIndices.CN); Assert.Equal(4, formatIndices.DN); Assert.Equal(5, formatIndices.DP); Assert.Equal(6, formatIndices.DST); Assert.Equal(7, formatIndices.FT); Assert.Equal(8, formatIndices.GQ); Assert.Equal(9, formatIndices.GT); Assert.Equal(10, formatIndices.LQ); Assert.Equal(11, formatIndices.PR); Assert.Equal(12, formatIndices.SR); Assert.Equal(13, formatIndices.VF); formatIndices.Set(null); Assert.False(formatIndices.AD.HasValue); Assert.False(formatIndices.AQ.HasValue); Assert.False(formatIndices.CN.HasValue); Assert.False(formatIndices.DN.HasValue); Assert.False(formatIndices.DP.HasValue); Assert.False(formatIndices.DST.HasValue); Assert.False(formatIndices.FT.HasValue); Assert.False(formatIndices.GQ.HasValue); Assert.False(formatIndices.GT.HasValue); Assert.False(formatIndices.LQ.HasValue); Assert.False(formatIndices.PR.HasValue); Assert.False(formatIndices.SR.HasValue); Assert.False(formatIndices.VF.HasValue); formatIndices.Set("TEMP:DP:BOB"); Assert.Equal(1, formatIndices.DP); } } } ================================================ FILE: UnitTests/Vcf/Samples/Legacy/LegacySampleFieldExtractorTests.cs ================================================ using System.Linq; using Vcf.Sample; using Xunit; using static UnitTests.Vcf.Samples.TestUtilities; namespace UnitTests.Vcf.Samples.Legacy { public sealed class LegacySampleFieldExtractorTests { [Fact] public void FormatIndicesTest() { const string formatColumn = "AU:GU:TAR:FT:GQ:DP:VF:CU:TU:TIR:GT:GQX:BOB:DPI:NV:NR:CHC:DST:PCH:DCS:DID:PLG:PCN:MAD:SCH:AQ:LQ"; var formatIndicies = new FormatIndices(); formatIndicies.Set(formatColumn); Assert.Equal(0, formatIndicies.AU); Assert.Equal(7, formatIndicies.CU); Assert.Equal(1, formatIndicies.GU); Assert.Equal(8, formatIndicies.TU); Assert.Equal(2, formatIndicies.TAR); Assert.Equal(9, formatIndicies.TIR); Assert.Equal(3, formatIndicies.FT); Assert.Equal(10, formatIndicies.GT); Assert.Equal(4, formatIndicies.GQ); Assert.Equal(11, formatIndicies.GQX); Assert.Equal(5, formatIndicies.DP); Assert.Equal(6, formatIndicies.VF); Assert.Equal(13, formatIndicies.DPI); Assert.Equal(17, formatIndicies.DST); Assert.Equal(25, formatIndicies.AQ); Assert.Equal(26, formatIndicies.LQ); formatIndicies.Set(null); Assert.Null(formatIndicies.TIR); Assert.Null(formatIndicies.AU); formatIndicies.Set("TEMP:DPI:BOB"); Assert.Equal(1, formatIndicies.DPI); Assert.Null(formatIndicies.AU); } [Theory] [InlineData("GT:TIR:TAR", "1/1:18,19:37,38", new[] { 37, 18 })] [InlineData("GT:AU:CU:GU:TU:AD", "1/1:10,11:20,21:30,31:40,41:11,13", new[] { 20, 40 })] [InlineData("GT:AD", "1/1:11,13", new[] { 11, 13 })] [InlineData("GT:AU:CU:GU:TU:AD", "1/1:.:20,21:30,31:40,41:11,13", new[] { 11, 13 })] [InlineData("GT:AU:CU:GU:TU:AD", "1/1:.", null)]//null when all fields are dropped after GT [InlineData("GT:AU:CU:GU:TU:AD", "1/1", null)]//null when all fields are dropped after GT [InlineData("AD", ".", null)] [InlineData("AD", "", null)] public void AlleleDepths(string formatCol, string sampleCol, int[] expectedAlleleDepths) { string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null, null); Assert.Single(samples); var sample = samples[0]; var observedAlleleDepths = sample?.AlleleDepths; Assert.Equal(expectedAlleleDepths, observedAlleleDepths); } [Theory] [InlineData("GT:TIR:TAR", "1/1:18,19:37,38", null)] [InlineData("GT:TIR:TAR:AD", "1/1:.:37,38:11,13,17", new[] { 11, 13, 17 })] [InlineData("GT:TIR:TAR:AD", "1/1:.:37,38", null)]//null when no values for AD for multi-allelic site public void AlleleDepthsMultiAllelic(string formatCol, string sampleCol, int[] expectedAlleleDepths) { string vcfLine = $"chr1\t5592503\t.\tC\tT,A\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(2), null, null); Assert.Single(samples); var sample = samples[0]; var observedAlleleDepths = sample?.AlleleDepths; Assert.Equal(expectedAlleleDepths, observedAlleleDepths); } [Theory] [InlineData("1/1:208:47:70:3:F", true)] [InlineData("1/1:208:47:70:3:.", false)] [InlineData(".", false)] [InlineData("", false)] public void FailedFilter(string sampleCol, bool? expectedFailedFilter) { string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\tGT:GQ:GQX:DP:DPF:FT\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(2), null, null); Assert.Single(samples); var sample = samples[0]; var observedFailedFilter = sample?.FailedFilter; Assert.Equal(expectedFailedFilter, observedFailedFilter); } [Theory] [InlineData("1/1:208:47:70:3:0,70", "1/1")] [InlineData(".:208:47:70:3:0,70", null)] [InlineData(".", null)] [InlineData("", null)] [InlineData("./.", "./.")] public void Genotype(string sampleCol, string expectedGenotype) { string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\tGT:GQ:GQX:DP:DPF:AD\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null, null); Assert.Single(samples); var sample = samples[0]; var observedGenotype = sample?.Genotype; Assert.Equal(expectedGenotype, observedGenotype); } [Theory] [InlineData("GT:GQ:GQX:DP:DPF:AD", "1/1:208:47:70:3:0,70", 47)] [InlineData("GT:GQ:DP:DPF:AD", "1/1:208:70:3:0,70", 208)] [InlineData("GT:GQ:DP:DPF:AD", "1/1:.:70:3:0,70", null)] [InlineData("GT:DP:DPF:AD:GQ", "1/1:70:3", null)]//dropped fields without '.' [InlineData("GQ", ".", null)] [InlineData("GQX", "", null)] [InlineData("GQX", "./.", null)] public void GenotypeQuality(string formatCol, string sampleCol, int? expectedGenotypeQuality) { string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null, null); Assert.Single(samples); var sample = samples[0]; var observedGenotypeQuality = sample?.GenotypeQuality; Assert.Equal(expectedGenotypeQuality, observedGenotypeQuality); } [Theory] [InlineData("GT:TIR:TAR:DP:DPF:AD", "1/1:22,22:3,4:70:3:0,70", 25)] [InlineData("GT:AU:CU:GU:TU:DP:DPF:AD", "1/1:10,11:20,21:30,31:40,41:70:3:0,70", 100)] [InlineData("GT:DPI:DP:DPF:AD", "1/1:17:70:3:0,70", 17)] [InlineData("GT:DP:DPF:AD", "1/1:70:3:0,70", 70)] [InlineData("GT:AU:CU:GU:TU:DPF:AD", "1/1:.:20,21:30,31:40,41:3:0,70", null)] [InlineData("GT:AU:CU:GU:TU:DPF:AD", "1/1:.:20,21:30,31:40,41:3", null)]//dropping AD completely [InlineData("GT:DP:DPF:AD", "1/1:.:3:0,70", null)] [InlineData("DP", ".", null)] [InlineData("DPI", "", null)] public void TotalDepth(string formatCol, string sampleCol, int? expectedTotalDepth) { string vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null, null); Assert.Single(samples); var sample = samples[0]; var observedTotalDepth = sample?.TotalDepth; Assert.Equal(expectedTotalDepth, observedTotalDepth); } [Fact] public void PiscesTotalDepth() { const string vcfLine = "chr1\t115251293\t.\tGA\tG\t100\tSB;LowVariantFreq\tDP=7882\tGT:GQ:AD:VF:NL:SB:GQX\t0/1:100:7588,294:0:20:-100.0000:100"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null, null); var sample = samples[0]; var observedTotalDepth = sample.TotalDepth; const int expectedTotalDepth = 7882; Assert.Equal(expectedTotalDepth, observedTotalDepth); } [Theory] [InlineData("T", "GT:GQ:GQX:DP:DPF:AD:VF", "1/1:208:47:70:3:0,70:0.75", "0.75")] // VF [InlineData("T", "GT:TIR:TAR", "1/1:10,11:20,21", "0.3333")] // TAR/TIR [InlineData("A", "GT:AU:CU:GU:TU", "1/1:10,11:20,21:30,31:40,41", "0.1")] // allele counts (A) [InlineData("C", "GT:AU:CU:GU:TU", "1/1:10,11:20,21:30,31:40,41", "0.2")] // allele counts (C) [InlineData("G", "GT:AU:CU:GU:TU", "1/1:10,11:20,21:30,31:40,41", "0.3")] // allele counts (G) [InlineData("T", "GT:AU:CU:GU:TU", "1/1:10,11:20,21:30,31:40,41", "0.4")] // allele counts (T) [InlineData("T", "GT:AD", "1/1:3,70", "0.9589")] // allele depths [InlineData("T", "GT:AU:CU:GU:TU:AD", "1/1:.:20,21:30,31:40,41:7,11", "0.6111")] // missing allele count [InlineData("T", "GT:AD:DP:VF", "0/1:317,200:517:0.38685", "0.3869")] // VF (rounding issue) public void VariantFrequency_Nominal(string altAllele, string formatCol, string sampleCol, string expectedResults) { string vcfLine = $"chr1\t5592503\t.\tC\t{altAllele}\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null, null); Assert.Single(samples); var sample = samples[0]; Assert.NotNull(sample?.VariantFrequencies); var observedResults = string.Join(',', sample.VariantFrequencies.Select(x => x.ToString("0.####"))); Assert.Equal(expectedResults, observedResults); } [Theory] [InlineData("C", "T", "GT:AD", "1/1:.")] // missing AD [InlineData("C", "T", "VF", ".")] // missing VF [InlineData("C", "T", "AD", "")] // missing AD [InlineData("C", "T,A", "GT:GQ:GQX:DP:DPF:AD:VF", "1/1:208:47:70:3:0,70:0.75")] // multiple alleles (VF) [InlineData("CG", "T", "GT:AU:CU:GU:TU", "1/1:10,11:20,21:30,31:40,41")] // multiple ref bases (AC) [InlineData("CG", "T", "GT:AU:CU:GU:TU", "1/1")] // dropping all fields after GT [InlineData("C", ".", "DP:AU:CU:GU:TU", "19:0,0:14,14:0,0:5,6")] // ref minor (AC) [InlineData("C", ".", "DP:AU:CU:GU:TU", "75:0,0:72,77:0,0:0,2")] // ref minor (AC) public void VariantFrequency_ReturnNull(string refAllele, string altAllele, string formatCol, string sampleCol) { var vcfLine = $"chr1\t5592503\t.\t{refAllele}\t{altAllele}\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(altAllele.Split(',').Length), null, null); Assert.Single(samples); var sample = samples[0]; Assert.Null(sample.VariantFrequencies); } [Theory] [InlineData("GT:GQ:GQX:DP:DPF:AD:VF", "1/1:208:47:70:3:70", "VF")] [InlineData("GT:GQ:GQX:DP:DPF:AD:VF", "1/1:208:47:70:3", "AD")] [InlineData("GT:DP:DPF:AD:VF:GQ:GQX", "1/1:70:3:208:47", "GQ")] [InlineData("GT:DP:DPF:AD:VF:GQ:GQX", "1/1:70:3:208:47", "GQX")] [InlineData("GT:AD:VF:FT", "1/1:47:70", "FT")] public void Leftout_fields_return_null(string formatCol, string sampleCol, string missingField) { var vcfLine = $"chr1\t5592503\t.\tA\tC\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null, null); Assert.Single(samples); var sample = samples[0]; switch (missingField) { case "VF": Assert.Null(sample.VariantFrequencies); break; case "AD": Assert.Null(sample.AlleleDepths); break; case "FT": Assert.False(sample.FailedFilter); break; case "GQ": case "GQX": Assert.Null(sample.GenotypeQuality); break; } } [Fact] public void MajorChromosomeCopyTest() { // data from NIR-1095 // for NIR-1218 const string vcfLine = "1 9314202 Canvas:GAIN:1:9314202:9404148 N 36 PASS SVTYPE=CNV;END=9404148;ensembl_gene_id=ENSG00000049239,ENSG00000252841,ENSG00000171621 RC:BC:CN:MCC . 151:108:6:4"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1),null, null); Assert.Equal(2, samples.Length); var sample = samples[1]; var observedMcc = sample?.IsLossOfHeterozygosity; Assert.False(observedMcc); } [Fact] public void EmptySamples() { // for NIR-1306 const string vcfLine = "chrX 2735147 . G A 38.25 VQSRTrancheSNP99.90to100.00 AC=3;AF=0.500;AN=6;BaseQRankSum=-0.602;DP=56;Dels=0.00;FS=30.019;HaplotypeScore=7.7259;MLEAC=3;MLEAF=0.500;MQ=41.18;MQ0=0;MQRankSum=0.098;QD=1.06;ReadPosRankSum=0.266;SB=-8.681e-03;VQSLOD=-6.0901;culprit=QD GT:AD:DP:GQ:PL 0:7,0:7:3:0,3,39 ./. 0/1:14,3:17:35:35,0,35 1/1:9,10:19:3:41,3,0"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal(4, samples.Length); var sample = samples[1]; var observedGenotype = sample.Genotype; var observedVariantFrequency = sample.VariantFrequencies; Assert.Equal("./.", observedGenotype); Assert.Null(observedVariantFrequency); } [Theory] [InlineData("GT:TIR:TAR", "1/1:0,11:0,21", "0")] [InlineData("GT:AU:CU:GU:TU", "1/1:0,11:0,21:0,31:0,41", "0")] [InlineData("GT:AD", "1/1:0,0", "0")] [InlineData("GT:AU:CU:GU:TU:AD", "1/1:.:20,21:30,31:40,41:0,0", "0")] [InlineData("GT:AD", "1/1:.", null)] [InlineData("VF", ".", null)] [InlineData("AD", "", null)] public void VariantFrequencyNan(string formatCol, string sampleCol, string expectedResults) { // NIR-1338 var vcfLine = $"chr1\t5592503\t.\tC\tT\t900.00\tPASS\t.\t{formatCol}\t{sampleCol}"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null, null); Assert.Single(samples); var sample = samples[0]; if (expectedResults == null) { Assert.Null(sample?.VariantFrequencies); return; } Assert.NotNull(sample?.VariantFrequencies); var observedResults = string.Join(',', sample.VariantFrequencies.Select(x => x.ToString("0.####"))); Assert.Equal(expectedResults, observedResults); } [Fact] public void SplitReadCounts() { const string vcfLine = "chr7 127717248 MantaINV:267944:0:1:2:0:0 T . PASS END=140789466;SVTYPE=INV;SVLEN=13072218;INV5 PR:SR 78,0:65,0 157,42:252,63"; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal(2, samples.Length); var sample1 = samples[0]; Assert.Equal(new[] { 78, 0 }, sample1.PairedEndReadCounts); Assert.Equal(new[] { 65, 0 }, sample1.SplitReadCounts); var sample2 = samples[1]; Assert.Equal(new[] { 157, 42 }, sample2.PairedEndReadCounts); Assert.Equal(new[] { 252, 63 }, sample2.SplitReadCounts); } [Fact] public void EmptySample() { const string vcfLine = "chr7 127717248 MantaINV:267944:0:1:2:0:0 T . PASS END=140789466;SVTYPE=INV;SVLEN=13072218;INV5 PR:SR ."; var vcfColumns = vcfLine.Split('\t'); var samples = vcfColumns.ToSamples(new FormatIndices(), GetSimplePositionUsingAlleleNum(1), null, null); Assert.Single(samples); var sample = samples[0]; Assert.True(sample.IsEmpty); } } } ================================================ FILE: UnitTests/Vcf/Samples/SampleFieldExtractorTests.cs ================================================ using System.Collections.Generic; using MitoHeteroplasmy; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Pools; using Variants; using Vcf; using Vcf.Sample; using Xunit; using static UnitTests.Vcf.Samples.TestUtilities; namespace UnitTests.Vcf.Samples { public sealed class SampleFieldExtractorTests { [Fact] public void NormalizeNulls() { const string periwinkle = "periwinkle"; string[] cols = { periwinkle, "", ".", null }; cols.NormalizeNulls(); Assert.Equal(periwinkle, cols[0]); Assert.Null(cols[1]); Assert.Null(cols[2]); Assert.Null(cols[3]); } [Fact] public void ExtractSample_PEPE() { var formatIndices = new FormatIndices(); formatIndices.Set("GT:GQ:AD:DP:VF:NL:SB:NC:US:AQ:LQ"); var sample = SampleFieldExtractor.ExtractSample("0/1:5:338,1:339:0.00295:30:-7.3191:0.0314:0,0,0,1,0,0,17,1,129,21,148,22:3.366:0.000", formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal("0/1", sample.Genotype); Assert.Equal(5, sample.GenotypeQuality); Assert.Equal(new[] { 338, 1 }, sample.AlleleDepths); Assert.Equal(339, sample.TotalDepth); Assert.Equal(new[] { 0.00295 }, sample.VariantFrequencies); Assert.Equal(3.366f, sample.ArtifactAdjustedQualityScore); Assert.Equal(0.000f, sample.LikelihoodRatioQualityScore); } [Fact] public void ExtractSample_DragenSomatic_AsExpected() { var formatIndices = new FormatIndices(); formatIndices.Set("GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB:PS"); var sample = SampleFieldExtractor.ExtractSample("0|1:3.96:33,8:0.195:13,6:20,2:41:17,16,4,4:13,20,4,4:534234", formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal("0|1", sample.Genotype); Assert.Equal(3.96, sample.SomaticQuality); Assert.Equal(new[] { 33, 8 }, sample.AlleleDepths); Assert.Equal(41, sample.TotalDepth); Assert.Equal(new[] { 8 / 41.0 }, sample.VariantFrequencies); } [Fact] public void ExtractSample_DragenCNV_AsExpected() { var formatIndices = new FormatIndices(); formatIndices.Set("GT:CN:MCN"); var sample = SampleFieldExtractor.ExtractSample("0|1:3:1", formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal("0|1", sample.Genotype); Assert.Equal(3, sample.CopyNumber); Assert.Equal(1, sample.MinorHaplotypeCopyNumber); } [Fact] public void ExtractSample_Custom_format() { var formatIndices = new FormatIndices(new HashSet(){"CUST"}); formatIndices.Set("GT:CN:MCN:CUST"); var sample = SampleFieldExtractor.ExtractSample("0|1:3:1:4.5", formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal("0|1", sample.Genotype); Assert.Equal(3, sample.CopyNumber); Assert.Equal(1, sample.MinorHaplotypeCopyNumber); Assert.NotNull(sample.CustomFields); Assert.Contains("\"CUST\":\"4.5\"", sample.CustomFields.ToString()); } [Fact] public void ExtractSample_Custom_format_empty() { var formatIndices = new FormatIndices(new HashSet(){"CUST"}); formatIndices.Set("GT:CN:MCN"); var sample = SampleFieldExtractor.ExtractSample("0|1:3:1", formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal("0|1", sample.Genotype); Assert.Equal(3, sample.CopyNumber); Assert.Equal(1, sample.MinorHaplotypeCopyNumber); Assert.NotNull(sample.CustomFields); Assert.True(sample.CustomFields.IsEmpty()); } [Theory] [InlineData("GT:CN:MCN:CNQ:MCNQ:CNF:MCNF:SD:MAF:BC:AS", "1/2:2:0:1000:1000:2.03102:0.000203:248.8:0.0001:1493:1137", 1493)] [InlineData("GT:CN:MCN:CNQ:MCNQ:CNF:MCNF:SD:MAF:BC:AS", "1/2:3:0:1000:1000:3.02612:0.000303:370.7:0.0001:8765:9070", 8765)] public void ExtractSample_DragenCNV_MCN_LOH_BC(string formatField, string sampleField, int binCount) { var formatIndices = new FormatIndices(); formatIndices.Set(formatField); var sample = SampleFieldExtractor.ExtractSample(sampleField, formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.True(sample.IsLossOfHeterozygosity); Assert.Equal(binCount, sample.BinCount); } [Fact] public void ExtractSample_ExpansionHunter() { var formatIndices = new FormatIndices(); formatIndices.Set("GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC"); var sample = SampleFieldExtractor.ExtractSample("1/1:SPANNING/SPANNING:15/15:15-15/15-15:22/22:23/23:0/0:38.270270", formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal("1/1", sample.Genotype); Assert.Equal(new[] { 15, 15 }, sample.RepeatUnitCounts); } [Fact] public void ExtractSample_EmptySampleColumn_ReturnEmptySample() { var formatIndices = new FormatIndices(); var sample = SampleFieldExtractor.ExtractSample(null, formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.True(sample.IsEmpty); } [Fact] public void ExtractSample_DotInSampleColumn_ReturnEmptySample() { var formatIndices = new FormatIndices(); var sample = SampleFieldExtractor.ExtractSample(".", formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.True(sample.IsEmpty); } [Fact] public void ToSamples_SMN1_CNV() { // GT:AD:DST:RPL:LC // 0/1:30,20:-:35.8981:45.810811 // GT:SM:CN:BC:QS:FT:DN // ./1:1.24763:3:4:5:cnvLength:. // ./.:1.17879:2:4:8:cnvLength:. // ./1:1.26335:3:4:6:cnvLength:Inherited var formatIndices = new FormatIndices(); string[] cols = { "chr1", "125068769", "DRAGEN:GAIN:125068770-125075279", "N", "", ".", "SampleFT", "SVTYPE=CNV;END=125075279;REFLEN=6510", "GT:AD:DST:RPL:LC:SM:CN:BC:QS:FT:DN", "0/1:30,20:-:35.8981:45.810811", "./1:.:.:.:.:1.24763:3:4:5:cnvLength:.", "./.:.:.:.:.:1.17879:2:4:8:cnvLength:.", "./1:.:.:.:.:1.26335:3:4:6:cnvLength:Inherited" }; ISample[] samples = cols.ToSamples(formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal(4, samples.Length); Assert.Equal("0/1", samples[0].Genotype); Assert.Equal(new[] { 30, 20 }, samples[0].AlleleDepths); Assert.Equal(new[] { "-" }, samples[0].DiseaseAffectedStatuses); Assert.Equal("./1", samples[1].Genotype); Assert.Equal(3, samples[1].CopyNumber); Assert.True(samples[1].FailedFilter); Assert.Equal("./.", samples[2].Genotype); Assert.Equal(2, samples[2].CopyNumber); Assert.True(samples[2].FailedFilter); Assert.Equal("./1", samples[3].Genotype); Assert.Equal(3, samples[3].CopyNumber); Assert.True(samples[3].FailedFilter); } [Fact] public void ToSamples_Custom() { var formatIndices = new FormatIndices(new HashSet(){"CF"}); string[] cols = { "chr1", "125068769", "DRAGEN:GAIN:125068770-125075279", "N", "", ".", "SampleFT", "SVTYPE=CNV;END=125075279;REFLEN=6510", "GT:AD:DST:RPL:LC:SM:CN:BC:QS:CF:FT:DN", "0/1:30,20:-:35.8981:45.810811:.:.:.:.:4.5", "./1:.:.:.:.:1.24763:3:4:5:1.2:cnvLength:.", "./.:.:.:.:.:1.17879:2:4:8:2.3:cnvLength:.", "./1:.:.:.:.:1.26335:3:4:6:3.4:cnvLength:Inherited" }; ISample[] samples = cols.ToSamples(formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Equal(4, samples.Length); Assert.NotNull(samples[0].CustomFields); Assert.Contains("\"CF\":\"4.5\"", samples[0].CustomFields.ToString()); Assert.NotNull(samples[1].CustomFields); Assert.Contains("\"CF\":\"1.2\"", samples[1].CustomFields.ToString()); Assert.NotNull(samples[2].CustomFields); Assert.Contains("\"CF\":\"2.3\"", samples[2].CustomFields.ToString()); Assert.NotNull(samples[3].CustomFields); Assert.Contains("\"CF\":\"3.4\"", samples[3].CustomFields.ToString()); } [Fact] public void ToSamples_TooFewVcfColumns_ReturnNull() { var formatIndices = new FormatIndices(); string[] cols = { "chr1", "125068769", "DRAGEN:GAIN:125068770-125075279", "N", "", ".", "SampleFT", "SVTYPE=CNV;END=125075279;REFLEN=6510" }; ISample[] samples = cols.ToSamples(formatIndices, GetSimplePositionUsingAlleleNum(1), null, null); Assert.Null(samples); } [Fact] public void ExtractSample_MitoHeteroplasmy_AsExpected() { var position = 1; var provider = new MitoHeteroplasmyProvider(); provider.Add(position, "C", new[] { 0.123, 0.200, 0.301 }, new[] { 1, 2, 4 }); provider.Add(position, "G", new[] { 0.101, 0.201 }, new[] { 1, 2 }); var simplePosition = new SimplePosition(ChromosomeUtilities.ChrM, 1, "A", new[] { "C", "T"}); IVariant[] variants = { VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "A", "C", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false), VariantPool.Get(ChromosomeUtilities.ChrM, position, position, "A", "T", VariantType.SNV, null, false, false, false, null, AnnotationBehavior.SmallVariants, false) }; var formatIndices = new FormatIndices(); formatIndices.Set("GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB:PS"); var sample = SampleFieldExtractor.ExtractSample("1|2:3.96:0,15,85:0.195:13,6:20,2:100:17,16,4,4:13,20,4,4:534234", formatIndices, simplePosition,variants, provider); Assert.Equal(new[] { 15 / 100.0, 85 / 100.0 }, sample.VariantFrequencies); Assert.Equal(new[] { "14.29", "null" }, sample.HeteroplasmyPercentile); VariantPool.Return((Variant)variants[0]); VariantPool.Return((Variant)variants[1]); } } } ================================================ FILE: UnitTests/Vcf/Samples/SampleParsingExtensionsTests.cs ================================================ using Vcf.Sample; using Xunit; namespace UnitTests.Vcf.Samples { public sealed class SampleParsingExtensionsTests { [Fact] public void GetString() { var cols = new[] { "knatte", "fnatte", "tjatte" }; string observedResult = cols.GetString(2); Assert.Equal(cols[2], observedResult); } [Fact] public void GetString_NullIndex_ReturnNull() { var cols = new[] { "temp" }; string observedResult = cols.GetString(null); Assert.Null(observedResult); } [Fact] public void GetFloat() { var observedResult = "1.23".GetFloat(); Assert.Equal(1.23f, observedResult); } [Fact] public void GetFloat_NotFloat_ReturnNull() { float? observedResult = "test".GetFloat(); Assert.Null(observedResult); } [Fact] public void GetFloat_NullString_ReturnNull() { string s = null; float? observedResult = s.GetFloat(); Assert.Null(observedResult); } [Fact] public void GetDouble() { double? observedResult = "1.23".GetDouble(); Assert.Equal(1.23, observedResult); } [Fact] public void GetDouble_NotDouble_ReturnNull() { double? observedResult = "test".GetDouble(); Assert.Null(observedResult); } [Fact] public void GetDouble_NullString_ReturnNull() { string s = null; double? observedResult = s.GetDouble(); Assert.Null(observedResult); } [Fact] public void GetInteger() { int? observedResult = "17".GetInteger(); Assert.Equal(17, observedResult); } [Fact] public void GetInteger_NotInteger_ReturnNull() { int? observedResult = "test".GetInteger(); Assert.Null(observedResult); } [Fact] public void GetInteger_NullString_ReturnNull() { string s = null; int? observedResult = s.GetInteger(); Assert.Null(observedResult); } [Fact] public void GetStrings() { string[] observedResult = "17,test,13".GetStrings(); Assert.Equal(new[] { "17", "test", "13" }, observedResult); } [Fact] public void GetStrings_NullString_ReturnNull() { string s = null; string[] observedResult = s.GetStrings(); Assert.Null(observedResult); } [Fact] public void GetIntegers() { int[] observedResult = "17,13,11".GetIntegers(); Assert.Equal(new[] { 17, 13, 11 }, observedResult); } [Fact] public void GetIntegers_NotInteger_ReturnNull() { int[] observedResult = "10,13,bobby".GetIntegers(); Assert.Null(observedResult); } [Fact] public void GetIntegers_NullString_ReturnNull() { string s = null; int[] observedResult = s.GetIntegers(); Assert.Null(observedResult); } } } ================================================ FILE: UnitTests/Vcf/Samples/SampleTests.cs ================================================ using Vcf.Sample; using Xunit; namespace UnitTests.Vcf.Samples { public sealed class SampleTests { [Fact] public void Sample_ReturnEmpty() { var emptySample = new Sample(null, null, null, null, false, null, null, false, null, null, null, null, null, null, null, null, null, null, null, null); Assert.True(emptySample.IsEmpty); Assert.True(Sample.EmptySample.IsEmpty); } } } ================================================ FILE: UnitTests/Vcf/Samples/TestUtilities.cs ================================================ using Moq; using VariantAnnotation.Interface.Positions; namespace UnitTests.Vcf.Samples { public static class TestUtilities { public static ISimplePosition GetSimplePositionUsingAlleleNum(int numAlleles) { var mock = new Mock(); mock.SetupGet(x => x.AltAlleles).Returns(new string[numAlleles]); mock.SetupGet(x => x.Start).Returns(-1); return mock.Object; } } } ================================================ FILE: UnitTests/Vcf/Samples/VariantFrequencyTests.cs ================================================ using Vcf.Sample; using Xunit; namespace UnitTests.Vcf.Samples { public sealed class VariantFrequencyTests { [Fact] public void GetVariantFrequencies_VF_OneAltAllele() { double[] expectedResults = { 0.75 }; double[] observedResults = VariantFrequency.GetVariantFrequencies(0.75, null, 1); Evaluate(expectedResults, observedResults); } [Fact] public void GetVariantFrequencies_VF_MultipleAltAlleles_ReturnNull() { double[] observedResults = VariantFrequency.GetVariantFrequencies(0.75, null, 2); Assert.Null(observedResults); } [Fact] public void GetVariantFrequencies_OverrideAD_UseVF() { double[] expectedResults = { 0.75 }; double[] observedResults = VariantFrequency.GetVariantFrequencies(0.75, new[] { 10, 20 }, 1); Evaluate(expectedResults, observedResults); } [Fact] public void GetVariantFrequencies_AD_WrongAlleleCount_ReturnNull() { double[] observedResults = VariantFrequency.GetVariantFrequencies(null, new[] { 10, 20 }, 3); Assert.Null(observedResults); } [Fact] public void GetVariantFrequencies_AD() { double[] expectedResults = { 0.35, 0.4 }; double[] observedResults = VariantFrequency.GetVariantFrequencies(null, new[] { 5, 7, 8 }, 2); Evaluate(expectedResults, observedResults); } [Fact] public void GetVariantFrequencies_AD_ZeroSumAlleleCount_ReturnZeros() { double[] expectedResults = { 0.0, 0.0 }; double[] observedResults = VariantFrequency.GetVariantFrequencies(null, new[] { 0, 0, 0 }, 2); Evaluate(expectedResults, observedResults); } private static void Evaluate(double[] expectedResults, double[] observedResults) { if (expectedResults == null || observedResults == null) { Assert.Equal(expectedResults, observedResults); return; } Assert.Equal(expectedResults.Length, observedResults.Length); for (int i = 0; i < expectedResults.Length; i++) { Assert.Equal(expectedResults[i], observedResults[i], 10); } } } } ================================================ FILE: UnitTests/Vcf/StringExtensionsTests.cs ================================================ using Vcf; using Xunit; namespace UnitTests.Vcf { public sealed class StringExtensionsTests { [Theory] [InlineData("12",12)] [InlineData("12.0", null)] public void GetNullableValue_int(string input, int? exp) { var observe = input.GetNullableValue(int.TryParse); Assert.Equal(exp,observe); } [Theory] [InlineData("12", 12)] [InlineData("12.0", 12.0)] [InlineData("a.8",null)] public void GetNullableValue_double(string input, double? exp) { var observe = input.GetNullableValue(double.TryParse); Assert.Equal(exp, observe); } [Theory] [InlineData("12", new[]{12})] [InlineData("12,13", new[]{12,13})] [InlineData("12,13.0", null)] public void SplitToArray_int(string input, int[] exp) { var observe = input.SplitToArray(); Assert.Equal(exp, observe); } //[Theory] //[InlineData("12", new double[] { 12 })] //[InlineData("12,13", new double[] { 12, 13 })] //[InlineData("12,13.0", new[] { 12, 13.0})] //[InlineData("12.a,13.0", null)] //public void SplitToArray_double(string input, double[] exp) //{ // var observe = input.SplitToArray(',', double.TryParse); // Assert.Equal(exp, observe); //} } } ================================================ FILE: UnitTests/Vcf/VariantCreator/CnvCreatorTests.cs ================================================ using UnitTests.TestUtilities; using Variants; using Vcf.Info; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf.VariantCreator { public sealed class CnvCreatorTests { [Fact] public void Create_Dragen_3_3_DEL() { // chr1 907965 DRAGEN:LOSS:907966-909406 N . SampleFT SVTYPE=CNV;END=909406;REFLEN=1441 GT:SM:CN:BC:QS:FT:DN 0/1:0.516574:1:1:24:cnvLength:. 0/1:0.409726:1:1:26:cnvLength:. 0/1:0.496663:1:1:23:cnvLength:Inherited var builder = new InfoDataBuilder {SvType = "CNV", End = 909406}; InfoData infoData = builder.Create(); IVariant observedResults = CnvCreator.Create(ChromosomeUtilities.Chr1, 907965, infoData.End.Value, "N", "", null); Assert.Equal(ChromosomeUtilities.Chr1, observedResults.Chromosome); Assert.Equal(907966, observedResults.Start); Assert.Equal(909406, observedResults.End); Assert.Equal("N", observedResults.RefAllele); Assert.Equal("", observedResults.AltAllele); Assert.Equal(VariantType.copy_number_loss, observedResults.Type); } [Fact] public void Create_Dragen_3_3_DUP() { // chr1 1715898 DRAGEN:GAIN:1715899-1750149 N . PASS SVTYPE=CNV;END=1750149;REFLEN=34251 GT:SM:CN:BC:QS:FT:DN ./.:1.07189:2:6:33:PASS:. ./1:1.53631:3:6:49:PASS:. ./.:1.012:2:6:38:PASS:Inherited var builder = new InfoDataBuilder {SvType = "CNV", End = 1750149}; InfoData infoData = builder.Create(); IVariant observedResults = CnvCreator.Create(ChromosomeUtilities.Chr1, 1715898, infoData.End.Value, "N", "", null); Assert.Equal(ChromosomeUtilities.Chr1, observedResults.Chromosome); Assert.Equal(1715899, observedResults.Start); Assert.Equal(1750149, observedResults.End); Assert.Equal("N", observedResults.RefAllele); Assert.Equal("", observedResults.AltAllele); Assert.Equal(VariantType.copy_number_gain, observedResults.Type); } [Fact] public void Create_Canvas_TotalCopyNumber() { // 1 723707 Canvas:GAIN:1:723708:2581225 N 41 PASS SVTYPE=CNV;END=2581225 RC:BC:CN:MCC . 129:3123:3:2 var builder = new InfoDataBuilder {SvType = "CNV", End = 2581225}; InfoData infoData = builder.Create(); IVariant observedResults = CnvCreator.Create(ChromosomeUtilities.Chr1, 723707, infoData.End.Value, "N", "", null); Assert.Equal(ChromosomeUtilities.Chr1, observedResults.Chromosome); Assert.Equal(723708, observedResults.Start); Assert.Equal(2581225, observedResults.End); Assert.Equal("N", observedResults.RefAllele); Assert.Equal("", observedResults.AltAllele); Assert.Equal(VariantType.copy_number_variation, observedResults.Type); } [Fact] public void Create_Canvas_AlleleSpecificCopyNumber() { //chr1 854895 Canvas:COMPLEXCNV:chr1:854896-861879 N , . PASS SVTYPE=CNV;END=861879;CNVLEN=6984;CIPOS=-291,291;CIEND=-291,291 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/1:59.45:12:1:1:.:25.34:PASS:. 0/1:59.45:12:1:1:.:25.34:PASS:. 1/2:165.40:12:3:3:16.80:16.71:PASS:. var builder = new InfoDataBuilder {SvType = "CNV", End = 861879, CiPos = new[] {-291, 291}, CiEnd = new[] {-291, 291}}; InfoData infoData = builder.Create(); IVariant observedResults = CnvCreator.Create(ChromosomeUtilities.Chr1, 854895, infoData.End.Value, "N", "", null); Assert.Equal(ChromosomeUtilities.Chr1, observedResults.Chromosome); Assert.Equal(854896, observedResults.Start); Assert.Equal(861879, observedResults.End); Assert.Equal("N", observedResults.RefAllele); Assert.Equal("", observedResults.AltAllele); Assert.Equal(VariantType.copy_number_variation, observedResults.Type); } } } ================================================ FILE: UnitTests/Vcf/VariantCreator/LegacyVariantIdTests.cs ================================================ using System; using UnitTests.TestUtilities; using VariantAnnotation.Interface; using Variants; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf.VariantCreator { public sealed class LegacyVariantIdTests { private readonly LegacyVariantId _vidCreator = new(ChromosomeUtilities.RefNameToChromosome); [Theory] [InlineData(66507, 66507, "T", "A", "1:66507:A")] [InlineData(66522, 66521, "", "ATATA", "1:66522:66521:ATATA")] [InlineData(66573, 66574, "TA", "", "1:66573:66574")] [InlineData(66573, 66572, "", "TACTATATATTA", "1:66573:66572:TACTATATATTA")] [InlineData(100, 104, "TAGGT", "ACTTA", "1:100:104:ACTTA")] [InlineData(100, 104, "TAGGT", "", "1:100:104")] [InlineData(101, 100, "", "CGA", "1:101:100:CGA")] [InlineData(100, 100, "T", "A", "1:100:A")] [InlineData(100, 104, "TAGGT", "CGA", "1:100:104:CGA")] [InlineData(100, 99, "", "ACTGACGTACGAAGTTGCCGTACGTACTTGTCC", "1:100:99:3bd631d37e62d5db0f6d5d6db3cdcb60")] [InlineData(66366, 66378, "ATATAATATATAA", "TATATATATTATTATATAATATAATATATATTATATAATATATTTTATTATATAATATAATATATATTATATAATATAATATATTTTATTATATAAATATATATTATATTATATAATATAATATATATTAATATAAATATATATTAT", "1:66366:66378:17b72647da13e3c186348467b29b0492")] [InlineData(100, 300, "", "", "1:100:*")] public void Create_SmallVariants_ReturnVid(int start, int end, string refAllele, string altAllele, string expectedVid) { string observedVid = _vidCreator.Create(null, VariantCategory.SmallVariant, null, ChromosomeUtilities.Chr1, start, end, refAllele, altAllele, null); Assert.Equal(expectedVid, observedVid); } [Theory] [InlineData(66507, 66507, "T", ".", "1:66507:66507:T")] [InlineData(100, 100, "T", "T", "1:100:100:T")] [InlineData(100, 100, "T", ".", "1:100:100:T")] public void Create_Reference_ReturnVid(int start, int end, string refAllele, string altAllele, string expectedVid) { string observedVid = _vidCreator.Create(null, VariantCategory.Reference, null, ChromosomeUtilities.Chr1, start, end, refAllele, altAllele, null); Assert.Equal(expectedVid, observedVid); } [Theory] [InlineData(2617277, "A", "AAAAAAAAAAAAAAAAAATTAGTCAGGCAC[chr3:153444911[", "2:2617277:+:3:153444911:+")] [InlineData(32973490, "T", "T]chr9:74198768]", "2:32973490:+:9:74198768:-")] [InlineData(321681, "G", "G[13:123460[", "2:321681:+:13:123460:+")] [InlineData(32527769, "C", "[HLA-DRB1*13:02:01:3117[C", "2:32527769:-:HLA-DRB1*13:02:01:3117:+")] public void Create_TranslocationBreakend_ReturnVid(int position, string refAllele, string altAllele, string expectedVid) { string observedVid = _vidCreator.Create(null, VariantCategory.SV, "BND", ChromosomeUtilities.Chr2, position, position, refAllele, altAllele, null); Assert.Equal(expectedVid, observedVid); } [Theory] [InlineData(1000, 3001000, "", null, "ROH", VariantCategory.ROH, "1:1001:3001000:ROH")] [InlineData(1350082, 1351320, "", null, "DEL", VariantCategory.SV, "1:1350083:1351320")] [InlineData(999, 2015, "", null, "DUP", VariantCategory.SV, "1:1000:2015:DUP")] [InlineData(1477854, 1477984, "", null, "DUP", VariantCategory.SV, "1:1477855:1477984:TDUP")] [InlineData(1477968, 1477968, "", null, "INS", VariantCategory.SV, "1:1477969:1477968:INS")] [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CNV")] [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CN3")] [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CDUP")] [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CDEL")] [InlineData(2000, 5000, "", null, "ALU", VariantCategory.SV, "1:2001:5000:MEI")] [InlineData(2000, 5000, "", null, "LINE1", VariantCategory.SV, "1:2001:5000:MEI")] [InlineData(2000, 5000, "", null, "SVA", VariantCategory.SV, "1:2001:5000:MEI")] [InlineData(2000, 5000, "", null, "BOB", VariantCategory.SV, "1:2001:5000")] [InlineData(1715898, 1750149, "", null, "CNV", VariantCategory.CNV, "1:1715899:1750149:CDUP")] [InlineData(2650426, 2653074, "", null, "CNV", VariantCategory.CNV, "1:2650427:2653074:CDEL")] [InlineData(321682, 421681, "", null, "INV", VariantCategory.SV, "1:321683:421681:Inverse")] [InlineData(199, 202, "", "TTG", "", VariantCategory.RepeatExpansion, "1:200:202:TTG:5")] public void Create_StructuralVariants_ReturnVid(int start, int end, string altAllele, string repeatUnit, string svType, VariantCategory category, string expectedVid) { string observedVid = _vidCreator.Create(null, category, svType, ChromosomeUtilities.Chr1, start, end, "", altAllele, repeatUnit); Assert.Equal(expectedVid, observedVid); } [Fact] public void Create_LOH_ReturnsCnvVid() { const string altAllele = ""; const string svType = "LOH"; VariantCategory variantCategory = VariantFactory.GetVariantCategory(altAllele, svType); string observedVid = _vidCreator.Create(null, variantCategory, svType, ChromosomeUtilities.Chr1, 787923, 887923, "N", altAllele, null); Assert.Equal("1:787924:887923:CNV", observedVid); } [Fact] public void GetSmallVariantVid_UnknownVariantType_ThrowsException() { Assert.Throws(delegate { // ReSharper disable once UnusedVariable string vid = LegacyVariantId.GetSmallVariantVid(ChromosomeUtilities.Chr1, 100, 200, "A", VariantType.complex_structural_alteration); }); } } } ================================================ FILE: UnitTests/Vcf/VariantCreator/ReferenceVariantCreatorTests.cs ================================================ using CacheUtils.TranscriptCache; using Genome; using UnitTests.TestUtilities; using Variants; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf.VariantCreator { public sealed class ReferenceVariantCreatorTests { private static readonly ISequence Sequence = new NSequence(); private readonly VariantId _vidCreator = new VariantId(); [Fact] public void Create_SinglePosition_NoGlobalMajorAllele_ReturnNull() { IVariant[] variants = ReferenceVariantCreator.Create(_vidCreator, Sequence, ChromosomeUtilities.Chr1, 100, 100, "A", ".", null); Assert.Null(variants); } [Fact] public void Create_SinglePosition_HasGlobalMajorAllele_ReturnVariant() { var variant = GetVariant(100, 100, "A", ".", "T"); Assert.True(variant.IsRefMinor); } [Fact] public void Create_MultiplePositions_NoGlobalMajorAllele_ReturnNull() { IVariant[] variants = ReferenceVariantCreator.Create(_vidCreator, Sequence, ChromosomeUtilities.Chr1, 100, 101, "A", ".", null); Assert.Null(variants); } [Fact] public void Create_MultiplePositions_HasGlobalMajorAllele_ReturnNull() { IVariant[] variants = ReferenceVariantCreator.Create(_vidCreator, Sequence, ChromosomeUtilities.Chr1, 100, 101, "A", ".", "T"); Assert.Null(variants); } private IVariant GetVariant(int start, int end, string refAllele, string altAllele, string globalMajorAllele) { IVariant[] variants = ReferenceVariantCreator.Create(_vidCreator, Sequence, ChromosomeUtilities.Chr1, start, end, refAllele, altAllele, globalMajorAllele); Assert.Single(variants); return variants[0]; } } } ================================================ FILE: UnitTests/Vcf/VariantCreator/SmallVariantCreatorTests.cs ================================================ using UnitTests.TestUtilities; using Variants; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf.VariantCreator { public sealed class SmallVariantCreatorTests { [Fact] public void Create_Insertion_ReturnVariant() { var variant = SmallVariantCreator.Create(ChromosomeUtilities.Chr1, 101, 100, "", "CG", false, false, null, null, false); Assert.False(variant.IsRefMinor); Assert.Equal(AnnotationBehavior.SmallVariants, variant.Behavior); Assert.Equal("1", variant.Chromosome.EnsemblName); Assert.Equal(101, variant.Start); Assert.Equal(100, variant.End); Assert.Equal("", variant.RefAllele); Assert.Equal("CG", variant.AltAllele); Assert.Equal(VariantType.insertion, variant.Type); Assert.False(variant.IsDecomposed); Assert.False(variant.IsRecomposed); Assert.Null(variant.LinkedVids); } } } ================================================ FILE: UnitTests/Vcf/VariantCreator/VariantFactoryTests.cs ================================================ using System.Collections.Generic; using System.IO; using CacheUtils.TranscriptCache; using ErrorHandling.Exceptions; using Genome; using Moq; using OptimizedCore; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Variants; using Vcf; using Vcf.Info; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf.VariantCreator { public sealed class VariantFactoryTests { private static readonly ISequence Sequence = new NSequence(); private readonly ISequence _chr12Seq = new SimpleSequence( "TCCCCATGCTGCTCTTTTTTGCAAACACCAACACAATTTGGGCTCCATTTATAAGGCATCTGCTGCACCAACCCTCTTTCTTGGTGCTTACTGGACCTGCTCAGGGTTAATTTCTAACTCAAAGAACCTAACTTGGAGTAACTCCGTACCACCAGCAAAGCGACTGGCTTTGGGGAATGACATTTACAATGTATCCACTGTTATTTGGTCACCCAGCAAACTGTCATTTTTCAGAAACCAGGGCTGTCTCACAAACTGGCTTTCAATAAGGTGGGTTGCTTAGCAACTGCCAAGGAATTAAGAAGACAGAATAAGGTATCCGCCAGAGATATTTTATGACCAAAATGAGCTGCACTCATGTGTCTGGTTGTGTTCAAGGTAACCAAGTAAGAGATAACACCCGACTATTTTTGCATCATGAGGAAAAATACTTGGCTTCTGCCCAGAAGGGCAATTATCTCAAAGTCTTGGCAGGCCCCATGGTATGAGAAATGGTAACTGATATGGGGGTTAAAAAAAA", 106499648); private readonly VariantId _vidCreator = new(); private readonly LegacyVariantId _legacyVidCreator = new(null); private readonly Mock _sequenceMock = new(); private readonly VariantFactory _variantFactory; private readonly ISequenceProvider _sequenceProvider; public VariantFactoryTests() { // GRCh38 _sequenceMock.Setup(x => x.Substring(1037629, 1)).Returns("G"); _sequenceMock.Setup(x => x.Substring(787922, 1)).Returns("A"); _sequenceMock.Setup(x => x.Substring(110541588, 1)).Returns("T"); _sequenceMock.Setup(x => x.Substring(100955983, 1)).Returns("C"); _sequenceMock.Setup(x => x.Substring(11071438, 1)).Returns("G"); _sequenceMock.Setup(x => x.Substring(934063, 1)).Returns("A"); _sequenceMock.Setup(x => x.Substring(36690135, 1)).Returns("C"); _sequenceMock.Setup(x => x.Substring(20093, 1)).Returns("T"); _sequenceMock.Setup(x => x.Substring(15902, 1)).Returns("G"); // GRCh37 (for multi-allelic deletion with left alignment) _sequenceMock.Setup(x => x.Substring(106500157, 1)).Returns("G"); _sequenceMock.Setup(x => x.Substring(106500158, 1)).Returns("T"); _sequenceMock.Setup(x => x.Substring(106500159, 1)).Returns("T"); _sequenceMock.Setup(x => x.Substring(106500159, 2)).Returns("TA"); _sequenceMock.Setup(x => x.Substring(106500159-50, 50)).Returns( "AAAGTCTTGGCAGGCCCCATGGTATGAGAAATGGTAACTGATATGGGGGT"); _sequenceMock.Setup(x => x.Substring(23102861, 63)).Returns( "GGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGC"); _sequenceMock.Setup(x => x.Substring(23102861 -50, 50)).Returns( "GCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGC"); _sequenceMock.Setup(x => x.Substring(23102861 -50, 63)).Returns( "GCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGG"); _sequenceMock.Setup(x => x.Substring(23102861 -100, 50)).Returns( "CAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGG"); _sequenceMock.Setup(x => x.Substring(106499659, 500)).Returns( "CTCTTTTTTGCAAACACCAACACAATTTGGGCTCCATTTATAAGGCATCTGCTGCACCAACCCTCTTTCTTGGTGCTTACTGGACCTGCTCAGGGTTAATTTCTAACTCAAAGAACCTAACTTGGAGTAACTCCGTACCACCAGCAAAGCGACTGGCTTTGGGGAATGACATTTACAATGTATCCACTGTTATTTGGTCACCCAGCAAACTGTCATTTTTCAGAAACCAGGGCTGTCTCACAAACTGGCTTTCAATAAGGTGGGTTGCTTAGCAACTGCCAAGGAATTAAGAAGACAGAATAAGGTATCCGCCAGAGATATTTTATGACCAAAATGAGCTGCACTCATGTGTCTGGTTGTGTTCAAGGTAACCAAGTAAGAGATAACACCCGACTATTTTTGCATCATGAGGAAAAATACTTGGCTTCTGCCCAGAAGGGCAATTATCTCAAAGTCTTGGCAGGCCCCATGGTATGAGAAATGGTAACTGATATGGGGGT"); _sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, _sequenceMock.Object, ChromosomeUtilities.RefNameToChromosome); _variantFactory = new VariantFactory(_sequenceMock.Object, _vidCreator, new HashSet(){"CF"}); } private IPosition ParseVcfLine(string vcfLine) { string[] vcfFields = vcfLine.OptimizedSplit('\t'); Chromosome chromosome = ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefNameToChromosome, vcfFields[VcfCommon.ChromIndex]); (int start, bool foundError) = vcfFields[VcfCommon.PosIndex].OptimizedParseInt32(); if (foundError) throw new InvalidDataException($"Unable to convert the VCF position to an integer: {vcfFields[VcfCommon.PosIndex]}"); var simplePosition = SimplePosition.GetSimplePosition(chromosome, start, vcfFields, new NullVcfFilter()); return Position.ToPosition(simplePosition, null, _sequenceProvider, null, _variantFactory); } // chr1 69391 . A . . SVTYPE=DEL;END=138730 . . [Fact] public void CreateVariants_svDel() { var builder = new InfoDataBuilder {SvType = "DEL", End = 138730}; InfoData infoData = builder.Create(); var variantFactory = new VariantFactory(Sequence, _vidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr1, 69391, 138730, "A", new[] {""}, infoData, new[] {false}, false, null, null); Assert.NotNull(variants); } // 1 723707 Canvas:GAIN:1:723708:2581225 N 41 PASS SVTYPE=CNV;END=2581225 RC:BC:CN:MCC . 129:3123:3:2 [Fact] public void CreateVariants_canvas_cnv() { var builder = new InfoDataBuilder {SvType = "CNV", End = 2581225}; InfoData infoData = builder.Create(); var variantFactory = new VariantFactory(Sequence, _vidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr1, 723707, 2581225, "N", new[] {""}, infoData, new[] {false}, false, null, null); Assert.NotNull(variants); Assert.Equal("1-723707-2581225-N--CNV", variants[0].VariantId); Assert.Equal(VariantType.copy_number_variation, variants[0].Type); } // chr1 854895 Canvas:COMPLEXCNV:chr1:854896-861879 N , . PASS SVTYPE=CNV;END=861879;CNVLEN=6984;CIPOS=-291,291;CIEND=-291,291 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/1:59.45:12:1:1:.:25.34:PASS:. 0/1:59.45:12:1:1:.:25.34:PASS:. 1/2:165.40:12:3:3:16.80:16.71:PASS:. [Fact] public void CreateVariants_canvas_cnx() { var builder = new InfoDataBuilder {SvType = "CNV", End = 861879, CiPos = new[] {-291, 291}, CiEnd = new[] {-291, 291}}; InfoData infoData = builder.Create(); var variantFactory = new VariantFactory(Sequence, _vidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr1, 854895, 861879, "N", new[] {"", ""}, infoData, new[] {false, false}, false, null, null); Assert.NotNull(variants); Assert.Equal(2, variants.Length); Assert.Equal("1-854895-861879-N--CNV", variants[0].VariantId); Assert.Equal(VariantType.copy_number_variation, variants[0].Type); Assert.Equal("1-854895-861879-N--CNV", variants[1].VariantId); Assert.Equal(VariantType.copy_number_variation, variants[1].Type); } // chr1 1463185 Canvas:COMPLEXCNV:chr1:1463186-1476229 N , . PASS SVTYPE=CNV;END=1476229;CNVLEN=13044;CIPOS=-415,415;CIEND=-291,291 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/0:109.56:15:2:.:.:20.04:PASS:. 1/1:0.00:15:0:.:.:64.59:PASS:. ./2:167.45:15:3:.:.:17.87:PASS:. [Fact] public void CreateVariants_canvas_cnv_dup() { var builder = new InfoDataBuilder {SvType = "CNV", End = 1476229, CiPos = new[] {-415, 415}, CiEnd = new[] {-291, 291}}; InfoData infoData = builder.Create(); var variantFactory = new VariantFactory(Sequence, _vidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr1, 1463185, 1476229, "N", new[] {"", ""}, infoData, new[] {false, false}, false, null, null); Assert.NotNull(variants); Assert.Equal(2, variants.Length); Assert.Equal("1-1463185-1476229-N--CNV", variants[0].VariantId); Assert.Equal(VariantType.copy_number_variation, variants[0].Type); Assert.Equal("1-1463185-1476229-N--CNV", variants[1].VariantId); Assert.Equal(VariantType.copy_number_gain, variants[1].Type); // s are copy number gains } // chr1 1463185 . N . PASS SVTYPE=DUP;END=1476229;SVLEN=13044;CIPOS=-415,415;CIEND=-291,291 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/0:109.56:15:2:.:.:20.04:PASS:. 1/1:0.00:15:0:.:.:64.59:PASS:. ./1:167.45:15:3:.:.:17.87:PASS:. [Fact] public void CreateVariants_dup() { var builder = new InfoDataBuilder {SvType = "DUP", End = 1476229, CiPos = new[] {-415, 415}, CiEnd = new[] {-291, 291}}; InfoData infoData = builder.Create(); var variantFactory = new VariantFactory(Sequence, _vidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr1, 1463185, 1476229, "N", new[] {""}, infoData, new[] {false}, false, null, null); Assert.NotNull(variants); Assert.Single(variants); Assert.Equal("1-1463185-1476229-N--DUP", variants[0].VariantId); Assert.Equal(VariantType.duplication, variants[0].Type); } // 1 37820921 MantaDUP:TANDEM:5515:0:1:0:0:0 G . MGE10kb END=38404543;SVTYPE=DUP;SVLEN=583622;CIPOS=0,1;CIEND=0,1;HOMLEN=1;HOMSEQ=A;SOMATIC;SOMATICSCORE=63;ColocalizedCanvas PR:SR 39,0:44,0 202,26:192,32 [Fact] public void CreateVariants_tandem_duplication() { var builder = new InfoDataBuilder {SvType = "DUP", End = 38404543, SvLength = 583622, CiPos = new[] {0, 1}, CiEnd = new[] {0, 1}}; InfoData infoData = builder.Create(); var variantFactory = new VariantFactory(Sequence, _vidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr1, 723707, 2581225, "N", new[] {""}, infoData, new[] {false}, false, null, null); Assert.NotNull(variants); Assert.Equal(VariantType.tandem_duplication, variants[0].Type); } // 1 4000000 . N . ROHLC SVTYPE=ROH;END=4001000 GT . . 1 [Fact] public void CreateVariants_ROH() { var builder = new InfoDataBuilder {SvType = "ROH", End = 4001000}; InfoData infoData = builder.Create(); var variantFactory = new VariantFactory(Sequence, _vidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr1, 400_0000, 400_1000, "N", new[] {""}, infoData, new[] {false}, false, null, null); Assert.Equal(AnnotationBehavior.RunsOfHomozygosity, variants[0].Behavior); Assert.Equal(VariantType.run_of_homozygosity, variants[0].Type); } // chr12 106500158 . GTTA GTA,GT . . . [Fact] public void CreateVariants_LegacyVid_DisableLeftAlignment_MultiAllelic_Deletions() { InfoData infoData = new InfoDataBuilder().Create(); var variantFactory = new VariantFactory(_chr12Seq, _legacyVidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr12, 106500158, 106500161, "GTTA", new[] {"GTA", "GT"}, infoData, new[] {false, false}, false, null, null); Assert.Equal(2, variants.Length); Assert.Equal("12:106500160:106500160", variants[0].VariantId); Assert.Equal("12:106500160:106500161", variants[1].VariantId); } // chr12 106500158 . GTTA GTA,GT . . . [Fact] public void CreateVariants_NormalVid_EnableLeftAlignment_MultiAllelic_Deletions() { InfoData infoData = new InfoDataBuilder().Create(); var variantFactory = new VariantFactory(_chr12Seq, _vidCreator); IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr12, 106500158, 106500161, "GTTA", new[] {"GTA", "GT"}, infoData, new[] {false, false}, false, null, null); Assert.Equal(2, variants.Length); Assert.Equal("12-106500158-GT-G", variants[0].VariantId); Assert.Equal("12-106500159-TTA-T", variants[1].VariantId); } [Fact] public void ToPosition_SNV() { IPosition position = ParseVcfLine("chr1 15274 SNV A T . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-15274-A-T", variant.VariantId); Assert.Equal(VariantType.SNV, variant.Type); Assert.Equal(15274, variant.Start); Assert.Equal(15274, variant.End); } [Fact] public void ToPosition_insertion() { IPosition position = ParseVcfLine("chr1 15903 INS G GC . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-15903-G-GC", variant.VariantId); Assert.Equal(VariantType.insertion, variant.Type); Assert.Equal(15904, variant.Start); Assert.Equal(15903, variant.End); } [Fact] public void ToPosition_deletion() { IPosition position = ParseVcfLine("chr1 20094 DEL TAA T . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-20094-TAA-T", variant.VariantId); Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(20095, variant.Start); Assert.Equal(20096, variant.End); } [Fact] public void ToPosition_CANVAS_LOH() { IPosition position = ParseVcfLine("chr1 787923 CNV_CANVAS_LOH N 40 . SVTYPE=LOH;END=887923 RC:BC:CN:MCC 106.52:12642:2:2"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-787923-887923-A--LOH", variant.VariantId); Assert.Equal(VariantType.copy_number_variation, variant.Type); Assert.Equal(787924, variant.Start); Assert.Equal(887923, variant.End); } [Fact] public void ToPosition_Manta_SmallDeletion() { IPosition position = ParseVcfLine( "chr1 934064 SV_SNV AGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCG A . . END=934904;SVTYPE=DEL . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal( "1-934064-AGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCG-A", variant.VariantId); Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(934065, variant.Start); Assert.Equal(934904, variant.End); } [Fact] public void ToPosition_CANVAS_CNnum() { IPosition position = ParseVcfLine( "chr1 1037630 CNV_CN# N . . SVTYPE=CNV;END=1045024 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/1:60.76:8:1:.:.:22.51:PASS:."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-1037630-1045024-G--CNV", variant.VariantId); Assert.Equal(VariantType.copy_number_variation, variant.Type); Assert.Equal(1037631, variant.Start); Assert.Equal(1045024, variant.End); } [Fact] public void ToPosition_SV_DUP() { IPosition position = ParseVcfLine("chr1 1477854 SV_DUP C . . END=1477984;SVTYPE=DUP . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-1477854-1477984-C--DUP", variant.VariantId); Assert.Equal(VariantType.tandem_duplication, variant.Type); Assert.Equal(1477855, variant.Start); Assert.Equal(1477984, variant.End); } [Fact] public void ToPosition_SV_INS() { IPosition position = ParseVcfLine("chr1 1565683 SV_INS G . . END=1565684;SVTYPE=INS . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-1565683-1565684-G--INS", variant.VariantId); Assert.Equal(VariantType.insertion, variant.Type); Assert.Equal(1565684, variant.Start); Assert.Equal(1565684, variant.End); } [Fact] public void ToPosition_SV_INV() { IPosition position = ParseVcfLine("chr1 6558910 SV_INV G . . END=6559723;SVTYPE=INV . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-6558910-6559723-G--INV", variant.VariantId); Assert.Equal(VariantType.inversion, variant.Type); Assert.Equal(6558911, variant.Start); Assert.Equal(6559723, variant.End); } [Fact] public void ToPosition_SV_Translocation() { IPosition position = ParseVcfLine("chr1 9061384 SV_BND C C]chr14:93246833] . . SVTYPE=BND . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-9061384-C-C]chr14:93246833]", variant.VariantId); Assert.Equal(VariantType.translocation_breakend, variant.Type); Assert.Equal(9061384, variant.Start); Assert.Equal(9061384, variant.End); } [Fact] public void ToPosition_DRAGEN_LOH() { IPosition position = ParseVcfLine( "chr1 11071439 CNV_DRAGEN_LOH N , . . SVTYPE=CNV;END=12859473;REFLEN=1788034 GT:CN:MCN:CNQ:MCNQ:CNF:MCNF:SD:MAF:BC:AS 1/2:2:0:1000:1000:2.03102:0.000203:248.8:0.0001:1493:1137"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1-11071439-12859473-G--CNV", variant.VariantId); Assert.Equal(VariantType.copy_number_loss, variant.Type); Assert.Equal(11071440, variant.Start); Assert.Equal(12859473, variant.End); variant = variants[1]; Assert.Equal("1-11071439-12859473-G--CNV", variant.VariantId); Assert.Equal(VariantType.copy_number_gain, variant.Type); Assert.Equal(11071440, variant.Start); Assert.Equal(12859473, variant.End); } [Fact] public void ToPosition_STR() { IPosition position = ParseVcfLine( "chr3 63912684 STR G . PASS END=63912714;REF=10;RL=30;RU=GCA;VARID=ATXN7;REPID=ATXN7 GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC 0/1:SPANNING/SPANNING:10/12:10-10/12-12:9/3:8/11:0/0:26.270270"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("3-63912684-63912714-G--STR", variant.VariantId); Assert.Equal(VariantType.short_tandem_repeat_variation, variant.Type); Assert.Equal(63912685, variant.Start); Assert.Equal(63912714, variant.End); } [Fact] public void STR_without_num_throws_user_error() { var vcfLine = "chr3 63912684 STR G . PASS END=63912714;REF=10;RL=30;RU=GCA;VARID=ATXN7;REPID=ATXN7 GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC 0/1:SPANNING/SPANNING:10/12:10-10/12-12:9/3:8/11:0/0:26.270270"; Assert.Throws(()=>ParseVcfLine(vcfLine)); } [Fact] public void ToPosition_indel() { IPosition position = ParseVcfLine("chr4 46758265 INDEL GAGGTATAGAG GTT . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("4-46758266-AGGTATAGAG-TT", variant.VariantId); Assert.Equal(VariantType.indel, variant.Type); Assert.Equal(46758266, variant.Start); Assert.Equal(46758275, variant.End); } [Fact] public void ToPosition_MNV() { IPosition position = ParseVcfLine("chr4 67754304 MNV TGA TTT . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("4-67754305-GA-TT", variant.VariantId); Assert.Equal(VariantType.MNV, variant.Type); Assert.Equal(67754305, variant.Start); Assert.Equal(67754306, variant.End); } [Fact] public void ToPosition_CNV_DUP() { IPosition position = ParseVcfLine( "chr7 100955984 CNV_DUP N 37 PASS SVTYPE=CNV;END=100969873;REFLEN=13889 GT:SM:CN:BC:PE ./1:1.6625:3:12:48,81"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("7-100955984-100969873-C--CNV", variant.VariantId); Assert.Equal(VariantType.copy_number_gain, variant.Type); Assert.Equal(100955985, variant.Start); Assert.Equal(100969873, variant.End); } [Fact] public void ToPosition_CNV_DEL() { IPosition position = ParseVcfLine( "chr7 110541589 CNV_DEL N 27 cnvLength SVTYPE=CNV;END=110548681;REFLEN=7092 GT:SM:CN:BC:PE 0/1:0.443182:1:7:19,17"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("7-110541589-110548681-T--CNV", variant.VariantId); Assert.Equal(VariantType.copy_number_loss, variant.Type); Assert.Equal(110541590, variant.Start); Assert.Equal(110548681, variant.End); } [Fact] public void ToPosition_CustomSampleInfoFields() { IPosition position = ParseVcfLine( "chr7 110541589 CNV_DEL N 27 cnvLength SVTYPE=CNV;END=110548681;REFLEN=7092 GT:SM:CN:BC:PE:CF 0/1:0.443182:1:7:19,17:0.1,1.2"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("7-110541589-110548681-T--CNV", variant.VariantId); Assert.Equal(VariantType.copy_number_loss, variant.Type); Assert.Equal(110541590, variant.Start); Assert.Equal(110548681, variant.End); Assert.NotNull(position.Samples); var sample = position.Samples[0]; Assert.Contains("{\"CF\":\"0.1,1.2\"}", sample.CustomFields.ToString()!); } [Fact] public void ToPosition_ROH() { IPosition position = ParseVcfLine("chr22 36690136 ROH N . . END=36788158;SVTYPE=ROH . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("22-36690136-36788158-C--ROH", variant.VariantId); Assert.Equal(VariantType.run_of_homozygosity, variant.Type); Assert.Equal(36690137, variant.Start); Assert.Equal(36788158, variant.End); } // this is actually on GRCh37 [Fact] public void ToPosition_MultiAllelic_Deletions() { IPosition position = ParseVcfLine("chr12 106500158 . GTTA GTA,GT . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("12-106500158-GT-G", variant.VariantId); Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(106500159, variant.Start); Assert.Equal(106500159, variant.End); variant = variants[1]; Assert.Equal("12-106500159-TTA-T", variant.VariantId); Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(106500160, variant.Start); Assert.Equal(106500161, variant.End); } [Fact] public void ToPosition_Giant_dbsnp155_variant() { IPosition position = ParseVcfLine("15\t23102333\trs1894384199\tATGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGC\tATGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGCGGGAGCAGGAGGGGCAGGTGC\t.\t.\tR5;VC=INDEL;GNO;INT;FREQ=GnomAD:.,1,1.068e-05;RS=1894384199;SSR=0;GENEINFO=LOC283683:283683|LOC729900:729900"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; //this variant shifts more than what is shown here. Due to mocking limitations, we limit it to 2 iterations of // left rotation of 50bp each Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(23102762, variant.Start); Assert.Equal(23102824, variant.End); } } } ================================================ FILE: UnitTests/Vcf/VariantCreator/VariantFactoryTestsWithLegacyVids.cs ================================================ using System.IO; using Genome; using Moq; using OptimizedCore; using UnitTests.TestDataStructures; using UnitTests.TestUtilities; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Variants; using Vcf; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf.VariantCreator { public sealed class VariantFactoryTestsWithLegacyVids { private readonly Mock _sequenceMock = new(); private readonly ISequenceProvider _sequenceProvider; private readonly VariantFactory _variantFactory; public VariantFactoryTestsWithLegacyVids() { _sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, _sequenceMock.Object, ChromosomeUtilities.RefNameToChromosome); var vidCreator = new LegacyVariantId(ChromosomeUtilities.RefNameToChromosome); _variantFactory = new VariantFactory(_sequenceMock.Object, vidCreator); } private IPosition ParseVcfLine(string vcfLine) { string[] vcfFields = vcfLine.OptimizedSplit('\t'); Chromosome chromosome = ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefNameToChromosome, vcfFields[VcfCommon.ChromIndex]); (int start, bool foundError) = vcfFields[VcfCommon.PosIndex].OptimizedParseInt32(); if (foundError) throw new InvalidDataException($"Unable to convert the VCF position to an integer: {vcfFields[VcfCommon.PosIndex]}"); var simplePosition = SimplePosition.GetSimplePosition(chromosome, start, vcfFields, new NullVcfFilter()); return Position.ToPosition(simplePosition, null, _sequenceProvider, null, _variantFactory); } [Fact] public void ToPosition_SNV() { IPosition position = ParseVcfLine("chr1 15274 SNV A T . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:15274:T", variant.VariantId); Assert.Equal(VariantType.SNV, variant.Type); Assert.Equal(15274, variant.Start); Assert.Equal(15274, variant.End); } [Fact] public void ToPosition_insertion() { IPosition position = ParseVcfLine("chr1 15903 INS G GC . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:15904:15903:C", variant.VariantId); Assert.Equal(VariantType.insertion, variant.Type); Assert.Equal(15904, variant.Start); Assert.Equal(15903, variant.End); } [Fact] public void ToPosition_deletion() { IPosition position = ParseVcfLine("chr1 20094 DEL TAA T . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:20095:20096", variant.VariantId); Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(20095, variant.Start); Assert.Equal(20096, variant.End); } [Fact] public void ToPosition_CANVAS_LOH() { IPosition position = ParseVcfLine("chr1 787923 CNV_CANVAS_LOH N 40 . SVTYPE=LOH;END=887923 RC:BC:CN:MCC 106.52:12642:2:2"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:787924:887923:CNV", variant.VariantId); Assert.Equal(VariantType.copy_number_variation, variant.Type); Assert.Equal(787924, variant.Start); Assert.Equal(887923, variant.End); } [Fact] public void ToPosition_Manta_SmallDeletion() { IPosition position = ParseVcfLine( "chr1 934064 SV_SNV AGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCG A . . END=934904;SVTYPE=DEL . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal( "1:934065:934904", variant.VariantId); Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(934065, variant.Start); Assert.Equal(934904, variant.End); } [Fact] public void ToPosition_CANVAS_CNnum() { IPosition position = ParseVcfLine("chr1 1037630 CNV_CN# N . . SVTYPE=CNV;END=1045024 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/1:60.76:8:1:.:.:22.51:PASS:."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:1037631:1045024:CN0", variant.VariantId); Assert.Equal(VariantType.copy_number_variation, variant.Type); Assert.Equal(1037631, variant.Start); Assert.Equal(1045024, variant.End); } [Fact] public void ToPosition_SV_DUP() { IPosition position = ParseVcfLine("chr1 1477854 SV_DUP C . . END=1477984;SVTYPE=DUP . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:1477855:1477984:TDUP", variant.VariantId); Assert.Equal(VariantType.tandem_duplication, variant.Type); Assert.Equal(1477855, variant.Start); Assert.Equal(1477984, variant.End); } [Fact] public void ToPosition_SV_INS() { IPosition position = ParseVcfLine("chr1 1565683 SV_INS G . . END=1565684;SVTYPE=INS . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:1565684:1565684:INS", variant.VariantId); Assert.Equal(VariantType.insertion, variant.Type); Assert.Equal(1565684, variant.Start); Assert.Equal(1565684, variant.End); } [Fact] public void ToPosition_SV_INV() { IPosition position = ParseVcfLine("chr1 6558910 SV_INV G . . END=6559723;SVTYPE=INV . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:6558911:6559723:Inverse", variant.VariantId); Assert.Equal(VariantType.inversion, variant.Type); Assert.Equal(6558911, variant.Start); Assert.Equal(6559723, variant.End); } [Fact] public void ToPosition_SV_Translocation() { IPosition position = ParseVcfLine("chr1 9061384 SV_BND C C]chr14:93246833] . . SVTYPE=BND . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:9061384:+:14:93246833:-", variant.VariantId); Assert.Equal(VariantType.translocation_breakend, variant.Type); Assert.Equal(9061384, variant.Start); Assert.Equal(9061384, variant.End); } [Fact] public void ToPosition_DRAGEN_LOH() { IPosition position = ParseVcfLine( "chr1 11071439 CNV_DRAGEN_LOH N , . . SVTYPE=CNV;END=12859473;REFLEN=1788034 GT:CN:MCN:CNQ:MCNQ:CNF:MCNF:SD:MAF:BC:AS 1/2:2:0:1000:1000:2.03102:0.000203:248.8:0.0001:1493:1137"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("1:11071440:12859473:CDEL", variant.VariantId); Assert.Equal(VariantType.copy_number_loss, variant.Type); Assert.Equal(11071440, variant.Start); Assert.Equal(12859473, variant.End); variant = variants[1]; Assert.Equal("1:11071440:12859473:CDUP", variant.VariantId); Assert.Equal(VariantType.copy_number_gain, variant.Type); Assert.Equal(11071440, variant.Start); Assert.Equal(12859473, variant.End); } [Fact] public void ToPosition_STR() { IPosition position = ParseVcfLine( "chr3 63912684 STR G . PASS END=63912714;REF=10;RL=30;RU=GCA;VARID=ATXN7;REPID=ATXN7 GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC 0/1:SPANNING/SPANNING:10/12:10-10/12-12:9/3:8/11:0/0:26.270270"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("3:63912685:63912714:GCA:12", variant.VariantId); Assert.Equal(VariantType.short_tandem_repeat_variation, variant.Type); Assert.Equal(63912685, variant.Start); Assert.Equal(63912714, variant.End); } [Fact] public void ToPosition_indel() { IPosition position = ParseVcfLine("chr4 46758265 INDEL GAGGTATAGAG GTT . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("4:46758266:46758275:TT", variant.VariantId); Assert.Equal(VariantType.indel, variant.Type); Assert.Equal(46758266, variant.Start); Assert.Equal(46758275, variant.End); } [Fact] public void ToPosition_MNV() { IPosition position = ParseVcfLine("chr4 67754304 MNV TGA TTT . . . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("4:67754305:67754306:TT", variant.VariantId); Assert.Equal(VariantType.MNV, variant.Type); Assert.Equal(67754305, variant.Start); Assert.Equal(67754306, variant.End); } [Fact] public void ToPosition_CNV_DUP() { IPosition position = ParseVcfLine("chr7 100955984 CNV_DUP N 37 PASS SVTYPE=CNV;END=100969873;REFLEN=13889 GT:SM:CN:BC:PE ./1:1.6625:3:12:48,81"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("7:100955985:100969873:CDUP", variant.VariantId); Assert.Equal(VariantType.copy_number_gain, variant.Type); Assert.Equal(100955985, variant.Start); Assert.Equal(100969873, variant.End); } [Fact] public void ToPosition_CNV_DEL() { IPosition position = ParseVcfLine( "chr7 110541589 CNV_DEL N 27 cnvLength SVTYPE=CNV;END=110548681;REFLEN=7092 GT:SM:CN:BC:PE 0/1:0.443182:1:7:19,17"); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("7:110541590:110548681:CDEL", variant.VariantId); Assert.Equal(VariantType.copy_number_loss, variant.Type); Assert.Equal(110541590, variant.Start); Assert.Equal(110548681, variant.End); } [Fact] public void ToPosition_ROH() { IPosition position = ParseVcfLine("chr22 36690136 ROH N . . END=36788158;SVTYPE=ROH . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("22:36690137:36788158:ROH", variant.VariantId); Assert.Equal(VariantType.run_of_homozygosity, variant.Type); Assert.Equal(36690137, variant.Start); Assert.Equal(36788158, variant.End); } // this is actually on GRCh37 [Fact] public void ToPosition_MultiAllelic_Deletions() { IPosition position = ParseVcfLine("chr12 106500158 . GTTA GTA,GT . . ."); IVariant[] variants = position.Variants; Assert.NotNull(variants); IVariant variant = variants[0]; Assert.Equal("12:106500160:106500160", variant.VariantId); Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(106500160, variant.Start); Assert.Equal(106500160, variant.End); variant = variants[1]; Assert.Equal("12:106500160:106500161", variant.VariantId); Assert.Equal(VariantType.deletion, variant.Type); Assert.Equal(106500160, variant.Start); Assert.Equal(106500161, variant.End); } } } ================================================ FILE: UnitTests/Vcf/VariantCreator/VariantIdTests.cs ================================================ using Genome; using Moq; using UnitTests.TestUtilities; using VariantAnnotation.Interface; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf.VariantCreator { public sealed class VariantIdTests { private readonly ISequence _sequence; private readonly VariantId _vidCreator = new VariantId(); public VariantIdTests() { var sequenceMock = new Mock(); sequenceMock.Setup(x => x.Substring(999, 1)).Returns("N"); sequenceMock.Setup(x => x.Substring(66520, 1)).Returns("T"); sequenceMock.Setup(x => x.Substring(66571, 1)).Returns("G"); sequenceMock.Setup(x => x.Substring(321681, 1)).Returns("G"); sequenceMock.Setup(x => x.Substring(477967, 1)).Returns("A"); sequenceMock.Setup(x => x.Substring(1350081, 1)).Returns("C"); sequenceMock.Setup(x => x.Substring(1477853, 1)).Returns("A"); sequenceMock.Setup(x => x.Substring(1477967, 1)).Returns("A"); sequenceMock.Setup(x => x.Substring(1715897, 1)).Returns("A"); sequenceMock.Setup(x => x.Substring(2633402, 1)).Returns("G"); sequenceMock.Setup(x => x.Substring(2633403, 1)).Returns("G"); sequenceMock.Setup(x => x.Substring(2650425, 1)).Returns("N"); _sequence = sequenceMock.Object; } [Theory] [InlineData(66507, "T", ".", "1-66507-T-T")] [InlineData(66507, "T", "A", "1-66507-T-A")] [InlineData(66522, "", "ATATA", "1-66521-T-TATATA")] [InlineData(66573, "TA", "", "1-66572-GTA-G")] [InlineData(66573, "", "TACTATATATTA", "1-66572-G-GTACTATATATTA")] public void Create_SmallVariants_ReturnShortVid(int position, string refAllele, string altAllele, string expectedVid) { string observedVid = _vidCreator.Create(_sequence, VariantCategory.SmallVariant, null, ChromosomeUtilities.Chr1, position, position, refAllele, altAllele, null); Assert.Equal(expectedVid, observedVid); } [Fact] public void Create_TranslocationBreakend_ReturnShortVid() { string observedVid = _vidCreator.Create(_sequence, VariantCategory.SV, "BND", ChromosomeUtilities.Chr1, 2617277, 2617277, "A", "AAAAAAAAAAAAAAAAAATTAGTCAGGCAC[chr3:153444911[", null); Assert.Equal("1-2617277-A-AAAAAAAAAAAAAAAAAATTAGTCAGGCAC[chr3:153444911[", observedVid); } [Theory] [InlineData(1000, 3001000, "N", "", "ROH", VariantCategory.ROH, "1-1000-3001000-N--ROH")] [InlineData(1350082, 1351320, "N", "", "DEL", VariantCategory.SV, "1-1350082-1351320-C--DEL")] [InlineData(1477854, 1477984, "N", "", "DUP", VariantCategory.SV, "1-1477854-1477984-A--DUP")] [InlineData(1477968, 1477968, "N", "", "INS", VariantCategory.SV, "1-1477968-1477968-A--INS")] [InlineData(1715898, 1750149, "N", "", "CNV", VariantCategory.CNV, "1-1715898-1750149-A--CNV")] [InlineData(2650426, 2653074, "N", "", "CNV", VariantCategory.CNV, "1-2650426-2653074-N--CNV")] [InlineData(321682, 421681, "N", "", "INV", VariantCategory.SV, "1-321682-421681-G--INV")] [InlineData(2633403, 2633421, "N", "", "", VariantCategory.RepeatExpansion, "1-2633403-2633421-G--STR")] public void Create_StructuralVariants_RecoverRefAllele_ReturnLongVid(int position, int endPosition, string refAllele, string altAllele, string svType, VariantCategory category, string expectedVid) { string observedVid = _vidCreator.Create(_sequence, category, svType, ChromosomeUtilities.Chr1, position, endPosition, refAllele, altAllele, null); Assert.Equal(expectedVid, observedVid); } } } ================================================ FILE: UnitTests/Vcf/VcfFilterTests.cs ================================================ using System.IO; using System.Text; using Genome; using UnitTests.TestUtilities; using Vcf; using Xunit; namespace UnitTests.Vcf { public sealed class VcfFilterTests { [Fact] public void FastForward_UcscNamingStyle_ChangeReaderStateCorrectly() { var annotationRange = new GenomicRange(new GenomicPosition(ChromosomeUtilities.Chr1, 100), new GenomicPosition(ChromosomeUtilities.Chr1, 200) ); var vcfFilter = new VcfFilter(annotationRange); const string firstLineInRange = "chr1\t100\t.\tC\tT\t165.00\tPASS\tSNVSB=-12.5;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:119:35:25:0:8,17"; using (var ms = new MemoryStream()) { using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("#Header line 1"); writer.WriteLine("#Header line 2"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tMother"); writer.WriteLine("chr2\t150\t.\tG\tA\t5.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:3:1:1:0:0,1"); writer.WriteLine("chr1\t90\t.\tT\tC\t1.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:23:9:3:0:2,1"); writer.WriteLine("chr1\t95\t.\tA\tT\t2.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:23:9:3:0:2,1"); writer.WriteLine(firstLineInRange); writer.WriteLine("chr1\t102\t.\tC\tA\t3.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=5\tGT:GQ:GQX:DP:DPF:AD\t0/1:29:2:2:0:1,1"); } ms.Position = 0; using (var reader = new StreamReader(ms)) { vcfFilter.FastForward(reader); Assert.Equal(firstLineInRange, vcfFilter.BufferedLine); } } } [Fact] public void FastForward_EnsemblNamingStyle_ChangeReaderStateCorrectly() { var annotationRange = new GenomicRange(new GenomicPosition(ChromosomeUtilities.Chr1, 100), new GenomicPosition(ChromosomeUtilities.Chr1, 200)); var vcfFilter = new VcfFilter(annotationRange); const string firstLineInRange = "1\t100\t.\tC\tT\t165.00\tPASS\tSNVSB=-12.5;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:119:35:25:0:8,17"; using (var ms = new MemoryStream()) { using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("#Header line 1"); writer.WriteLine("#Header line 2"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tMother"); writer.WriteLine("2\t150\t.\tG\tA\t5.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:3:1:1:0:0,1"); writer.WriteLine("1\t90\t.\tT\tC\t1.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:23:9:3:0:2,1"); writer.WriteLine("1\t95\t.\tA\tT\t2.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:23:9:3:0:2,1"); writer.WriteLine(firstLineInRange); writer.WriteLine("1\t102\t.\tC\tA\t3.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=5\tGT:GQ:GQX:DP:DPF:AD\t0/1:29:2:2:0:1,1"); } ms.Position = 0; using (var reader = new StreamReader(ms)) { vcfFilter.FastForward(reader); Assert.Equal(firstLineInRange, vcfFilter.BufferedLine); } } } [Fact] public void GetNextLine_NoBufferedLine_ReadNextLine() { var vcfFilter = new VcfFilter(null); using (var ms = new MemoryStream(Encoding.UTF8.GetBytes("first line\nsecond line\n"))) using (var reader = new StreamReader(ms)) { string nextLine = vcfFilter.GetNextLine(reader); Assert.Equal("first line", nextLine); } } [Fact] public void GetNextLine_ReturnBufferedLine() { const string bufferedLine = "I am buffered"; var vcfFilter = new VcfFilter(null) {BufferedLine = bufferedLine}; string nextLine = vcfFilter.GetNextLine(null); Assert.Equal(bufferedLine, nextLine); } [Fact] public void PassedTheEnd_AsExpected() { var annotationRange = new GenomicRange(new GenomicPosition(ChromosomeUtilities.Chr1, 100), new GenomicPosition(ChromosomeUtilities.Chr1, 200)); var vcfFilter = new VcfFilter(annotationRange); Assert.False(vcfFilter.PassedTheEnd(ChromosomeUtilities.Chr1, 150)); Assert.False(vcfFilter.PassedTheEnd(ChromosomeUtilities.Chr1, 200)); Assert.True(vcfFilter.PassedTheEnd(ChromosomeUtilities.Chr1, 201)); Assert.True(vcfFilter.PassedTheEnd(ChromosomeUtilities.Chr2, 150)); } } } ================================================ FILE: UnitTests/Vcf/VcfInfoParserTests.cs ================================================ using VariantAnnotation.Interface.Positions; using Vcf.Info; using Xunit; namespace UnitTests.Vcf { public sealed class VcfInfoParserTests { [Fact] public void Parse_Somatic_Manta() { IInfoData info = VcfInfoParser.Parse( "END=1660503;SVTYPE=DEL;SVLEN=-65919;IMPRECISE;CIPOS=-285,285;CIEND=-205,205;SOMATIC;SOMATICSCORE=36;ColocalizedCanvas"); Assert.Equal(65919, info.SvLength); Assert.Equal(1660503, info.End); Assert.Equal(36, info.JointSomaticNormalQuality); Assert.Equal(new[] {-285, 285}, info.CiPos); Assert.Equal(new[] {-205, 205}, info.CiEnd); Assert.True(info.IsImprecise); } [Fact] public void Parse_Somatic_Strelka() { var info = VcfInfoParser.Parse("SOMATIC;QSS=2;TQSS=1;NT=het;QSS_NT=2;TQSS_NT=1;SGT=CG->CG;DP=183;MQ=46.57;MQ0=15;ALTPOS=35;ALTMAP=24;ReadPosRankSum=-1.23;SNVSB=0.00;PNOISE=0.00;PNOISE2=0.00;VQSR=1.23"); Assert.Equal(1.23, info.RecalibratedQuality); Assert.Equal(2, info.JointSomaticNormalQuality); } [Fact] public void Parse_GATK() { var info = VcfInfoParser.Parse("AC=2;AF=0.250;AN=8;BaseQRankSum=1.719;DB;DP=106;Dels=0.00;FS=20.202;HaplotypeScore=0.0000;MLEAC=2;MLEAF=0.250;MQ=43.50;MQ0=52;MQRankSum=2.955;QD=4.73;ReadPosRankSum=1.024;SB=-1.368e+02;VQSLOD=-0.3503;culprit=MQ;PLF"); Assert.Equal(-136.8, info.StrandBias); Assert.Equal(20.202, info.FisherStrandBias); Assert.Equal(43.50, info.MappingQuality); } [Fact] public void Parse_Breakend_Event_Id() { var info = VcfInfoParser.Parse("SVTYPE=BND;MATEID=MantaBND:2312:0:1:1:0:0:0;IMPRECISE;CIPOS=-344,344;EVENT=MantaBND:2312:0:1:0:0:0:0;JUNCTION_QUAL=204;BND_DEPTH=38;MATE_BND_DEPTH=46"); Assert.Equal("MantaBND:2312:0:1:0:0:0:0", info.BreakendEventId); } [Fact] public void EmptyInfoField() { Assert.Null(VcfInfoParser.Parse("")); } } } ================================================ FILE: UnitTests/Vcf/VcfReaderTests.cs ================================================ using System.IO; using ErrorHandling.Exceptions; using Genome; using IO; using Moq; using UnitTests.SAUtils.InputFileParsers; using UnitTests.TestUtilities; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Vcf; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf { public sealed class VcfReaderTests { private MemoryStream _ms; private StreamWriter _streamWriter; private readonly VariantId _vidCreator = new VariantId(); private void AddLines(string[] lines) { _ms = new MemoryStream(); _streamWriter = new StreamWriter(_ms); foreach (string headline in lines) { _streamWriter.WriteLine(headline); } _streamWriter.Flush(); _ms.Position = 0; } [Fact] public void ValidateVcfHeader_ExceptionThrown_NoFileFormat() { var headers = new[] { "##Some comments", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NHL-16 NHL-17" }; AddLines(headers); var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); var reader = FileUtilities.GetStreamReader(_ms); Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, new NullVcfFilter(), _vidCreator, null)); } [Fact] public void ValidateVcfHeader_ExceptionThrown_NoChromHeaderLine() { var headers = new[] { "##fileformat=VCFv4.1", "##FILTER=", "##fileDate=20160920" }; AddLines(headers); var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); var reader = FileUtilities.GetStreamReader(_ms); Assert.Throws(() => VcfReader.Create(reader, reader, seqProvider, null, new NullVcfFilter(), _vidCreator, null)); } [Fact] public void Sample_names_are_reported() { var headers = new[] { "##fileformat=VCFv4.1", "##FILTER=", "##fileDate=20160920", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NHL-16 NHL-17" }; AddLines(headers); string[] samples; var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, new NullVcfFilter(), _vidCreator, null)) { samples = vcfReader.GetSampleNames(); } Assert.Equal(new[] { "NHL-16", "NHL-17" }, samples); } [Fact] public void GetChromAndLengthInfo_ReturnEmptyArray_NoProperPrefix() { Assert.Empty(VcfReader.GetChromAndLengthInfo("##fileformat=VCFv")); } [Fact] public void GetChromAndLengthInfo_ReturnEmptyArray_NoChromInfo() { Assert.Empty(VcfReader.GetChromAndLengthInfo("##contig=")); } [Fact] public void GetChromAndLengthInfo_ReturnEmptyArray_NoLengthInfo() { Assert.Empty(VcfReader.GetChromAndLengthInfo("##contig=")); } [Theory] [InlineData("##contig=")] [InlineData("##contig=(() => VcfReader.Create(reader, reader, seqProvider, null, new NullVcfFilter(), _vidCreator, null)); } [Theory] [InlineData("##contig=")] [InlineData("##contig=")] public void CheckContigId_InferredAssemblyIsUnknown_GivenIrregularChrom(string contigLine) { var headers = new[] { "##fileformat=VCFv4.1", contigLine, "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT" }; AddLines(headers); var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, new NullVcfFilter(), _vidCreator, null)) { Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly); } } [Theory] [InlineData("##contig=")] [InlineData("##contig=")] public void CheckContigId_IsRcrsMitochondrionTrue_InferredAssemblyIsUnknown_GivenRcrsChrMLength(string contigLine) { var headers = new[] { "##fileformat=VCFv4.1", contigLine, "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT" }; AddLines(headers); var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, new NullVcfFilter(), _vidCreator, null)) { Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly); Assert.True(vcfReader.IsRcrsMitochondrion); } } [Theory] [InlineData("##contig=")] [InlineData("##contig=")] public void CheckContigId_IsRcrsMitochondrionFalse_InferredAssemblyIsUnknown_GivenNonRcrsChrMLength(string contigLine) { var headers = new[] { "##fileformat=VCFv4.1", contigLine, "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT" }; AddLines(headers); var seqProvider = ParserTestUtils.GetSequenceProvider(1000, "A", 'T', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, null, new NullVcfFilter(), _vidCreator, null)) { Assert.Equal(GenomeAssembly.Unknown, vcfReader.InferredGenomeAssembly); Assert.False(vcfReader.IsRcrsMitochondrion); } } [Theory] [InlineData("##contig=", new[] { "chr1", "248956422" })] [InlineData("##contig=", new[] { "2", "242193529" })] [InlineData("##contig=", new[] { "chrM", "16569" })] [InlineData("##contig=", new[] { "MT", "16569" })] public void GetChromAndLength_AsExpect(string line, string[] info) { Assert.Equal(info, VcfReader.GetChromAndLengthInfo(line)); } [Fact] public void GetNextPosition() { const string vcfLine = "chr1 13133 . T C 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; var lines = new[] { "##fileformat=VCFv4.1", "##FILTER=", "##fileDate=20160920", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NHL-16", vcfLine }; AddLines(lines); var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome); IPosition observedResult; using (var reader = FileUtilities.GetStreamReader(_ms)) using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullVcfFilter(), _vidCreator, null)) { observedResult = vcfReader.GetNextPosition(); } var expectedResult = PositionPool.Get(ChromosomeUtilities.Chr1, 13133, 13133, "T", new[] { "C" }, 36, new[] { "PASS" }, null, null, null, vcfLine.Split('\t'), new[] { false }, false); Assert.NotNull(observedResult); Assert.Equal(expectedResult.End, observedResult.End); Assert.Equal(expectedResult.AltAlleles, observedResult.AltAlleles); Assert.Equal(expectedResult.Filters, observedResult.Filters); Assert.Equal(expectedResult.Quality, observedResult.Quality); Assert.Equal(expectedResult.VcfFields, observedResult.VcfFields); PositionPool.Return(expectedResult); } [Fact] public void CheckSampleConsistency_oneSample() { const string vcfLine1 = "chr1 13133 . T C 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; const string vcfLine2 = "chr1 13133 . T A 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD"; var lines = new[] { "##fileformat=VCFv4.1", "##FILTER=", "##fileDate=20160920", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NHL-16", vcfLine1, vcfLine2 }; AddLines(lines); var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullVcfFilter(), _vidCreator, null)) { //first line is valid. So, no exception Assert.NotNull(vcfReader.GetNextPosition()); // second line has invalid number of sample fields, so it will throw exception Assert.Throws(()=>vcfReader.GetNextPosition()); } } [Fact] public void CheckSampleConsistency_noSample() { const string vcfLine1 = "chr1 13133 . T C 36.00 PASS SNVSB=0.0;SNVHPOL=4"; const string vcfLine2 = "chr1 13133 . T A 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; var lines = new[] { "##fileformat=VCFv4.1", "##FILTER=", "##fileDate=20160920", "#CHROM POS ID REF ALT QUAL FILTER INFO", vcfLine1, vcfLine2 }; AddLines(lines); var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome); using (var reader = FileUtilities.GetStreamReader(_ms)) using (var vcfReader = VcfReader.Create(reader, reader, seqProvider, refMinorProvider.Object, new NullVcfFilter(), _vidCreator, null)) { //first line is valid. So, no exception Assert.NotNull(vcfReader.GetNextPosition()); // second line has invalid number of sample fields, so it will throw exception Assert.Throws(() => vcfReader.GetNextPosition()); } } } } ================================================ FILE: UnitTests/Vcf/VcfReaderUtilsTests.cs ================================================ using System.Linq; using Moq; using UnitTests.SAUtils.InputFileParsers; using UnitTests.TestUtilities; using VariantAnnotation; using VariantAnnotation.Interface.Providers; using Vcf.VariantCreator; using Xunit; namespace UnitTests.Vcf { public sealed class VcfReaderUtilsTests { private readonly VariantId _vidCreator = new VariantId(); #if (NI_ALLELE) [Fact] public void ParseVcfLine_NonInformativeAlleles_Alone_NotFiltered() { const string vcfLine1 = "chr1 13133 . T <*> 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; const string vcfLine2 = "chr1 13133 . T * 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; const string vcfLine3 = "chr1 13133 . T 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider); var position1 = AnnotationUtilities.ParseVcfLine(vcfLine1, refMinorProvider.Object, variantFactory, seqProvider.RefNameToChromosome); var position2 = AnnotationUtilities.ParseVcfLine(vcfLine2, refMinorProvider.Object, variantFactory, seqProvider.RefNameToChromosome); var position3 = AnnotationUtilities.ParseVcfLine(vcfLine3, refMinorProvider.Object, variantFactory, seqProvider.RefNameToChromosome); var annotatedVariants1 = Annotator.GetAnnotatedVariants(position1.Variants); var annotatedVariants2 = Annotator.GetAnnotatedVariants(position2.Variants); var annotatedVariants3 = Annotator.GetAnnotatedVariants(position3.Variants); // SimplePositions unchanged Assert.Equal("<*>", position1.AltAlleles[0]); Assert.Equal("*", position2.AltAlleles[0]); Assert.Equal("", position3.AltAlleles[0]); // Variants not filtered Assert.Equal("<*>", annotatedVariants1[0].Variant.AltAllele); Assert.Equal("*", annotatedVariants2[0].Variant.AltAllele); Assert.Equal("", annotatedVariants3[0].Variant.AltAllele); } [Fact] public void ParseVcfLine_NonInformativeAlleles_WithNormalAllele_NotFiltered() { const string vcfLine1 = "chr1 13133 . T <*>,G 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; const string vcfLine2 = "chr1 13133 . T *,C 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; const string vcfLine3 = "chr1 13133 . T ,A 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; const string vcfLine4 = "chr1 13133 . T A, 36.00 PASS SNVSB=0.0;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD 0/1:62:20:7:1:3,4"; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(13133, "T", 'A', ChromosomeUtilities.RefNameToChromosome); var refNameToChromosome = seqProvider.RefNameToChromosome; var variantFactory = new VariantFactory(seqProvider); var position1 = AnnotationUtilities.ParseVcfLine(vcfLine1, refMinorProvider.Object, variantFactory, refNameToChromosome); var position2 = AnnotationUtilities.ParseVcfLine(vcfLine2, refMinorProvider.Object, variantFactory, refNameToChromosome); var position3 = AnnotationUtilities.ParseVcfLine(vcfLine3, refMinorProvider.Object, variantFactory, refNameToChromosome); var position4 = AnnotationUtilities.ParseVcfLine(vcfLine4, refMinorProvider.Object, variantFactory, refNameToChromosome); var annotatedVariants1 = Annotator.GetAnnotatedVariants(position1.Variants); var annotatedVariants2 = Annotator.GetAnnotatedVariants(position2.Variants); var annotatedVariants3 = Annotator.GetAnnotatedVariants(position3.Variants); var annotatedVariants4 = Annotator.GetAnnotatedVariants(position4.Variants); // SimplePositions Assert.Equal(new[] { "<*>", "G" }, position1.AltAlleles); Assert.Equal(new[] { "*", "C" }, position2.AltAlleles); Assert.Equal(new[] { "", "A" }, position3.AltAlleles); Assert.Equal(new[] { "A", "" }, position4.AltAlleles); // Variants Assert.Equal(new[] { "<*>", "G" }, annotatedVariants1.Select(x => x.Variant.AltAllele).ToArray()); Assert.Equal(new[] { "*", "C" }, annotatedVariants2.Select(x => x.Variant.AltAllele).ToArray()); Assert.Equal(new[] { "", "A" }, annotatedVariants3.Select(x => x.Variant.AltAllele).ToArray()); Assert.Equal(new[] { "A", "" }, annotatedVariants4.Select(x => x.Variant.AltAllele).ToArray()); } #endif [Fact] public void Test_crash_caused_by_variant_trimming () { const string vcfLine1 = "chr1 8021910 rs373653682 GGTGCTGGACGGTGTCCCT G . . ."; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(8021910, "GGTGCTGGACGGTGTCCCT", 'A', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator); var position1 = AnnotationUtilities.ParseVcfLine(vcfLine1, refMinorProvider.Object, seqProvider, null, variantFactory); var annotatedVariants1 = Annotator.GetAnnotatedVariants(position1.Variants); // SimplePositions Assert.Equal(new[] { "G"}, position1.AltAlleles); // Variants Assert.Equal(new[] { "" }, annotatedVariants1.Select(x => x.Variant.AltAllele).ToArray()); } [Fact] public void ParseVcfLine_line_with_only_NonRef_is_refMinor() { const string vcfLine = "1 10628385 . C . LowGQX;HighDPFRatio END=10628385;BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:24:9:18"; var refMinorProvider = new Mock(); refMinorProvider.Setup(x => x.GetGlobalMajorAllele(ChromosomeUtilities.Chr1, 10628385)).Returns("T"); var seqProvider = ParserTestUtils.GetSequenceProvider(10628385, "C", 'A', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator); var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); var annotatedVariants = Annotator.GetAnnotatedVariants(position.Variants); Assert.Equal("C", position.RefAllele); Assert.Equal(new[] { "" }, position.AltAlleles); Assert.Equal("T", position.Variants[0].RefAllele); Assert.Equal("C", position.Variants[0].AltAllele); // Variants Assert.Equal(new[] { "C" }, annotatedVariants.Select(x => x.Variant.AltAllele).ToArray()); } [Fact] public void ParseVcfLine_line_with_only_NonRef_is_not_refMinor() { const string vcfLine = "1 10005 . C . LowGQX END=10034;BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:3:1:0"; var refMinorProvider = new Mock(); var seqProvider = ParserTestUtils.GetSequenceProvider(10005, "C", 'A', ChromosomeUtilities.RefNameToChromosome); var variantFactory = new VariantFactory(seqProvider.Sequence, _vidCreator); var position = AnnotationUtilities.ParseVcfLine(vcfLine, refMinorProvider.Object, seqProvider, null, variantFactory); var annotatedVariants = Annotator.GetAnnotatedVariants(position.Variants); Assert.Equal("C", position.RefAllele); Assert.Equal(new[] { "" }, position.AltAlleles); Assert.Null(position.Variants); Assert.Null(annotatedVariants); } } } ================================================ FILE: VariantAnnotation/Algorithms/Swap.cs ================================================ namespace VariantAnnotation.Algorithms { public static class Swap { /// /// swaps two integers /// public static void Int(ref int a, ref int b) { var temp = a; a = b; b = temp; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/AnnotatedPosition.cs ================================================ using System.Collections.Generic; using System.Linq; using System.Text; using OptimizedCore; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; namespace VariantAnnotation.AnnotatedPositions { public sealed class AnnotatedPosition : IAnnotatedPosition { public IPosition Position { get; private set; } public string CytogeneticBand { get; set; } public IAnnotatedVariant[] AnnotatedVariants { get; private set; } public IList SupplementaryIntervals { get; } = new List(); public void Initialize(IPosition position, IAnnotatedVariant[] annotatedVariants) { Position = position; AnnotatedVariants = annotatedVariants; SupplementaryIntervals.Clear(); } public StringBuilder GetJsonStringBuilder() { if (AnnotatedVariants == null || AnnotatedVariants.Length == 0) return null; var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); string originalChromName = Position.VcfFields[0]; jsonObject.AddStringValue("chromosome", originalChromName); jsonObject.AddIntValue("position", Position.Start); if (Position.HasShortTandemRepeat) { jsonObject.AddStringValue("repeatUnit", Position.InfoData?.RepeatUnit); jsonObject.AddIntValue("refRepeatCount", Position.InfoData?.RefRepeatCount); } if (Position.HasStructuralVariant) jsonObject.AddIntValue("svEnd", Position.InfoData?.End); jsonObject.AddStringValue("refAllele", Position.RefAllele); jsonObject.AddStringValues("altAlleles", Position.AltAlleles); jsonObject.AddDoubleValue("quality", Position.Quality); jsonObject.AddStringValues("filters", Position.Filters); jsonObject.AddIntValues("ciPos", Position.InfoData?.CiPos); jsonObject.AddIntValues("ciEnd", Position.InfoData?.CiEnd); jsonObject.AddIntValue("svLength", Position.InfoData?.SvLength); jsonObject.AddStringValue("breakendEventId", Position.InfoData?.BreakendEventId); jsonObject.AddDoubleValue("strandBias", Position.InfoData?.StrandBias,JsonCommon.FrequencyRoundingFormat); jsonObject.AddDoubleValue("fisherStrandBias", Position.InfoData?.FisherStrandBias,"0.###"); jsonObject.AddDoubleValue("mappingQuality", Position.InfoData?.MappingQuality,"0.##"); jsonObject.AddIntValue("jointSomaticNormalQuality", Position.InfoData?.JointSomaticNormalQuality); jsonObject.AddDoubleValue("recalibratedQuality", Position.InfoData?.RecalibratedQuality); jsonObject.AddStringValue("cytogeneticBand", CytogeneticBand); jsonObject.AddDoubleValue("logOddsRatio", Position.InfoData?.LogOddsRatio, "0.###"); //adding object of custom vcf info fields if (Position.InfoData!=null && !Position.InfoData.CustomKeyValues.IsEmpty()) { jsonObject.AddObjectValue("vcfInfo", Position.InfoData.CustomKeyValues); } if (Position.Samples != null && Position.Samples.Length > 0) jsonObject.AddStringValues("samples", Position.Samples.Select(s => s.GetJsonString()), false); if (SupplementaryIntervals != null && SupplementaryIntervals.Any()) { AddSuppIntervalToJsonObject(jsonObject); } var variantStringBuilders = AnnotatedVariants.Select(v => v.GetJsonStringBuilder(originalChromName)).ToArray(); jsonObject.AddStringValues("variants", variantStringBuilders , false); foreach (StringBuilder builder in variantStringBuilders) { StringBuilderPool.Return(builder); } sb.Append(JsonObject.CloseBrace); return sb; } private void AddSuppIntervalToJsonObject(JsonObject jsonObject) { foreach (var si in SupplementaryIntervals) jsonObject.AddObjectValue(si.JsonKey, si); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/AnnotatedRegulatoryRegion.cs ================================================ using System.Collections.Generic; using System.Linq; using System.Text; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO; namespace VariantAnnotation.AnnotatedPositions { public sealed class AnnotatedRegulatoryRegion : IAnnotatedRegulatoryRegion { public IRegulatoryRegion RegulatoryRegion { get; } public IEnumerable Consequences { get; } public AnnotatedRegulatoryRegion(IRegulatoryRegion regulatoryRegion, List consequences) { RegulatoryRegion = regulatoryRegion; Consequences = consequences; } public void SerializeJson(StringBuilder sb) { var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("id", RegulatoryRegion.Id.WithVersion); jsonObject.AddStringValue("type", RegulatoryRegion.Type.ToString()); jsonObject.AddStringValues("consequence", Consequences?.Select(ConsequenceUtil.GetConsequence)); sb.Append(JsonObject.CloseBrace); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/AnnotatedVariant.cs ================================================ using System.Collections.Generic; using System.Text; using OptimizedCore; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using Variants; namespace VariantAnnotation.AnnotatedPositions { public sealed class AnnotatedVariant : IAnnotatedVariant { public IVariant Variant { get; private set; } public string HgvsgNotation { get; set; } public IList RegulatoryRegions { get; } = new List(); public IList Transcripts { get; } = new List(); public IList SaList { get; } = new List(); public ISupplementaryAnnotation RepeatExpansionPhenotypes { get; set; } public double? PhylopScore { get; set; } public double? GerpScore { get; set; } public bool InLowComplexityRegion { get; set; } public void Initialize(IVariant variant) { Variant = variant; HgvsgNotation = null; RegulatoryRegions.Clear(); Transcripts.Clear(); SaList.Clear(); RepeatExpansionPhenotypes = null; PhylopScore = null; GerpScore = null; } public StringBuilder GetJsonStringBuilder(string originalChromName) { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); // data section sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("vid", Variant.VariantId); jsonObject.AddStringValue("chromosome", originalChromName); jsonObject.AddIntValue("begin", Variant.Start); jsonObject.AddIntValue("end", Variant.End); jsonObject.AddBoolValue("isReferenceMinorAllele", Variant.IsRefMinor); jsonObject.AddBoolValue("isStructuralVariant", Variant.IsStructuralVariant); jsonObject.AddStringValue("refAllele", string.IsNullOrEmpty(Variant.RefAllele) ? "-" : Variant.RefAllele); jsonObject.AddStringValue("altAllele", string.IsNullOrEmpty(Variant.AltAllele) ? "-" : Variant.AltAllele); jsonObject.AddStringValue("variantType", Variant.Type.ToString()); jsonObject.AddBoolValue("isDecomposedVariant", Variant.IsDecomposed); jsonObject.AddBoolValue("isRecomposedVariant", Variant.IsRecomposed); jsonObject.AddStringValues("linkedVids", Variant.LinkedVids); jsonObject.AddStringValue("hgvsg", HgvsgNotation); jsonObject.AddDoubleValue("phylopScore", PhylopScore); jsonObject.AddDoubleValue("gerpScore", GerpScore); jsonObject.AddBoolValue("inLowComplexityRegion", InLowComplexityRegion); if (RegulatoryRegions?.Count > 0) jsonObject.AddObjectValues("regulatoryRegions", RegulatoryRegions); foreach (ISupplementaryAnnotation saItem in SaList) { jsonObject.AddObjectValue(saItem.JsonKey, saItem); } jsonObject.AddObjectValue(RepeatExpansionPhenotypes?.JsonKey, RepeatExpansionPhenotypes); if (Transcripts?.Count > 0) jsonObject.AddObjectValues("transcripts", Transcripts); sb.Append(JsonObject.CloseBrace); return sb; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Consequence/Consequences.cs ================================================ using System; using System.Collections.Generic; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions.Consequence { public sealed class Consequences { private readonly VariantType _variantType; private readonly List _consequences; public List GetConsequences() => _consequences; private readonly IVariantEffect _variantEffect; private readonly IFeatureVariantEffects _featureEffect; private readonly (Func, ConsequenceTag)[] _tier3Consequences; private static readonly HashSet ConsequencesThatNeedTranscriptVariant = new HashSet { ConsequenceTag.feature_elongation, ConsequenceTag.feature_truncation, ConsequenceTag.short_tandem_repeat_change, ConsequenceTag.short_tandem_repeat_contraction, ConsequenceTag.short_tandem_repeat_expansion, ConsequenceTag.transcript_ablation }; public Consequences(VariantType variantType, IVariantEffect variantEffect, IFeatureVariantEffects featureEffect) { _variantType = variantType; _consequences = new List(); _variantEffect = variantEffect; _featureEffect = featureEffect; _tier3Consequences = new List<(Func, ConsequenceTag)> { (() => _variantEffect.IsSpliceDonorVariant(), ConsequenceTag.splice_donor_variant), (() => _variantEffect.IsSpliceAcceptorVariant(), ConsequenceTag.splice_acceptor_variant), (() => _variantEffect.IsStopGained(), ConsequenceTag.stop_gained), (() => _variantEffect.IsFrameshiftVariant(), ConsequenceTag.frameshift_variant), (() => _variantEffect.IsStopLost(), ConsequenceTag.stop_lost), (() => _variantEffect.IsStartLost(), ConsequenceTag.start_lost), (() => _variantEffect.IsInframeInsertion(), ConsequenceTag.inframe_insertion), (() => _variantEffect.IsInframeDeletion(), ConsequenceTag.inframe_deletion), (() => _variantEffect.IsMissenseVariant(), ConsequenceTag.missense_variant), (() => _variantEffect.IsProteinAlteringVariant(), ConsequenceTag.protein_altering_variant), (() => _variantEffect.IsSpliceRegionVariant(), ConsequenceTag.splice_region_variant), (() => _variantEffect.IsIncompleteTerminalCodonVariant(), ConsequenceTag.incomplete_terminal_codon_variant), (() => _variantEffect.IsStartRetained(), ConsequenceTag.start_retained_variant), (() => _variantEffect.IsStopRetained(), ConsequenceTag.stop_retained_variant), (() => _variantEffect.IsSynonymousVariant(), ConsequenceTag.synonymous_variant), (() => _variantEffect.IsCodingSequenceVariant(), ConsequenceTag.coding_sequence_variant), (() => _variantEffect.IsFivePrimeUtrVariant(), ConsequenceTag.five_prime_UTR_variant), (() => _variantEffect.IsThreePrimeUtrVariant(), ConsequenceTag.three_prime_UTR_variant), (() => _variantEffect.IsNonCodingTranscriptExonVariant(), ConsequenceTag.non_coding_transcript_exon_variant), (() => _variantEffect.IsWithinIntron(), ConsequenceTag.intron_variant), (() => _variantEffect.IsNonsenseMediatedDecayTranscriptVariant(), ConsequenceTag.NMD_transcript_variant), (() => _variantEffect.IsNonCodingTranscriptVariant(), ConsequenceTag.non_coding_transcript_variant), (() => _featureEffect.Elongation(), ConsequenceTag.feature_elongation), (() => _featureEffect.Truncation(), ConsequenceTag.feature_truncation) }.ToArray(); } public static List DetermineFlankingVariantEffects(bool isDownstreamVariant) => new List(1) {isDownstreamVariant ? ConsequenceTag.downstream_gene_variant : ConsequenceTag.upstream_gene_variant}; public void DetermineSmallVariantEffects() { GetTier1Types(); if (_consequences.Count == 0) GetTier2Types(); if (_consequences.Count == 0) GetTier3Types(); if (NeedsTranscriptVariant(_variantType, _consequences)) _consequences.Add(ConsequenceTag.transcript_variant); } internal static bool NeedsTranscriptVariant(VariantType variantType, List consequences) => consequences.Count == 0 ? NeedsTranscriptVariantByVariantType(variantType) : NeedsTranscriptVariantByConsequences(consequences); private static bool NeedsTranscriptVariantByConsequences(List consequences) { foreach (ConsequenceTag consequence in consequences) { if (!ConsequencesThatNeedTranscriptVariant.Contains(consequence)) return false; } return true; } private static bool NeedsTranscriptVariantByVariantType(VariantType variantType) { switch (variantType) { case VariantType.duplication: case VariantType.tandem_duplication: case VariantType.copy_number_variation: case VariantType.copy_number_loss: case VariantType.copy_number_gain: case VariantType.run_of_homozygosity: return false; default: return true; } } public void DetermineStructuralVariantEffect(IVariant variant) { GetTier1Types(); if (_consequences.Count == 0) GetStructuralTier2Types(); DetermineCopyNumberEffect(variant.Type); DetermineRepeatExpansionEffect(variant); if (NeedsTranscriptVariant(_variantType, _consequences)) _consequences.Add(ConsequenceTag.transcript_variant); } private void DetermineRepeatExpansionEffect(IVariant variant) { if (!(variant is RepeatExpansion repeatExpansion)) return; if (repeatExpansion.RefRepeatCount == null || repeatExpansion.RefRepeatCount == repeatExpansion.RepeatCount) { _consequences.Add(ConsequenceTag.short_tandem_repeat_change); } else if (repeatExpansion.RepeatCount > repeatExpansion.RefRepeatCount) { _consequences.Add(ConsequenceTag.short_tandem_repeat_expansion); } else { _consequences.Add(ConsequenceTag.short_tandem_repeat_contraction); } } private void DetermineCopyNumberEffect(VariantType variantType) { // ReSharper disable once SwitchStatementMissingSomeCases switch (variantType) { case VariantType.copy_number_gain: _consequences.Add(ConsequenceTag.copy_number_increase); break; case VariantType.copy_number_loss: _consequences.Add(ConsequenceTag.copy_number_decrease); break; case VariantType.copy_number_variation: _consequences.Add(ConsequenceTag.copy_number_change); break; } } private void GetStructuralTier2Types() { // FeatureElongation if (_featureEffect.Elongation()) _consequences.Add(ConsequenceTag.feature_elongation); // FeatureTruncation if (_featureEffect.Truncation()) _consequences.Add(ConsequenceTag.feature_truncation); // FivePrimeDuplicatedTranscript if (_featureEffect.FivePrimeDuplicatedTranscript()) _consequences.Add(ConsequenceTag.five_prime_duplicated_transcript); // ThreePrimeDuplicatedTranscript if (_featureEffect.ThreePrimeDuplicatedTranscript()) _consequences.Add(ConsequenceTag.three_prime_duplicated_transcript); } private void GetTier1Types() { // TranscriptAblation if (_featureEffect.Ablation()) _consequences.Add(ConsequenceTag.transcript_ablation); // TranscriptAmplification if (_featureEffect.Amplification()) _consequences.Add(ConsequenceTag.transcript_amplification); } private void GetTier2Types() { // MatureMirnaVariant if (_variantEffect.IsMatureMirnaVariant()) _consequences.Add(ConsequenceTag.mature_miRNA_variant); } private void GetTier3Types() { foreach ((Func consequenceTest, ConsequenceTag consequenceTag) in _tier3Consequences) { if (consequenceTest()) _consequences.Add(consequenceTag); } } public void DetermineRegulatoryVariantEffects() { // RegulatoryRegionAmplification if (_featureEffect.Amplification()) _consequences.Add(ConsequenceTag.regulatory_region_amplification); // RegulatoryRegionAblation if (_featureEffect.Ablation()) _consequences.Add(ConsequenceTag.regulatory_region_ablation); // RegulatoryRegionVariant _consequences.Add(ConsequenceTag.regulatory_region_variant); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/HgvsCodingNomenclature.cs ================================================ using Genome; using Intervals; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions { public static class HgvsCodingNomenclature { public static string GetHgvscAnnotation(ITranscript transcript, ISimpleVariant variant, ISequence refSequence, int regionStart, int regionEnd, string transcriptRef, string transcriptAlt ) { // sanity check: don't try to handle odd characters, make sure this is not a reference allele, // and make sure that we have protein coordinates if (variant.Type == VariantType.reference || SequenceUtilities.HasNonCanonicalBase(variant.AltAllele)) return null; // do not report HGVSc notation when variant lands inside gap region if (regionStart > -1 && regionEnd > -1) { var startRegion = transcript.TranscriptRegions[regionStart]; var endRegion = transcript.TranscriptRegions[regionEnd]; if (startRegion.Id == endRegion.Id && startRegion.Type == TranscriptRegionType.Gap && endRegion.Type == TranscriptRegionType.Gap) return null; } bool onReverseStrand = transcript.Gene.OnReverseStrand; string refAllele = string.IsNullOrEmpty(transcriptRef)? onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.RefAllele) : variant.RefAllele : transcriptRef; string altAllele = string.IsNullOrEmpty(transcriptAlt) ? onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.AltAllele) : variant.AltAllele : transcriptAlt; // decide event type from HGVS nomenclature var genomicChange = GetGenomicChange(transcript, onReverseStrand, refSequence, variant); int variantStart = variant.Start; int variantEnd = variant.End; if (genomicChange == GenomicChange.Duplication) { (variantStart, variantEnd, refAllele, regionStart, regionEnd) = transcript.TranscriptRegions.ShiftDuplication(variantStart, altAllele, onReverseStrand); } var startPositionOffset = HgvsUtilities.GetPositionOffset(transcript, variantStart, regionStart, true); var endPositionOffset = variantStart == variantEnd ? startPositionOffset : HgvsUtilities.GetPositionOffset(transcript, variantEnd, regionEnd, false); if (onReverseStrand) { PositionOffset tmp = startPositionOffset; startPositionOffset = endPositionOffset; endPositionOffset = tmp; } if (startPositionOffset == null && variant.Type == VariantType.insertion) { startPositionOffset = new PositionOffset(endPositionOffset.Position + 1, endPositionOffset.Offset, $"{endPositionOffset.Position + 1}"); } // sanity check: make sure we have coordinates if (startPositionOffset == null || endPositionOffset == null) return null; var hgvsNotation = new HgvscNotation(refAllele, altAllele, transcript.Id.WithVersion, genomicChange, startPositionOffset, endPositionOffset, transcript.Translation != null); // generic formatting return hgvsNotation.ToString(); } /// /// Adjust positions by alt allele length /// internal static (int Start, int End, string RefAllele, int RegionStart, int RegionEnd) ShiftDuplication( this ITranscriptRegion[] regions, int start, string altAllele, bool onReverseStrand) { int incrementLength = altAllele.Length; int dupStart = onReverseStrand ? start + incrementLength - 1 : start - incrementLength; int dupEnd = onReverseStrand ? dupStart - incrementLength + 1 : dupStart + incrementLength - 1; (int regionStart, _) = MappedPositionUtilities.FindRegion(regions, dupStart); (int regionEnd, _) = MappedPositionUtilities.FindRegion(regions, dupEnd); return (dupStart, dupEnd, altAllele, regionStart, regionEnd); } public static GenomicChange GetGenomicChange(IInterval interval, bool onReverseStrand, ISequence refSequence, ISimpleVariant variant) { // length of the reference allele. Negative lengths make no sense int refLength = variant.End - variant.Start + 1; if (refLength < 0) refLength = 0; // length of alternative allele int altLength = variant.AltAllele.Length; // sanity check: make sure that the alleles are different if (variant.RefAllele == variant.AltAllele) return GenomicChange.Reference; // deletion if (altLength == 0) return GenomicChange.Deletion; if (refLength == altLength) { // substitution if (refLength == 1) return GenomicChange.Substitution; // inversion string rcRefAllele = SequenceUtilities.GetReverseComplement(variant.RefAllele); return variant.AltAllele == rcRefAllele ? GenomicChange.Inversion : GenomicChange.DelIns; } // deletion/insertion if (refLength != 0) return GenomicChange.DelIns; // If this is an insertion, we should check if the preceding reference nucleotides // match the insertion. In that case it should be annotated as a multiplication. bool isGenomicDuplicate = HgvsUtilities.IsDuplicateWithinInterval(refSequence, variant, interval, onReverseStrand); return isGenomicDuplicate ? GenomicChange.Duplication : GenomicChange.Insertion; } } public enum GenomicChange { Unknown, Deletion, Duplication, DelIns, Insertion, Inversion, Substitution, Reference } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/HgvsProteinNomenclature.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Algorithms; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions { public static class HgvsProteinNomenclature { public static string GetHgvsProteinAnnotation( ITranscript transcript, string refAminoAcids, string altAminoAcids, string transcriptAltAllele, IMappedPosition position, VariantEffect variantEffect, ISimpleVariant variant, ISequence refSequence, string hgvscNotation, bool isMitochondrial) { if (IsHgvspNull(transcriptAltAllele, position.CdsStart, position.CdsEnd, variant, hgvscNotation)) return null; var peptideSeq = transcript.Translation.PeptideSeq; // Amino acid seq should never go past the stop codon refAminoAcids = !refAminoAcids.EndsWith(AminoAcids.StopCodon) && refAminoAcids.Contains(AminoAcids.StopCodon) ? refAminoAcids.OptimizedSplit(AminoAcids.StopCodon[0])[0] + AminoAcids.StopCodon : refAminoAcids; int proteinStart = position.ProteinStart; HgvsUtilities.ShiftAndRotateAlleles(ref proteinStart, ref refAminoAcids, ref altAminoAcids, peptideSeq); var end = proteinStart + refAminoAcids.Length - 1; var refAbbreviation = AminoAcids.GetAbbreviations(refAminoAcids); var altAbbreviation = AminoAcids.GetAbbreviations(altAminoAcids); var proteinId = transcript.Translation.ProteinId.WithVersion; var proteinChange = GetProteinChange(proteinStart, refAminoAcids, altAminoAcids, peptideSeq, variantEffect); // ReSharper disable once SwitchStatementMissingSomeCases switch (proteinChange) { case ProteinChange.Substitution: return HgvspNotation.GetSubstitutionNotation(proteinId, proteinStart, refAbbreviation, altAbbreviation); case ProteinChange.Unknown: return HgvspNotation.GetUnknownNotation(proteinId, proteinStart, end, refAbbreviation, altAbbreviation); case ProteinChange.Deletion: return HgvspNotation.GetDeletionNotation(proteinId, proteinStart, end, refAbbreviation, variantEffect.IsStopGained()); case ProteinChange.Duplication: proteinStart -= altAminoAcids.Length; return HgvspNotation.GetDuplicationNotation(proteinId, proteinStart, end, altAbbreviation); case ProteinChange.Frameshift: return GetHgvsFrameshiftNotation(refSequence, position.CdsStart, position.CdsEnd, transcriptAltAllele, transcript, isMitochondrial, proteinId, proteinStart, end); case ProteinChange.None: return HgvspNotation.GetSilentNotation(hgvscNotation, proteinStart, refAbbreviation, variantEffect.IsStopRetained()); case ProteinChange.DelIns: return HgvspNotation.GetDelInsNotation(proteinId, proteinStart, end, refAbbreviation, altAbbreviation); case ProteinChange.Insertion: Swap.Int(ref proteinStart, ref end); return HgvspNotation.GetInsertionNotation(proteinId, proteinStart, end, altAbbreviation, peptideSeq); case ProteinChange.Extension: var altPeptideSequence = HgvsUtilities.GetAltPeptideSequence(refSequence, position.CdsStart, position.CdsEnd, transcriptAltAllele, transcript, isMitochondrial); altAbbreviation = proteinStart <= altPeptideSequence.Length ? AminoAcids.ConvertAminoAcidToAbbreviation(altPeptideSequence[proteinStart - 1]): "Ter"; var countToStop = HgvsUtilities.GetNumAminoAcidsUntilStopCodon(altPeptideSequence, peptideSeq, proteinStart - 1, false); return HgvspNotation.GetExtensionNotation(proteinId, proteinStart, refAbbreviation, altAbbreviation,countToStop); case ProteinChange.StartLost: return HgvspNotation.GetStartLostNotation(proteinId); } return null; } private static string GetHgvsFrameshiftNotation(ISequence refSequence, int cdsBegin, int cdsEnd, string transcriptAltAllele, ITranscript transcript, bool isMitochondrial, string proteinId, int start, int end) { var peptideSeq = transcript.Translation.PeptideSeq; var altPeptideSeq = HgvsUtilities.GetAltPeptideSequence(refSequence, cdsBegin, cdsEnd, transcriptAltAllele, transcript, isMitochondrial); if (start > end) Swap.Int(ref start, ref end); var frameshiftedParameters = HgvsUtilities.GetChangesAfterFrameshift(start, peptideSeq, altPeptideSeq); start = frameshiftedParameters.Item1; var refAminoAcid = frameshiftedParameters.Item2; var altAminoAcid = frameshiftedParameters.Item3; var refAbbreviation = AminoAcids.ConvertAminoAcidToAbbreviation(refAminoAcid); if (altAminoAcid == AminoAcids.StopCodonChar) return HgvspNotation.GetSubstitutionNotation(proteinId, start, refAbbreviation, "Ter"); var altAbbreviation = AminoAcids.ConvertAminoAcidToAbbreviation(altAminoAcid); var countToStop = HgvsUtilities.GetNumAminoAcidsUntilStopCodon(altPeptideSeq, peptideSeq, start - 1, true); return HgvspNotation.GetFrameshiftNotation(proteinId, start, refAbbreviation, altAbbreviation, countToStop); } private static bool IsHgvspNull(string transcriptAltAllele, int cdsStart, int cdsEnd, ISimpleVariant variant, string hgvscNotation) { return string.IsNullOrEmpty(hgvscNotation) || variant.Type == VariantType.reference || SequenceUtilities.HasNonCanonicalBase(transcriptAltAllele) || cdsStart == -1 || cdsEnd == -1; } internal static ProteinChange GetProteinChange(int start, string refAminoAcids, string altAminoAcids, string peptideSeq, IVariantEffect variantEffect) { var insertionBeforeTranscript = refAminoAcids.Length == 0 && start == 1; if (refAminoAcids == altAminoAcids || variantEffect.IsStopRetained() || insertionBeforeTranscript) return ProteinChange.None; if (variantEffect.IsStartLost()) return ProteinChange.StartLost; // according to var nom, only if the Stop codon is effected, we call it an extension if (variantEffect.IsStopLost() && refAminoAcids.OptimizedStartsWith(AminoAcids.StopCodonChar)) return ProteinChange.Extension; if (variantEffect.IsFrameshiftVariant()) return ProteinChange.Frameshift; if (altAminoAcids.Length > refAminoAcids.Length && HgvsUtilities.IsAminoAcidDuplicate(start, altAminoAcids, peptideSeq)) return ProteinChange.Duplication; if (refAminoAcids.Length == 0 && altAminoAcids.Length != 0) return ProteinChange.Insertion; if (refAminoAcids.Length != 0 && altAminoAcids.Length == 0) return ProteinChange.Deletion; if (refAminoAcids.Length == 1 && altAminoAcids.Length == 1) return ProteinChange.Substitution; // the only remaining possibility is deletions/insertions return ProteinChange.DelIns; } } public enum ProteinChange { Unknown, Deletion, Duplication, Frameshift, DelIns, Insertion, None, Extension, StartLost, Substitution } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/HgvsUtilities.cs ================================================ using System; using System.Text; using Genome; using Intervals; using OptimizedCore; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions { public static class HgvsUtilities { public static void ShiftAndRotateAlleles(ref int start, ref string refAminoAcids, ref string altAminoAcids, string peptides) { (start, refAminoAcids, altAminoAcids) = BiDirectionalTrimmer.Trim(start, refAminoAcids, altAminoAcids); (start, refAminoAcids, altAminoAcids) = Rotate3Prime(refAminoAcids, altAminoAcids, start, peptides); } internal static (int Start, string RefAminoAcids, string AltAminoAcids) Rotate3Prime(string refAminoAcids, string altAminoAcids, int start, string peptides) { if (!(string.IsNullOrEmpty(refAminoAcids) || string.IsNullOrEmpty(altAminoAcids))) return (start, refAminoAcids, altAminoAcids); bool isInsertion = !string.IsNullOrEmpty(altAminoAcids); // ReSharper disable once PossibleNullReferenceException int end = start + refAminoAcids.Length - 1; // for insertion, the reference bases will be empty string. The shift should happen on the alternate allele string rotatingPeptides = isInsertion ? altAminoAcids : refAminoAcids; int numBases = rotatingPeptides.Length; string downstreamPeptides = peptides.Length >= end ? peptides.Substring(end) : null; string combinedSequence = rotatingPeptides + downstreamPeptides; int shiftStart, shiftEnd; var hasShifted = false; for (shiftStart = 0, shiftEnd = numBases; shiftEnd < combinedSequence.Length; shiftStart++, shiftEnd++) { if (combinedSequence[shiftStart] != combinedSequence[shiftEnd]) break; start++; hasShifted = true; } if (hasShifted) rotatingPeptides = combinedSequence.Substring(shiftStart, numBases); if (isInsertion) altAminoAcids = rotatingPeptides; else refAminoAcids = rotatingPeptides; return (start, refAminoAcids, altAminoAcids); } public static bool IsAminoAcidDuplicate(int start, string altAminoAcids, string transcriptPeptides) { if (altAminoAcids == null || transcriptPeptides == null) return false; int testAminoAcidPos = start - altAminoAcids.Length - 1; if (testAminoAcidPos < 0) return false; string precedingAminoAcids = testAminoAcidPos + altAminoAcids.Length <= transcriptPeptides.Length ? transcriptPeptides.Substring(testAminoAcidPos, altAminoAcids.Length) : ""; return testAminoAcidPos >= 0 && precedingAminoAcids == altAminoAcids; } public static int GetNumAminoAcidsUntilStopCodon(string altCds, string peptideSeq, int refVarPos, bool isFrameshift) { int numExtraAminoAcids = -1; int refLen = peptideSeq.Length; // find the number of residues that are translated until a termination codon is encountered int terPos = altCds.IndexOf('*'); if (terPos != -1) { numExtraAminoAcids = terPos + 1 - (isFrameshift ? refVarPos : refLen + 1); } // A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon return numExtraAminoAcids > 0 ? numExtraAminoAcids : -1; } public static (int Start, char RefAminoAcid, char AltAminoAcid) GetChangesAfterFrameshift(int start, string peptideSeq, string altPeptideSeq) { start = Math.Min(start, peptideSeq.Length); // for deletions at the end of peptide sequence if (start > altPeptideSeq.Length) return (start, peptideSeq[start - 1], '?'); string refPeptideSeq = peptideSeq + "*"; char refAminoAcid = refPeptideSeq[start - 1]; char altAminoAcid = altPeptideSeq[start - 1]; while (start <= altPeptideSeq.Length && start <= refPeptideSeq.Length) { refAminoAcid = refPeptideSeq[start - 1]; altAminoAcid = altPeptideSeq[start - 1]; // variation at stop codon, but maintains stop codon - set to synonymous if (refAminoAcid == '*' && altAminoAcid == '*' || refAminoAcid != altAminoAcid) break; start++; } return (start, refAminoAcid, altAminoAcid); } public static string GetAltPeptideSequence(ISequence refSequence, int cdsBegin, int cdsEnd, string transcriptAltAllele, ITranscript transcript, bool isMitochondrial) { string altCds = TranscriptUtilities.GetAlternateCds(refSequence, cdsBegin, cdsEnd, transcriptAltAllele, transcript.TranscriptRegions, transcript.Gene.OnReverseStrand, transcript.StartExonPhase, transcript.Translation.CodingRegion.CdnaStart); var aminoAcids = new AminoAcids(isMitochondrial); return aminoAcids.TranslateBases(altCds, true); } public static PositionOffset GetPositionOffset(ITranscript transcript, int genomicPosition, int regionIndex, bool isRegionStart) { if (!transcript.Overlaps(genomicPosition, genomicPosition)) return null; ITranscriptRegion region = transcript.TranscriptRegions[regionIndex]; int codingRegionStart = transcript.Translation?.CodingRegion.CdnaStart ?? -1; int codingRegionEnd = transcript.Translation?.CodingRegion.CdnaEnd ?? -1; (int position, int offset) = GetPositionAndOffset(genomicPosition, region, transcript.Gene.OnReverseStrand, isRegionStart); if (position == -1) return null; string coordinate = GetCoordinate(position, codingRegionStart, codingRegionEnd); string offsetString = offset == 0 ? "" : offset.ToString("+0;-0;+0"); string value = coordinate + offsetString; return new PositionOffset(position, offset, value); } internal static (int Position, int Offset) GetPositionAndOffset(int position, ITranscriptRegion region, bool onReverseStrand, bool isRegionStart) { int cdsPos = -1; int offset = -1; switch (region.Type) { case TranscriptRegionType.Exon: cdsPos = region.CdnaStart + (onReverseStrand ? region.End - position : position - region.Start); offset = 0; break; case TranscriptRegionType.Gap: (cdsPos, offset) = GetGapPositionAndOffset(region, isRegionStart); break; case TranscriptRegionType.Intron: (cdsPos, offset) = GetIntronPositionAndOffset(position, region, onReverseStrand); break; } return (cdsPos, offset); } private static (int Position, int Offset) GetIntronPositionAndOffset(int position, ITranscriptRegion region, bool onReverseStrand) { int leftDist = position - region.Start + 1; int rightDist = region.End - position + 1; int offset = Math.Min(leftDist, rightDist); if (!onReverseStrand && rightDist < leftDist || onReverseStrand && rightDist > leftDist) offset = -offset; // cDNA position truth table // // forward reverse // ------------------------- // L < R | CdnaStart | CdnaEnd | // L = R | CdnaStart | CdnaStart | // L > R | CdnaEnd | CdnaStart | // ------------------------- int cdnaPosition = leftDist < rightDist && onReverseStrand || leftDist > rightDist && !onReverseStrand ? region.CdnaEnd : region.CdnaStart; return (cdnaPosition, offset); } private static (int Position, int Offset) GetGapPositionAndOffset(ITranscriptRegion region, bool isRegionStart) => isRegionStart ? (region.CdnaEnd, 0) : (region.CdnaStart, 0); private static string GetCoordinate(int position, int codingRegionStart, int codingRegionEnd) { if (codingRegionEnd != -1 && position > codingRegionEnd) return "*" + (position - codingRegionEnd); return codingRegionStart != -1 ? (position + (position >= codingRegionStart ? 1 : 0) - codingRegionStart).ToString() : position.ToString(); } public static string GetTranscriptAllele(string variantAllele, bool onReverseStrand) => onReverseStrand ? SequenceUtilities.GetReverseComplement(variantAllele) : variantAllele; public static string FormatDnaNotation(string start, string end, string referenceId, string referenceBases, string alternateBases, GenomicChange type, char notationType) { StringBuilder sb = StringBuilderPool.Get(); // all start with transcript name & numbering type sb.Append(referenceId + ':' + notationType + '.'); // handle single and multiple positions string coordinates = start == end ? start : start + '_' + end; // format rest of string according to type // note: inversion and multiple are never assigned as genomic changes // ReSharper disable once SwitchStatementMissingSomeCases switch (type) { case GenomicChange.Deletion: sb.Append(coordinates + "del"); break; case GenomicChange.Inversion: sb.Append(coordinates + "inv"); break; case GenomicChange.Duplication: sb.Append(coordinates + "dup"); break; case GenomicChange.Substitution: if (referenceBases == alternateBases) { sb.Append(start + '='); } else { sb.Append(start + referenceBases + '>' + alternateBases); } break; case GenomicChange.DelIns: // NOTE: change to delins, now use del--ins-- to reduce anavarin differences sb.Append(coordinates + "delins" + alternateBases); break; case GenomicChange.Insertion: sb.Append(coordinates + "ins" + alternateBases); break; case GenomicChange.Reference: sb.Append(coordinates + "="); break; default: throw new InvalidOperationException("Unhandled genomic change found: " + type); } return StringBuilderPool.GetStringAndReturn(sb); } public static bool IsDuplicateWithinInterval(ISequence refSequence, ISimpleVariant variant, IInterval interval, bool onReverseStrand) { if (variant.Type != VariantType.insertion) return false; int altAlleleLen = variant.AltAllele.Length; string compareRegion; if (onReverseStrand) { if (variant.End + altAlleleLen > interval.End) return false; compareRegion = refSequence.Substring(variant.Start - 1, altAlleleLen); } else { if (variant.Start - altAlleleLen < interval.Start) return false; compareRegion = refSequence.Substring(variant.End - altAlleleLen, altAlleleLen); } return compareRegion == variant.AltAllele; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/HgvscNotation.cs ================================================ namespace VariantAnnotation.AnnotatedPositions { public sealed class HgvscNotation { private readonly string _referenceBases; private readonly string _alternateBases; private PositionOffset _start; private PositionOffset _end; private readonly string _transcriptId; private readonly char _transcriptType; private readonly GenomicChange _type; private const char CodingType = 'c'; private const char NonCodingType = 'n'; public HgvscNotation(string referenceBases, string alternateBases, string transcriptId, GenomicChange changeType, PositionOffset start, PositionOffset end, bool isCoding) { _transcriptId = transcriptId; _start = start; _end = end; _type = changeType; SwapEndpoints(); _referenceBases = referenceBases ?? ""; _alternateBases = alternateBases ?? ""; _transcriptType = isCoding ? CodingType : NonCodingType; } /// /// HGVS aligns changes 3' /// e.g. given a ATG/- deletion in C[ATG]ATGT, we want to move to: CATG[ATG]T /// given a A/- deletion in TA[A]AAAA, we want to move to: TAAAAA[A] /// given a AA/- deletion in TA[AA]AAA, we want to move to: TAAAA[AA] /// private void SwapEndpoints() { if (_start.Position <= _end.Position && (_start.Position != _end.Position || _start.Offset <= _end.Offset)) return; PositionOffset temp = _start; _start = _end; _end = temp; } public override string ToString() => HgvsUtilities.FormatDnaNotation(_start.Value, _end.Value, _transcriptId, _referenceBases, _alternateBases, _type, _transcriptType); } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/HgvsgNotation.cs ================================================ using System; using Genome; using Intervals; using Variants; namespace VariantAnnotation.AnnotatedPositions { public static class HgvsgNotation { private const char NotationType = 'g'; private const char MitoNotationType = 'm'; public static string GetNotation(string refseqAccession, ISimpleVariant variant, ISequence refSequence, IInterval referenceInterval) { ISimpleVariant rotatedVariant = VariantRotator.Right(variant, referenceInterval, refSequence, false); int start = Math.Min(rotatedVariant.Start, rotatedVariant.End); int end = Math.Max(rotatedVariant.Start, rotatedVariant.End); string referenceBases = rotatedVariant.RefAllele; string alternateBases = rotatedVariant.AltAllele; GenomicChange type = HgvsCodingNomenclature.GetGenomicChange(referenceInterval, false, refSequence, rotatedVariant); if (type == GenomicChange.Duplication && variant.Type == VariantType.insertion) { referenceBases = alternateBases; end = start; start = end - referenceBases.Length + 1; } char notationType = variant.Chromosome.UcscName == "chrM" ? MitoNotationType : NotationType; return HgvsUtilities.FormatDnaNotation(start.ToString(), end.ToString(), refseqAccession, referenceBases, alternateBases, type, notationType); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/HgvspNotation.cs ================================================ using VariantAnnotation.AnnotatedPositions.Transcript; namespace VariantAnnotation.AnnotatedPositions { public static class HgvspNotation { public static string GetDelInsNotation(string proteinId, int start, int end, string refAbbreviation, string altAbbreviation) { if (altAbbreviation.StartsWith("Ter")) return GetSubstitutionNotation(proteinId, start, refAbbreviation.Substring(0, 3), "Ter"); return start == end ? $"{proteinId}:p.({refAbbreviation}{start}delins{altAbbreviation})" : $"{proteinId}:p.({refAbbreviation.Substring(0, 3)}{start}_{refAbbreviation.Substring(refAbbreviation.Length - 3)}{end}delins{altAbbreviation})"; } public static string GetInsertionNotation(string proteinId, int start, int end, string altAbbreviation, string peptideSeq) { // insertion past the last AA if (end > peptideSeq.Length) return null; var leftFlankingAa = AminoAcids.ConvertAminoAcidToAbbreviation(peptideSeq[start - 1]); if (altAbbreviation.StartsWith("Ter")) { var refAminoAcid = AminoAcids.ConvertAminoAcidToAbbreviation(peptideSeq[start]); return $"{proteinId}:p.({refAminoAcid}{end}Ter)"; } var rightFlankingAa = end > peptideSeq.Length ? "Ter" : AminoAcids.ConvertAminoAcidToAbbreviation(peptideSeq[end - 1]); return $"{proteinId}:p.({leftFlankingAa}{start}_{rightFlankingAa}{end}ins{altAbbreviation})"; } public static string GetFrameshiftNotation(string proteinId, int start, string refAbbreviation, string altAbbreviation, int countToStop) { if (altAbbreviation.StartsWith("Ter")) return $"{proteinId}:p.({refAbbreviation}{start}Ter)"; return countToStop > 0 ? $"{proteinId}:p.({refAbbreviation}{start}{altAbbreviation}fsTer{countToStop})" : $"{proteinId}:p.({refAbbreviation}{start}{altAbbreviation}fsTer?)"; } public static string GetExtensionNotation(string proteinId, int start, string refAbbreviation, string altAbbreviation, int countToStop) { return countToStop > 0 ? $"{proteinId}:p.({refAbbreviation}{start}{altAbbreviation.Substring(0, 3)}extTer{countToStop})" : $"{proteinId}:p.({refAbbreviation}{start}{altAbbreviation.Substring(0, 3)}extTer?)"; } public static string GetDuplicationNotation(string proteinId, int start, int end, string altAbbreviation) { return start == end ? $"{proteinId}:p.({altAbbreviation}{start}dup)" : $"{proteinId}:p.({altAbbreviation.Substring(0, 3)}{start}_{altAbbreviation.Substring(altAbbreviation.Length - 3)}{end}dup)"; } public static string GetStartLostNotation(string proteinId) { // changing it according to https://varnomen.hgvs.org/recommendations/protein/variant/substitution/ return $"{proteinId}:p.?"; } public static string GetSilentNotation(string hgvscNotation, int start, string refAbbreviation, bool isStopRetained) { return isStopRetained ? $"{hgvscNotation}(p.(Ter{start}=))" : $"{hgvscNotation}(p.({refAbbreviation}{start}=))"; } internal static string GetSubstitutionNotation(string proteinId, int start, string refAbbreviation, string altAbbreviation) { // start lost if (start == 1 && refAbbreviation != altAbbreviation) return $"{proteinId}:p.({refAbbreviation}{start}?)"; return $"{proteinId}:p.({refAbbreviation}{start}{altAbbreviation})"; } internal static string GetUnknownNotation(string proteinId, int start, int end, string refAbbreviation, string altAbbreviation) { return start == end ? $"{proteinId}:p.({refAbbreviation}{start}{altAbbreviation})" : $"{proteinId}:p.({refAbbreviation}{start}_{altAbbreviation}{end})"; } internal static string GetDeletionNotation(string proteinId, int start, int end, string refAbbreviation, bool isStopGained) { if (isStopGained) return $"{proteinId}:p.({refAbbreviation}{start}Ter)"; return start == end ? $"{proteinId}:p.({refAbbreviation}{start}del)" : $"{proteinId}:p.({refAbbreviation.Substring(0, 3)}{start}_{refAbbreviation.Substring(refAbbreviation.Length - 3)}{end}del)"; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/PositionOffset.cs ================================================ namespace VariantAnnotation.AnnotatedPositions { public sealed record PositionOffset(int Position, int Offset, string Value); } ================================================ FILE: VariantAnnotation/AnnotatedPositions/RegulatoryRegionAnnotator.cs ================================================ using Intervals; using VariantAnnotation.AnnotatedPositions.Consequence; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions { public static class RegulatoryRegionAnnotator { public static IAnnotatedRegulatoryRegion Annotate(IVariant variant, IRegulatoryRegion regulatoryRegion) { OverlapType overlapType = Intervals.Utilities.GetOverlapType(regulatoryRegion.Start, regulatoryRegion.End, variant.Start, variant.End); EndpointOverlapType endpointOverlapType = Intervals.Utilities.GetEndpointOverlapType(regulatoryRegion.Start, regulatoryRegion.End, variant.Start, variant.End); var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, false, variant.Type, variant.IsStructuralVariant); var consequence = new Consequences(VariantType.unknown, null, featureEffect); consequence.DetermineRegulatoryVariantEffects(); return new AnnotatedRegulatoryRegion(regulatoryRegion, consequence.GetConsequences()); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/AminoAcids.cs ================================================ using System; using System.Collections.Generic; using OptimizedCore; using VariantAnnotation.TranscriptAnnotation; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class AminoAcids { public const string StopCodon = "*"; public const char StopCodonChar = '*'; private readonly CodonConversion _codonConversionScheme = CodonConversion.HumanChromosome; private readonly Dictionary _aminoAcidLookupTable; private readonly Dictionary _mitoDifferences; // converts single letter amino acid ambiguity codes to three // letter abbreviations private static readonly Dictionary SingleToThreeAminoAcids = new Dictionary { {'A', "Ala"}, {'B', "Asx"}, {'C', "Cys"}, {'D', "Asp"}, {'E', "Glu"}, {'F', "Phe"}, {'G', "Gly"}, {'H', "His"}, {'I', "Ile"}, {'K', "Lys"}, {'L', "Leu"}, {'M', "Met"}, {'N', "Asn"}, {'P', "Pro"}, {'Q', "Gln"}, {'R', "Arg"}, {'S', "Ser"}, {'T', "Thr"}, {'V', "Val"}, {'W', "Trp"}, {'Y', "Tyr"}, {'Z', "Glx"}, {'X', "Ter"}, // Ter now recommended in HGVS {'*', "Ter"}, {'U', "Sec"}, {'O', "Pyl"}, {'J', "Xle"}, {'?', "_?_"} //deletion at the end of incomplete transcript results in unknown change }; private enum CodonConversion : byte { HumanChromosome, HumanMitochondrion } public AminoAcids(bool isMitochondrial) { if (isMitochondrial) _codonConversionScheme = CodonConversion.HumanMitochondrion; _aminoAcidLookupTable = new Dictionary { // 2nd base: T {"TTT", 'F'}, {"TTC", 'F'}, {"TTA", 'L'}, {"TTG", 'L'}, {"CTT", 'L'}, {"CTC", 'L'}, {"CTA", 'L'}, {"CTG", 'L'}, {"ATT", 'I'}, {"ATC", 'I'}, {"ATA", 'I'}, {"ATG", 'M'}, {"GTT", 'V'}, {"GTC", 'V'}, {"GTA", 'V'}, {"GTG", 'V'}, // 2nd base: C {"TCT", 'S'}, {"TCC", 'S'}, {"TCA", 'S'}, {"TCG", 'S'}, {"CCT", 'P'}, {"CCC", 'P'}, {"CCA", 'P'}, {"CCG", 'P'}, {"ACT", 'T'}, {"ACC", 'T'}, {"ACA", 'T'}, {"ACG", 'T'}, {"GCT", 'A'}, {"GCC", 'A'}, {"GCA", 'A'}, {"GCG", 'A'}, // 2nd base: A {"TAT", 'Y'}, {"TAC", 'Y'}, {"TAA", '*'}, {"TAG", '*'}, {"CAT", 'H'}, {"CAC", 'H'}, {"CAA", 'Q'}, {"CAG", 'Q'}, {"AAT", 'N'}, {"AAC", 'N'}, {"AAA", 'K'}, {"AAG", 'K'}, {"GAT", 'D'}, {"GAC", 'D'}, {"GAA", 'E'}, {"GAG", 'E'}, // 2nd base: G {"TGT", 'C'}, {"TGC", 'C'}, {"TGA", '*'}, {"TGG", 'W'}, {"CGT", 'R'}, {"CGC", 'R'}, {"CGA", 'R'}, {"CGG", 'R'}, {"AGT", 'S'}, {"AGC", 'S'}, {"AGA", 'R'}, {"AGG", 'R'}, {"GGT", 'G'}, {"GGC", 'G'}, {"GGA", 'G'}, {"GGG", 'G'} }; _mitoDifferences = new Dictionary { {"ATA", 'M'}, {"TGA", 'W'}, {"AGA", '*'}, {"AGG", '*'} }; } internal static string AddUnknownAminoAcid(string aminoAcids) => aminoAcids == StopCodon ? aminoAcids : aminoAcids + 'X'; public SequenceChange Translate(string referenceCodons, string alternateCodons) { if (string.IsNullOrEmpty(referenceCodons) && string.IsNullOrEmpty(alternateCodons)) return new SequenceChange("", ""); if (referenceCodons != null && (referenceCodons.Contains("N") || alternateCodons.Contains("N"))) return new SequenceChange("", ""); var referenceAminoAcids = TranslateBases(referenceCodons, false); var alternateAminoAcids = TranslateBases(alternateCodons, false); return new SequenceChange(referenceAminoAcids, alternateAminoAcids); } /// /// converts a DNA triplet to the appropriate amino acid abbreviation /// public static string ConvertAminoAcidToAbbreviation(char aminoAcid) { if (!SingleToThreeAminoAcids.TryGetValue(aminoAcid, out var abbreviation)) { throw new NotSupportedException($"Unable to convert the following string to an amino acid abbreviation: {aminoAcid}"); } return abbreviation; } /// /// converts a DNA triplet to the appropriate amino acid abbreviation /// The default conversion is human chromosomes. The second parameter also allows the user to specify other codon conversions like mitochondria, etc. /// internal char ConvertTripletToAminoAcid(string triplet) { var upperTriplet = triplet.ToUpper(); // check our exceptions first if (_codonConversionScheme == CodonConversion.HumanMitochondrion && _mitoDifferences.TryGetValue(upperTriplet, out var mitoAminoAcid)) return mitoAminoAcid; // the default case return _aminoAcidLookupTable.TryGetValue(upperTriplet, out var aminoAcid) ? aminoAcid : 'X'; } /// /// given a string of 1-letter amino acid ambiguity codes, this function /// returns a string of 3-letter amino acid abbreviations up until the first /// stop codon. /// public static string GetAbbreviations(string aminoAcids) { if (string.IsNullOrEmpty(aminoAcids)) return ""; if (aminoAcids.Length == 1) return ConvertAminoAcidToAbbreviation(aminoAcids[0]); var sb = StringBuilderPool.Get(); foreach (var aminoAcid in aminoAcids) { sb.Append(ConvertAminoAcidToAbbreviation(aminoAcid)); } return StringBuilderPool.GetStringAndReturn(sb); } /// /// returns a string of single-letter amino acids translated from a string of bases. /// The bases must already be grouped by triplets (i.e. len must be a multiple of 3) /// public string TranslateBases(string bases, bool forceNonTriplet) { // sanity check: handle the empty case if (bases == null) return null; var numAminoAcids = bases.Length / 3; // check if we have a non triplet case var nonTriplet = !forceNonTriplet && numAminoAcids * 3 != bases.Length; // special case: single amino acid string aminoAcidString; if (numAminoAcids == 1) { aminoAcidString = ConvertTripletToAminoAcid(bases.Substring(0, 3 * numAminoAcids)) .ToString(); return nonTriplet ? AddUnknownAminoAcid(aminoAcidString) : aminoAcidString; } // multiple amino acid case var aminoAcids = new char[numAminoAcids]; for (var i = 0; i < numAminoAcids; i++) { aminoAcids[i] = ConvertTripletToAminoAcid(bases.Substring(i * 3, 3)); } aminoAcidString = new string(aminoAcids); return nonTriplet ? AddUnknownAminoAcid(aminoAcidString) : aminoAcidString; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/AnnotatedConservationScore.cs ================================================ using System.Collections.Generic; using System.Linq; using System.Text; using VariantAnnotation.Interface.IO; using VariantAnnotation.IO; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class AnnotatedConservationScore : IJsonSerializer { private readonly IEnumerable _scores; public AnnotatedConservationScore(IEnumerable scores) => _scores = scores; public void SerializeJson(StringBuilder sb) { var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValues("scores", _scores.Select(x => x.ToString("0.##")), false); sb.Append(JsonObject.CloseBrace); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/AnnotatedGeneFusion.cs ================================================ using System.Text; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO; namespace VariantAnnotation.AnnotatedPositions.Transcript { // ReSharper disable InconsistentNaming public sealed record AnnotatedGeneFusion(ITranscript transcript, int? exon, int? intron, string hgvsr, bool isInFrame, ulong FusionKey, string FirstGeneSymbol, uint FirstGeneKey, string SecondGeneSymbol, uint SecondGeneKey) : IAnnotatedGeneFusion { // ReSharper restore InconsistentNaming public void SerializeJson(StringBuilder sb) { string geneId = transcript.Source == Source.Ensembl ? transcript.Gene.EnsemblId.ToString() : transcript.Gene.EntrezGeneId.ToString(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("transcript", transcript.Id.WithVersion); jsonObject.AddStringValue("bioType", AnnotatedTranscript.GetBioType(transcript.BioType)); jsonObject.AddIntValue("exon", exon); jsonObject.AddIntValue("intron", intron); jsonObject.AddStringValue("geneId", geneId); jsonObject.AddStringValue("hgnc", transcript.Gene.Symbol); jsonObject.AddStringValue("hgvsr", hgvsr); jsonObject.AddBoolValue("inFrame", isInFrame); sb.Append(JsonObject.CloseBrace); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/AnnotatedTranscript.cs ================================================ using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Text; using VariantAnnotation.Algorithms; using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class AnnotatedTranscript : IAnnotatedTranscript { public ITranscript Transcript { get; private set; } public string ReferenceAminoAcids { get; private set;} public string AlternateAminoAcids { get; private set;} public string ReferenceCodons { get; private set;} public string AlternateCodons { get; private set;} public IMappedPosition MappedPosition { get; private set;} public string HgvsCoding { get; private set;} public string HgvsProtein { get; private set;} public PredictionScore Sift { get; private set;} public PredictionScore PolyPhen { get; private set;} public List Consequences { get; private set;} public bool? CompleteOverlap { get; private set;} public List ConservationScores { get; set; } private List _geneFusions; public void Initialize(ITranscript transcript, string referenceAminoAcids, string alternateAminoAcids, string referenceCodons, string alternateCodons, IMappedPosition mappedPosition, string hgvsCoding, string hgvsProtein, PredictionScore sift, PredictionScore polyphen, List consequences, bool? completeOverlap) { Transcript = transcript; ReferenceAminoAcids = referenceAminoAcids; AlternateAminoAcids = alternateAminoAcids; ReferenceCodons = referenceCodons; AlternateCodons = alternateCodons; MappedPosition = mappedPosition; HgvsCoding = hgvsCoding; HgvsProtein = hgvsProtein; Sift = sift; PolyPhen = polyphen; Consequences = consequences; CompleteOverlap = completeOverlap; _geneFusions = null; } public void SerializeJson(StringBuilder sb) { var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("transcript", Transcript.Id.WithVersion); jsonObject.AddStringValue("source", Transcript.Source.ToString()); if (CompleteOverlap.HasValue && !CompleteOverlap.Value) jsonObject.AddStringValue("bioType", GetBioType(Transcript.BioType)); jsonObject.AddStringValue("codons", GetCodonString(ReferenceCodons, AlternateCodons)); jsonObject.AddStringValue("aminoAcids", GetAminoAcidString(ReferenceAminoAcids, AlternateAminoAcids)); if (MappedPosition != null) { jsonObject.AddStringValue("cdnaPos", GetRangeString(MappedPosition.CoveredCdnaStart, MappedPosition.CoveredCdnaEnd)); jsonObject.AddStringValue("cdsPos", GetRangeString(MappedPosition.CoveredCdsStart, MappedPosition.CoveredCdsEnd)); jsonObject.AddStringValue("exons", GetFractionString(MappedPosition.ExonStart, MappedPosition.ExonEnd, Transcript.NumExons)); jsonObject.AddStringValue("introns", GetFractionString(MappedPosition.IntronStart, MappedPosition.IntronEnd, Transcript.NumExons - 1)); jsonObject.AddStringValue("proteinPos", GetRangeString(MappedPosition.CoveredProteinStart, MappedPosition.CoveredProteinEnd)); } string geneId = Transcript.Source == Source.Ensembl ? Transcript.Gene.EnsemblId.ToString() : Transcript.Gene.EntrezGeneId.ToString(); if (CompleteOverlap.HasValue &&!CompleteOverlap.Value) jsonObject.AddStringValue("geneId", geneId); jsonObject.AddStringValue("hgnc", Transcript.Gene.Symbol); if (Consequences != null) AddConsequences(jsonObject); jsonObject.AddStringValue("hgvsc", HgvsCoding); jsonObject.AddStringValue("hgvsp", HgvsProtein); if (_geneFusions != null) jsonObject.AddObjectValues("geneFusions", _geneFusions); jsonObject.AddBoolValue("isCanonical", Transcript.IsCanonical); jsonObject.AddDoubleValue("polyPhenScore", PolyPhen?.Score); jsonObject.AddStringValue("polyPhenPrediction", PolyPhen?.Prediction); if (CompleteOverlap.HasValue && !CompleteOverlap.Value && Transcript.Translation != null) jsonObject.AddStringValue("proteinId", Transcript.Translation.ProteinId.WithVersion); jsonObject.AddDoubleValue("siftScore", Sift?.Score); jsonObject.AddStringValue("siftPrediction", Sift?.Prediction); if (ConservationScores != null && ConservationScores.Count > 0) { jsonObject.AddObjectValue("aminoAcidConservation", new AnnotatedConservationScore(ConservationScores)); } if (CompleteOverlap.HasValue) jsonObject.AddBoolValue("completeOverlap", CompleteOverlap.Value); sb.Append(JsonObject.CloseBrace); } private void AddConsequences(JsonObject jsonObject) { jsonObject.AddStringValues("consequence", Consequences?.Select(ConsequenceUtil.GetConsequence)); } public static string GetBioType(BioType bioType) => bioType == BioType.three_prime_overlapping_ncRNA ? "3prime_overlapping_ncRNA" : bioType.ToString(); private static string GetAminoAcidString(string a, string b) { if (a == b) return a; a = string.IsNullOrEmpty(a) ? "-" : a; b = string.IsNullOrEmpty(b) ? "-" : b; return $"{a}/{b}"; } private static string GetCodonString(string a, string b) { if (a == b && string.IsNullOrEmpty(a)) return a; a = string.IsNullOrEmpty(a) ? "-" : a; b = string.IsNullOrEmpty(b) ? "-" : b; return $"{a}/{b}"; } private static string GetRangeString(int start, int end) { if (start == -1 && end == -1) return null; if (start == -1) return "?-" + end; if (end == -1) return start + "-?"; if (start > end) Swap.Int(ref start, ref end); return start == end ? start.ToString(CultureInfo.InvariantCulture) : start + "-" + end; } private static string GetFractionString(int start, int end, int total) { if (start == -1 && end == -1) return null; return GetRangeString(start, end) + "/" + total; } public void AddGeneFusions(IAnnotatedGeneFusion[] geneFusions) { _geneFusions ??= new List(); _geneFusions.AddRange(geneFusions); Consequences.Add(ConsequenceTag.unidirectional_gene_fusion); } public void AddGeneFusionPairs(HashSet fusionPairs) { if (_geneFusions == null) return; foreach (IAnnotatedGeneFusion gf in _geneFusions) fusionPairs.Add(new GeneFusionPair(gf.FusionKey, gf.FirstGeneSymbol, gf.FirstGeneKey, gf.SecondGeneSymbol, gf.SecondGeneKey)); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/CdnaSequence.cs ================================================ using System.Text; using ErrorHandling.Exceptions; using Genome; using OptimizedCore; using VariantAnnotation.Caches.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class CdnaSequence : ISequence { private readonly ICodingRegion _codingRegion; private readonly ITranscriptRegion[] _regions; private readonly IRnaEdit[] _rnaEdits; private readonly bool _onReverseStrand; private readonly ISequence _compressedSequence; private string _sequence; public CdnaSequence(ISequence compressedSequence, ICodingRegion codingRegion, ITranscriptRegion[] regions, bool onReverseStrand, IRnaEdit[] rndEdits) { _codingRegion = codingRegion; _regions = regions; _rnaEdits = rndEdits; _onReverseStrand = onReverseStrand; _compressedSequence = compressedSequence; _sequence = GetCdnaSequence(); } public string GetCdnaSequence() { if (_sequence != null) return _sequence; var sb = StringBuilderPool.Get(); foreach (var region in _regions) { if (region.Type != TranscriptRegionType.Exon) continue; sb.Append(_compressedSequence.Substring(region.Start - 1, region.End - region.Start + 1)); } if (_onReverseStrand) { string reverseComplement = SequenceUtilities.GetReverseComplement(sb.ToString()); sb.Clear(); sb.Append(reverseComplement); } ApplyRnaEdits(sb); _sequence = StringBuilderPool.GetStringAndReturn(sb); return _sequence; } private void ApplyRnaEdits(StringBuilder sb) { if (_rnaEdits == null) return; var editOffset = 0; RnaEditUtilities.SetTypesAndSort(_rnaEdits); foreach (var rnaEdit in _rnaEdits) { int cdnaEditStart = rnaEdit.Start - 1 + editOffset; switch (rnaEdit.Type) { case VariantType.SNV: if(cdnaEditStart >= 0 ) sb[cdnaEditStart] = rnaEdit.Bases[0]; break; case VariantType.MNV: for (var i = 0; i < rnaEdit.Bases.Length && cdnaEditStart >= 0; i++) sb[cdnaEditStart + i] = rnaEdit.Bases[i]; break; case VariantType.insertion: if (cdnaEditStart >= 0) sb.Insert(cdnaEditStart, rnaEdit.Bases); editOffset += rnaEdit.Bases.Length; break; case VariantType.deletion: editOffset -= rnaEdit.End - rnaEdit.Start + 1; break; default: throw new UserErrorException("Encountered unknown rnaEdit type:" + rnaEdit.Type); } } } public int Length => _sequence?.Length ?? _codingRegion?.Length ?? 0; public Band[] CytogeneticBands => null; public string Substring(int offset, int length) { if (_sequence == null) _sequence = GetCdnaSequence(); return _sequence.Substring(offset, length); } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/CodingSequence.cs ================================================ using Genome; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class CodingSequence : ISequence { private readonly string _sequence; public CodingSequence(ISequence compressedSequence, ICodingRegion codingRegion, ITranscriptRegion[] regions, bool onReverseStrand, byte startExonPhase, IRnaEdit[] rnaEdits) { string cdnaSequence = new CdnaSequence(compressedSequence, codingRegion, regions, onReverseStrand, rnaEdits) .GetCdnaSequence(); int cdsLen = codingRegion.CdnaEnd - codingRegion.CdnaStart + 1; _sequence = new string('N', startExonPhase) + cdnaSequence.Substring(codingRegion.CdnaStart - 1, cdsLen); } public string GetCodingSequence() => _sequence; public int Length => _sequence.Length; public Band[] CytogeneticBands => null; public string Substring(int offset, int length) => _sequence.Substring(offset, length); } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/Codons.cs ================================================ using System; using Genome; using VariantAnnotation.TranscriptAnnotation; namespace VariantAnnotation.AnnotatedPositions.Transcript { public static class Codons { public static SequenceChange GetCodons(string transcriptAlternateAllele, int cdsStart, int cdsEnd, int proteinBegin, int proteinEnd, ISequence codingSequence) { if (cdsStart == -1 || cdsEnd == -1 || proteinBegin == -1 || proteinEnd == -1) return new SequenceChange("", ""); // current implementation of GetCoveredCdsAndProteinPositions may return negative cdsStart and cdsEnd beyond the CDS region if (cdsStart < 1) cdsStart = 1; if (cdsEnd > codingSequence.Length) cdsEnd = codingSequence.Length; int aminoAcidStart = Math.Max(proteinBegin * 3 - 2, 1); int aminoAcidEnd = Math.Min(proteinEnd * 3, codingSequence.Length); var transcriptReferenceAllele = cdsEnd >= cdsStart ? codingSequence.Substring(cdsStart - 1, cdsEnd - cdsStart + 1) : ""; int prefixStartIndex = aminoAcidStart - 1; int prefixLen = cdsStart - aminoAcidStart; int suffixStartIndex = cdsEnd; int suffixLen = aminoAcidEnd - cdsEnd; string prefix = prefixStartIndex + prefixLen < codingSequence.Length ? codingSequence.Substring(prefixStartIndex, prefixLen).ToLower() : "AAA"; string suffix = suffixLen > 0 ? codingSequence.Substring(suffixStartIndex, suffixLen).ToLower() : ""; var refCodons = GetCodon(transcriptReferenceAllele, prefix, suffix); var altCodons = GetCodon(transcriptAlternateAllele, prefix, suffix); return new SequenceChange(refCodons, altCodons); } /// /// returns the codon string consisting of the prefix and suffix bases flanking the allele bases /// public static string GetCodon(string allele, string prefix, string suffix) { if (prefix.Length == 0 && suffix.Length == 0) return allele; return $"{prefix}{allele}{suffix}"; } /// /// returns true if the length is a multiple of three, false otherwise /// public static bool IsTriplet(int len) => Math.Abs(len) % 3 == 0; } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/CompactId.cs ================================================ using System; using IO; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Utilities; namespace VariantAnnotation.AnnotatedPositions.Transcript { public struct CompactId : ICompactId { private readonly IdType _id; private readonly byte _version; private readonly uint _info; private const int NoInfo = int.MaxValue; private const byte NoVersion = byte.MaxValue; private const int NumShift = 4; private const int LengthMask = 0xf; private const int MaxNumber = 0xfffffff; internal static CompactId Empty => new CompactId(IdType.Unknown, NoVersion, NoInfo); public bool IsEmpty() => _id == IdType.Unknown; private CompactId(IdType id, byte version, uint info) { _id = id; _version = version; _info = info; } public override string ToString() => ConvertToString(true); public string WithVersion => ConvertToString(true); public string WithoutVersion => ConvertToString(false); public static CompactId Convert(string s, byte version = NoVersion) { if (string.IsNullOrEmpty(s)) return Empty; if (s.StartsWith("ENSG")) return GetCompactId(s, 4, IdType.EnsemblGene, version); if (s.StartsWith("ENST")) return GetCompactId(s, 4, IdType.EnsemblTranscript, version); if (s.StartsWith("ENSP")) return GetCompactId(s, 4, IdType.EnsemblProtein, version); if (s.StartsWith("ENSESTG")) return GetCompactId(s, 7, IdType.EnsemblEstGene, version); if (s.StartsWith("ENSESTP")) return GetCompactId(s, 7, IdType.EnsemblEstProtein, version); if (s.StartsWith("ENSR")) return GetCompactId(s, 4, IdType.EnsemblRegulatory, version); if (s.StartsWith("CCDS")) return GetCompactId(s, 4, IdType.Ccds, version); if (s.StartsWith("NR_")) return GetCompactId(s, 3, IdType.RefSeqNonCodingRNA, version); if (s.StartsWith("NM_")) return GetCompactId(s, 3, IdType.RefSeqMessengerRNA, version); if (s.StartsWith("NP_")) return GetCompactId(s, 3, IdType.RefSeqProtein, version); if (s.StartsWith("XR_")) return GetCompactId(s, 3, IdType.RefSeqPredictedNonCodingRNA, version); if (s.StartsWith("XM_")) return GetCompactId(s, 3, IdType.RefSeqPredictedMessengerRNA, version); if (s.StartsWith("XP_")) return GetCompactId(s, 3, IdType.RefSeqPredictedProtein, version); if (s.StartsWith("YP_")) return GetCompactId(s, 3, IdType.RefSeq_YP, version); if (int.TryParse(s, out int i)) return GetNumericalCompactId(i, s.Length); Console.WriteLine("Unknown ID: [{0}] ({1})", s, s.Length); return Empty; } private static uint ToInfo(int num, int len) => (uint)(num << 4 | (len & LengthMask)); private static CompactId GetCompactId(string s, int prefixLen, IdType idType, byte version) { var (id, _) = FormatUtilities.SplitVersion(s); int num = int.Parse(id.Substring(prefixLen)); return new CompactId(idType, version, ToInfo(num, id.Length - prefixLen)); } private static CompactId GetNumericalCompactId(int num, int paddedLength) { if (num > MaxNumber) throw new ArgumentOutOfRangeException($"Could not convert the number ({num}) to a CompactID. Max supported number is {MaxNumber}."); return new CompactId(IdType.OnlyNumbers, NoVersion, ToInfo(num, paddedLength)); } private string ConvertToString(bool showVersion) { if (_id == IdType.Unknown) return null; var prefix = GetPrefix(); var number = GetNumber(); var version = GetVersion(showVersion); return prefix + number + version; } private string GetVersion(bool showVersion) { if (!showVersion || _version == NoVersion) return null; return "." + _version; } private string GetNumber() { var num = _info >> NumShift; var length = _info & LengthMask; return num.ToString("D" + length); } private string GetPrefix() { // ReSharper disable once SwitchStatementMissingSomeCases switch (_id) { case IdType.EnsemblGene: return "ENSG"; case IdType.EnsemblTranscript: return "ENST"; case IdType.EnsemblProtein: return "ENSP"; case IdType.EnsemblEstGene: return "ENSESTG"; case IdType.EnsemblEstProtein: return "ENSESTP"; case IdType.EnsemblRegulatory: return "ENSR"; case IdType.Ccds: return "CCDS"; case IdType.RefSeqNonCodingRNA: return "NR_"; case IdType.RefSeqMessengerRNA: return "NM_"; case IdType.RefSeqProtein: return "NP_"; case IdType.RefSeqPredictedNonCodingRNA: return "XR_"; case IdType.RefSeqPredictedMessengerRNA: return "XM_"; case IdType.RefSeqPredictedProtein: return "XP_"; case IdType.RefSeq_YP: return "YP_"; } return null; } public void Write(IExtendedBinaryWriter writer) { writer.Write((byte)_id); writer.Write(_version); writer.Write(_info); } public static CompactId Read(IBufferedBinaryReader reader) { var id = (IdType)reader.ReadByte(); var version = reader.ReadByte(); var info = reader.ReadUInt32(); return new CompactId(id, version, info); } } public enum IdType : byte { // ReSharper disable InconsistentNaming Unknown, Ccds, EnsemblEstGene, EnsemblEstProtein, EnsemblGene, EnsemblProtein, EnsemblRegulatory, EnsemblTranscript, OnlyNumbers, RefSeqMessengerRNA, RefSeqNonCodingRNA, RefSeqPredictedMessengerRNA, RefSeqPredictedNonCodingRNA, RefSeqPredictedProtein, RefSeqProtein, RefSeq_YP // ReSharper restore InconsistentNaming } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/FeatureVariantEffects.cs ================================================ using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class FeatureVariantEffects : IFeatureVariantEffects { private readonly bool _isSv; private readonly bool _completelyOverlaps; private readonly bool _partialOverlap; private readonly bool _fivePrimeOverlap; private readonly bool _threePrimeOverlap; private readonly bool _completelyWithin; private readonly bool _lossOrDeletion; private readonly bool _gainOrDuplication; private readonly bool _isInsertionDeletion; private readonly bool _isInsertion; public FeatureVariantEffects(OverlapType overlapType, EndpointOverlapType endpointOverlapType, bool onReverseStrand, VariantType vt, bool isSv) { _isSv = isSv; _partialOverlap = overlapType != OverlapType.CompletelyOverlaps && overlapType != OverlapType.None; _completelyOverlaps = overlapType == OverlapType.CompletelyOverlaps; _completelyWithin = overlapType == OverlapType.CompletelyWithin; _fivePrimeOverlap = !onReverseStrand && endpointOverlapType == EndpointOverlapType.Start || onReverseStrand && endpointOverlapType == EndpointOverlapType.End; _threePrimeOverlap = !onReverseStrand && endpointOverlapType == EndpointOverlapType.End || onReverseStrand && endpointOverlapType == EndpointOverlapType.Start; _lossOrDeletion = vt == VariantType.copy_number_loss || vt == VariantType.deletion; _gainOrDuplication = vt == VariantType.copy_number_gain || vt == VariantType.duplication || vt == VariantType.tandem_duplication; _isInsertionDeletion = vt == VariantType.indel; _isInsertion = vt == VariantType.insertion; } public bool Ablation() => (_lossOrDeletion || _isInsertionDeletion) && _completelyOverlaps; public bool Amplification() => _gainOrDuplication && _completelyOverlaps; public bool Truncation() => _isSv && _lossOrDeletion && _partialOverlap; public bool Elongation() => _isSv && _completelyWithin && (_gainOrDuplication || _isInsertion); public bool FivePrimeDuplicatedTranscript() => _gainOrDuplication && _fivePrimeOverlap; public bool ThreePrimeDuplicatedTranscript() => _gainOrDuplication && _threePrimeOverlap; } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/MappedPosition.cs ================================================ using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class MappedPosition : IMappedPosition { public int CdnaStart { get; } public int CdnaEnd { get; } public int CdsStart { get; } public int CdsEnd { get; } public int ProteinStart { get; set; } public int ProteinEnd { get; set; } public int ExonStart { get; } public int ExonEnd { get; } public int IntronStart { get; } public int IntronEnd { get; } public int RegionStartIndex { get; } public int RegionEndIndex { get; } public int CoveredProteinStart { get; set; } = -1; public int CoveredProteinEnd { get; set; } = -1; public int CoveredCdsStart { get; set; } = -1; public int CoveredCdsEnd { get; set; } = -1; public int CoveredCdnaStart { get; set; } = -1; public int CoveredCdnaEnd { get; set; } = -1; public MappedPosition(int cdnaStart, int cdnaEnd, int cdsStart, int cdsEnd, int proteinStart, int proteinEnd, int exonStart, int exonEnd, int intronStart, int intronEnd, int regionStartIndex, int regionEndIndex) { CdnaStart = cdnaStart; CdnaEnd = cdnaEnd; CdsStart = cdsStart; CdsEnd = cdsEnd; ProteinStart = proteinStart; ProteinEnd = proteinEnd; ExonStart = exonStart; ExonEnd = exonEnd; IntronStart = intronStart; IntronEnd = intronEnd; RegionStartIndex = regionStartIndex; RegionEndIndex = regionEndIndex; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/MappedPositionUtilities.cs ================================================ using Intervals; using VariantAnnotation.Algorithms; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.AnnotatedPositions.Transcript { public static class MappedPositionUtilities { public static (int Index, ITranscriptRegion Region) FindRegion(ITranscriptRegion[] regions, int variantPosition) { int index = regions.BinarySearch(variantPosition); var region = index < 0 ? null : regions[index]; return (index, region); } public static (int CdnaStart, int CdnaEnd) GetCdnaPositions(ITranscriptRegion startRegion, ITranscriptRegion endRegion, IInterval variant, bool onReverseStrand, bool isInsertion) { int cdnaStart = GetCdnaPosition(startRegion, variant.Start, onReverseStrand); int cdnaEnd = GetCdnaPosition(endRegion, variant.End, onReverseStrand); if (FoundExonEndpointInsertion(isInsertion, cdnaStart, cdnaEnd, startRegion, endRegion)) { return FixExonEndpointInsertion(cdnaStart, cdnaEnd, onReverseStrand, startRegion, endRegion, variant); } return (cdnaStart, cdnaEnd); } private static int GetCdnaPosition(ITranscriptRegion region, int variantPosition, bool onReverseStrand) { if (region == null || region.Type != TranscriptRegionType.Exon) return -1; return onReverseStrand ? region.End - variantPosition + region.CdnaStart : variantPosition - region.Start + region.CdnaStart; } /// /// Assuming at least one cDNA coordinate overlaps with an exon, the covered cDNA coordinates represent /// the coordinates actually covered by the variant. /// public static (int Start, int End) GetCoveredCdnaPositions(this ITranscriptRegion[] regions, int cdnaStart, int startRegionIndex, int cdnaEnd, int endRegionIndex, bool onReverseStrand) { // exon case if (cdnaStart != -1 && cdnaEnd != -1) return (cdnaStart, cdnaEnd); if (onReverseStrand) Swap.Int(ref startRegionIndex, ref endRegionIndex); var startRegion = regions.GetCoveredRegion(startRegionIndex); var endRegion = regions.GetCoveredRegion(endRegionIndex); if (startRegion.Type != TranscriptRegionType.Exon && endRegion.Type != TranscriptRegionType.Exon) return (-1, -1); int codingEnd = onReverseStrand ? regions[0].CdnaEnd : regions[regions.Length - 1].CdnaEnd; cdnaStart = GetCoveredCdnaPosition(cdnaStart, startRegion, startRegionIndex, codingEnd, onReverseStrand, false); cdnaEnd = GetCoveredCdnaPosition(cdnaEnd, endRegion, endRegionIndex, codingEnd, onReverseStrand, true); return cdnaStart < cdnaEnd ? (cdnaStart, cdnaEnd) : (cdnaEnd, cdnaStart); } private static ITranscriptRegion GetCoveredRegion(this ITranscriptRegion[] regions, int regionIndex) { if (regionIndex == -1) return regions[0]; return regionIndex == ~regions.Length ? regions[regions.Length - 1] : regions[regionIndex]; } private static int GetCoveredCdnaPosition(int cdnaPosition, ITranscriptRegion region, int regionIndex, int codingEnd, bool onReserveStrand, bool isEndPosition) { if (cdnaPosition >= 0) return cdnaPosition; // genomic position on the left of the transcript if (regionIndex == -1) return onReserveStrand ? codingEnd : 1; // genomic position on the right of the transcript if (regionIndex < -1) return onReserveStrand ? 1 : codingEnd; // intron return isEndPosition ? region.CdnaStart : region.CdnaEnd; } public static (int CdsStart, int CdsEnd, int ProteinStart, int ProteinEnd) GetCoveredCdsAndProteinPositions(int coveredCdnaStart, int coveredCdnaEnd, byte startExonPhase, ICodingRegion codingRegion) { if (codingRegion == null || coveredCdnaEnd < codingRegion.CdnaStart || coveredCdnaStart > codingRegion.CdnaEnd || coveredCdnaStart == -1 && coveredCdnaEnd == -1) return (-1, -1, -1, -1); int beginOffset = startExonPhase - codingRegion.CdnaStart + 1; int start = coveredCdnaStart + beginOffset; int end = coveredCdnaEnd + beginOffset; return (start, end, GetProteinPosition(start), GetProteinPosition(end)); } public static int GetProteinPosition(int cdsPosition) { if (cdsPosition == -1) return -1; return (cdsPosition + 2) / 3; } public static (int CdsStart, int CdsEnd) GetCdsPositions(ICodingRegion codingRegion, int cdnaStart, int cdnaEnd, byte startExonPhase, bool isInsertion) { int cdsStart = GetCdsPosition(codingRegion, cdnaStart, startExonPhase); int cdsEnd = GetCdsPosition(codingRegion, cdnaEnd, startExonPhase); // silence CDS for insertions that occur just after the coding region if (isInsertion && codingRegion != null && (cdnaEnd == codingRegion.CdnaEnd || cdnaStart == codingRegion.CdnaStart)) { cdsStart = -1; cdsEnd = -1; } return (cdsStart, cdsEnd); } private static int GetCdsPosition(ICodingRegion codingRegion, int cdnaPosition, byte startExonPhase) { if (codingRegion == null || cdnaPosition < codingRegion.CdnaStart || cdnaPosition > codingRegion.CdnaEnd) return -1; return cdnaPosition - codingRegion.CdnaStart + startExonPhase + 1; } /// /// Fixes the missing cDNA coordinate for situations where an insertion occurs on either the first or last /// base of an exon /// internal static (int CdnaStart, int CdnaEnd) FixExonEndpointInsertion(int cdnaStart, int cdnaEnd, bool onReverseStrand, ITranscriptRegion startRegion, ITranscriptRegion endRegion, IInterval variant) { var (intron, exon) = startRegion.Type == TranscriptRegionType.Exon ? (endRegion, startRegion) : (startRegion, endRegion); bool matchExonStart = variant.Start == exon.Start; int cdnaPos = !onReverseStrand && matchExonStart || onReverseStrand && !matchExonStart ? intron.CdnaStart : intron.CdnaEnd; if (cdnaStart == -1) cdnaStart = cdnaPos; else cdnaEnd = cdnaPos; return (cdnaStart, cdnaEnd); } /// /// Identifies when an insertion on an exon boundary needs special attention. Here we're looking for one /// intron & one exon where one cDNA coordinate is defined, but the other isn't. /// internal static bool FoundExonEndpointInsertion(bool isInsertion, int cdnaStart, int cdnaEnd, ITranscriptRegion startRegion, ITranscriptRegion endRegion) { bool isCdnaStartUndef = cdnaStart == -1; bool isCdnaEndUndef = cdnaEnd == -1; bool isStartExon = startRegion?.Type == TranscriptRegionType.Exon; bool isEndExon = endRegion?.Type == TranscriptRegionType.Exon; return isInsertion && startRegion != null && endRegion != null && isStartExon ^ isEndExon && isCdnaStartUndef ^ isCdnaEndUndef; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/StringExtensions.cs ================================================ using System; namespace VariantAnnotation.AnnotatedPositions.Transcript { public static class StringExtensions { public static int CommonPrefixLength(this string a, string b) { if (a == null || b == null) return 0; var maxPrefixLength = Math.Min(a.Length, b.Length); var prefixLength = 0; while (prefixLength < maxPrefixLength && a[prefixLength] == b[prefixLength]) prefixLength++; return prefixLength; } public static int CommonSuffixLength(this string a, string b) { if (a == null || b == null) return 0; var maxSuffixLength = Math.Min(a.Length, b.Length); var suffixLength = 0; while (suffixLength < maxSuffixLength && a[a.Length - suffixLength - 1] == b[b.Length - suffixLength - 1]) suffixLength++; return suffixLength; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/TranscriptPositionalEffect.cs ================================================ using System.Linq; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class TranscriptPositionalEffect { public bool IsEndSpliceSite; public bool IsStartSpliceSite; public bool IsWithinFrameshiftIntron; public bool IsWithinIntron; public bool IsWithinSpliceSiteRegion; public bool HasExonOverlap; public bool AfterCoding; public bool BeforeCoding; public bool WithinCdna; public bool WithinCds; public bool HasFrameShift; public bool IsCoding; public bool OverlapWithMicroRna; public void DetermineIntronicEffect(ITranscriptRegion[] regions, IInterval variant, VariantType variantType) { if (regions == null) return; var isInsertion = variantType == VariantType.insertion; foreach (var region in regions) { if (region.Type != TranscriptRegionType.Intron) continue; // skip this one if variant is out of range : the range is set to 3 instead of the original old: // all of the checking occured in the region between start-3 to end+3, if we set to 8, we can made mistakes when // checking IsWithinIntron when we have a small exon if (!variant.Overlaps(region.Start - 3, region.End + 3)) continue; // under various circumstances the genebuild process can introduce artificial // short (<= 12 nucleotide) introns into transcripts (e.g. to deal with errors // in the reference sequence etc.), we don't want to categorize variations that // fall in these introns as intronic, or as any kind of splice variant var isFrameshiftIntron = region.End - region.Start <= 12; if (isFrameshiftIntron && variant.Overlaps(region.Start, region.End)) { IsWithinFrameshiftIntron = true; continue; } CheckSpliceSiteOverlap(variant, region); CheckIntronOverlap(variant, isInsertion, region); // the definition of splice_region (SO:0001630) is "within 1-3 bases of the // exon or 3-8 bases of the intron." We also need to special case insertions // between the edge of an exon and a donor or acceptor site and between a donor // or acceptor site and the intron IsWithinSpliceSiteRegion = variant.Overlaps(region.Start + 2, region.Start + 7) || variant.Overlaps(region.End - 7, region.End - 2) || variant.Overlaps(region.Start - 3, region.Start - 1) || variant.Overlaps(region.End + 1, region.End + 3) || isInsertion && (variant.Start == region.Start || variant.End == region.End || variant.Start == region.Start + 2 || variant.End == region.End - 2); } } private void CheckSpliceSiteOverlap(IInterval variant, IInterval region) { if (variant.Overlaps(region.Start, region.Start + 1)) { IsStartSpliceSite = true; } if (variant.Overlaps(region.End - 1, region.End)) { IsEndSpliceSite = true; } } private void CheckIntronOverlap(IInterval variant, bool isInsertion, IInterval region) { // we need to special case insertions between the donor and acceptor sites // make sure the size of intron is larger than 4 if (region.Start <= region.End - 4 && (variant.Overlaps(region.Start + 2, region.End - 2) || isInsertion && (variant.Start == region.Start + 2 || variant.End == region.End - 2))) { IsWithinIntron = true; } } public void DetermineExonicEffect(ITranscript transcript, IInterval variant, IMappedPosition position, int coveredCdnaStart, int coveredCdnaEnd, int coveredCdsStart, int coveredCdsEnd, string altAllele, bool startCodonInsertionWithNoImpact) { HasExonOverlap = position.ExonStart != -1 || position.ExonEnd != -1; if (transcript.Translation != null) { var codingRegion = transcript.Translation.CodingRegion; AfterCoding = IsAfterCoding(variant.Start, variant.End, transcript.End, codingRegion.End); BeforeCoding = IsBeforeCoding(variant.Start, variant.End, transcript.Start, codingRegion.Start); WithinCds = IsWithinCds(coveredCdsStart, coveredCdsEnd, codingRegion, variant); IsCoding = !startCodonInsertionWithNoImpact && (position.CdsStart != -1 || position.CdsEnd != -1); } WithinCdna = IsWithinCdna(coveredCdnaStart, coveredCdnaEnd, transcript.TotalExonLength); if (coveredCdsStart != -1 && coveredCdsEnd != -1) { var varLen = coveredCdsEnd - coveredCdsStart + 1; var alleleLen = altAllele?.Length ?? 0; HasFrameShift = position.CdsStart != -1 && position.CdsEnd != -1 && !Codons.IsTriplet(alleleLen - varLen); } OverlapWithMicroRna = IsMatureMirnaVariant(position.CdnaStart, position.CdnaEnd, transcript.MicroRnas, transcript.BioType == BioType.miRNA); } internal static bool IsMatureMirnaVariant(int cdnaStart, int cdnaEnd, IInterval[] microRnas, bool isMiRna) { if (microRnas == null) return false; if (!isMiRna || cdnaStart == -1 || cdnaEnd == -1) return false; return microRnas.Any(microRna => microRna.Overlaps(cdnaStart, cdnaEnd)); } internal static bool IsAfterCoding(int variantRefBegin, int variantRefEnd, int transcriptEnd, int codingRegionEnd) { // special case to handle insertions after the CDS end if (variantRefBegin == variantRefEnd + 1 && variantRefEnd == codingRegionEnd) { return true; } var result = Intervals.Utilities.Overlaps(variantRefBegin, variantRefEnd, codingRegionEnd + 1, transcriptEnd); return result; } internal static bool IsBeforeCoding(int variantRefBegin, int variantRefEnd, int transcriptStart, int codingRegionStart) { // special case to handle insertions before the CDS start if (variantRefBegin == variantRefEnd + 1 && variantRefBegin == codingRegionStart) return true; bool result = Intervals.Utilities.Overlaps(variantRefBegin, variantRefEnd, transcriptStart, codingRegionStart - 1); return result; } internal static bool IsWithinCdna(int coveredCdnaStart, int coveredCdnaEnd, int totalExonLen) => coveredCdnaStart > 0 && coveredCdnaEnd <= totalExonLen; internal bool IsWithinCds(int coveredCdsBegin, int coveredCdsEnd, IInterval codingRegion, IInterval variant) { if (IsWithinFrameshiftIntron) return variant.Overlaps(codingRegion); return coveredCdsBegin != -1 && coveredCdsEnd != -1; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/TranscriptUtilities.cs ================================================ using Genome; using OptimizedCore; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.AnnotatedPositions.Transcript { public static class TranscriptUtilities { /// /// returns the alternate CDS given the reference sequence, the cds coordinates, and the alternate allele. /// public static string GetAlternateCds(ISequence refSequence, int cdsBegin, int cdsEnd, string alternateAllele, ITranscriptRegion[] regions, bool onReverseStrand, byte startExonPhase, int cdnaCodingStart) { var splicedSeq = GetSplicedSequence(refSequence, regions, onReverseStrand); int numPaddedBases = startExonPhase; int shift = cdnaCodingStart - 1; int upstreamLength = GetUpstreamLength(shift, cdsBegin - numPaddedBases - 1, splicedSeq.Length); int downstreamStart = cdsEnd - numPaddedBases + shift; string upstreamSeq = splicedSeq.Substring(shift, upstreamLength); string downstreamSeq = downstreamStart < splicedSeq.Length ? splicedSeq.Substring(downstreamStart) : ""; if (alternateAllele == null) alternateAllele = string.Empty; var paddedBases = numPaddedBases > 0 ? new string('N', numPaddedBases) : ""; return paddedBases + upstreamSeq + alternateAllele + downstreamSeq; } private static int GetUpstreamLength(int start, int length, int seqLength) { int desiredLength = start + length; int maxLength = seqLength - start; return desiredLength <= seqLength ? length : maxLength; } /// /// Retrieves all Exon sequences and concats them together. /// This includes 5' UTR + cDNA + 3' UTR [Transcript.pm:862 spliced_seq] /// private static string GetSplicedSequence(ISequence refSequence, ITranscriptRegion[] regions, bool onReverseStrand) { var sb = StringBuilderPool.Get(); foreach (var region in regions) { if (region.Type != TranscriptRegionType.Exon) continue; var exonLength = region.End - region.Start + 1; // sanity check: handle the situation where no reference has been provided if (refSequence == null) { sb.Append(new string('N', exonLength)); continue; } sb.Append(refSequence.Substring(region.Start - 1, exonLength)); } var results = StringBuilderPool.GetStringAndReturn(sb); return onReverseStrand ? SequenceUtilities.GetReverseComplement(results) : results; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/VariantEffect.cs ================================================ using System; using OptimizedCore; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.AnnotatedPositions.Transcript { /// /// This class performs all of the functional consequence testing. An additional caching layer /// has been added to prevent unneeded calculations. The caching layer is reset when each new /// variant has been read. /// public sealed class VariantEffect : IVariantEffect { private readonly TranscriptPositionalEffect _preCache; private readonly ITranscript _transcript; private readonly ISimpleVariant _variant; private readonly VariantEffectCache _cache; private readonly string _referenceAminoAcids; private readonly string _alternateAminoAcids; private readonly int _referenceAminoAcidsLen; private readonly int _alternateAminoAcidsLen; private readonly string _coveredReferenceAminoAcids; private readonly string _coveredAlternateAminoAcids; private readonly string _referenceCodons; private readonly string _alternateCodons; private readonly int _referenceCodonsLen; private readonly int _alternateCodonsLen; private readonly bool _isInsertion; private readonly bool _isDeletion; private readonly int _proteinBegin; public VariantEffect(TranscriptPositionalEffect transcriptEffect, ISimpleVariant variant, ITranscript transcript, string referenAminoAcids, string alternateAminoAcids, string referenceCodons, string alternateCodons, int? proteinBegin, string coveredReferenceAminoAcids, string coveredAlternateAminoAcids, VariantEffectCache cache = null) { _transcript = transcript; _variant = variant; _preCache = transcriptEffect; _cache = cache ?? new VariantEffectCache(); _referenceAminoAcids = referenAminoAcids; _alternateAminoAcids = alternateAminoAcids; _referenceAminoAcidsLen = _referenceAminoAcids?.Length ?? 0; _alternateAminoAcidsLen = _alternateAminoAcids?.Length ?? 0; _coveredReferenceAminoAcids = coveredReferenceAminoAcids; _coveredAlternateAminoAcids = coveredAlternateAminoAcids; _referenceCodons = referenceCodons; _alternateCodons = alternateCodons; _referenceCodonsLen = _referenceCodons?.Length ?? 0; _alternateCodonsLen = _alternateCodons?.Length ?? 0; _isInsertion = variant.AltAllele.Length > variant.RefAllele.Length; _isDeletion = variant.AltAllele.Length < variant.RefAllele.Length; _proteinBegin = proteinBegin ?? -1; } /// /// returns true if the variant is a splice acceptor variant [VariationEffect.pm:404 acceptor_splice_site] /// public bool IsSpliceAcceptorVariant() { const ConsequenceTag ct = ConsequenceTag.splice_acceptor_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = _transcript.Gene.OnReverseStrand ? _preCache.IsStartSpliceSite : _preCache.IsEndSpliceSite; _cache.Add(ct, result); return result; } /// /// returns true if the variant is a splice donor variant [VariationEffect.pm:459 donor_splice_site] /// public bool IsSpliceDonorVariant() { const ConsequenceTag ct = ConsequenceTag.splice_donor_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = _transcript.Gene.OnReverseStrand ? _preCache.IsEndSpliceSite : _preCache.IsStartSpliceSite; _cache.Add(ct, result); return result; } /// /// returns true if the variant is a 5' UTR variant (VariationEffect.pm:595 within_5_prime_utr) /// public bool IsFivePrimeUtrVariant() { const ConsequenceTag ct = ConsequenceTag.five_prime_UTR_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = false; if (_transcript.Translation != null) { var isFivePrimeOfCoding = _transcript.Gene.OnReverseStrand ? _preCache.AfterCoding : _preCache.BeforeCoding; result = isFivePrimeOfCoding && _preCache.WithinCdna; } _cache.Add(ct, result); return result; } /// /// returns true if the variant is a frameshift variant [VariantEffect.pm:940 frameshift] /// public bool IsFrameshiftVariant() { const ConsequenceTag ct = ConsequenceTag.frameshift_variant; if (_cache.Contains(ct)) return _cache.Get(ct); // check the predicates if (!_preCache.IsCoding) { _cache.Add(ct, false); return false; } if (IsIncompleteTerminalCodonVariant()) { _cache.Add(ct, false); return false; } bool result = _preCache.HasFrameShift && !IsStopRetained() && !IsTruncatedByStop(); _cache.Add(ct, result); return result; } /// /// returns true if we have an incomplete terminal codon variant. [VariantEffect.pm:983 partial_codon] /// public bool IsIncompleteTerminalCodonVariant() { const ConsequenceTag ct = ConsequenceTag.incomplete_terminal_codon_variant; if (_cache.Contains(ct)) return _cache.Get(ct); if (_transcript.Translation == null) { _cache.Add(ct, false); return false; } int cdsLength = _transcript.Translation.CodingRegion.Length; int codonCdsStart = _proteinBegin * 3 - 2; int lastCodonLength = cdsLength - (codonCdsStart - 1); bool result = lastCodonLength < 3 && lastCodonLength > 0; _cache.Add(ct, result); return result; } /// /// returns true if the variant is an inframe deletion [VariantEffect.pm:825 inframe_deletion] /// public bool IsInframeDeletion() { const ConsequenceTag ct = ConsequenceTag.inframe_deletion; if (_cache.Contains(ct)) return _cache.Get(ct); // check the predicates if (!_preCache.IsCoding || !_isDeletion) { _cache.Add(ct, false); return false; } if (_referenceCodonsLen == 0 //|| (PreCache.ReferenceCodonLen < PreCache.AlternateCodonLen) || IsFrameshiftVariant() || IsIncompleteTerminalCodonVariant() || IsStopGained()) { _cache.Add(ct, false); return false; } // simple string match var referenceCodon = _referenceCodons.ToLower(); var alternateCodon = _alternateCodons.ToLower(); if (referenceCodon.StartsWith(alternateCodon) || referenceCodon.EndsWith(alternateCodon)) { _cache.Add(ct, true); return true; } // try a more complex string match var commonPrefixLength = _referenceCodons.CommonPrefixLength(_alternateCodons); var commonSuffixLength = _referenceCodons.CommonSuffixLength(_alternateCodons); bool result = _alternateCodonsLen - commonPrefixLength - commonSuffixLength == 0; _cache.Add(ct, result); return result; } /// /// returns true if the variant is an inframe insertion [VariantEffect.pm:780 inframe_insertion] /// public bool IsInframeInsertion() { const ConsequenceTag ct = ConsequenceTag.inframe_insertion; if (_cache.Contains(ct)) return _cache.Get(ct); // check the predicates if (!_preCache.IsCoding || !_isInsertion) { _cache.Add(ct, false); return false; } if (IsStopRetained() || IsFrameshiftVariant() || IsStartLost() || _alternateCodonsLen <= _referenceCodonsLen || IsIncompleteTerminalCodonVariant()) { _cache.Add(ct, false); return false; } bool result = !IsTruncatedByStop(); _cache.Add(ct, result); return result; } private bool IsTruncatedByStop() { if (_alternateAminoAcids != null && _alternateAminoAcids.Contains(AminoAcids.StopCodon)) { var stopPos = _alternateAminoAcids.IndexOf(AminoAcids.StopCodon, StringComparison.Ordinal); var altAminoAcidesBeforeStop = _alternateAminoAcids.Substring(0, stopPos); if (_alternateAminoAcids.OptimizedStartsWith(AminoAcids.StopCodonChar) || _referenceAminoAcids.StartsWith(altAminoAcidesBeforeStop)) return true; } return false; } /// /// returns true if at least one base of the first codon was changed in the transcript [VariantEffect.pm:722 affects_start_codon] /// public bool IsStartLost() { const ConsequenceTag ct = ConsequenceTag.start_lost; if (_cache.Contains(ct)) return _cache.Get(ct); // check the predicates if (!_preCache.IsCoding) { _cache.Add(ct, false); return false; } if (_proteinBegin != 1 || _referenceAminoAcidsLen == 0) { _cache.Add(ct, false); return false; } // insertion in start codon and do not change start codon if (_isInsertion && _proteinBegin == 1 && _alternateAminoAcids.EndsWith(_referenceAminoAcids)) { _cache.Add(ct, false); return false; } bool result = _alternateAminoAcidsLen == 0 || _alternateAminoAcids[0] != _referenceAminoAcids[0]; _cache.Add(ct, result); return result; } /// /// returns true if the variant is a missense variant [VariantEffect.pm:682 missense_variant] /// public bool IsMissenseVariant() { const ConsequenceTag ct = ConsequenceTag.missense_variant; if (_cache.Contains(ct)) return _cache.Get(ct); // check the predicates if (!_preCache.IsCoding) { _cache.Add(ct, false); return false; } if (IsStartLost() || IsStopLost() || IsStopGained() || IsIncompleteTerminalCodonVariant() || IsFrameshiftVariant() || IsInframeDeletion() || IsInframeInsertion()) { _cache.Add(ct, false); return false; } bool result = _referenceAminoAcids != _alternateAminoAcids && _referenceAminoAcidsLen == _alternateAminoAcidsLen; _cache.Add(ct, result); return result; } /// /// returns true if the variant is a non-coding transcript exon variant [VariationEffect.pm:405 non_coding_exon_variant] /// public bool IsNonCodingTranscriptExonVariant() { const ConsequenceTag ct = ConsequenceTag.non_coding_transcript_exon_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = _preCache.HasExonOverlap && _transcript.Translation == null && !_preCache.OverlapWithMicroRna; _cache.Add(ct, result); return result; } /// /// returns true if the variant is a nonsense-mediated decay transcript variant [VariationEffect.pm:391 within_nmd_transcript] /// public bool IsNonsenseMediatedDecayTranscriptVariant() { const ConsequenceTag ct = ConsequenceTag.NMD_transcript_variant; if (_cache.Contains(ct)) return _cache.Get(ct); var result = _transcript.BioType == BioType.nonsense_mediated_decay; _cache.Add(ct, result); return result; } /// /// returns true if the variant is a protein altering variant [VariationEffect.pm:300 protein_altering_variant] /// public bool IsProteinAlteringVariant() { const ConsequenceTag ct = ConsequenceTag.protein_altering_variant; if (_cache.Contains(ct)) return _cache.Get(ct); var result = true; var sameLen = _referenceAminoAcidsLen == _alternateAminoAcidsLen; var startsWithTer = _referenceAminoAcids.OptimizedStartsWith('X') || _alternateAminoAcids.OptimizedStartsWith('X'); var isInframeDeletion = IsInframeDeletion(); // Note: sequence ontology says that stop retained should not be here (http://www.sequenceontology.org/browser/current_svn/term/SO:0001567) var isStopCodonVarinat = IsStopLost() || IsStopGained(); if (sameLen || startsWithTer || isInframeDeletion || isStopCodonVarinat || IsStartLost() || IsFrameshiftVariant() || IsInframeInsertion() || IsStopRetained() || !_preCache.IsCoding) { result = false; } _cache.Add(ct, result); return result; } /// /// returns true if the variant is a splice region variant [VariationEffect.pm:483 splice_region] /// public bool IsSpliceRegionVariant() { const ConsequenceTag ct = ConsequenceTag.splice_region_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = false; if (IsSpliceDonorVariant() || IsSpliceAcceptorVariant()) { // false } else { result = _preCache.IsWithinSpliceSiteRegion; } _cache.Add(ct, result); return result; } /// /// returns true if the variant's amino acid changes to a stop codon [VariationEffect.pm:884 stop_gained] /// public bool IsStopGained() { const ConsequenceTag ct = ConsequenceTag.stop_gained; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = !IsStopRetained() && (string.IsNullOrEmpty(_referenceAminoAcids) || !_referenceAminoAcids.Contains(AminoAcids.StopCodon)) && !string.IsNullOrEmpty(_alternateAminoAcids) && _alternateAminoAcids.Contains(AminoAcids.StopCodon); _cache.Add(ct, result); return result; } /// /// returns true if the variant is a stop lost variant [VariationEffect.pm:898 stop_lost] /// public bool IsStopLost() { const ConsequenceTag ct = ConsequenceTag.stop_lost; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = false; if (!string.IsNullOrEmpty(_coveredReferenceAminoAcids) && _coveredAlternateAminoAcids != null) result = _coveredReferenceAminoAcids.Contains(AminoAcids.StopCodon) && !_coveredAlternateAminoAcids.Contains(AminoAcids.StopCodon); _cache.Add(ct, result); return result; } /// /// returns true if the variant is a stop retained variant [VariationEffect.pm:701 stop_lost] /// public bool IsStopRetained() { const ConsequenceTag ct = ConsequenceTag.stop_retained_variant; if (_cache.Contains(ct)) return _cache.Get(ct); var alternateAminoAcids = TrimPeptides(_alternateAminoAcids); bool result = !string.IsNullOrEmpty(_referenceAminoAcids) && alternateAminoAcids != null && _referenceAminoAcids == alternateAminoAcids && _referenceAminoAcids.Contains(AminoAcids.StopCodon) || string.IsNullOrEmpty(_referenceAminoAcids) && alternateAminoAcids != null && _proteinBegin == _transcript.Translation?.PeptideSeq.Length + 1 && alternateAminoAcids == AminoAcids.StopCodon; _cache.Add(ct, result); return result; } public bool IsStartRetained() { const ConsequenceTag ct = ConsequenceTag.start_retained_variant; if (_cache.Contains(ct)) return _cache.Get(ct); if (_proteinBegin != 1 || string.IsNullOrEmpty(_referenceAminoAcids)) { _cache.Add(ct, false); return false; } var startProtein = _referenceAminoAcids[0].ToString(); var alternateAminoAcids = TrimPeptides(_alternateAminoAcids); var result = alternateAminoAcids != null && alternateAminoAcids.Contains(startProtein); _cache.Add(ct, result); return result; } private static string TrimPeptides(string alternateAminoAcids) { if (string.IsNullOrEmpty(alternateAminoAcids)) return null; if (!alternateAminoAcids.Contains(AminoAcids.StopCodon)) return alternateAminoAcids; var pos = alternateAminoAcids.IndexOf(AminoAcids.StopCodon, StringComparison.Ordinal); return pos < 0 ? alternateAminoAcids : alternateAminoAcids.Substring(0, pos + 1); } /// /// returns true if the variant is a synonymous variant [VariationEffect.pm:755 synonymous_variant] /// public bool IsSynonymousVariant() { const ConsequenceTag ct = ConsequenceTag.synonymous_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = !string.IsNullOrEmpty(_referenceAminoAcids) && (_variant.Type == VariantType.SNV || _variant.Type == VariantType.MNV) && _referenceAminoAcids == _alternateAminoAcids && !_referenceAminoAcids.Contains("X") && !_alternateAminoAcids.Contains("X") && !IsStopRetained(); _cache.Add(ct, result); return result; } /// /// returns true if the variant is a 3' UTR variant [VariationEffect.pm:609 within_3_prime_utr] /// public bool IsThreePrimeUtrVariant() { const ConsequenceTag ct = ConsequenceTag.three_prime_UTR_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = false; if (_transcript.Translation != null) { var isThreePrimeOfCoding = _transcript.Gene.OnReverseStrand ? _preCache.BeforeCoding : _preCache.AfterCoding; result = isThreePrimeOfCoding && _preCache.WithinCdna; } _cache.Add(ct, result); return result; } /// /// returns true if the variant is within a non-coding gene [VariationEffect.pm:398 within_non_coding_gene] /// public bool IsNonCodingTranscriptVariant() { const ConsequenceTag ct = ConsequenceTag.non_coding_transcript_variant; if (_cache.Contains(ct)) return _cache.Get(ct); // NOTE: Isn't IsWithinTranscript always true? and not within mature miRNA is always true // For Ensembl transcript, miRNA may be a valid attribute. We have their location and we would like to check if the variant overlaps with the miRNA var result = !_preCache.HasExonOverlap && _transcript.Translation == null && !_preCache.OverlapWithMicroRna; _cache.Add(ct, result); return result; } /// /// returns true if it's a coding sequnce variant [VariationEffect.pm:998 coding_unknown] /// public bool IsCodingSequenceVariant() { const ConsequenceTag ct = ConsequenceTag.coding_sequence_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = _preCache.WithinCds && (string.IsNullOrEmpty(_transcript.Translation.PeptideSeq) || string.IsNullOrEmpty(_alternateAminoAcids) || _alternateAminoAcids.Contains("X")) && !(IsFrameshiftVariant() || IsInframeDeletion() || IsIncompleteTerminalCodonVariant() || IsProteinAlteringVariant() || IsStopGained() || IsStopRetained() || IsStopLost() || IsStartRetained()); _cache.Add(ct, result); return result; } /// /// returns true if the variant occurs within an intron [VariationEffect.pm:494 within_intron] /// public bool IsWithinIntron() => _preCache.IsWithinIntron; /// /// returns true if the variant overlaps a mature MiRNA. [VariationEffect.pm:432 within_mature_miRNA] /// public bool IsMatureMirnaVariant() { const ConsequenceTag ct = ConsequenceTag.mature_miRNA_variant; if (_cache.Contains(ct)) return _cache.Get(ct); bool result = _preCache.OverlapWithMicroRna; _cache.Add(ct, result); return result; } } } ================================================ FILE: VariantAnnotation/AnnotatedPositions/Transcript/VariantEffectCache.cs ================================================ using System; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.AnnotatedPositions.Transcript { public sealed class VariantEffectCache { #region members private readonly bool[] _isCached; private readonly bool[] _cachedResults; #endregion // constructor public VariantEffectCache() { var numConsequences = Enum.GetNames(typeof(ConsequenceTag)).Length; _isCached = new bool[numConsequences]; _cachedResults = new bool[numConsequences]; } /// /// returns true if the corresponding value has been cached /// public void Add(ConsequenceTag consequence, bool result) { var index = (int)consequence; _isCached[index] = true; _cachedResults[index] = result; } /// /// returns the cached value for the corresponding result /// public bool Get(ConsequenceTag consequence) { return _cachedResults[(int)consequence]; } /// /// returns true if the corresponding value has been cached /// public bool Contains(ConsequenceTag consequence) { return _isCached[(int)consequence]; } } } ================================================ FILE: VariantAnnotation/Annotator.cs ================================================ using System.Collections.Generic; using System.Linq; using ErrorHandling.Exceptions; using Genome; using OptimizedCore; using RepeatExpansions; using VariantAnnotation.Interface; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.GeneAnnotation; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Pools; using Variants; namespace VariantAnnotation { public sealed class Annotator : IAnnotator { private readonly IAnnotationProvider _saProvider; private readonly IAnnotationProvider _gsaProvider; private readonly IAnnotationProvider _taProvider; private readonly IAnnotationProvider _lcrProvider; private readonly ISequenceProvider _sequenceProvider; private readonly IAnnotationProvider _conservationProvider; private readonly IGeneAnnotationProvider _geneAnnotationProvider; private readonly IRepeatExpansionProvider _repeatExpansionProvider; private readonly HashSet _affectedGenes; private bool _annotateMito; public GenomeAssembly Assembly { get; } public Annotator(IAnnotationProvider taProvider, ISequenceProvider sequenceProvider, IAnnotationProvider saProvider, IAnnotationProvider conservationProvider, IAnnotationProvider lcrProvider, IGeneAnnotationProvider geneAnnotationProvider, IRepeatExpansionProvider repeatExpansionProvider, IAnnotationProvider gsaProvider ) { _saProvider = saProvider; _gsaProvider = gsaProvider; _taProvider = taProvider; _sequenceProvider = sequenceProvider; _conservationProvider = conservationProvider; _lcrProvider = lcrProvider; _geneAnnotationProvider = geneAnnotationProvider; _repeatExpansionProvider = repeatExpansionProvider; _affectedGenes = new HashSet(); Assembly = GetAssembly(); } private GenomeAssembly GetAssembly() { var assemblies = new Dictionary>(); AddAssembly(assemblies, _taProvider); AddAssembly(assemblies, _saProvider); AddAssembly(assemblies, _gsaProvider); AddAssembly(assemblies, _sequenceProvider); AddAssembly(assemblies, _conservationProvider); if (assemblies.Count == 0) return GenomeAssembly.Unknown; if (assemblies.Count != 1) throw new UserErrorException(GetAssemblyErrorMessage(assemblies)); return assemblies.First().Key; } private static void AddAssembly(Dictionary> assemblies, IProvider provider) { if (provider == null) return; if (assemblies.TryGetValue(provider.Assembly, out List assemblyList)) assemblyList.Add(provider.Name); else assemblies[provider.Assembly] = new List {provider.Name}; } private static string GetAssemblyErrorMessage(Dictionary> assemblies) { var sb = StringBuilderPool.Get(); sb.AppendLine("Not all of the data sources have the same genome assembly:"); foreach ((GenomeAssembly genomeAssembly, List dataSources) in assemblies) sb.AppendLine($"- Using {genomeAssembly}: {string.Join(", ", dataSources)}"); return StringBuilderPool.GetStringAndReturn(sb); } public IAnnotatedPosition Annotate(IPosition position) { if (position == null) return null; IAnnotatedVariant[] annotatedVariants = GetAnnotatedVariants(position.Variants); //var annotatedPosition = new AnnotatedPosition(position, annotatedVariants); var annotatedPosition = AnnotatedPositionPool.Get(position, annotatedVariants); if (annotatedPosition.AnnotatedVariants == null || annotatedPosition.AnnotatedVariants.Length == 0 || position.Chromosome.UcscName == "chrM" && !_annotateMito ) return annotatedPosition; _sequenceProvider?.Annotate(annotatedPosition); _lcrProvider?.Annotate(annotatedPosition); _repeatExpansionProvider?.Annotate(annotatedPosition); _conservationProvider?.Annotate(annotatedPosition); _taProvider.Annotate(annotatedPosition); _saProvider?.Annotate(annotatedPosition); // needs to come after _taProvider for gene fusions _gsaProvider?.Annotate(annotatedPosition); TrackAffectedGenes(annotatedPosition); return annotatedPosition; } private void TrackAffectedGenes(IAnnotatedPosition annotatedPosition) { if (_geneAnnotationProvider == null) return; foreach (var variant in annotatedPosition.AnnotatedVariants) { AddGenesFromTranscripts(variant.Transcripts); } } private void AddGenesFromTranscripts(IList transcripts) { foreach (var transcript in transcripts) { if (IsFlankingTranscript(transcript)) continue; _affectedGenes.Add(transcript.Transcript.Gene.Symbol); } } private static bool IsFlankingTranscript(IAnnotatedTranscript transcript) { if (transcript.Consequences == null) return false; // ReSharper disable once LoopCanBeConvertedToQuery foreach (var consequence in transcript.Consequences) { if (consequence == ConsequenceTag.downstream_gene_variant || consequence == ConsequenceTag.upstream_gene_variant) return true; } return false; } internal static IAnnotatedVariant[] GetAnnotatedVariants(IVariant[] variants) { if (variants?[0].Behavior == null) return null; int numVariants = variants.Length; var annotatedVariants = new IAnnotatedVariant[numVariants]; for (var i = 0; i < numVariants; i++) annotatedVariants[i] = AnnotatedVariantPool.Get(variants[i]); return annotatedVariants; } public IEnumerable GetGeneAnnotations() { var geneAnnotations = new List(); foreach (string gene in _affectedGenes.OrderBy(x => x)) { string annotation = _geneAnnotationProvider.Annotate(gene); if (string.IsNullOrEmpty(annotation)) continue; geneAnnotations.Add(annotation); } return geneAnnotations.Count > 0 ? geneAnnotations : null; } public void EnableMitochondrialAnnotation() => _annotateMito = true; } } ================================================ FILE: VariantAnnotation/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: VariantAnnotation/Caches/DataStructures/CodingRegion.cs ================================================ using IO; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches.DataStructures { public sealed class CodingRegion : ICodingRegion { public int Start { get; } public int End { get; } public int CdnaStart { get; } public int CdnaEnd { get; } public int Length { get; } public CodingRegion(int start, int end, int cdnaStart, int cdnaEnd, int length) { Start = start; End = end; CdnaStart = cdnaStart; CdnaEnd = cdnaEnd; Length = length; } public static ICodingRegion Read(BufferedBinaryReader reader) { int genomicStart = reader.ReadOptInt32(); int genomicEnd = reader.ReadOptInt32(); int cdnaStart = reader.ReadOptInt32(); int cdnaEnd = reader.ReadOptInt32(); int length = reader.ReadOptInt32(); return new CodingRegion(genomicStart, genomicEnd, cdnaStart, cdnaEnd, length); } public void Write(IExtendedBinaryWriter writer) { writer.WriteOpt(Start); writer.WriteOpt(End); writer.WriteOpt(CdnaStart); writer.WriteOpt(CdnaEnd); writer.WriteOpt(Length); } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/EncodedTranscriptData.cs ================================================ using IO; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches.DataStructures { public sealed class EncodedTranscriptData { private readonly ushort _info; private readonly byte _contents; // contents // +====+====+====+====+====+====+====+====+ // |Tran|TReg|////|Mirn|Poly|Sift|StrExonPh| // +====+====+====+====+====+====+====+====+ private const int StartExonMask = 3; private const int SiftMask = 4; private const int PolyPhenMask = 8; private const int MirnasMask = 16; private const int TranscriptRegionsMask = 64; private const int TranslationMask = 128; public byte StartExonPhase => (byte)(_contents & StartExonMask); public bool HasSift => (_contents & SiftMask) != 0; public bool HasPolyPhen => (_contents & PolyPhenMask) != 0; public bool HasMirnas => (_contents & MirnasMask) != 0; public bool HasRnaEdits => (_info & RnaEditsMask) != 0; public bool HasSelenocysteines => (_info & SelenocysteinesMask) != 0; public bool HasTranscriptRegions => (_contents & TranscriptRegionsMask) != 0; public bool HasTranslation => (_contents & TranslationMask) != 0; // info // +====+====+====+====+====+====+====+====+====+====+====+====+====+====+====+====+ // |Cano| Source |\\\\|Sele|RnaE|CSNF|CENF| BioType | // +====+====+====+====+====+====+====+====+====+====+====+====+====+====+====+====+ private const int BioTypeMask = 0xff; private const int CdsStartNotFoundMask = 0x100; private const int CdsEndNotFoundMask = 0x200; private const int TranscriptSourceMask = 0x3; private const int CanonicalMask = 0x8000; private const int TranscriptSourceShift = 13; private const int RnaEditsMask = 1024; private const int SelenocysteinesMask = 2048; public BioType BioType => (BioType)(_info & BioTypeMask); public bool CdsStartNotFound => (_info & CdsStartNotFoundMask) != 0; public bool CdsEndNotFound => (_info & CdsEndNotFoundMask) != 0; public Source TranscriptSource => (Source)((_info >> TranscriptSourceShift) & TranscriptSourceMask); public bool IsCanonical => (_info & CanonicalMask) != 0; private EncodedTranscriptData(ushort info, byte contents) { _info = info; _contents = contents; } public static EncodedTranscriptData GetEncodedTranscriptData(BioType bioType, bool cdsStartNotFound, bool cdsEndNotFound, Source source, bool isCanonical, bool hasSift, bool hasPolyPhen, bool hasMicroRnas, bool hasRnaEdits, bool hasSelenocysteines, bool hasTranscriptRegions, bool hasTranslation, byte startExonPhase) { ushort info = (ushort)bioType; if (cdsStartNotFound) info |= CdsStartNotFoundMask; if (cdsEndNotFound) info |= CdsEndNotFoundMask; if (isCanonical) info |= CanonicalMask; if (hasRnaEdits) info |= RnaEditsMask; if (hasSelenocysteines) info |= SelenocysteinesMask; info |= (ushort)((ushort)source << TranscriptSourceShift); byte contents = startExonPhase; if (hasSift) contents |= SiftMask; if (hasPolyPhen) contents |= PolyPhenMask; if (hasMicroRnas) contents |= MirnasMask; if (hasTranscriptRegions) contents |= TranscriptRegionsMask; if (hasTranslation) contents |= TranslationMask; return new EncodedTranscriptData(info, contents); } public static EncodedTranscriptData Read(BufferedBinaryReader reader) { var info = reader.ReadUInt16(); var contents = reader.ReadByte(); return new EncodedTranscriptData(info, contents); } internal void Write(IExtendedBinaryWriter writer) { writer.Write(_info); writer.Write(_contents); } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/Gene.cs ================================================ using System.Collections.Generic; using Genome; using IO; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches.DataStructures { public sealed class Gene : IGene { public int Start { get; } public int End { get; } public Chromosome Chromosome { get; } public bool OnReverseStrand { get; } public string Symbol { get; } public ICompactId EntrezGeneId { get; } public ICompactId EnsemblId { get; } public int HgncId { get; } public Gene(Chromosome chromosome, int start, int end, bool onReverseStrand, string symbol, int hgncId, CompactId entrezGeneId, CompactId ensemblId) { OnReverseStrand = onReverseStrand; Symbol = symbol; HgncId = hgncId; EntrezGeneId = entrezGeneId; EnsemblId = ensemblId; Start = start; End = end; Chromosome = chromosome; } public static IGene Read(IBufferedBinaryReader reader, Dictionary indexToChromosome) { ushort referenceIndex = reader.ReadOptUInt16(); int start = reader.ReadOptInt32(); int end = reader.ReadOptInt32(); bool onReverseStrand = reader.ReadBoolean(); string symbol = reader.ReadAsciiString(); int hgncId = reader.ReadOptInt32(); var entrezId = CompactId.Read(reader); var ensemblId = CompactId.Read(reader); return new Gene(indexToChromosome[referenceIndex], start, end, onReverseStrand, symbol, hgncId, entrezId, ensemblId); } public void Write(IExtendedBinaryWriter writer) { writer.WriteOpt(Chromosome.Index); writer.WriteOpt(Start); writer.WriteOpt(End); writer.Write(OnReverseStrand); writer.WriteOptAscii(Symbol); writer.WriteOpt(HgncId); // ReSharper disable ImpureMethodCallOnReadonlyValueField EntrezGeneId.Write(writer); EnsemblId.Write(writer); } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/IndexEntry.cs ================================================ using System.IO; namespace VariantAnnotation.Caches.DataStructures { public struct IndexEntry { public long FileOffset; public int Count; public void Read(BinaryReader reader) { FileOffset = reader.ReadInt64(); Count = reader.ReadInt32(); } public void Write(BinaryWriter writer) { writer.Write(FileOffset); writer.Write(Count); } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/Prediction.cs ================================================ using System.IO; using IO; using VariantAnnotation.AnnotatedPositions.Transcript; namespace VariantAnnotation.Caches.DataStructures { public sealed class Prediction { private readonly byte[] _data; private readonly Entry[] _lut; // A X C D E F G H I X K L M N X P Q R S T X V W X Y X private static readonly int[] AminoAcidIndices = { 0, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11, -1, 12, 13, 14, 15, 16, -1, 17, 18, -1, 19, -1 }; private const int NumAminoAcids = 20; private const byte NullEntry = 0xff; public Prediction(byte[] data, Entry[] lut) { _data = data; _lut = lut; } public Entry GetPrediction(char newAminoAcid, int aaPosition) { // sanity check: skip stop codons if (newAminoAcid == AminoAcids.StopCodonChar || newAminoAcid == 'X') return null; int index = GetIndex(newAminoAcid, aaPosition); // sanity check: skip instances where the data isn't long enough if (index >= _data.Length) return null; byte entry = _data[index]; return entry == NullEntry ? null : _lut[entry]; } private static int GetIndex(char newAminoAcid, int aaPosition) { int asciiIndex = char.ToUpper(newAminoAcid) - 'A'; // sanity check: make sure the array index is within range if (asciiIndex < 0 || asciiIndex >= 26) { throw new InvalidDataException($"Expected an array index on the interval [0, 25], but observed the following: {asciiIndex} ({newAminoAcid})"); } int aaIndex = AminoAcidIndices[asciiIndex]; // sanity check: make sure the array index is within range if (aaIndex == -1) { throw new InvalidDataException($"An invalid amino acid was given: {newAminoAcid}"); } return NumAminoAcids * (aaPosition - 1) + aaIndex; } public void Write(BinaryWriter writer) { writer.Write(_data.Length); writer.Write(_data); } public static Prediction Read(ExtendedBinaryReader reader, Entry[] lut) { int numBytes = reader.ReadInt32(); var data = reader.ReadBytes(numBytes); return new Prediction(data, lut); } public sealed class Entry { public readonly double Score; public readonly byte EnumIndex; public Entry(double score, byte enumIndex) { Score = score; EnumIndex = enumIndex; } public static Entry ReadEntry(ExtendedBinaryReader reader) { double score = reader.ReadDouble(); byte enumIndex = reader.ReadByte(); return new Entry(score, enumIndex); } public void Write(BinaryWriter writer) { writer.Write(Score); writer.Write(EnumIndex); } } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/RegulatoryRegion.cs ================================================ using System.Collections.Generic; using Genome; using IO; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; namespace VariantAnnotation.Caches.DataStructures { public sealed class RegulatoryRegion : IRegulatoryRegion { public int Start { get; } public int End { get; } public Chromosome Chromosome { get; } public ICompactId Id { get; } public RegulatoryRegionType Type { get; } public RegulatoryRegion(Chromosome chromosome, int start, int end, CompactId id, RegulatoryRegionType type) { Id = id; Type = type; Start = start; End = end; Chromosome = chromosome; } public static IRegulatoryRegion Read(IBufferedBinaryReader reader, Dictionary chromosomeIndexDictionary) { var refIndex = reader.ReadOptUInt16(); int start = reader.ReadOptInt32(); int end = reader.ReadOptInt32(); var type = (RegulatoryRegionType)reader.ReadByte(); var id = CompactId.Read(reader); return new RegulatoryRegion(chromosomeIndexDictionary[refIndex], start, end, id, type); } public void Write(IExtendedBinaryWriter writer) { writer.WriteOpt(Chromosome.Index); writer.WriteOpt(Start); writer.WriteOpt(End); writer.Write((byte)Type); // ReSharper disable once ImpureMethodCallOnReadonlyValueField Id.Write(writer); } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/RnaEdit.cs ================================================ using IO; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.Caches.DataStructures { public sealed class RnaEdit : IRnaEdit { public int Start { get; } public int End { get; } public string Bases { get; } public VariantType Type { get; set; } public RnaEdit(int start, int end, string bases) { Start = start; End = end; Bases = bases; Type = VariantType.unknown; } public static IRnaEdit Read(BufferedBinaryReader reader) { int start = reader.ReadOptInt32(); int end = reader.ReadOptInt32(); string bases = reader.ReadAsciiString(); return new RnaEdit(start, end, bases); } public void Write(IExtendedBinaryWriter writer) { writer.WriteOpt(Start); writer.WriteOpt(End); writer.WriteOptAscii(Bases); } public int CompareTo(IRnaEdit other) { return Start.CompareTo(other.Start); } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/Transcript.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using Intervals; using IO; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches.DataStructures { public sealed class Transcript : ITranscript { public Chromosome Chromosome { get; } public int Start { get; } public int End { get; } public ICompactId Id { get; } public BioType BioType { get; } public bool IsCanonical { get; } public Source Source { get; } public IGene Gene { get; } public ITranscriptRegion[] TranscriptRegions { get; } public ushort NumExons { get; } public int TotalExonLength { get; } public byte StartExonPhase { get; } public int SiftIndex { get; } public int PolyPhenIndex { get; } public ITranslation Translation { get; } public IInterval[] MicroRnas { get; } public int[] Selenocysteines { get; } public IRnaEdit[] RnaEdits { get; } public bool CdsStartNotFound { get; } public bool CdsEndNotFound { get; } public ISequence CodingSequence { get; set; } public ISequence CdnaSequence { get; set; } public Transcript(Chromosome chromosome, int start, int end, ICompactId id, ITranslation translation, BioType bioType, IGene gene, int totalExonLength, byte startExonPhase, bool isCanonical, ITranscriptRegion[] transcriptRegions, ushort numExons, IInterval[] microRnas, int siftIndex, int polyPhenIndex, Source source, bool cdsStartNotFound, bool cdsEndNotFound, int[] selenocysteines, IRnaEdit[] rnaEdits) { Chromosome = chromosome; Start = start; End = end; Id = id; Translation = translation; BioType = bioType; Gene = gene; TotalExonLength = totalExonLength; StartExonPhase = startExonPhase; IsCanonical = isCanonical; TranscriptRegions = transcriptRegions; NumExons = numExons; MicroRnas = microRnas; SiftIndex = siftIndex; PolyPhenIndex = polyPhenIndex; Source = source; CdsStartNotFound = cdsStartNotFound; CdsEndNotFound = cdsEndNotFound; Selenocysteines = selenocysteines; RnaEdits = rnaEdits; } public static ITranscript Read(BufferedBinaryReader reader, Dictionary chromosomeIndexDictionary, IGene[] cacheGenes, ITranscriptRegion[] cacheTranscriptRegions, IInterval[] cacheMirnas, string[] cachePeptideSeqs) { // transcript ushort referenceIndex = reader.ReadOptUInt16(); int start = reader.ReadOptInt32(); int end = reader.ReadOptInt32(); var id = CompactId.Read(reader); // gene int geneIndex = reader.ReadOptInt32(); var gene = cacheGenes[geneIndex]; // encoded data var encoded = EncodedTranscriptData.Read(reader); // transcript regions ITranscriptRegion[] transcriptRegions = encoded.HasTranscriptRegions ? ReadIndices(reader, cacheTranscriptRegions) : null; ushort numExons = reader.ReadOptUInt16(); // protein function predictions int siftIndex = encoded.HasSift ? reader.ReadOptInt32() : -1; int polyphenIndex = encoded.HasPolyPhen ? reader.ReadOptInt32() : -1; // translation var translation = encoded.HasTranslation ? DataStructures.Translation.Read(reader, cachePeptideSeqs) : null; // attributes IInterval[] mirnas = encoded.HasMirnas ? ReadIndices(reader, cacheMirnas) : null; IRnaEdit[] rnaEdits = encoded.HasRnaEdits ? ReadItems(reader, RnaEdit.Read) : null; int[] selenocysteines = encoded.HasSelenocysteines ? ReadItems(reader, x => x.ReadOptInt32()) : null; return new Transcript(chromosomeIndexDictionary[referenceIndex], start, end, id, translation, encoded.BioType, gene, ExonUtilities.GetTotalExonLength(transcriptRegions), encoded.StartExonPhase, encoded.IsCanonical, transcriptRegions, numExons, mirnas, siftIndex, polyphenIndex, encoded.TranscriptSource, encoded.CdsStartNotFound, encoded.CdsEndNotFound, selenocysteines, rnaEdits); } /// /// writes the transcript to the binary writer /// public void Write(IExtendedBinaryWriter writer, Dictionary geneIndices, Dictionary transcriptRegionIndices, Dictionary microRnaIndices, Dictionary peptideIndices) { // transcript writer.WriteOpt(Chromosome.Index); writer.WriteOpt(Start); writer.WriteOpt(End); // ReSharper disable once ImpureMethodCallOnReadonlyValueField Id.Write(writer); // gene writer.WriteOpt(GetIndex(Gene, geneIndices)); // encoded data var encoded = EncodedTranscriptData.GetEncodedTranscriptData(BioType, CdsStartNotFound, CdsEndNotFound, Source, IsCanonical, SiftIndex != -1, PolyPhenIndex != -1, MicroRnas != null, RnaEdits != null, Selenocysteines != null, TranscriptRegions != null, Translation != null, StartExonPhase); encoded.Write(writer); // transcript regions if (encoded.HasTranscriptRegions) WriteIndices(writer, TranscriptRegions, transcriptRegionIndices); writer.WriteOpt(NumExons); // protein function predictions if (encoded.HasSift) writer.WriteOpt(SiftIndex); if (encoded.HasPolyPhen) writer.WriteOpt(PolyPhenIndex); // translation if (encoded.HasTranslation) { // ReSharper disable once PossibleNullReferenceException int peptideIndex = GetIndex(Translation.PeptideSeq, peptideIndices); Translation.Write(writer, peptideIndex); } // attributes if (encoded.HasMirnas) WriteIndices(writer, MicroRnas, microRnaIndices); if (encoded.HasRnaEdits) WriteItems(writer, RnaEdits, (x, y) => x.Write(y)); if (encoded.HasSelenocysteines) WriteItems(writer, Selenocysteines, (x, y) => y.WriteOpt(x)); } private static T[] ReadItems(BufferedBinaryReader reader, Func readFunc) { int numItems = reader.ReadOptInt32(); var items = new T[numItems]; for (var i = 0; i < numItems; i++) items[i] = readFunc(reader); return items; } private static void WriteItems(IExtendedBinaryWriter writer, T[] items, Action writeAction) { writer.WriteOpt(items.Length); foreach (var item in items) writeAction(item, writer); } private static T[] ReadIndices(IBufferedBinaryReader reader, T[] cachedItems) { int numItems = reader.ReadOptInt32(); var items = new T[numItems]; for (var i = 0; i < numItems; i++) { int index = reader.ReadOptInt32(); items[i] = cachedItems[index]; } return items; } private static void WriteIndices(IExtendedBinaryWriter writer, T[] items, IReadOnlyDictionary indices) { writer.WriteOpt(items.Length); foreach (var item in items) writer.WriteOpt(GetIndex(item, indices)); } private static int GetIndex(T item, IReadOnlyDictionary indices) { if (item == null) return -1; if (!indices.TryGetValue(item, out int index)) { throw new InvalidDataException($"Unable to locate the {typeof(T)} in the indices: {item}"); } return index; } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/TranscriptRegion.cs ================================================ using IO; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches.DataStructures { public sealed class TranscriptRegion : ITranscriptRegion { public TranscriptRegionType Type { get; } public ushort Id { get; } public int Start { get; } public int End { get; } public int CdnaStart { get; } public int CdnaEnd { get; } public TranscriptRegion(TranscriptRegionType type, ushort id, int start, int end, int cdnaStart, int cdnaEnd) { Type = type; Id = id; Start = start; End = end; CdnaStart = cdnaStart; CdnaEnd = cdnaEnd; } public static ITranscriptRegion Read(BufferedBinaryReader reader) { TranscriptRegionType type = (TranscriptRegionType)reader.ReadByte(); ushort id = reader.ReadOptUInt16(); int genomicStart = reader.ReadOptInt32(); int genomicEnd = reader.ReadOptInt32(); int cdnaStart = reader.ReadOptInt32(); int cdnaEnd = reader.ReadOptInt32(); return new TranscriptRegion(type, id, genomicStart, genomicEnd, cdnaStart, cdnaEnd); } public void Write(IExtendedBinaryWriter writer) { writer.Write((byte)Type); writer.WriteOpt(Id); writer.WriteOpt(Start); writer.WriteOpt(End); writer.WriteOpt(CdnaStart); writer.WriteOpt(CdnaEnd); } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/TranscriptRegionExtensions.cs ================================================ using System; using VariantAnnotation.Algorithms; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches.DataStructures { public static class TranscriptRegionExtensions { public static int BinarySearch(this ITranscriptRegion[] regions, int position) { var begin = 0; int end = regions.Length - 1; while (begin <= end) { int index = begin + (end - begin >> 1); var region = regions[index]; if (position >= region.Start && position <= region.End) return index; if (region.End < position) begin = index + 1; else if (position < region.Start) end = index - 1; } return ~begin; } public static (int ExonStart, int ExonEnd, int IntronStart, int IntronEnd) GetExonsAndIntrons( this ITranscriptRegion[] regions, int startIndex, int endIndex) { int affectedStartIndex = GetAffectedRegionIndex(startIndex); int affectedEndIndex = GetAffectedRegionIndex(endIndex); var exons = regions.FindDesiredRegionIds(x => x == TranscriptRegionType.Exon || x == TranscriptRegionType.Gap, affectedStartIndex, affectedEndIndex); var introns = regions.FindDesiredRegionIds(x => x == TranscriptRegionType.Intron, affectedStartIndex, affectedEndIndex); return (exons.Start, exons.End, introns.Start, introns.End); } private static (int Start, int End) FindDesiredRegionIds(this ITranscriptRegion[] regions, Func hasDesiredRegion, int startIndex, int endIndex) { int regionStart = FindFirst(regions, hasDesiredRegion, startIndex, endIndex); int newStartIndex = regionStart != -1 ? regionStart : startIndex; int regionEnd = FindLast(regions, hasDesiredRegion, newStartIndex, endIndex); int startId = regionStart == -1 ? -1 : regions[regionStart].Id; int endId = regionEnd == -1 ? -1 : regions[regionEnd].Id; if (endId < startId) Swap.Int(ref startId, ref endId); return (startId, endId); } private static int FindFirst(ITranscriptRegion[] regions, Func hasDesiredRegion, int startIndex, int endIndex) { for (int i = startIndex; i <= endIndex; i++) if (hasDesiredRegion(regions[i].Type)) return i; return -1; } private static int FindLast(ITranscriptRegion[] regions, Func hasDesiredRegion, int startIndex, int endIndex) { for (int i = endIndex; i >= startIndex; i--) if (hasDesiredRegion(regions[i].Type)) return i; return -1; } private static int GetAffectedRegionIndex(int index) { if (index >= 0) return index; index = ~index; return index == 0 ? 0 : index - 1; } } } ================================================ FILE: VariantAnnotation/Caches/DataStructures/Translation.cs ================================================ using IO; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches.DataStructures { public sealed class Translation : ITranslation { public ICodingRegion CodingRegion { get; } public ICompactId ProteinId { get; } public string PeptideSeq { get; } public Translation(ICodingRegion codingRegion, CompactId proteinId, string peptideSeq) { CodingRegion = codingRegion; ProteinId = proteinId; PeptideSeq = peptideSeq; } public static ITranslation Read(BufferedBinaryReader reader, string[] peptideSeqs) { var codingRegion = DataStructures.CodingRegion.Read(reader); var proteinId = CompactId.Read(reader); var peptideIndex = reader.ReadOptInt32(); var peptideSeq = peptideIndex == -1 ? null : peptideSeqs[peptideIndex]; return new Translation(codingRegion, proteinId, peptideSeq); } public void Write(IExtendedBinaryWriter writer, int peptideIndex) { CodingRegion.Write(writer); ProteinId.Write(writer); writer.WriteOpt(peptideIndex); } } } ================================================ FILE: VariantAnnotation/Caches/PredictionCache.cs ================================================ using System.Collections.Generic; using Genome; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.Caches { public sealed class PredictionCache : IPredictionCache { private readonly Prediction[] _predictions; public string Name { get; } = string.Empty; public GenomeAssembly Assembly { get; } public IEnumerable DataSourceVersions { get; } = new List(); private readonly string[] _descriptions; public PredictionCache(GenomeAssembly genomeAssembly, Prediction[] predictions, string[] descriptions) { Assembly = genomeAssembly; _predictions = predictions; _descriptions = descriptions; } public PredictionScore GetProteinFunctionPrediction(int predictionIndex, char newAminoAcid, int aaPosition) { var entry = _predictions[predictionIndex].GetPrediction(newAminoAcid, aaPosition); return entry == null ? null : new PredictionScore(_descriptions[entry.EnumIndex], entry.Score); } } } ================================================ FILE: VariantAnnotation/Caches/TranscriptCache.cs ================================================ using System.Collections.Generic; using Genome; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.Caches { public sealed class TranscriptCache : ITranscriptCache { public IIntervalForest TranscriptIntervalForest { get; } public IIntervalForest RegulatoryIntervalForest { get; } public string Name { get; } public GenomeAssembly Assembly { get; } public IEnumerable DataSourceVersions { get; } public TranscriptCache(IEnumerable dataSourceVersions, GenomeAssembly genomeAssembly, IntervalArray[] transcriptIntervalArrays, IntervalArray[] regulatoryRegionIntervalArrays) { Name = "Transcript annotation provider"; DataSourceVersions = dataSourceVersions; Assembly = genomeAssembly; TranscriptIntervalForest = new IntervalForest(transcriptIntervalArrays); RegulatoryIntervalForest = new IntervalForest(regulatoryRegionIntervalArrays); } } } ================================================ FILE: VariantAnnotation/Caches/TranscriptCacheData.cs ================================================ using System; using System.Collections.Generic; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO.Caches; using VariantAnnotation.Providers; namespace VariantAnnotation.Caches { public sealed class TranscriptCacheData { public readonly CacheHeader Header; public readonly IGene[] Genes; public readonly ITranscriptRegion[] TranscriptRegions; public readonly IInterval[] Mirnas; public readonly string[] PeptideSeqs; public readonly IntervalArray[] TranscriptIntervalArrays; public readonly IntervalArray[] RegulatoryRegionIntervalArrays; public TranscriptCacheData(CacheHeader header, IGene[] genes, ITranscriptRegion[] transcriptRegions, IInterval[] mirnas, string[] peptideSeqs, IntervalArray[] transcriptIntervalArrays, IntervalArray[] regulatoryRegionIntervalArrays) { Header = header; Genes = genes; TranscriptRegions = transcriptRegions; Mirnas = mirnas; PeptideSeqs = peptideSeqs; TranscriptIntervalArrays = transcriptIntervalArrays; RegulatoryRegionIntervalArrays = regulatoryRegionIntervalArrays; } public TranscriptCache GetCache() { var dataSourceVersions = GetDataSourceVersions(Header); return new TranscriptCache(dataSourceVersions, Header.Assembly, TranscriptIntervalArrays, RegulatoryRegionIntervalArrays); } private static IEnumerable GetDataSourceVersions(CacheHeader header) { var dataSourceVersions = new List(); if (header == null) return dataSourceVersions; ushort vepVersion = header.Custom.VepVersion; // TODO: Embed the data source version in the next cache file format. This hack let's us handle the SARS-CoV-2 genome DataSourceVersion dataSourceVersion = vepVersion == 0 ? new DataSourceVersion("RefSeq", "NC_045512.2", new DateTime(2020,3,20,0,0,0,DateTimeKind.Utc).Ticks, "Severe acute respiratory syndrome coronavirus 2 (SARS-CoV2)") : new DataSourceVersion("VEP", vepVersion.ToString(), header.CreationTimeTicks, header.Source.ToString()); dataSourceVersions.Add(dataSourceVersion); return dataSourceVersions; } } } ================================================ FILE: VariantAnnotation/Caches/TranscriptIntervalForestExtensions.cs ================================================ using Genome; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches { public static class TranscriptIntervalForestExtensions { public static ITranscript[] GetAllFlankingValues(this IIntervalForest transcriptIntervalForest, IChromosomeInterval interval) => transcriptIntervalForest.GetAllOverlappingValues(interval.Chromosome.Index, interval.Start - interval.Chromosome.FlankingLength, interval.End + interval.Chromosome.FlankingLength); } } ================================================ FILE: VariantAnnotation/Caches/Utilities/ExonUtilities.cs ================================================ using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Caches.Utilities { public static class ExonUtilities { public static int GetTotalExonLength(ITranscriptRegion[] regions) { int totalExonLength = 0; foreach (var region in regions) { if (region.Type != TranscriptRegionType.Exon) continue; totalExonLength += region.End - region.Start + 1; } return totalExonLength; } } } ================================================ FILE: VariantAnnotation/Caches/Utilities/GeneForestGenerator.cs ================================================ using System.Collections.Generic; using System.Linq; using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Utilities; namespace VariantAnnotation.Caches.Utilities { public static class GeneForestGenerator { private static readonly IntervalArray EmptyIntervalArray = new IntervalArray(new Interval[0]); public static IntervalForest GetGeneForest(IntervalArray[] transcriptIntervalArrays) { int numChromosomes = transcriptIntervalArrays.Length; var geneIntervalArrays = new IntervalArray[numChromosomes]; var geneComparer = new GeneComparer(); for (var chrIndex = 0; chrIndex < numChromosomes; chrIndex++) { if (transcriptIntervalArrays[chrIndex] == null) { geneIntervalArrays[chrIndex] = EmptyIntervalArray; continue; // assign an empty IntervalArray to this chr } var geneList = new List(); // keeps the order of genes, as the intervals are already sorted at trasncripts level var geneSet = new HashSet(geneComparer); foreach (var transcriptInterval in transcriptIntervalArrays[chrIndex].Array) { var transcript = transcriptInterval.Value; var gene = transcript.Gene; if (geneSet.Contains(gene)) continue; geneSet.Add(gene); geneList.Add(gene); } geneIntervalArrays[chrIndex] = new IntervalArray(geneList.Select(GetGeneInterval).ToArray()); } return new IntervalForest(geneIntervalArrays); } private static Interval GetGeneInterval(IGene gene) => new Interval(gene.Start, gene.End, gene); } } ================================================ FILE: VariantAnnotation/Caches/Utilities/RnaEditUtilities.cs ================================================ using System; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.Caches.Utilities { public static class RnaEditUtilities { public static VariantType GetRnaEditType(IRnaEdit rnaEdit) { if (string.IsNullOrEmpty(rnaEdit.Bases)) return VariantType.deletion; if (rnaEdit.Start == rnaEdit.End && rnaEdit.Bases.Length == 1) return VariantType.SNV; if (rnaEdit.Start == rnaEdit.End + 1 && !string.IsNullOrEmpty(rnaEdit.Bases)) return VariantType.insertion; if (Math.Abs(rnaEdit.End - rnaEdit.Start) + 1 == rnaEdit.Bases.Length) return VariantType.MNV; return VariantType.unknown; } public static void SetTypesAndSort(IRnaEdit[] rnaEdits) { foreach (var rnaEdit in rnaEdits) { if (rnaEdit.Type != VariantType.unknown) return; rnaEdit.Type = GetRnaEditType(rnaEdit); } Array.Sort(rnaEdits); } } } ================================================ FILE: VariantAnnotation/GeneAnnotation/GeneAnnotationProvider.cs ================================================ using System.Collections.Generic; using System.IO; using System.Linq; using Genome; using OptimizedCore; using VariantAnnotation.Interface.GeneAnnotation; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO; using VariantAnnotation.NSA; namespace VariantAnnotation.GeneAnnotation { public sealed class GeneAnnotationProvider : IGeneAnnotationProvider { public string Name { get; } public GenomeAssembly Assembly => GenomeAssembly.Unknown; public IEnumerable DataSourceVersions => _ngaReaders.Select(x => x.Version); private readonly List _ngaReaders; public string Annotate(string geneName) { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("name", geneName); var hasAnnotation = false; foreach (var ngaReader in _ngaReaders) { string jsonString = ngaReader.GetAnnotation(geneName); jsonObject.AddStringValue(ngaReader.JsonKey, jsonString, false); if (!string.IsNullOrEmpty(jsonString)) hasAnnotation = true; } if (!hasAnnotation) { StringBuilderPool.GetStringAndReturn(sb); return null; } sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } public GeneAnnotationProvider(IEnumerable dbStreams) { Name = "Gene annotation provider"; _ngaReaders = new List(); foreach (var dbStream in dbStreams) _ngaReaders.Add(NgaReader.Read(dbStream)); } public void Dispose() {} } } ================================================ FILE: VariantAnnotation/GeneFusions/Calling/BreakEndAdjacency.cs ================================================ namespace VariantAnnotation.GeneFusions.Calling { public sealed record BreakEndAdjacency(BreakPoint Origin, BreakPoint Partner); } ================================================ FILE: VariantAnnotation/GeneFusions/Calling/BreakEndAdjacencyFactory.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Text.RegularExpressions; using Genome; using Variants; namespace VariantAnnotation.GeneFusions.Calling { public static class BreakEndAdjacencyFactory { private const string ReverseBracket = "]"; private static readonly Regex ForwardRegex = new(@"\w+([\[\]])(.+):(\d+)([\[\]])", RegexOptions.Compiled); private static readonly Regex ReverseRegex = new(@"([\[\]])(.+):(\d+)([\[\]])\w+", RegexOptions.Compiled); public static BreakEndAdjacency[] CreateAdjacencies(ISimpleVariant variant, Dictionary refNameToChromosome, bool isInv3, bool isInv5) => variant.Type == VariantType.translocation_breakend ? CreateFromTranslocation(variant, refNameToChromosome) : CreateFromSymbolicAllele(variant, variant.Type, isInv3, isInv5); public static BreakEndAdjacency[] CreateFromTranslocation(ISimpleVariant variant, Dictionary refNameToChromosome) => variant.AltAllele.StartsWith(variant.RefAllele) ? ConvertTranslocation(variant, ForwardRegex, false, 4, refNameToChromosome) : ConvertTranslocation(variant, ReverseRegex, true, 1, refNameToChromosome); private static BreakEndAdjacency[] ConvertTranslocation(ISimpleVariant variant, Regex regex, bool onReverseStrand, int partnerBracketIndex, Dictionary refNameToChromosome) { Match match = regex.Match(variant.AltAllele); if (!match.Success) throw new InvalidDataException( $"Unable to successfully parse the complex rearrangements for the following allele: {variant.AltAllele}"); bool partnerOnReverseStrand = match.Groups[partnerBracketIndex].Value == ReverseBracket; var partnerPosition = Convert.ToInt32(match.Groups[3].Value); string partnerReferenceName = match.Groups[2].Value; Chromosome partnerChromosome = ReferenceNameUtilities.GetChromosome(refNameToChromosome, partnerReferenceName); var origin = new BreakPoint(variant.Chromosome, variant.Start, onReverseStrand); var partner = new BreakPoint(partnerChromosome, partnerPosition, partnerOnReverseStrand); return new[] {new BreakEndAdjacency(origin, partner)}; } public static BreakEndAdjacency[] CreateFromSymbolicAllele(IChromosomeInterval interval, VariantType variantType, bool isInv3, bool isInv5) { // ReSharper disable once SwitchStatementMissingSomeCases return variantType switch { VariantType.deletion => CreateFromDeletion(interval), VariantType.tandem_duplication => CreateFromDuplication(interval), VariantType.inversion => CreateFromInversion(interval, isInv3, isInv5), _ => null }; } // ReSharper disable once UseDeconstructionOnParameter private static BreakEndAdjacency Flip(this BreakEndAdjacency adjacency) { var origin = new BreakPoint(adjacency.Partner.Chromosome, adjacency.Partner.Position, !adjacency.Partner.OnReverseStrand); var partner = new BreakPoint(adjacency.Origin.Chromosome, adjacency.Origin.Position, !adjacency.Origin.OnReverseStrand); return new BreakEndAdjacency(origin, partner); } private static BreakEndAdjacency[] CreateFromDeletion(IChromosomeInterval interval) { // 1 10 . N N[1:21[ var origin = new BreakPoint(interval.Chromosome, interval.Start - 1, false); var remote = new BreakPoint(interval.Chromosome, interval.End + 1, false); var adjacency = new BreakEndAdjacency(origin, remote); return new[] {adjacency, adjacency.Flip()}; } private static BreakEndAdjacency[] CreateFromDuplication(IChromosomeInterval interval) { // 1 1 . N ]1:10]N var origin = new BreakPoint(interval.Chromosome, interval.End, false); var remote = new BreakPoint(interval.Chromosome, interval.Start - 1, false); var adjacency = new BreakEndAdjacency(origin, remote); return new[] {adjacency, adjacency.Flip()}; } private static BreakEndAdjacency[] CreateFromInversion(IChromosomeInterval interval, bool isInv3, bool isInv5) { // 1 10 . N N]1:20] // 1 11 . N [1:21[N BreakPoint origin, origin2, remote, remote2; // ReSharper disable once ConvertIfStatementToSwitchStatement if (!isInv3 && !isInv5) { origin = new BreakPoint(interval.Chromosome, interval.Start - 1, false); remote = new BreakPoint(interval.Chromosome, interval.End, true); origin2 = new BreakPoint(interval.Chromosome, interval.End + 1, true); remote2 = new BreakPoint(interval.Chromosome, interval.Start, false); } else if (isInv3) { origin = new BreakPoint(interval.Chromosome, interval.Start - 1, false); remote = new BreakPoint(interval.Chromosome, interval.End, true); origin2 = new BreakPoint(interval.Chromosome, interval.End, false); remote2 = new BreakPoint(interval.Chromosome, interval.Start - 1, true); } else // isInv5 { origin = new BreakPoint(interval.Chromosome, interval.Start, true); remote = new BreakPoint(interval.Chromosome, interval.End + 1, false); origin2 = new BreakPoint(interval.Chromosome, interval.End + 1, true); remote2 = new BreakPoint(interval.Chromosome, interval.Start, false); } return new[] {new BreakEndAdjacency(origin, remote), new BreakEndAdjacency(origin2, remote2)}; } } } ================================================ FILE: VariantAnnotation/GeneFusions/Calling/BreakPoint.cs ================================================ using Genome; namespace VariantAnnotation.GeneFusions.Calling { public sealed record BreakPoint(Chromosome Chromosome, int Position, bool OnReverseStrand); } ================================================ FILE: VariantAnnotation/GeneFusions/Calling/BreakPointTranscript.cs ================================================ using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.GeneFusions.Calling { public sealed record BreakPointTranscript(ITranscript Transcript, int GenomicPosition, int RegionIndex); } ================================================ FILE: VariantAnnotation/GeneFusions/Calling/GeneFusionCaller.cs ================================================ using System.Collections.Generic; using Genome; using Intervals; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.GeneFusions.HGVS; using VariantAnnotation.GeneFusions.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; using Variants; namespace VariantAnnotation.GeneFusions.Calling { public sealed class GeneFusionCaller { private readonly Dictionary _refNameToChromosome; private readonly IIntervalForest _transcriptIntervalForest; public GeneFusionCaller(Dictionary refNameToChromosome, IIntervalForest transcriptIntervalForest) { _refNameToChromosome = refNameToChromosome; _transcriptIntervalForest = transcriptIntervalForest; } // ReSharper disable once ParameterTypeCanBeEnumerable.Global public void AddGeneFusions(IAnnotatedVariant[] annotatedVariants, bool isImprecise, bool isInv3, bool isInv5) { var transcriptIdToGeneFusions = new Dictionary(); foreach (IAnnotatedVariant annotatedVariant in annotatedVariants) { IVariant variant = annotatedVariant.Variant; if (!variant.IsStructuralVariant) continue; BreakEndAdjacency[] adjacencies = BreakEndAdjacencyFactory.CreateAdjacencies(variant, _refNameToChromosome, isInv3, isInv5); if (adjacencies == null) continue; transcriptIdToGeneFusions.Clear(); foreach (BreakEndAdjacency adjacency in adjacencies) { ITranscript[] originTranscripts = GetOverlappingTranscripts(adjacency.Origin); ITranscript[] partnerTranscripts = GetOverlappingTranscripts(adjacency.Partner); if (originTranscripts == null || partnerTranscripts == null) continue; AddGeneFusionsToDictionary(transcriptIdToGeneFusions, adjacency, originTranscripts, partnerTranscripts, isImprecise); } foreach (IAnnotatedTranscript transcript in annotatedVariant.Transcripts) { string transcriptId = transcript.Transcript.Id.WithVersion; if (!transcriptIdToGeneFusions.TryGetValue(transcriptId, out IAnnotatedGeneFusion[] annotatedGeneFusions)) continue; transcript.AddGeneFusions(annotatedGeneFusions); } } } private ITranscript[] GetOverlappingTranscripts(BreakPoint bp) => bp == null ? null : _transcriptIntervalForest.GetAllOverlappingValues(bp.Chromosome.Index, bp.Position, bp.Position); internal static void AddGeneFusionsToDictionary(Dictionary transcriptIdToGeneFusions, // ReSharper disable once ParameterTypeCanBeEnumerable.Global BreakEndAdjacency adjacency, ITranscript[] originTranscripts, ITranscript[] partnerTranscripts, bool isImprecise) { var geneKeys = new HashSet(); var geneFusions = new List(); foreach (ITranscript originTranscript in originTranscripts) { geneFusions.Clear(); (int originIndex, ITranscriptRegion _) = MappedPositionUtilities.FindRegion(originTranscript.TranscriptRegions, adjacency.Origin.Position); foreach (ITranscript partnerTranscript in partnerTranscripts) { EvaluateGeneFusionCandidate(geneFusions, geneKeys, adjacency, originTranscript, originIndex, partnerTranscript, isImprecise); } if (geneFusions.Count == 0) continue; transcriptIdToGeneFusions[originTranscript.Id.WithVersion] = geneFusions.ToArray(); } } // ReSharper disable once UseDeconstructionOnParameter private static void EvaluateGeneFusionCandidate(List geneFusions, HashSet geneKeys, BreakEndAdjacency adjacency, ITranscript originTranscript, int originIndex, ITranscript partnerTranscript, bool isImprecise) { IGene originGene = originTranscript.Gene; IGene partnerGene = partnerTranscript.Gene; if (!FoundViableGeneFusion(adjacency, originGene, originTranscript, originTranscript.Source, partnerGene, partnerTranscript, partnerTranscript.Source)) return; (int partnerIndex, ITranscriptRegion partnerRegion) = MappedPositionUtilities.FindRegion(partnerTranscript.TranscriptRegions, adjacency.Partner.Position); int? partnerExon = partnerRegion.Type == TranscriptRegionType.Exon ? partnerRegion.Id : null; int? partnerIntron = partnerRegion.Type == TranscriptRegionType.Intron ? partnerRegion.Id : null; var origin = new BreakPointTranscript(originTranscript, adjacency.Origin.Position, originIndex); var partner = new BreakPointTranscript(partnerTranscript, adjacency.Partner.Position, partnerIndex); bool originOnReverseStrand = originGene.OnReverseStrand ^ adjacency.Origin.OnReverseStrand; (BreakPointTranscript first, BreakPointTranscript second) = originOnReverseStrand ? (partner, origin) : (origin, partner); bool isInFrame = !isImprecise && DetermineInFrameFusion(first, second); string hgvsr = HgvsRnaNomenclature.GetHgvs(first, second); (ulong fusionKey, string firstGeneSymbol, uint firstGeneKey, string secondGeneSymbol, uint secondGeneKey) = GetGeneAndFusionKeys(originGene, partnerGene); geneFusions.Add(new AnnotatedGeneFusion(partnerTranscript, partnerExon, partnerIntron, hgvsr, isInFrame, fusionKey, firstGeneSymbol, firstGeneKey, secondGeneSymbol, secondGeneKey)); geneKeys.Add(fusionKey); } internal static (ulong FusionKey, string FirstGeneSymbol, uint FirstGeneKey, string SecondGeneSymbol, uint SecondGeneKey) GetGeneAndFusionKeys(IGene originGene, IGene partnerGene) { (IGene firstGene, IGene secondGene) = SortGenes(originGene, partnerGene); string firstGeneId = firstGene.EnsemblId.WithoutVersion; string secondGeneId = secondGene.EnsemblId.WithoutVersion; uint firstGeneKey = GeneFusionKey.CreateGeneKey(firstGeneId); uint secondGeneKey = GeneFusionKey.CreateGeneKey(secondGeneId); ulong fusionKey = GeneFusionKey.Create(firstGeneKey, secondGeneKey); return (fusionKey, firstGene.Symbol, firstGeneKey, secondGene.Symbol, secondGeneKey); } private static (IGene FirstGene, IGene SecondGene) SortGenes(IGene originGene, IGene partnerGene) { if (originGene.Chromosome.Index == partnerGene.Chromosome.Index) { return originGene.Start < partnerGene.Start ? (originGene, partnerGene) : (partnerGene, originGene); } return originGene.Chromosome.Index < partnerGene.Chromosome.Index ? (originGene, partnerGene) : (partnerGene, originGene); } // ReSharper disable UseDeconstructionOnParameter internal static bool DetermineInFrameFusion(BreakPointTranscript first, BreakPointTranscript second) // ReSharper restore UseDeconstructionOnParameter { ITranscriptRegion firstRegion = first.Transcript.TranscriptRegions[first.RegionIndex]; ITranscriptRegion secondRegion = second.Transcript.TranscriptRegions[second.RegionIndex]; byte? firstCodonPosition = GetCodonPosition(firstRegion, first.Transcript.Translation, first.Transcript.StartExonPhase, first.Transcript.Gene.OnReverseStrand, first.GenomicPosition); byte? secondCodonPosition = GetCodonPosition(secondRegion, second.Transcript.Translation, second.Transcript.StartExonPhase, second.Transcript.Gene.OnReverseStrand, second.GenomicPosition); // nothing to do if we landed outside of the CDS or outside an exon if (firstCodonPosition == null || secondCodonPosition == null) return false; return firstCodonPosition == 1 && secondCodonPosition == 2 || firstCodonPosition == 2 && secondCodonPosition == 3 || firstCodonPosition == 3 && secondCodonPosition == 1; } internal static byte? GetCodonPosition(ITranscriptRegion region, ITranslation translation, byte startExonPhase, bool onReverseStrand, int genomicPosition) { if (translation == null || region.Type != TranscriptRegionType.Exon) return null; var variant = new Interval(genomicPosition, genomicPosition); (int cdnaPosition, int _) = MappedPositionUtilities.GetCdnaPositions(region, region, variant, onReverseStrand, false); (int cdsPosition, int _) = MappedPositionUtilities.GetCdsPositions(translation.CodingRegion, cdnaPosition, cdnaPosition, startExonPhase, false); if (cdsPosition == -1) return null; return (byte) ((cdsPosition - 1) % 3 + 1); } // ReSharper disable once UseDeconstructionOnParameter internal static bool FoundViableGeneFusion(BreakEndAdjacency adjacency, IGene originGene, IChromosomeInterval originInterval, Source originSource, IGene partnerGene, IChromosomeInterval partnerInterval, Source partnerSource) { bool originOnReverseStrand = originGene.OnReverseStrand ^ adjacency.Origin.OnReverseStrand; bool partnerOnReverseStrand = partnerGene.OnReverseStrand ^ adjacency.Partner.OnReverseStrand; bool differentStrand = originOnReverseStrand != partnerOnReverseStrand; bool differentTranscriptSource = originSource != partnerSource; bool sameGeneSymbol = originGene.Symbol == partnerGene.Symbol; bool transcriptAlreadyOverlaps = originInterval.Chromosome.Index == partnerInterval.Chromosome.Index && originInterval.Overlaps(partnerInterval); return !differentStrand && !differentTranscriptSource && !sameGeneSymbol && !transcriptAlreadyOverlaps; } } } ================================================ FILE: VariantAnnotation/GeneFusions/HGVS/HgvsRnaNomenclature.cs ================================================ using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.GeneFusions.Calling; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.GeneFusions.HGVS { public static class HgvsRnaNomenclature { public static string GetHgvs(BreakPointTranscript first, BreakPointTranscript second) { string firstCoordinate = GetHgvsRnaCoordinate(first); string secondCoordinate = GetHgvsRnaCoordinate(second); return $"{first.Transcript.Id.WithVersion}({first.Transcript.Gene.Symbol}):r.?_{firstCoordinate}::{second.Transcript.Id.WithVersion}({second.Transcript.Gene.Symbol}):r.{secondCoordinate}_?"; } // ReSharper disable once UseDeconstructionOnParameter private static string GetHgvsRnaCoordinate(BreakPointTranscript first) { ITranscript transcript = first.Transcript; PositionOffset positionOffset = HgvsUtilities.GetPositionOffset(transcript, first.GenomicPosition, first.RegionIndex, true); return positionOffset.Value; } } } ================================================ FILE: VariantAnnotation/GeneFusions/IO/GeneFusionIndexEntry.cs ================================================ using System; using IO; namespace VariantAnnotation.GeneFusions.IO { public readonly struct GeneFusionIndexEntry { private readonly ulong _geneKey; public readonly ushort Index; public GeneFusionIndexEntry(ulong geneKey, ushort index) { _geneKey = geneKey; Index = index; } public int Compare(ulong geneKey) { if (_geneKey < geneKey) return -1; return _geneKey > geneKey ? 1 : 0; } public static GeneFusionIndexEntry Read(ref ReadOnlySpan byteSpan) { ulong geneKey = SpanBufferBinaryReader.ReadUInt64(ref byteSpan); ushort index = SpanBufferBinaryReader.ReadOptUInt16(ref byteSpan); return new GeneFusionIndexEntry(geneKey, index); } public void Write(ExtendedBinaryWriter writer) { writer.Write(_geneKey); writer.WriteOpt(Index); } } } ================================================ FILE: VariantAnnotation/GeneFusions/IO/GeneFusionJsonReader.cs ================================================ using System; using System.Buffers; using System.Collections.Generic; using System.IO; using System.Text; using Compression.Utilities; using ErrorHandling; using Genome; using IO; using IO.v2; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; namespace VariantAnnotation.GeneFusions.IO { public sealed class GeneFusionJsonReader : IGeneFusionSaReader { public const ushort SupportedFileFormatVersion = 1; public GenomeAssembly Assembly => GenomeAssembly.Unknown; public IDataSourceVersion Version { get; } public string JsonKey { get; } private readonly ExtendedBinaryReader _reader; internal Dictionary FusionKeyToFusions; public GeneFusionJsonReader(Stream stream) { _reader = new ExtendedBinaryReader(stream, Encoding.UTF8); // ReSharper disable once UseDeconstruction Header header = Header.Read(_reader); JsonKey = _reader.ReadString(); CheckHeader(header.FileType, header.FileFormatVersion); Version = DataSourceVersion.Read(_reader); } public static void CheckHeader(FileType fileType, ushort fileFormatVersion) { if (fileType != FileType.GeneFusionJson) throw new InvalidDataException( $"Found an invalid file type ({fileType}) while reading the gene fusions file.") .MakeUserError(); if (fileFormatVersion != SupportedFileFormatVersion) throw new InvalidDataException( $"The gene fusion reader currently supports v{SupportedFileFormatVersion} files, but found v{fileFormatVersion} instead.") .MakeUserError(); } public void LoadAnnotations() { ArrayPool bytePool = ArrayPool.Shared; byte[] bytes = _reader.ReadCompressedByteArray(bytePool); ReadOnlySpan byteSpan = bytes.AsSpan(); int numGeneFusionPairs = SpanBufferBinaryReader.ReadOptInt32(ref byteSpan); FusionKeyToFusions = new Dictionary(numGeneFusionPairs); for (var i = 0; i < numGeneFusionPairs; i++) { ulong geneKey = SpanBufferBinaryReader.ReadUInt64(ref byteSpan); int numJsonEntries = SpanBufferBinaryReader.ReadOptInt32(ref byteSpan); var jsonArray = new string[numJsonEntries]; for (var j = 0; j < numJsonEntries; j++) jsonArray[j] = SpanBufferBinaryReader.ReadUtf8String(ref byteSpan); FusionKeyToFusions[geneKey] = jsonArray; } bytePool.Return(bytes); } public void AddAnnotations(IGeneFusionPair[] fusionPairs, IList supplementaryAnnotations) { var jsonEntries = new List(); foreach (IGeneFusionPair fusionPair in fusionPairs) { if (!FusionKeyToFusions.TryGetValue(fusionPair.FusionKey, out string[] entries)) continue; jsonEntries.AddRange(entries); } if (jsonEntries.Count == 0) return; var sa = new SupplementaryAnnotation(JsonKey, true, false, null, jsonEntries); supplementaryAnnotations.Add(sa); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: VariantAnnotation/GeneFusions/IO/GeneFusionSourceReader.cs ================================================ using System; using System.Buffers; using System.Collections.Generic; using System.IO; using System.Text; using Compression.Utilities; using ErrorHandling; using Genome; using IO; using IO.v2; using VariantAnnotation.GeneFusions.SA; using VariantAnnotation.GeneFusions.Utilities; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using VariantAnnotation.Providers; namespace VariantAnnotation.GeneFusions.IO { public sealed class GeneFusionSourceReader : IGeneFusionSaReader { public const ushort SupportedFileFormatVersion = 1; private readonly ExtendedBinaryReader _reader; public GenomeAssembly Assembly => GenomeAssembly.Unknown; public IDataSourceVersion Version { get; } public string JsonKey { get; } internal uint[] OncogeneKeys; internal GeneFusionSourceCollection[] Index; internal GeneFusionIndexEntry[] IndexEntries; public GeneFusionSourceReader(Stream stream) { _reader = new ExtendedBinaryReader(stream, Encoding.UTF8); // ReSharper disable once UseDeconstruction Header header = Header.Read(_reader); JsonKey = _reader.ReadString(); CheckHeader(header.FileType, header.FileFormatVersion); Version = DataSourceVersion.Read(_reader); } internal static void CheckHeader(FileType fileType, ushort fileFormatVersion) { if (fileType != FileType.FusionCatcher) throw new InvalidDataException( $"Found an invalid file type ({fileType}) while reading the gene fusions file.") .MakeUserError(); if (fileFormatVersion != SupportedFileFormatVersion) throw new InvalidDataException( $"The gene fusion reader currently supports v{SupportedFileFormatVersion} files, but found v{fileFormatVersion} instead.") .MakeUserError(); } public void LoadAnnotations() { ArrayPool bytePool = ArrayPool.Shared; byte[] bytes = _reader.ReadCompressedByteArray(bytePool); ReadOnlySpan byteSpan = bytes.AsSpan(); int numOncogenes = SpanBufferBinaryReader.ReadOptInt32(ref byteSpan); OncogeneKeys = new uint[numOncogenes]; for (var i = 0; i < numOncogenes; i++) OncogeneKeys[i] = SpanBufferBinaryReader.ReadOptUInt32(ref byteSpan); int indexLength = SpanBufferBinaryReader.ReadOptInt32(ref byteSpan); Index = new GeneFusionSourceCollection[indexLength]; for (var i = 0; i < indexLength; i++) Index[i] = GeneFusionSourceCollection.Read(ref byteSpan); int numIndexEntries = SpanBufferBinaryReader.ReadOptInt32(ref byteSpan); IndexEntries = new GeneFusionIndexEntry[numIndexEntries]; for (var i = 0; i < numIndexEntries; i++) IndexEntries[i] = GeneFusionIndexEntry.Read(ref byteSpan); bytePool.Return(bytes); } public void AddAnnotations(IGeneFusionPair[] fusionPairs, IList supplementaryAnnotations) { var jsonEntries = new List(); foreach (IGeneFusionPair fusionPair in fusionPairs) { ushort? index = IndexEntries.GetIndex(fusionPair.FusionKey); if (index == null) continue; jsonEntries.Add(Index[index.Value].GetJsonEntry(fusionPair, OncogeneKeys)); } if (jsonEntries.Count == 0) return; var sa = new SupplementaryAnnotation(JsonKey, true, false, null, jsonEntries); supplementaryAnnotations.Add(sa); } public void Dispose() => _reader.Dispose(); } } ================================================ FILE: VariantAnnotation/GeneFusions/IO/IGeneFusionSaReader.cs ================================================ using System; using System.Collections.Generic; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.SA; namespace VariantAnnotation.GeneFusions.IO { public interface IGeneFusionSaReader : ISaMetadata, IDisposable { void LoadAnnotations(); // ReSharper disable once ParameterTypeCanBeEnumerable.Global void AddAnnotations(IGeneFusionPair[] fusionPairs, IList supplementaryAnnotations); } } ================================================ FILE: VariantAnnotation/GeneFusions/SA/GeneFusionPair.cs ================================================ using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.GeneFusions.SA { public sealed record GeneFusionPair (ulong FusionKey, string FirstGeneSymbol, uint FirstGeneKey, string SecondGeneSymbol, uint SecondGeneKey) : IGeneFusionPair { public bool Equals(GeneFusionPair other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return FusionKey == other.FusionKey; } public override int GetHashCode() => FusionKey.GetHashCode(); } } ================================================ FILE: VariantAnnotation/GeneFusions/SA/GeneFusionSource.cs ================================================ // ReSharper disable InconsistentNaming // ReSharper disable UnusedMember.Global namespace VariantAnnotation.GeneFusions.SA { public enum GeneFusionSource : byte { None = 0, Alaei_Mahabadi_18_Cancers, // 18cancer Babiceanu_NonCancerTissues, Bailey_pancreatic_cancers, Bao_gliomas, CACG, Cancer_Genome_Project, CCLE, CCLE_Vellichirammal, // ccle3 ConjoinG, COSMIC, Duplicated_Genes_Database, GTEx_healthy_tissues, Healthy, Healthy_prefrontal_cortex, Healthy_strong_support, // banned Human_Protein_Atlas, Illumina_BodyMap2, NonTumorCellLines, OneK_Genomes_Project, Paralog, Pseudogene, Readthrough, Robinson_prostate_cancers, TumorFusions_normal, TCGA_oesophageal_carcinomas, TCGA_Tumor, // additional data sources (2021-05-25) CCLE_Klign, // ccle2.txt ChimerKB_4, ChimerPub_4, ChimerSeq_4, Known, Mitelman_DB, OncoKB, PCAWG, TCGA, // tcga.txt TumorFusions_tumor, // tcga-cancer.txt TCGA_Gao, // tcga2.txt TCGA_Vellichirammal, // tcga3.txt TICdb } } ================================================ FILE: VariantAnnotation/GeneFusions/SA/GeneFusionSourceCollection.cs ================================================ using System; using System.Collections.Generic; using System.Text; using IO; using OptimizedCore; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.IO; namespace VariantAnnotation.GeneFusions.SA { public sealed class GeneFusionSourceCollection : IEquatable { private readonly bool _isPseudogenePair; private readonly bool _isParalogPair; private readonly bool _isReadthrough; private readonly GeneFusionSource[] _germlineSources; private readonly GeneFusionSource[] _somaticSources; private const int PseudogeneMask = 1; private const int ParalogMask = 2; private const int ReadthroughMask = 4; public GeneFusionSourceCollection(bool isPseudogenePair, bool isParalogPair, bool isReadthrough, GeneFusionSource[] germlineSources, GeneFusionSource[] somaticSources) { _isPseudogenePair = isPseudogenePair; _isParalogPair = isParalogPair; _isReadthrough = isReadthrough; _germlineSources = germlineSources; _somaticSources = somaticSources; } public void Write(ExtendedBinaryWriter writer) { writer.Write(GetFlags()); WriteSourceGroup(writer, _germlineSources); WriteSourceGroup(writer, _somaticSources); } private byte GetFlags() { byte flags = 0; if (_isPseudogenePair) flags |= PseudogeneMask; if (_isParalogPair) flags |= ParalogMask; if (_isReadthrough) flags |= ReadthroughMask; return flags; } // ReSharper disable once SuggestBaseTypeForParameter private static void WriteSourceGroup(ExtendedBinaryWriter writer, GeneFusionSource[] sources) { if (sources == null) { writer.Write((byte)0); return; } writer.WriteOpt(sources.Length); foreach (GeneFusionSource source in sources) writer.Write((byte) source); } public static GeneFusionSourceCollection Read(ref ReadOnlySpan byteSpan) { byte flags = SpanBufferBinaryReader.ReadByte(ref byteSpan); bool isPseudogenePair = (flags & PseudogeneMask) != 0; bool isParalogPair = (flags & ParalogMask) != 0; bool isReadthrough = (flags & ReadthroughMask) != 0; GeneFusionSource[] germlineSources = ReadSources(ref byteSpan); GeneFusionSource[] somaticSources = ReadSources(ref byteSpan); return new GeneFusionSourceCollection(isPseudogenePair, isParalogPair, isReadthrough, germlineSources, somaticSources); } private static GeneFusionSource[] ReadSources(ref ReadOnlySpan byteSpan) { int numSources = SpanBufferBinaryReader.ReadOptInt32(ref byteSpan); if (numSources == 0) return null; var sources = new GeneFusionSource[numSources]; for (var i = 0; i < numSources; i++) { sources[i] = (GeneFusionSource) SpanBufferBinaryReader.ReadByte(ref byteSpan); } return sources; } // ReSharper disable once ParameterTypeCanBeEnumerable.Global public string GetJsonEntry(IGeneFusionPair geneFusionPair, uint[] oncogeneKeys) { StringBuilder sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); var entries = new List(); AddGenes(geneFusionPair, oncogeneKeys, jsonObject); if (_germlineSources != null) AddGeneFusionSource("germlineSources", _germlineSources, entries, jsonObject); if (_somaticSources != null) AddGeneFusionSource("somaticSources", _somaticSources, entries, jsonObject); return StringBuilderPool.GetStringAndReturn(sb); } private void AddGenes(IGeneFusionPair geneFusionPair, uint[] oncogeneKeys, JsonObject jsonObject) { jsonObject.StartObjectWithKey("genes"); AddGene("first", geneFusionPair.FirstGeneKey, geneFusionPair.FirstGeneSymbol, oncogeneKeys, jsonObject); AddGene("second", geneFusionPair.SecondGeneKey, geneFusionPair.SecondGeneSymbol, oncogeneKeys, jsonObject); jsonObject.AddBoolValue("isParalogPair", _isParalogPair); jsonObject.AddBoolValue("isPseudogenePair", _isPseudogenePair); jsonObject.AddBoolValue("isReadthrough", _isReadthrough); jsonObject.EndObject(); } private static void AddGene(string key, uint geneKey, string geneSymbol, uint[] oncogeneKeys, JsonObject jsonObject) { jsonObject.StartObjectWithKey(key); jsonObject.AddStringValue("hgnc", geneSymbol); bool isOncogene = Array.BinarySearch(oncogeneKeys, geneKey) >= 0; jsonObject.AddBoolValue("isOncogene", isOncogene); jsonObject.EndObject(); } // ReSharper disable once ParameterTypeCanBeEnumerable.Local private static void AddGeneFusionSource(string description, GeneFusionSource[] sources, List entries, JsonObject jsonObject) { entries.Clear(); foreach (GeneFusionSource source in sources) { string sourceString = GeneFusionSourceUtilities.Convert(source); if (sourceString != null) entries.Add(sourceString); } jsonObject.AddStringValues(description, entries); } public bool Equals(GeneFusionSourceCollection other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return _isPseudogenePair == other._isPseudogenePair && _isParalogPair == other._isParalogPair && _isReadthrough == other._isReadthrough && _germlineSources.ArrayEqual(other._germlineSources) && _somaticSources.ArrayEqual(other._somaticSources); } public override int GetHashCode() { var hashCode = new HashCode(); hashCode.Add(_isPseudogenePair); hashCode.Add(_isParalogPair); hashCode.Add(_isReadthrough); if (_germlineSources != null) foreach (GeneFusionSource source in _germlineSources) hashCode.Add((byte) source); if (_somaticSources != null) foreach (GeneFusionSource source in _somaticSources) hashCode.Add((byte) source); return hashCode.ToHashCode(); } } } ================================================ FILE: VariantAnnotation/GeneFusions/SA/GeneFusionSourceUtilities.cs ================================================ namespace VariantAnnotation.GeneFusions.SA { public static class GeneFusionSourceUtilities { public static string Convert(GeneFusionSource source) { // ReSharper disable once SwitchExpressionHandlesSomeKnownEnumValuesWithExceptionInDefault return source switch { GeneFusionSource.Alaei_Mahabadi_18_Cancers => "Alaei-Mahabadi 18 cancers", GeneFusionSource.Babiceanu_NonCancerTissues => "Babiceanu non-cancer tissues", GeneFusionSource.Bailey_pancreatic_cancers => "Bailey pancreatic cancers", GeneFusionSource.Bao_gliomas => "Bao gliomas", GeneFusionSource.CACG => "CACG", GeneFusionSource.Cancer_Genome_Project => "Cancer Genome Project", GeneFusionSource.CCLE => "DepMap CCLE", GeneFusionSource.CCLE_Klign => "CCLE Klijn", GeneFusionSource.CCLE_Vellichirammal => "CCLE Vellichirammal", GeneFusionSource.ChimerKB_4 => "ChimerKB 4.0", GeneFusionSource.ChimerPub_4 => "ChimerPub 4.0", GeneFusionSource.ChimerSeq_4 => "ChimerSeq 4.0", GeneFusionSource.ConjoinG => "ConjoinG", GeneFusionSource.COSMIC => "COSMIC", GeneFusionSource.Duplicated_Genes_Database => "Duplicated Genes Database", GeneFusionSource.GTEx_healthy_tissues => "GTEx healthy tissues", GeneFusionSource.Healthy => "Healthy", GeneFusionSource.Healthy_prefrontal_cortex => "Healthy prefrontal cortex", GeneFusionSource.Healthy_strong_support => "Healthy (strong support)", GeneFusionSource.Human_Protein_Atlas => "Human Protein Atlas", GeneFusionSource.Illumina_BodyMap2 => "Illumina Body Map 2.0", GeneFusionSource.Known => "Known", GeneFusionSource.Mitelman_DB => "Mitelman DB", GeneFusionSource.NonTumorCellLines => "non-tumor cell lines", GeneFusionSource.OncoKB => "OncoKB", GeneFusionSource.OneK_Genomes_Project => "1000 Genomes Project", GeneFusionSource.PCAWG => "PCAWG", GeneFusionSource.Robinson_prostate_cancers => "Robinson prostate cancers", GeneFusionSource.TCGA => "TCGA", GeneFusionSource.TCGA_Gao => "TCGA Gao", GeneFusionSource.TCGA_oesophageal_carcinomas => "TCGA oesophageal carcinomas", GeneFusionSource.TCGA_Tumor => "TCGA tumor", GeneFusionSource.TCGA_Vellichirammal => "TCGA Vellichirammal", GeneFusionSource.TICdb => "TICdb", GeneFusionSource.TumorFusions_tumor => "TumorFusions tumor", GeneFusionSource.TumorFusions_normal => "TumorFusions normal", _ => null }; } } } ================================================ FILE: VariantAnnotation/GeneFusions/Utilities/GeneFusionKey.cs ================================================ using System; namespace VariantAnnotation.GeneFusions.Utilities { public static class GeneFusionKey { public static ulong Create(uint num, uint num2) { if (num == 0 || num2 == 0) return 0; (ulong min, ulong max) = num < num2 ? (num, num2) : (num2, num); return min << 32 | max; } public static uint CreateGeneKey(string geneId) { if (geneId == null) return 0; ReadOnlySpan geneSpan = geneId.AsSpan().Slice(4); return uint.Parse(geneSpan); } } } ================================================ FILE: VariantAnnotation/GeneFusions/Utilities/IndexEntryExtensions.cs ================================================ using VariantAnnotation.GeneFusions.IO; namespace VariantAnnotation.GeneFusions.Utilities { public static class IndexEntryExtensions { public static ushort? GetIndex(this GeneFusionIndexEntry[] array, ulong geneKey) { var begin = 0; int end = array.Length - 1; while (begin <= end) { int index = begin + (end - begin >> 1); int ret = array[index].Compare(geneKey); // ReSharper disable once ConvertIfStatementToSwitchStatement if (ret == 0) return array[index].Index; if (ret < 0) begin = index + 1; else end = index - 1; } return null; } } } ================================================ FILE: VariantAnnotation/GenericScore/ChromosomeBlock.cs ================================================ using System.Collections.Generic; using IO; namespace VariantAnnotation.GenericScore { public sealed class ChromosomeBlock { private List ScoreIndexBlocks { get; } public int BlockCount; public readonly long StartingPosition; public ChromosomeBlock(List scoreIndexBlocks, int blockCount, long startingPosition) { ScoreIndexBlocks = scoreIndexBlocks; BlockCount = blockCount; StartingPosition = startingPosition; } /// /// Add the index block to the list of all blocks for each chromosome /// /// public void Add(ScoreIndexBlock indexBlock) { ScoreIndexBlocks.Add(indexBlock); BlockCount++; } /// /// Returns the index block corresponding to the blocknumber /// /// /// public ScoreIndexBlock Get(int blockNumber) { return blockNumber < BlockCount ? ScoreIndexBlocks[blockNumber] : null; } /// /// Serialize the instance to writer stream /// /// public void Write(ExtendedBinaryWriter writer) { writer.WriteOpt(BlockCount); writer.WriteOpt(StartingPosition); foreach (ScoreIndexBlock scoreIndexBlock in ScoreIndexBlocks) { scoreIndexBlock.Write(writer); } } /// /// Deserialize the instance from reader stream /// /// /// public static ChromosomeBlock Read(ExtendedBinaryReader reader) { int blockCount = reader.ReadOptInt32(); long startingPosition = reader.ReadOptInt64(); var scoreIndexBlocks = new List(blockCount); for (var i = 0; i < blockCount; i++) { scoreIndexBlocks.Add(ScoreIndexBlock.Read(reader)); } return new ChromosomeBlock(scoreIndexBlocks, blockCount, startingPosition); } } } ================================================ FILE: VariantAnnotation/GenericScore/EncoderType.cs ================================================ namespace VariantAnnotation.GenericScore; public enum EncoderType: byte { Generic, ZeroToOne } ================================================ FILE: VariantAnnotation/GenericScore/GenericScoreEncoder.cs ================================================ using System; using System.Collections.Generic; using System.Collections.Immutable; using IO; namespace VariantAnnotation.GenericScore; public sealed class GenericScoreEncoder : IScoreEncoder { private readonly byte[] _encodedArray; private readonly Dictionary _scoreMap; private ImmutableDictionary _scoreMapReader; public ushort BytesRequired => 2; private ushort _nextScoreCode; public GenericScoreEncoder() { _encodedArray = new byte[BytesRequired]; _scoreMap = new Dictionary(byte.MaxValue); } public ushort AddScore(double number) { // if the score is already in the map, return the index // this is because the socre and the code, both should be unique if(_scoreMap.TryGetValue(number, out ushort code)) return code; // if the score is not in the map, add it and return the index code = _nextScoreCode++; _scoreMap.Add(number, code); return code; } public byte[] EncodeToBytes(double number) { Array.Clear(_encodedArray, 0, _encodedArray.Length); ushort transformedNumber = AddScore(number); // BitConverter is used as a convenient means of transforming the number into bytes // Only the `BytesRequred` portion is saved, because the converted bytes will not exceed it. Array.Copy(BitConverter.GetBytes(transformedNumber), _encodedArray, BytesRequired); return _encodedArray; } public double DecodeFromBytes(ReadOnlySpan encodedArray) { // Because the scoreMap uses `ushort` return GetScore(BitConverter.ToUInt16(encodedArray)); } private double GetScore(ushort encodedNumber) { return _scoreMapReader.GetValueOrDefault(encodedNumber, double.NaN); } public void Write(ExtendedBinaryWriter writer) { writer.WriteOpt(_scoreMap.Count); foreach ((double score, ushort code) in _scoreMap) { writer.Write(code); writer.Write(score); } } public static GenericScoreEncoder Read(ExtendedBinaryReader reader) { int scoreCount = reader.ReadOptInt32(); var scoreMapReader = new Dictionary(scoreCount); for (var i = 0; i < scoreCount; i++) { scoreMapReader.Add(reader.ReadUInt16(), reader.ReadDouble()); } return new GenericScoreEncoder { _scoreMapReader = scoreMapReader.ToImmutableDictionary() }; } } ================================================ FILE: VariantAnnotation/GenericScore/IScoreEncoder.cs ================================================ using System; using IO; namespace VariantAnnotation.GenericScore; public interface IScoreEncoder { public ushort BytesRequired { get; } public byte[] EncodeToBytes(double number); public double DecodeFromBytes(ReadOnlySpan encodedArray); public void Write(ExtendedBinaryWriter writer); } ================================================ FILE: VariantAnnotation/GenericScore/MetaData.cs ================================================ using System; using CommandLine.Utilities; namespace VariantAnnotation.GenericScore { public sealed class MetaData { private int _totalChromosomeCount; private int _totalBlockCount; private int _blankBlockCount; private uint _compressedChromosomeSize; private uint _uncompressedChromosomeSize; private uint _compressedSize; private uint _uncompressedSize; private uint _unmatchedReferencePositionsCount; private double _totalProcessingTime; private double _chromosomeProcessingTime; private ushort? _lastChromosome = null; private readonly Benchmark _blockBenchmark; private const string DashedLine = "________________________________________________________________"; private double AverageCompressionRatio => CalculateCompressionRatio(_compressedSize, _uncompressedSize); private double AverageCompressedBlockSize => (double) _compressedSize / _totalBlockCount; private double AverageProcessingTime => _totalProcessingTime / _totalBlockCount; private double AverageWriteSpeed => _compressedSize / _totalProcessingTime / 1_000_000; private static double CalculateCompressionRatio(uint compressedSize, uint uncompressedSize) { return compressedSize * 100.0 / uncompressedSize; } public MetaData() { _blockBenchmark = new Benchmark(); } public void AddIndexBlock(ushort chromosomeIndex, int blockNumber, long fileStartingPosition, uint uncompressedSize, uint compressedSize) { double processingTime = _blockBenchmark.GetElapsedTime().TotalSeconds; _totalBlockCount++; if (fileStartingPosition < 0) { _blankBlockCount++; } _uncompressedSize += uncompressedSize; _compressedSize += compressedSize; _uncompressedChromosomeSize += uncompressedSize; _compressedChromosomeSize += compressedSize; _totalProcessingTime += processingTime; _chromosomeProcessingTime += processingTime; PrintFormattedString(chromosomeIndex, blockNumber, uncompressedSize, compressedSize, processingTime); _blockBenchmark.Reset(); } public void AddChromosomeBlock(ushort chromosomeIndex) { _totalChromosomeCount++; if (_lastChromosome == null) { _lastChromosome = chromosomeIndex; return; } PrintFormattedString(_lastChromosome, null, _uncompressedChromosomeSize, _compressedChromosomeSize, _chromosomeProcessingTime); _lastChromosome = chromosomeIndex; _chromosomeProcessingTime = 0; _uncompressedChromosomeSize = 0; _compressedChromosomeSize = 0; } public void TrackUnmatchedReferencePositions() { _unmatchedReferencePositionsCount++; } private static void PrintFormattedString(ushort? chromosomeIndex, int? blockNumber, uint uncompressedSize, uint compressedSize, double processingTime) { string headerLine = $"{chromosomeIndex}:{blockNumber}"; if (blockNumber == null) { headerLine = $"{DashedLine}\n{chromosomeIndex}"; } Console.WriteLine( $"{headerLine}" + $"\t{compressedSize} bytes/{uncompressedSize} bytes\t= " + $"{CalculateCompressionRatio(compressedSize, uncompressedSize):F1} % \t" + $"Processing Time {processingTime:F4} s" ); if (blockNumber == null) { Console.WriteLine($"{DashedLine}"); } } public void PrintWriteMetrics() { PrintFormattedString(_lastChromosome, null, _uncompressedChromosomeSize, _compressedChromosomeSize, _chromosomeProcessingTime); Console.WriteLine( $"{DashedLine}\n" + $"Write Metrics\n" + $"{DashedLine}\n" + $"Total Chromosomes = {_totalChromosomeCount}\n" + $"Total Blocks = {_totalBlockCount}\n" + $"Blank Blocks = {_blankBlockCount}\n" + $"Unmatched Reference Positions = {_unmatchedReferencePositionsCount}\n" + $"Total Compressed Size = {_compressedSize} bytes\n" + $"Total Uncompressed Size = {_uncompressedSize} bytes\n" + $"Total Processing Time = {_totalProcessingTime:F3} seconds\n" + $"Average Compressed Block Size = {AverageCompressedBlockSize:F0} bytes\n" + $"Average Compression Ratio = {AverageCompressionRatio:F1} %\n" + $"Average Processing Time = {AverageProcessingTime:F4} seconds\n" + $"Average Writing Speed = {AverageWriteSpeed:F4} MB/second\n" + $"{DashedLine}" ); } } } ================================================ FILE: VariantAnnotation/GenericScore/ReaderSettings.cs ================================================ using System; using IO; namespace VariantAnnotation.GenericScore { public sealed class ReaderSettings { public readonly bool IsPositional; public readonly EncoderType EncoderType; public readonly IScoreEncoder ScoreEncoder; public readonly ScoreJsonEncoder ScoreJsonEncoder; public readonly string[] Nucleotides; public readonly int BlockLength; public ushort BytesRequired => ScoreEncoder.BytesRequired; public ReaderSettings( bool isPositional, EncoderType encoderType, IScoreEncoder scoreEncoder, ScoreJsonEncoder scoreJsonEncoder, string[] nucleotides, int blockLength ) { IsPositional = isPositional; EncoderType = encoderType; ScoreEncoder = scoreEncoder; ScoreJsonEncoder = scoreJsonEncoder; BlockLength = blockLength; Nucleotides = nucleotides; } public static ReaderSettings Read(ExtendedBinaryReader reader) { bool isPositional = reader.ReadBoolean(); var encoderType = (EncoderType) reader.ReadByte(); IScoreEncoder scoreEncoder = encoderType switch { EncoderType.ZeroToOne => ZeroToOneScoreEncoder.Read(reader), EncoderType.Generic => GenericScoreEncoder.Read(reader), _ => throw new Exception("Unknown score encoder") }; ScoreJsonEncoder scoreJsonEncoder = ScoreJsonEncoder.Read(reader); byte nucleotideCount = reader.ReadByte(); var nucleotides = new string[nucleotideCount]; for (var i = 0; i < nucleotideCount; i++) { string value = reader.ReadAsciiString(); nucleotides[i] = value; } int blockLength = reader.ReadOptInt32(); return new ReaderSettings( isPositional, encoderType, scoreEncoder, scoreJsonEncoder, nucleotides, blockLength ); } public void Write(ExtendedBinaryWriter writer) { writer.Write(IsPositional); writer.Write((byte) EncoderType); ScoreEncoder.Write(writer); ScoreJsonEncoder.Write(writer); writer.Write((byte) Nucleotides.Length); foreach (string key in Nucleotides) { writer.WriteOptAscii(key); } writer.WriteOpt(BlockLength); } } } ================================================ FILE: VariantAnnotation/GenericScore/ScoreBlock.cs ================================================ using System; using Compression.Algorithms; using IO; namespace VariantAnnotation.GenericScore { public sealed class ScoreBlock { private readonly ICompressionAlgorithm _compressionAlgorithm; private readonly byte[] _compressedBytes; private readonly byte[] _uncompressedBytes; private uint _cursorPosition; private readonly int _blockSize; public ScoreBlock(ICompressionAlgorithm compressionAlgorithm, int blockSize) { _compressionAlgorithm = compressionAlgorithm; _blockSize = blockSize; int compressedBlockSize = _compressionAlgorithm.GetCompressedBufferBounds(_blockSize); _compressedBytes = new byte[compressedBlockSize]; _uncompressedBytes = new byte[_blockSize]; Clear(); } private void Clear() { Array.Fill(_uncompressedBytes, byte.MaxValue); _cursorPosition = 0; } public bool IsFull() { return _cursorPosition == _blockSize; } public void Add(uint memoryIndex, byte[] variableArray, uint arraySize) { Array.Copy(variableArray, 0, _uncompressedBytes, memoryIndex, arraySize); _cursorPosition = (memoryIndex + arraySize); } public (uint uncompressedSize, int compressedSize) Write(ExtendedBinaryWriter writer) { int compressedSize = _compressionAlgorithm.Compress( _uncompressedBytes, _blockSize, _compressedBytes, _compressedBytes.Length ); writer.Write(_compressedBytes, 0, compressedSize); uint uncompressedSize = _cursorPosition; Clear(); return (uncompressedSize, compressedSize); } } } ================================================ FILE: VariantAnnotation/GenericScore/ScoreIndex.cs ================================================ using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using Genome; using IO; using IO.v2; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace VariantAnnotation.GenericScore { public sealed class ScoreIndex { private readonly ExtendedBinaryWriter _writer; private readonly int _blockLength; private readonly ushort _scoreSize; private readonly byte _nucleotideCount; private readonly Dictionary _nucleotideIndexMapper; public readonly GenomeAssembly Assembly; public readonly int SchemaVersion; private readonly Header _indexHeader; private readonly int _filePairId; public readonly IDataSourceVersion Version; private readonly MetaData _metaData; private Dictionary _chromosomeBlocks; public readonly ReaderSettings ReaderSettings; public ScoreIndex( ExtendedBinaryWriter indexWriter, ReaderSettings readerSettings, GenomeAssembly assembly, IDataSourceVersion version, int schemaVersion, Header indexHeader, int filePairId ) { _writer = indexWriter; Assembly = assembly; Version = version; SchemaVersion = schemaVersion; _indexHeader = indexHeader; _filePairId = filePairId; ReaderSettings = readerSettings; _chromosomeBlocks = new Dictionary(); _metaData = new MetaData(); string[] nucleotides = readerSettings.Nucleotides; _nucleotideCount = (byte) nucleotides.Length; _scoreSize = readerSettings.BytesRequired; _blockLength = _nucleotideCount * readerSettings.BlockLength * _scoreSize; // Nucleotide to position mapping _nucleotideIndexMapper = new Dictionary(); for (ushort i = 0; i < _nucleotideCount; i++) { _nucleotideIndexMapper[readerSettings.Nucleotides[i]] = (ushort) (i * _scoreSize); } } /// /// Add the block to index /// /// /// /// /// public void Add(ushort chromIndex, long filePosition, int compressedSize, uint uncompressedSize) { // Create index block and add to chromosome block var indexBlock = new ScoreIndexBlock(filePosition, compressedSize); _chromosomeBlocks[chromIndex].Add(indexBlock); int blockNumber = GetLastBlockNumber(chromIndex); _metaData.AddIndexBlock(chromIndex, blockNumber, filePosition, uncompressedSize, (uint) compressedSize); } public void AddChromosomeBlock(ushort chromIndex, int chromosomeStartingPosition) { _chromosomeBlocks[chromIndex] = new ChromosomeBlock(new List(), 0, chromosomeStartingPosition); _metaData.AddChromosomeBlock(chromIndex); } public void TrackUnmatchedReferencePositions() { _metaData.TrackUnmatchedReferencePositions(); } private void WriteHeader() { _indexHeader.Write(_writer); _writer.WriteOpt(_filePairId); _writer.Write(SaCommon.GuardInt); } private static void CheckHeader(Header header) { (FileType fileType, ushort fileFormatVersion) = header; if (fileType != FileType.GsaIndex) throw new UserErrorException($"The file type {fileType} version {fileFormatVersion} " + $"is not supported by this reader {FileType.GsaIndex}"); } private static (Header indexHeader, int filePairId) ReadHeader(ExtendedBinaryReader reader, int expectedFilePairId) { Header indexHeader = Header.Read(reader); CheckHeader(indexHeader); int filePairId = reader.ReadOptInt32(); uint guardInt = reader.ReadUInt32(); if (guardInt != SaCommon.GuardInt || filePairId != expectedFilePairId) { throw new UserErrorException("Unable to read the index"); } return (indexHeader, filePairId); } /// /// Serialize the instance to writer stream /// public void Write() { WriteHeader(); _writer.Write((byte) Assembly); Version.Write(_writer); _writer.WriteOpt(SchemaVersion); _writer.WriteOpt(_chromosomeBlocks.Count); // Write the Chromsome Blocks foreach ((ushort index, ChromosomeBlock chromosomeBlocks) in _chromosomeBlocks) { _writer.WriteOpt(index); chromosomeBlocks.Write(_writer); } ReaderSettings.Write(_writer); _metaData.PrintWriteMetrics(); } /// /// Deserialize the instance from reader stream /// /// /// /// public static ScoreIndex Read(Stream stream, int dataFilePairId) { using (var memStream = new MemoryStream()) using (var reader = new ExtendedBinaryReader(memStream)) { stream.CopyTo(memStream); //reading all bytes in stream to memStream memStream.Position = 0; (Header indexHeader, int filePairId) = ReadHeader(reader, dataFilePairId); GenomeAssembly assembly = (GenomeAssembly) reader.ReadByte(); IDataSourceVersion version = DataSourceVersion.Read(reader); int schemaVersion = reader.ReadOptInt32(); int chromCount = reader.ReadOptInt32(); // read the chromblocks var chromBlocks = new Dictionary(chromCount); for (var i = 0; i < chromCount; i++) { var chromIndex = reader.ReadOptUInt16(); chromBlocks[chromIndex] = ChromosomeBlock.Read(reader); } ReaderSettings readerSettings = ReaderSettings.Read(reader); var scoreIndex = new ScoreIndex( null, readerSettings, assembly, version, schemaVersion, indexHeader, filePairId ) { _chromosomeBlocks = chromBlocks, }; return scoreIndex; } } /// /// Return the file position of the block containing the given chromosome and chromosomal position /// /// /// /// public long GetFilePosition(ushort chromIndex, int position) { if (_chromosomeBlocks == null || !_chromosomeBlocks.TryGetValue(chromIndex, out var chromosomeBlock)) return -1; int blockNumber = GetBlockNumber(chromosomeBlock, position); if (blockNumber < 0) return -1; return chromosomeBlock.Get(blockNumber) != null ? chromosomeBlock.Get(blockNumber).FilePosition : -1; } /// /// Returns the block number which would contain the position /// Because each block is of a known size, (e.g. 10_000) /// the first position (e.g 10_001) can be used to find the file position /// Example: /// blockNumber = (354_011 - 10_001) / 10_000 = 45th block contains the position 354_011 /// /// /// /// private int GetBlockNumber(ChromosomeBlock chromosomeBlock, int position) { // Position is less than start position if (position < chromosomeBlock.StartingPosition) return -1; (int blockNumber, _) = PositionToBlockLocation(position, (int) chromosomeBlock.StartingPosition); // Position is outside the last block if (blockNumber >= chromosomeBlock.BlockCount) return -1; return blockNumber; } public int GetBlockNumber(ushort chromosomeIndex, int position) { if (_chromosomeBlocks == null || !_chromosomeBlocks.TryGetValue(chromosomeIndex, out ChromosomeBlock chromosomeBlock)) return -1; return GetBlockNumber(chromosomeBlock, position); } public int GetBytesToRead(ushort chromIndex, int blockNumber) { return _chromosomeBlocks[chromIndex].Get(blockNumber).BytesWritten; } public int GetBlockLength() { return _blockLength; } public uint GetNucleotideCount() { return _nucleotideCount; } public ushort? GetNucleotidePosition(string saItemAltAllele) { if (!_nucleotideIndexMapper.ContainsKey(saItemAltAllele)) return null; return _nucleotideIndexMapper[saItemAltAllele]; } public (int blockNumber, int localBlockIndex) PositionToBlockLocation(int position, int startingPosition) { // Position is less than start position if (position < startingPosition) return (-1, -1); int deltaPosition = (position - startingPosition) * _nucleotideCount * _scoreSize; return (deltaPosition / _blockLength, deltaPosition % _blockLength); } private (int blockNumber, int localBlockIndex) PositionToBlockLocation(ChromosomeBlock chromosomeBlock, int position) { return PositionToBlockLocation(position, (int) chromosomeBlock.StartingPosition); } public (int blockNumber, int localBlockIndex) PositionToBlockLocation(ushort chromosomeIndex, int position) { if (_chromosomeBlocks == null || !_chromosomeBlocks.TryGetValue(chromosomeIndex, out var chromosomeBlock)) return (-1, -1); return PositionToBlockLocation(chromosomeBlock, position); } public Dictionary GetChromosomeBlocks() { return _chromosomeBlocks; } public int GetLastBlockNumber(ushort chromosomeIndex) { return _chromosomeBlocks[chromosomeIndex].BlockCount - 1; } } } ================================================ FILE: VariantAnnotation/GenericScore/ScoreIndexBlock.cs ================================================ using IO; namespace VariantAnnotation.GenericScore { public sealed class ScoreIndexBlock { public readonly long FilePosition; public readonly int BytesWritten; public ScoreIndexBlock(long filePosition, int bytesWritten) { FilePosition = filePosition; BytesWritten = bytesWritten; } /// /// Deserialize the instance from reader stream /// /// /// public static ScoreIndexBlock Read(ExtendedBinaryReader reader) { long filePosition = reader.ReadOptInt64(); int bytesWritten = reader.ReadOptInt32(); return new ScoreIndexBlock(filePosition, bytesWritten); } /// /// Serialize the instance to writer stream /// /// public void Write(ExtendedBinaryWriter writer) { writer.WriteOpt(FilePosition); writer.WriteOpt(BytesWritten); } } } ================================================ FILE: VariantAnnotation/GenericScore/ScoreJsonEncoder.cs ================================================ using IO; namespace VariantAnnotation.GenericScore { public sealed class ScoreJsonEncoder { public readonly string JsonKey; private readonly string _jsonSubKey; public string JsonRepresentation(T data) { if (_jsonSubKey != null) return $"\"{_jsonSubKey}\":{data}"; return data.ToString(); } public ScoreJsonEncoder(string jsonKey, string jsonSubKey) { JsonKey = jsonKey; _jsonSubKey = jsonSubKey; } public void Write(ExtendedBinaryWriter writer) { writer.WriteOptAscii(JsonKey); writer.WriteOptAscii(_jsonSubKey); } public static ScoreJsonEncoder Read(ExtendedBinaryReader reader) { return new ScoreJsonEncoder( reader.ReadAsciiString(), reader.ReadAsciiString() ); } } } ================================================ FILE: VariantAnnotation/GenericScore/ScoreReader.cs ================================================ using System; using System.Buffers; using System.IO; using Compression.Algorithms; using ErrorHandling.Exceptions; using Genome; using IO; using IO.v2; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.SA; namespace VariantAnnotation.GenericScore { public sealed class ScoreReader : ISaMetadata { private const int FileFormatVersion = 1; private readonly ExtendedBinaryReader _reader; public GenomeAssembly Assembly { get; } private readonly ScoreIndex _index; public IDataSourceVersion Version { get; } public string JsonKey { get; } private readonly ICompressionAlgorithm _compressionAlgorithm = new Zstandard(); private readonly byte[] _uncompressedBlock; private readonly byte[] _compressedBlock; private long? _lastFileLocation; private readonly int _encodedScoreSize; private ScoreReader(ScoreIndex scoreIndex, ExtendedBinaryReader dataFileReader) { _index = scoreIndex; _reader = dataFileReader; Assembly = _index.Assembly; Version = _index.Version; JsonKey = _index.ReaderSettings.ScoreJsonEncoder.JsonKey; if (_index.SchemaVersion != SaCommon.SchemaVersion) throw new UserErrorException( $"SA schema version mismatch. Expected {SaCommon.SchemaVersion}, observed {_index.SchemaVersion} for {JsonKey}"); _encodedScoreSize = _index.ReaderSettings.BytesRequired; _uncompressedBlock = ArrayPool.Shared.Rent(_index.GetBlockLength()); int compressedBlockSize = _compressionAlgorithm.GetCompressedBufferBounds(_index.GetBlockLength()); _compressedBlock = ArrayPool.Shared.Rent(compressedBlockSize); } public static ScoreReader Read(Stream dataStream, Stream indexStream) { var dataFileReader = new ExtendedBinaryReader(dataStream); int filePairId = ReadHeader(dataFileReader); ScoreIndex index = ScoreIndex.Read(indexStream, filePairId); return new ScoreReader(index, dataFileReader); } private static void CheckHeader(Header header) { (FileType fileType, ushort fileFormatVersion) = header; if (fileType != FileType.GsaWriter || fileFormatVersion != FileFormatVersion) { throw new UserErrorException( $"The file type {fileType} version {fileFormatVersion} is not supported by this reader " + $"{FileType.GsaWriter} version {FileFormatVersion}." ); } } private static int ReadHeader(ExtendedBinaryReader dataFileReader) { Header header = Header.Read(dataFileReader); CheckHeader(header); int filePairId = dataFileReader.ReadOptInt32(); uint guardInt = dataFileReader.ReadUInt32(); if (guardInt != SaCommon.GuardInt) { throw new UserErrorException("The data file may be corrupted"); } return filePairId; } private bool GetUncompressedBlock(ushort chromIndex, int position) { long fileLocation = _index.GetFilePosition(chromIndex, position); if (fileLocation < 0) return false; // Reuse the current block if (_lastFileLocation == fileLocation) return true; _lastFileLocation = fileLocation; Array.Clear(_uncompressedBlock, 0, _uncompressedBlock.Length); _reader.BaseStream.Position = fileLocation; int blockNumber = _index.GetBlockNumber(chromIndex, position); int bytesToRead = _index.GetBytesToRead(chromIndex, blockNumber); _reader.BaseStream.Read(_compressedBlock, 0, bytesToRead); _compressionAlgorithm.Decompress(_compressedBlock, bytesToRead, _uncompressedBlock, _index.GetBlockLength()); return true; } public double GetScore(ushort chromosomeIndex, int position, string allele) { if (_index.ReaderSettings.IsPositional) allele = "N"; if (!GetUncompressedBlock(chromosomeIndex, position)) return double.NaN; (_, int localBlockIndex) = _index.PositionToBlockLocation(chromosomeIndex, position); ushort? allelePosition = _index.GetNucleotidePosition(allele); if (allelePosition == null) return double.NaN; Span score = _uncompressedBlock.AsSpan(localBlockIndex + (ushort) allelePosition, _encodedScoreSize); return _index.ReaderSettings.ScoreEncoder.DecodeFromBytes(score); } public string GetAnnotationJson(ushort chromosomeIndex, int position, string altAllele) { double score = GetScore(chromosomeIndex, position, altAllele); return double.IsNaN(score) ? null : _index.ReaderSettings.ScoreJsonEncoder.JsonRepresentation(score); } } } ================================================ FILE: VariantAnnotation/GenericScore/ZeroToOneScoreEncoder.cs ================================================ using System; using ErrorHandling.Exceptions; using IO; namespace VariantAnnotation.GenericScore { public sealed class ZeroToOneScoreEncoder : IScoreEncoder { private readonly byte[] _encodedArray; private readonly int _numberOfDigits; private readonly uint _maxNumber; private readonly double _maxScore; public ushort BytesRequired { get; } public ZeroToOneScoreEncoder(int numberOfDigits, double maxScore) { _numberOfDigits = numberOfDigits; _maxScore = maxScore; _maxNumber = (uint) Math.Pow(10, _numberOfDigits); BytesRequired = (ushort) Math.Ceiling(_numberOfDigits / Math.Log10(256)); _encodedArray = new byte[BytesRequired]; } public void Write(ExtendedBinaryWriter writer) { writer.WriteOpt(_numberOfDigits); writer.Write(_maxScore); } public static ZeroToOneScoreEncoder Read(ExtendedBinaryReader reader) { return new ZeroToOneScoreEncoder(reader.ReadOptInt32(), reader.ReadDouble()); } public byte[] EncodeToBytes(double number) { Array.Clear(_encodedArray, 0, _encodedArray.Length); if (double.IsNaN(number)) { Array.Fill(_encodedArray, byte.MaxValue); return _encodedArray; } uint transformedNumber = TransformToUint(number); // BitConverter is used as a convenient means of transforming the number into bytes // Only the `BytesRequred` portion is saved, because the converted bytes will not exceed it. Array.Copy(BitConverter.GetBytes(transformedNumber), _encodedArray, BytesRequired); return _encodedArray; } public double DecodeFromBytes(ReadOnlySpan encodedArray) { if (encodedArray[^1] == byte.MaxValue) return double.NaN; var count = 0; var shift = 0; // because a variable lenght enodedarray is received, the BitConverter cannot be used directly foreach (byte b in encodedArray) { count |= (b & byte.MaxValue) << shift; shift += 8; } return TransformToDouble((uint) count); } private uint TransformToUint(double number) { if (number > _maxScore) throw new UserErrorException("Score may not be larger than maximum score"); return (uint) Math.Round(number * _maxNumber / _maxScore); } private double TransformToDouble(uint encodedNumber) { return encodedNumber * _maxScore / _maxNumber; } } } ================================================ FILE: VariantAnnotation/IO/Caches/CacheHeader.cs ================================================ using System.IO; using System.Text; namespace VariantAnnotation.IO.Caches { public sealed class CacheHeader : Header { public readonly TranscriptCacheCustomHeader Custom; public CacheHeader(Header header, TranscriptCacheCustomHeader customHeader) : base(header.Identifier, header.SchemaVersion, header.DataVersion, header.Source, header.CreationTimeTicks, header.Assembly) { Custom = customHeader; } public new void Write(BinaryWriter writer) { base.Write(writer); Custom.Write(writer); } public static CacheHeader Read(Stream stream) { CacheHeader header; using (var reader = new BinaryReader(stream, Encoding.Default, true)) { var baseHeader = Read(reader); var customHeader = TranscriptCacheCustomHeader.Read(reader); header = new CacheHeader(baseHeader, customHeader); } return header; } } } ================================================ FILE: VariantAnnotation/IO/Caches/Header.cs ================================================ using System.IO; using Genome; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.IO.Caches { public class Header { public readonly string Identifier; public readonly ushort SchemaVersion; public readonly ushort DataVersion; public readonly Source Source; public readonly long CreationTimeTicks; public readonly GenomeAssembly Assembly; public Header(string identifier, ushort schemaVersion, ushort dataVersion, Source source, long creationTimeTicks, GenomeAssembly genomeAssembly) { Identifier = identifier; SchemaVersion = schemaVersion; DataVersion = dataVersion; Source = source; CreationTimeTicks = creationTimeTicks; Assembly = genomeAssembly; } protected void Write(BinaryWriter writer) { writer.Write(Identifier); writer.Write(SchemaVersion); writer.Write(DataVersion); writer.Write((byte)Source); writer.Write(CreationTimeTicks); writer.Write((byte)Assembly); } protected static Header Read(BinaryReader reader) { string identifier = reader.ReadString(); ushort schemaVersion = reader.ReadUInt16(); ushort dataVersion = reader.ReadUInt16(); var source = (Source)reader.ReadByte(); long creationTimeTicks = reader.ReadInt64(); var genomeAssembly = (GenomeAssembly)reader.ReadByte(); return new Header(identifier, schemaVersion, dataVersion, source, creationTimeTicks, genomeAssembly); } } } ================================================ FILE: VariantAnnotation/IO/Caches/PredictionCacheCustomHeader.cs ================================================ using System.IO; using VariantAnnotation.Caches.DataStructures; namespace VariantAnnotation.IO.Caches { public sealed class PredictionCacheCustomHeader { public readonly IndexEntry[] Entries; public PredictionCacheCustomHeader(IndexEntry[] entries) => Entries = entries; public void Write(BinaryWriter writer) { writer.Write((ushort)Entries.Length); foreach (var entry in Entries) entry.Write(writer); } public static PredictionCacheCustomHeader Read(BinaryReader reader) { ushort numReferenceSeqs = reader.ReadUInt16(); var entries = new IndexEntry[numReferenceSeqs]; for (var i = 0; i < numReferenceSeqs; i++) entries[i].Read(reader); return new PredictionCacheCustomHeader(entries); } } } ================================================ FILE: VariantAnnotation/IO/Caches/PredictionCacheReader.cs ================================================ using System; using System.IO; using System.IO.Compression; using System.Text; using Compression.Algorithms; using Compression.FileHandling; using IO; using VariantAnnotation.Caches; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.Caches; namespace VariantAnnotation.IO.Caches { public sealed class PredictionCacheReader : IDisposable { private readonly ExtendedBinaryReader _reader; private readonly BlockStream _blockStream; private readonly string[] _predictionDescriptions; private readonly IndexEntry[] _indexEntries; private readonly int _numRefSeqs; public readonly PredictionHeader Header; public PredictionCacheReader(Stream stream, string[] predictionDescriptions) { _blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Decompress); Header = PredictionHeader.Read(stream, _blockStream); _reader = new ExtendedBinaryReader(_blockStream, Encoding.Default, true); _predictionDescriptions = predictionDescriptions; _indexEntries = Header.Custom.Entries; _numRefSeqs = _indexEntries.Length; } public void Dispose() { _reader.Dispose(); _blockStream.Dispose(); } /// /// parses the database cache file and populates the specified lists and interval trees /// public IPredictionCache Read(ushort refIndex) { if (refIndex >= _numRefSeqs) return null; var predictions = GetPredictions(refIndex); return new PredictionCache(Header.Assembly, predictions, _predictionDescriptions); } public Prediction[] GetPredictions(ushort refIndex) { var indexEntry = _indexEntries[refIndex]; _blockStream.SetBlockPosition(indexEntry.FileOffset); var predictions = new Prediction[indexEntry.Count]; for (var i = 0; i < indexEntry.Count; i++) predictions[i] = Prediction.Read(_reader, Header.LookupTable); return predictions; } public static readonly string[] SiftDescriptions = new string[]{"tolerated", "deleterious", "tolerated - low confidence", "deleterious - low confidence"}; public static readonly string[] PolyphenDescriptions = new string[]{"probably damaging", "possibly damaging", "benign", "unknown"}; } } ================================================ FILE: VariantAnnotation/IO/Caches/PredictionHeader.cs ================================================ using System.IO; using System.Text; using Compression.FileHandling; using IO; using VariantAnnotation.Caches.DataStructures; namespace VariantAnnotation.IO.Caches { public sealed class PredictionHeader : Header { public readonly PredictionCacheCustomHeader Custom; public readonly Prediction.Entry[] LookupTable; public PredictionHeader(Header header, PredictionCacheCustomHeader customHeader, Prediction.Entry[] lookupTable) : base(header.Identifier, header.SchemaVersion, header.DataVersion, header.Source, header.CreationTimeTicks, header.Assembly) { Custom = customHeader; LookupTable = lookupTable; } public new void Write(BinaryWriter writer) { base.Write(writer); Custom.Write(writer); } public static PredictionHeader Read(Stream stream, BlockStream blockStream) { Header baseHeader; PredictionCacheCustomHeader customHeader; Prediction.Entry[] lookupTable; using (var reader = new BinaryReader(stream, Encoding.Default, true)) { baseHeader = Read(reader); customHeader = PredictionCacheCustomHeader.Read(reader); } using (var reader = new ExtendedBinaryReader(blockStream, Encoding.Default, true)) { lookupTable = ReadLookupTable(reader); } return new PredictionHeader(baseHeader, customHeader, lookupTable); } private static Prediction.Entry[] ReadLookupTable(ExtendedBinaryReader reader) { int numEntries = reader.ReadInt32(); var lut = new Prediction.Entry[numEntries]; for (var i = 0; i < numEntries; i++) lut[i] = Prediction.Entry.ReadEntry(reader); return lut; } } } ================================================ FILE: VariantAnnotation/IO/Caches/TranscriptCacheCustomHeader.cs ================================================ using System.IO; namespace VariantAnnotation.IO.Caches { public sealed class TranscriptCacheCustomHeader { public readonly ushort VepVersion; private readonly long _vepReleaseTicks; public TranscriptCacheCustomHeader(ushort vepVersion, long vepReleaseTicks) { VepVersion = vepVersion; _vepReleaseTicks = vepReleaseTicks; } public void Write(BinaryWriter writer) { writer.Write(_vepReleaseTicks); writer.Write(VepVersion); } public static TranscriptCacheCustomHeader Read(BinaryReader reader) { long vepReleaseTicks = reader.ReadInt64(); ushort vepVersion = reader.ReadUInt16(); return new TranscriptCacheCustomHeader(vepVersion, vepReleaseTicks); } } } ================================================ FILE: VariantAnnotation/IO/Caches/TranscriptCacheReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using Compression.Algorithms; using Compression.FileHandling; using Genome; using Intervals; using IO; using VariantAnnotation.Caches; using VariantAnnotation.Caches.DataStructures; namespace VariantAnnotation.IO.Caches { public sealed class TranscriptCacheReader : IDisposable { private readonly BufferedBinaryReader _reader; public readonly CacheHeader Header; public TranscriptCacheReader(Stream stream) { Header = CacheHeader.Read(stream); var blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Decompress); _reader = new BufferedBinaryReader(blockStream); } public void Dispose() => _reader.Dispose(); /// /// parses the database cache file and populates the specified lists and interval trees /// public TranscriptCacheData Read(Dictionary refIndexToChromosome) { var genes = ReadItems(_reader, () => Gene.Read(_reader, refIndexToChromosome)); var transcriptRegions = ReadItems(_reader, () => TranscriptRegion.Read(_reader)); var mirnas = ReadItems(_reader, () => IntervalExtensions.Read(_reader)); var peptideSeqs = ReadItems(_reader, () => _reader.ReadAsciiString()); var regulatoryRegions = ReadIntervals(_reader, () => RegulatoryRegion.Read(_reader, refIndexToChromosome)); var transcripts = ReadIntervals(_reader, () => Transcript.Read(_reader, refIndexToChromosome, genes, transcriptRegions, mirnas, peptideSeqs)); return new TranscriptCacheData(Header, genes, transcriptRegions, mirnas, peptideSeqs, transcripts, regulatoryRegions); } private static IntervalArray[] ReadIntervals(IBufferedBinaryReader reader, Func readMethod) where T : IInterval { var numRefSeqs = reader.ReadOptInt32(); var intervalArrays = new IntervalArray[numRefSeqs]; for (int refSeqIndex = 0; refSeqIndex < numRefSeqs; refSeqIndex++) { var numItems = reader.ReadOptInt32(); if (numItems == 0) continue; var intervals = new Interval[numItems]; for (int i = 0; i < numItems; i++) { var item = readMethod(); intervals[i] = new Interval(item.Start, item.End, item); } intervalArrays[refSeqIndex] = new IntervalArray(intervals); } CheckGuard(reader); return intervalArrays; } internal static T[] ReadItems(IBufferedBinaryReader reader, Func readMethod) { var numItems = reader.ReadOptInt32(); var items = new T[numItems]; for (int i = 0; i < numItems; i++) items[i] = readMethod(); CheckGuard(reader); return items; } /// /// check if the section guard is in place /// internal static void CheckGuard(IBufferedBinaryReader reader) { uint observedGuard = reader.ReadUInt32(); if (observedGuard != CacheConstants.GuardInt) { throw new InvalidDataException($"Expected a guard integer ({CacheConstants.GuardInt}), but found another value: ({observedGuard})"); } } } } ================================================ FILE: VariantAnnotation/IO/IntervalExtensions.cs ================================================ using Intervals; using IO; namespace VariantAnnotation.IO { public static class IntervalExtensions { public static IInterval Read(IBufferedBinaryReader reader) { int start = reader.ReadOptInt32(); int end = reader.ReadOptInt32(); return new Interval(start, end); } public static void Write(this IInterval interval, IExtendedBinaryWriter writer) { writer.WriteOpt(interval.Start); writer.WriteOpt(interval.End); } } } ================================================ FILE: VariantAnnotation/IO/JsonCommon.cs ================================================ namespace VariantAnnotation.IO { public static class JsonCommon { public const int SchemaVersion = 6; public const string FrequencyRoundingFormat = "0.######"; public const string Chromosome = "chromosome"; public const string Begin = "begin"; public const string End = "end"; public const string FailedFilter = "failedFilter"; public const string VariantId = "variantId"; public const string VariantType = "variantType"; public const string AllAlleleCount = "allAc"; public const string AfrAlleleCount = "afrAc"; public const string AmrAlleleCount = "amrAc"; public const string EasAlleleCount = "easAc"; public const string EurAlleleCount = "eurAc"; public const string OthAlleleCount = "othAc"; public const string FemaleAlleleCount = "femaleAc"; public const string MaleAlleleCount = "maleAc"; public const string AllAlleleFrequency = "allAf"; public const string AfrAlleleFrequency = "afrAf"; public const string AmrAlleleFrequency = "amrAf"; public const string EasAlleleFrequency = "easAf"; public const string EurAlleleFrequency = "eurAf"; public const string OthAlleleFrequency = "othAf"; public const string FemaleAlleleFrequency = "femaleAf"; public const string MaleAlleleFrequency = "maleAf"; public const string AllAlleleNumber = "allAn"; public const string AfrAlleleNumber = "afrAn"; public const string AmrAlleleNumber = "amrAn"; public const string EasAlleleNumber = "easAn"; public const string EurAlleleNumber = "eurAn"; public const string OthAlleleNumber = "othAn"; public const string FemaleAlleleNumber = "femaleAn"; public const string MaleAlleleNumber = "maleAn"; public const string AllHomCount = "allHc"; public const string AfrHomCount = "afrHc"; public const string AmrHomCount = "amrHc"; public const string EasHomCount = "easHc"; public const string EurHomCount = "eurHc"; public const string OthHomCount = "othHc"; public const string FemaleHomCount = "femaleHc"; public const string MaleHomCount = "maleHc"; } } ================================================ FILE: VariantAnnotation/IO/JsonObject.cs ================================================ using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Text; using VariantAnnotation.Interface.IO; namespace VariantAnnotation.IO { public sealed class JsonObject { private readonly StringBuilder _sb; private bool _needsComma; private int _nestedLevel; public const char Comma = ','; private const char DoubleQuote = '\"'; public const char OpenBracket = '['; public const char CloseBracket = ']'; public const char OpenBrace = '{'; public const char CloseBrace = '}'; private const string ColonString = "\":"; public JsonObject(StringBuilder sb) => _sb = sb; private void AddKey(string description) { _sb.Append(DoubleQuote); _sb.Append(description); _sb.Append(ColonString); } public void StartObjectWithKey(string objectKey) { if (_needsComma) _sb.Append(Comma); _sb.Append(DoubleQuote); _sb.Append(objectKey); _sb.Append(ColonString); _sb.Append(OpenBrace); _needsComma = false; _nestedLevel++; } public bool AddBoolValue(string description, bool b, bool outputFalse = false) { // we do not want to print out false flags by default. if (!b && !outputFalse) return false; if (_needsComma) _sb.Append(Comma); AddKey(description); _sb.Append(b ? "true" : "false"); _needsComma = true; return true; } public bool AddIntValue(string description, int? i) { if (i == null) return false; if (_needsComma) _sb.Append(Comma); AddKey(description); _sb.Append(i); _needsComma = true; return true; } public bool AddUIntValue(string description, uint? i) { if (i == null) return false; if (_needsComma) _sb.Append(Comma); AddKey(description); _sb.Append(i); _needsComma = true; return true; } public void AddIntValues(string description, int[] values) { if (values == null || values.Length == 0) return; // removing '.'s from the list of values var valueList = values.Select(value => value.ToString()).ToList(); AddStringValues(description, valueList, false); _needsComma = true; } public bool AddDoubleValue(string description, double? d, string format = "0.####") { if (d == null) return false; if (_needsComma) _sb.Append(Comma); AddKey(description); _sb.Append(d.Value.ToString(format, CultureInfo.InvariantCulture)); _needsComma = true; return true; } public void AddDoubleValues(string description, double[] values, string format = "0.####") { if (values == null || values.Length == 0) return; var valueList = values.Select(value => value.ToString(format)).ToList(); AddStringValues(description, valueList, false); _needsComma = true; } public bool AddStringValue(string description, string s, bool useQuote = true) { if (string.IsNullOrEmpty(s) || s == ".") return false; if (_needsComma) _sb.Append(Comma); AddKey(description); if (useQuote) _sb.Append(DoubleQuote); _sb.Append(s); if (useQuote) _sb.Append(DoubleQuote); _needsComma = true; return true; } public bool AddStringValues(string description, IEnumerable values, bool useQuote = true) { if (values == null) return false; var validEntries = new List(); foreach (string value in values) if (value != ".") validEntries.Add(value); if (validEntries.Count == 0) return false; if (_needsComma) _sb.Append(Comma); AddKey(description); _sb.Append(OpenBracket); var needsComma = false; foreach (string value in validEntries) { if (needsComma) _sb.Append(Comma); if (useQuote) _sb.Append(DoubleQuote); _sb.Append(value); if (useQuote) _sb.Append(DoubleQuote); needsComma = true; } _sb.Append(CloseBracket); _needsComma = true; return true; } public bool AddStringValues(string description, IEnumerable sbs, bool useQuote = true) { if (sbs == null) return false; var validEntries = new List(); foreach (var sb in sbs) if (sb.Length > 0 && sb[0] != '.') validEntries.Add(sb); if (validEntries.Count == 0) return false; if (_needsComma) _sb.Append(Comma); AddKey(description); _sb.Append(OpenBracket); var needsComma = false; foreach (var value in validEntries) { if (needsComma) _sb.Append(Comma); if (useQuote) _sb.Append(DoubleQuote); _sb.Append(value); if (useQuote) _sb.Append(DoubleQuote); needsComma = true; } _sb.Append(CloseBracket); _needsComma = true; return true; } public void AddObjectValue(string description, T value) where T : IJsonSerializer { if (value == null) return; if (_needsComma) _sb.Append(Comma); AddKey(description); value.SerializeJson(_sb); _needsComma = true; } public bool AddObjectValues(string description, IEnumerable values) where T : IJsonSerializer { if (values == null) return false; if (_needsComma) _sb.Append(Comma); AddKey(description); _sb.Append(OpenBracket); var needsComma = false; foreach (var value in values) { // comma handling if (needsComma) _sb.Append(Comma); else needsComma = true; value.SerializeJson(_sb); } _sb.Append(CloseBracket); _needsComma = true; return true; } public void StartObject() { _sb.Append(OpenBrace); _needsComma = false; _nestedLevel++; } public void EndObject() { _sb.Append(CloseBrace); _needsComma = true; _nestedLevel--; } public void EndAllObjects() { _sb.Append(CloseBrace, _nestedLevel); } } } ================================================ FILE: VariantAnnotation/IO/JsonWriter.cs ================================================ using System.Collections.Generic; using System.IO; using System.Text; using Compression.FileHandling; using Jasix; using Jasix.DataStructures; using OptimizedCore; using VariantAnnotation.Interface; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.IO { public sealed class JsonWriter : IJsonWriter { private readonly StreamWriter _writer; private bool _firstEntry; private bool _positionFieldClosed; private readonly bool _leaveOpen; private readonly BgzipTextWriter _bgzipTextWriter; private readonly OnTheFlyIndexCreator _jasixIndexCreator; private JsonWriter(Stream jsonStream, Stream indexStream, string annotator, string creationTime, string vepDataVersion, List dataSourceVersions, string genomeAssembly, string[] sampleNames, bool leaveOpen) : this(GetProperWriter(jsonStream), indexStream, annotator, creationTime, vepDataVersion, dataSourceVersions, genomeAssembly, sampleNames, leaveOpen) { } public JsonWriter(Stream jsonStream, Stream indexStream, IAnnotationResources annotationResources, string creationTime, string[] sampleNames, bool leaveOpen) : this(jsonStream, indexStream, annotationResources.AnnotatorVersionTag, creationTime, annotationResources.VepDataVersion, annotationResources.DataSourceVersions, annotationResources.SequenceProvider.Assembly.ToString(), sampleNames, leaveOpen) { } private static StreamWriter GetProperWriter(Stream jsonStream) => jsonStream is BlockGZipStream stream ? new BgzipTextWriter(stream) : new StreamWriter(jsonStream); public JsonWriter(StreamWriter writer, Stream indexStream, string annotator, string creationTime, string vepDataVersion, List dataSourceVersions, string genomeAssembly, string[] sampleNames, bool leaveOpen) { _writer = writer; _writer.NewLine = "\n"; _firstEntry = true; _positionFieldClosed = false; _leaveOpen = leaveOpen; _bgzipTextWriter = writer as BgzipTextWriter; _jasixIndexCreator = _bgzipTextWriter != null ? new OnTheFlyIndexCreator(indexStream) : null; WriteHeader(annotator, creationTime, genomeAssembly, JsonCommon.SchemaVersion, vepDataVersion, dataSourceVersions, sampleNames); } private void WriteHeader(string annotator, string creationTime, string genomeAssembly, int schemaVersion, string vepDataVersion, IEnumerable dataSourceVersions, string[] sampleNames) { BeginSection(JasixCommons.HeaderSectionTag); var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); sb.Append($"{{\"{JasixCommons.HeaderSectionTag}\":{{"); jsonObject.AddStringValue("annotator", annotator); jsonObject.AddStringValue("creationTime", creationTime); jsonObject.AddStringValue("genomeAssembly", genomeAssembly); jsonObject.AddIntValue("schemaVersion", schemaVersion); jsonObject.AddStringValue("dataVersion", vepDataVersion); jsonObject.AddObjectValues("dataSources", dataSourceVersions); if (sampleNames != null) jsonObject.AddStringValues("samples", sampleNames); sb.Append($"}},\"{JasixCommons.PositionsSectionTag}\":[\n"); _writer.Write(StringBuilderPool.GetStringAndReturn(sb)); if(_bgzipTextWriter != null) EndSection(JasixCommons.HeaderSectionTag); } public void Dispose() { WriteFooter(); _writer?.Flush(); _jasixIndexCreator?.Flush(); if (_leaveOpen) return; _writer?.Dispose(); _jasixIndexCreator?.Dispose(); } // due to the flush, the end of a section will point to the next to last block for a section. // e.g. if positions start at block 2 and end at block 10, blocks 2..9 contains positions. private void BeginSection(string section) { if (_bgzipTextWriter == null) return; _bgzipTextWriter.Flush(); _jasixIndexCreator.BeginSection(section, _bgzipTextWriter.Position); } private void EndSection(string section) { if (_bgzipTextWriter == null) return; _bgzipTextWriter.Flush(); _jasixIndexCreator.EndSection(section, _bgzipTextWriter.Position); } public void WritePosition(IPosition position, string entry) { if (string.IsNullOrEmpty(entry)) return; _jasixIndexCreator?.Add(position, _bgzipTextWriter.Position); if (_firstEntry) { BeginSection(JasixCommons.PositionsSectionTag); } else _writer.WriteLine(","); _firstEntry = false; _writer.Write(entry); } public void WritePosition(IPosition position, StringBuilder sb) { if (sb == null || sb.Length == 0) return; _jasixIndexCreator?.Add(position, _bgzipTextWriter.Position); if (_firstEntry) { BeginSection(JasixCommons.PositionsSectionTag); } else _writer.WriteLine(","); _firstEntry = false; _writer.Write(sb); } public void WriteGenes(IEnumerable annotatedGenes) { _positionFieldClosed = true; EndSection(JasixCommons.PositionsSectionTag); _writer.Write("\n]"); if (annotatedGenes == null) return; _writer.Write($",\"{JasixCommons.GenesSectionTag}\":[\n"); BeginSection(JasixCommons.GenesSectionTag); var sb = StringBuilderPool.Get(); var firstGeneEntry = true; foreach (string jsonString in annotatedGenes) { if (!firstGeneEntry) sb.Append(",\n"); sb.Append(jsonString); firstGeneEntry = false; } var json = StringBuilderPool.GetStringAndReturn(sb); _writer.Write(json); EndSection(JasixCommons.GenesSectionTag); _writer.WriteLine(); _writer.Write("]"); } private void WriteFooter() { if (!_positionFieldClosed) { EndSection(JasixCommons.PositionsSectionTag); _writer.WriteLine(); _writer.Write("]"); } _writer.WriteLine("}"); } } } ================================================ FILE: VariantAnnotation/IO/SampleExtensions.cs ================================================ using OptimizedCore; using VariantAnnotation.Interface.Positions; namespace VariantAnnotation.IO { public static class SampleExtensions { public static string GetJsonString(this ISample sample) { var sb = StringBuilderPool.Get(); var jsonObject = new JsonObject(sb); // data section sb.Append(JsonObject.OpenBrace); jsonObject.AddBoolValue("isEmpty", sample.IsEmpty); jsonObject.AddStringValue("genotype", sample.Genotype); jsonObject.AddDoubleValues("variantFrequencies", sample.VariantFrequencies); jsonObject.AddIntValue("totalDepth", sample.TotalDepth); jsonObject.AddIntValue("genotypeQuality", sample.GenotypeQuality); jsonObject.AddIntValue("copyNumber", sample.CopyNumber); jsonObject.AddIntValue("minorHaplotypeCopyNumber", sample.MinorHaplotypeCopyNumber); jsonObject.AddIntValues("repeatUnitCounts", sample.RepeatUnitCounts); jsonObject.AddIntValues("alleleDepths", sample.AlleleDepths); jsonObject.AddBoolValue("failedFilter", sample.FailedFilter); jsonObject.AddIntValues("splitReadCounts", sample.SplitReadCounts); jsonObject.AddIntValues("pairedEndReadCounts", sample.PairedEndReadCounts); jsonObject.AddBoolValue("isDeNovo", sample.IsDeNovo); jsonObject.AddDoubleValue("deNovoQuality", sample.DeNovoQuality); jsonObject.AddStringValues("diseaseAffectedStatuses", sample.DiseaseAffectedStatuses); jsonObject.AddDoubleValue("artifactAdjustedQualityScore", sample.ArtifactAdjustedQualityScore, "0.#"); jsonObject.AddDoubleValue("likelihoodRatioQualityScore", sample.LikelihoodRatioQualityScore, "0.#"); if (sample.IsLossOfHeterozygosity.HasValue) jsonObject.AddBoolValue("lossOfHeterozygosity", sample.IsLossOfHeterozygosity.Value); jsonObject.AddDoubleValue("somaticQuality", sample.SomaticQuality, "0.#"); jsonObject.AddStringValues("heteroplasmyPercentile", sample.HeteroplasmyPercentile, false); jsonObject.AddIntValue("binCount", sample.BinCount); if(sample.CustomFields != null && !sample.CustomFields.IsEmpty()) jsonObject.AddObjectValue("vcfSampleInfo", sample.CustomFields); sb.Append(JsonObject.CloseBrace); return StringBuilderPool.GetStringAndReturn(sb); } } } ================================================ FILE: VariantAnnotation/NSA/NgaReader.cs ================================================ using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Text; using Compression.Algorithms; using Compression.FileHandling; using ErrorHandling.Exceptions; using IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace VariantAnnotation.NSA { public sealed class NgaReader { public readonly IDataSourceVersion Version; public readonly string JsonKey; private readonly bool _isArray; private readonly Dictionary> _geneSymbolToJsonStrings; private NgaReader(IDataSourceVersion version, string jsonKey, bool isArray, Dictionary> geneSymbolToJsonStrings) { Version = version; JsonKey = jsonKey; _isArray = isArray; _geneSymbolToJsonStrings = geneSymbolToJsonStrings; } public static NgaReader Read(Stream stream) { (IDataSourceVersion version, string jsonKey, bool isArray) = ReadHeader(stream); Dictionary> geneSymbolToJsonStrings; using (var blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Decompress)) using (var reader = new ExtendedBinaryReader(blockStream)) { int geneCount = reader.ReadOptInt32(); geneSymbolToJsonStrings = new Dictionary>(geneCount); for (var i = 0; i < geneCount; i++) { string geneSymbol = reader.ReadAsciiString(); int numEntries = reader.ReadOptInt32(); var entries = new List(numEntries); for (var j = 0; j < numEntries; j++) { entries.Add(reader.ReadString()); } geneSymbolToJsonStrings[geneSymbol] = entries; } } return new NgaReader(version, jsonKey, isArray, geneSymbolToJsonStrings); } private static (IDataSourceVersion Version, string JsonKey, bool IsArray) ReadHeader(Stream stream) { IDataSourceVersion version; string jsonKey; bool isArray; using (var reader = new ExtendedBinaryReader(stream, Encoding.UTF8, true)) { string identifier = reader.ReadString(); if (identifier != SaCommon.NgaIdentifier) { throw new InvalidDataException($"Expected the NGA identifier ({SaCommon.NgaIdentifier}), but found another value: ({identifier})"); } version = DataSourceVersion.Read(reader); jsonKey = reader.ReadString(); isArray = reader.ReadBoolean(); ushort schemaVersion = reader.ReadUInt16(); if (schemaVersion != SaCommon.SchemaVersion) { throw new UserErrorException($"Expected the schema version {SaCommon.SchemaVersion}, but found another value: ({schemaVersion}) for {jsonKey}"); } uint guard = reader.ReadUInt32(); if (guard != SaCommon.GuardInt) { throw new InvalidDataException($"Expected a guard integer ({SaCommon.GuardInt}), but found another value: ({guard})"); } } return (version, jsonKey, isArray); } public string GetAnnotation(string geneName) => _geneSymbolToJsonStrings.TryGetValue(geneName, out List annotations) ? GetJsonString(annotations) : null; private string GetJsonString(List annotations) { if (_isArray) return "[" + string.Join(',', annotations) + "]"; return annotations[0]; } } } ================================================ FILE: VariantAnnotation/NSA/NsaBlock.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Compression.Algorithms; using IO; namespace VariantAnnotation.NSA { public sealed class NsaBlock:IDisposable { private readonly ICompressionAlgorithm _compressionAlgorithm; private readonly byte[] _compressedBlock; private readonly byte[] _uncompressedBlock; private int _compressedLength; private int _uncompressedLength; private readonly ExtendedBinaryWriter _writer; public int BlockOffset => (int)_writer.BaseStream.Position; private int _firstPosition; private int _lastPosition; private int _count; private readonly ExtendedBinaryReader _blockReader; private readonly MemoryStream _blockStream; public NsaBlock(ICompressionAlgorithm compressionAlgorithm, int size) { _compressionAlgorithm = compressionAlgorithm; _uncompressedBlock = new byte[size]; _blockStream = new MemoryStream(_uncompressedBlock); _blockReader = new ExtendedBinaryReader(_blockStream); _writer = new ExtendedBinaryWriter(new MemoryStream(_uncompressedBlock)); int compressedBlockSize = compressionAlgorithm.GetCompressedBufferBounds(size); _compressedBlock = new byte[compressedBlockSize]; } public void Read(ExtendedBinaryReader reader) { _compressedLength = reader.ReadOptInt32(); _firstPosition = reader.ReadOptInt32(); //_lastPosition = reader.ReadOptInt32(); _count = reader.ReadOptInt32(); reader.Read(_compressedBlock, 0, _compressedLength); _uncompressedLength = _compressionAlgorithm.Decompress(_compressedBlock, _compressedLength, _uncompressedBlock, _uncompressedBlock.Length); _blockStream.Position = 0; } //read block but do not uncompress public void ReadCompressedBytes(ExtendedBinaryReader reader) { _compressedLength = reader.ReadOptInt32(); _firstPosition = reader.ReadOptInt32(); //_lastPosition = reader.ReadOptInt32(); _count = reader.ReadOptInt32(); reader.Read(_compressedBlock, 0, _compressedLength); } //write a block that has not been uncompressed public void WriteCompressedBytes(ExtendedBinaryWriter writer) { writer.WriteOpt(_compressedLength); writer.WriteOpt(_firstPosition); //writer.WriteOpt(_lastPosition); writer.WriteOpt(_count); writer.Write(_compressedBlock, 0, _compressedLength); } public bool HasSpace(int length) { return BlockOffset + length + 2 * sizeof(int) <= _uncompressedBlock.Length; //saving space for length and position } public void Add(byte[] data, int length, int position) { if (!HasSpace(length)) return; if (_writer.BaseStream.Position == 0) { _firstPosition = position; _lastPosition = position; } _writer.WriteOpt(length); _writer.WriteOpt(position - _lastPosition); _writer.Write(data, 0, length); _lastPosition = position; _count++; } public int AddAnnotations(List vcfPositions, int j, List annotationItems) { if (_uncompressedLength == 0) return j; _blockStream.Position = 0; var position = _firstPosition; var i = 0; var length = _blockReader.ReadOptInt32(); position += _blockReader.ReadOptInt32(); while (i < _count && j < vcfPositions.Count) { if (position < vcfPositions[j]) { _blockStream.Position += length; //this position is not needed, move to next length = _blockReader.ReadOptInt32(); position += _blockReader.ReadOptInt32(); i++; continue; } if (vcfPositions[j] < position) { //go to next position from vcf j++; continue; } //positions have matched var data = _blockReader.ReadBytes(length); annotationItems.Add(new AnnotationItem(position, data)); j++; i++; length = _blockReader.ReadOptInt32(); position += _blockReader.ReadOptInt32(); } return j; } public (int firstPosition, int lastPosition, int numBytes) Write(ExtendedBinaryWriter writer) { var compressedLength = _compressionAlgorithm.Compress(_uncompressedBlock, BlockOffset, _compressedBlock, _compressedBlock.Length); writer.WriteOpt(compressedLength); writer.WriteOpt(_firstPosition); //writer.WriteOpt(_lastPosition); writer.WriteOpt(_count); writer.Write(_compressedBlock, 0, compressedLength); _writer.BaseStream.Position = 0; return (_firstPosition, _lastPosition, compressedLength); } public void Clear() { _count = 0; _firstPosition = -1; _lastPosition = -1; _compressedLength = 0; _uncompressedLength = 0; _blockStream.Position = 0; } public void Dispose() { _writer?.Dispose(); _blockReader?.Dispose(); _blockStream?.Dispose(); } } } ================================================ FILE: VariantAnnotation/NSA/NsaIndex.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; namespace VariantAnnotation.NSA { public sealed class NsaIndex { private readonly Dictionary> _chromBlocks; private ushort _chromIndex = ushort.MaxValue; private readonly ExtendedBinaryWriter _writer; public readonly GenomeAssembly Assembly; public readonly IDataSourceVersion Version; public readonly string JsonKey; public readonly int SchemaVersion; public readonly bool IsArray; public readonly bool MatchByAllele; public readonly bool IsPositional; public IEnumerable ChromosomeIndices => _chromBlocks.Keys; public Dictionary> GetBlocks() => _chromBlocks; public List GetChromBlocks(ushort chromIndex) => _chromBlocks[chromIndex]; public NsaIndex(ExtendedBinaryWriter indexWriter, GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, bool matchByAllele, bool isArray, int schemaVersion, bool isPositional) { _writer = indexWriter; MatchByAllele = matchByAllele; JsonKey = jsonKey; Version = version; Assembly = assembly; IsArray = isArray; IsPositional = isPositional; indexWriter.Write((byte)assembly); version.Write(indexWriter); indexWriter.WriteOptAscii(jsonKey); indexWriter.Write(matchByAllele); indexWriter.Write(isArray); indexWriter.WriteOpt(schemaVersion); indexWriter.Write(isPositional); _chromBlocks = new Dictionary>(); } public void Add(ushort chromIndex, int start, int end, long filePosition, int dataLength) { _chromIndex = chromIndex; if (! _chromBlocks.ContainsKey(_chromIndex)) { _chromBlocks[_chromIndex] = new List(); } var indexBlock = new NsaIndexBlock(start, end, filePosition, dataLength); _chromBlocks[_chromIndex].Add(indexBlock); } public void Write() { _writer.WriteOpt(_chromBlocks.Count); foreach ((ushort index, List chunks) in _chromBlocks) { _writer.WriteOpt(index); _writer.WriteOpt(chunks.Count); foreach (NsaIndexBlock chunk in chunks) { chunk.Write(_writer); } } } public void Write(Dictionary> chromBlocks) { _writer.WriteOpt(chromBlocks.Count); foreach ((ushort index, List chunks) in chromBlocks) { _writer.WriteOpt(index); _writer.WriteOpt(chunks.Count); foreach (NsaIndexBlock chunk in chunks) { chunk.Write(_writer); } } } public NsaIndex(Stream stream) { using (var memStream = new MemoryStream()) using (var memReader = new ExtendedBinaryReader(memStream)) { stream.CopyTo(memStream);//reading all bytes in stream to memStream memStream.Position = 0; Assembly = (GenomeAssembly)memReader.ReadByte(); Version = DataSourceVersion.Read(memReader); JsonKey = memReader.ReadAsciiString(); MatchByAllele = memReader.ReadBoolean(); IsArray = memReader.ReadBoolean(); SchemaVersion = memReader.ReadOptInt32(); IsPositional = memReader.ReadBoolean(); var chromCount = memReader.ReadOptInt32(); _chromBlocks = new Dictionary>(chromCount); for (var i = 0; i < chromCount; i++) { var chromIndex = memReader.ReadOptUInt16(); var chunkCount = memReader.ReadOptInt32(); _chromBlocks[chromIndex] = new List(chunkCount); for (var j = 0; j < chunkCount; j++) _chromBlocks[chromIndex].Add(new NsaIndexBlock(memReader)); } } } public long GetFileLocation(ushort chromIndex, int start) { if (_chromBlocks == null || !_chromBlocks.TryGetValue(chromIndex, out var chunks)) return -1; var index = BinarySearch(chunks, start); if (index < 0) return -1; return chunks[index].FilePosition; } public (long startFilePosition, int chunkCount) GetFileRange(ushort chromIndex, int start, int end) { //create a static empty entry. if (_chromBlocks == null || !_chromBlocks.TryGetValue(chromIndex, out var chunks)) return (-1, 0); long startFilePosition = -1; long endFilePosition = -1; int startChunkIndex = BinarySearch(chunks, start); int endChunkIndex = BinarySearch(chunks, end); if (startChunkIndex < 0) startChunkIndex = ~startChunkIndex; if (startChunkIndex == chunks.Count) return (-1, 0); //start lands after the last chunk=> nothing to return if (startChunkIndex < chunks.Count) startFilePosition = chunks[startChunkIndex].FilePosition; if (endChunkIndex < 0) endChunkIndex = ~endChunkIndex - 1; //if end lands on a gap, return the the chunk to the left of end if (endChunkIndex < 0) return (-1, 0); //end lands before the first chunk => nothing to return if (endChunkIndex < chunks.Count) endFilePosition = chunks[endChunkIndex].FilePosition + chunks[endChunkIndex].Length; if (endFilePosition < startFilePosition) return (-1, 0); //both begin and end landed on the same gap. return (startFilePosition, endChunkIndex - startChunkIndex + 1); } private static int BinarySearch(List chunks, int position) { var begin = 0; int end = chunks.Count - 1; while (begin <= end) { int index = begin + (end - begin >> 1); int ret = chunks[index].CompareTo(position); if (ret == 0) return index; if (ret < 0) begin = index + 1; else end = index - 1; } return ~begin; } } } ================================================ FILE: VariantAnnotation/NSA/NsaIndexBlock.cs ================================================ using System; using IO; namespace VariantAnnotation.NSA { public sealed class NsaIndexBlock { public readonly int Start; public readonly int End; public readonly long FilePosition; public readonly int Length; public NsaIndexBlock(int start, int end, long filePosition, int length) { Start = start; End = end; FilePosition = filePosition; Length = length; } [Obsolete("Use a factory method instead of an extra constructor.")] public NsaIndexBlock(ExtendedBinaryReader reader) { Start = reader.ReadOptInt32(); End = reader.ReadOptInt32(); FilePosition = reader.ReadOptInt64(); Length = reader.ReadOptInt32(); } public void Write(ExtendedBinaryWriter writer) { writer.WriteOpt(Start); writer.WriteOpt(End); writer.WriteOpt(FilePosition); writer.WriteOpt(Length); } public int CompareTo(int position) { if (Start <= position && position <= End) return 0; return Start.CompareTo(position); } } } ================================================ FILE: VariantAnnotation/NSA/NsaReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Compression.Algorithms; using ErrorHandling.Exceptions; using Genome; using IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.SA; namespace VariantAnnotation.NSA { public sealed class AnnotationItem { public readonly int Position; public readonly byte[] Data; public AnnotationItem(int position, byte[] data) { Position = position; Data = data; } } public sealed class NsaReader : INsaReader { private readonly Stream _stream; private readonly ExtendedBinaryReader _reader; public GenomeAssembly Assembly { get; } private readonly NsaIndex _index; public IDataSourceVersion Version { get; } private readonly NsaBlock _block; public string JsonKey { get; } public bool MatchByAllele { get; } public bool IsArray { get; } public bool IsPositional { get; } public IEnumerable ChromosomeIndices => _index.ChromosomeIndices; private readonly List _annotations; private readonly int _blockSize; private ExtendedBinaryReader _annotationReader; private MemoryStream _annotationStream; private byte[] _annotationBuffer; public NsaReader(Stream dataStream, Stream indexStream, int blockSize = SaCommon.DefaultBlockSize) { _stream = dataStream; _blockSize = blockSize; _reader = new ExtendedBinaryReader(_stream); _block = new NsaBlock(new Zstandard(), blockSize); _index = new NsaIndex(indexStream); Assembly = _index.Assembly; Version = _index.Version; JsonKey = _index.JsonKey; MatchByAllele = _index.MatchByAllele; IsArray = _index.IsArray; IsPositional = _index.IsPositional; if (_index.SchemaVersion != SaCommon.SchemaVersion) throw new UserErrorException($"SA schema version mismatch. Expected {SaCommon.SchemaVersion}, observed {_index.SchemaVersion} for {JsonKey}"); _annotations = new List(64 * 1024); _annotationBuffer = new byte[1024*1024]; _annotationStream = new MemoryStream(_annotationBuffer); _annotationReader = new ExtendedBinaryReader(_annotationStream); } public void PreLoad(Chromosome chrom, List positions) { if (positions == null || positions.Count == 0) return; _annotations.Clear(); for (var i = 0; i < positions.Count;) { int position = positions[i]; long fileLocation = _index.GetFileLocation(chrom.Index, position); if (fileLocation == -1) { i++; continue; } //only reconnect if necessary if (_reader.BaseStream.Position != fileLocation) _reader.BaseStream.Position = fileLocation; _block.Read(_reader); var newIndex = _block.AddAnnotations(positions, i, _annotations); if (newIndex == i) i++; //no positions were found in this block else i = newIndex; } } public List GetIndexBlocks(ushort chromIndex) => _index.GetChromBlocks(chromIndex); public bool HasDataBlocks(ushort chromIndex) { var (location, _) = _index.GetFileRange(chromIndex, 1, int.MaxValue); return location != -1; } public IEnumerable GetCompressedBlocks(ushort chromIndex) { var (location, blockCount) = _index.GetFileRange(chromIndex, 1, int.MaxValue); if (location == -1) yield break; _reader.BaseStream.Position = location; for (var i = 0; i < blockCount; i++) { var block = new NsaBlock(new Zstandard(), _blockSize); block.ReadCompressedBytes(_reader); yield return block; } } private void ExtractAnnotations(byte[] data, List<(string refAllele, string altAllele, string annotation)> annotations) { if (_annotationBuffer.Length < data.Length) { _annotationBuffer = new byte[2 *data.Length]; _annotationReader.Dispose(); _annotationStream?.Dispose(); _annotationStream = new MemoryStream(_annotationBuffer); _annotationReader = new ExtendedBinaryReader(_annotationStream); } Array.Copy(data, _annotationBuffer, data.Length); _annotationStream.Position = 0; if (IsPositional) { var positionalAnno = _annotationReader.ReadString(); annotations.Add((null, null, positionalAnno)); return; } int count = _annotationReader.ReadOptInt32(); for (var i = 0; i < count; i++) { string refAllele = _annotationReader.ReadAsciiString(); string altAllele = _annotationReader.ReadAsciiString(); string annotation = _annotationReader.ReadString(); annotations.Add((refAllele ?? "", altAllele ?? "", annotation)); } } public void GetAnnotation(int position, List<(string refAllele, string altAllele, string annotation)> annotations) { annotations.Clear(); int index = BinarySearch(position); if(index < 0) return; ExtractAnnotations(_annotations[index].Data, annotations); } private int BinarySearch(int position) { var begin = 0; int end = _annotations.Count - 1; while (begin <= end) { int index = begin + (end - begin >> 1); int ret = _annotations[index].Position.CompareTo(position); if (ret == 0) return index; if (ret < 0) begin = index + 1; else end = index - 1; } return ~begin; } public void Dispose() { _stream?.Dispose(); _block?.Dispose(); _annotationStream?.Dispose(); _annotationReader?.Dispose(); } } } ================================================ FILE: VariantAnnotation/NSA/NsiReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using System.Text; using Compression.Algorithms; using Compression.FileHandling; using ErrorHandling.Exceptions; using Genome; using Intervals; using IO; using VariantAnnotation.Algorithms; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.IO; using VariantAnnotation.Providers; using VariantAnnotation.SA; using Variants; namespace VariantAnnotation.NSA { public sealed class NsiReader : INsiReader { public GenomeAssembly Assembly { get; } public IDataSourceVersion Version { get; } public string JsonKey { get; } public ReportFor ReportFor { get; } private readonly IntervalForest _intervalForest; private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, IntervalArray[] intervalArrays) { Assembly = assembly; Version = version; JsonKey = jsonKey; ReportFor = reportFor; _intervalForest = new IntervalForest(intervalArrays); } public static NsiReader Read(Stream stream) { (IDataSourceVersion version, GenomeAssembly assembly, string jsonKey, ReportFor reportFor, int schemaVersion) = ReadHeader(stream); if (schemaVersion != SaCommon.SchemaVersion) throw new UserErrorException($"Schema version mismatch!! Expected {SaCommon.SchemaVersion}, observed {schemaVersion} for {jsonKey}"); using (var blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Decompress)) using (var reader = new ExtendedBinaryReader(blockStream)) { int count = reader.ReadOptInt32(); var suppIntervals = new Dictionary>>(); for (var i = 0; i < count; i++) { var saInterval = SuppInterval.Read(reader); if (suppIntervals.TryGetValue(saInterval.Chromosome.Index, out var intervals)) intervals.Add(new Interval(saInterval.Start, saInterval.End, saInterval.GetJsonString())); else suppIntervals[saInterval.Chromosome.Index] = new List> { new Interval(saInterval.Start, saInterval.End, saInterval.GetJsonString()) }; } var maxChromIndex = suppIntervals.Keys.Max(); var intervalArrays = new IntervalArray[maxChromIndex + 1]; for (ushort i = 0; i < intervalArrays.Length; i++) { intervalArrays[i] = suppIntervals.ContainsKey(i) ? new IntervalArray(suppIntervals[i].ToArray()) : null; } return new NsiReader(assembly, version, jsonKey, reportFor, intervalArrays); } } private static (IDataSourceVersion, GenomeAssembly, string, ReportFor, int) ReadHeader(Stream stream) { using (var reader = new ExtendedBinaryReader(stream, Encoding.UTF8, true)) { var identifier = reader.ReadAsciiString(); if(identifier != SaCommon.NsiIdentifier) throw new InvalidDataException($"Failed to find identifier!!Expected: {SaCommon.NsiIdentifier}, observed:{identifier}"); var version = DataSourceVersion.Read(reader); var assembly = (GenomeAssembly)reader.ReadByte(); var jsonKey = reader.ReadAsciiString(); var reportFor = (ReportFor)reader.ReadByte(); int schemaVersion = reader.ReadInt32(); var guard = reader.ReadUInt32(); if (guard != SaCommon.GuardInt) throw new InvalidDataException($"Failed to find guard int!!Expected: {SaCommon.GuardInt}, observed:{guard}"); return (version, assembly, jsonKey, reportFor, schemaVersion); } } public IEnumerable GetAnnotation(IVariant variant) { var start = variant.Start; var end = variant.End; // for insertions, the end position is one past the last base if (end < start) Swap.Int(ref start, ref end); var overlappingSvs = _intervalForest.GetAllOverlappingIntervals(variant.Chromosome.Index, start, end); if (overlappingSvs == null) return null; var jsonStrings = new List(); foreach (var interval in overlappingSvs) { var (reciprocalOverlap, annotationOverlap) = SuppIntervalUtilities.GetOverlapFractions( new ChromosomeInterval(variant.Chromosome, interval.Begin, interval.End), variant); jsonStrings.Add(AddOverlapToAnnotation(interval.Value, reciprocalOverlap, annotationOverlap)); } return jsonStrings; } private static string AddOverlapToAnnotation(string jsonString, double? reciprocalOverlap, double? annotationOverlap) { if (reciprocalOverlap != null) jsonString+=JsonObject.Comma + "\"reciprocalOverlap\":" + reciprocalOverlap.Value.ToString("0.#####"); if (annotationOverlap != null) jsonString += JsonObject.Comma + "\"annotationOverlap\":" + annotationOverlap.Value.ToString("0.#####"); return jsonString; } public bool OverlapsAny(IChromosomeInterval variant) { return _intervalForest.OverlapsAny(variant.Chromosome.Index, variant.Start, variant.End); } } } ================================================ FILE: VariantAnnotation/NSA/NsiWriter.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using System.Text; using Compression.Algorithms; using Compression.FileHandling; using Genome; using IO; using VariantAnnotation.Interface.SA; using VariantAnnotation.Providers; using VariantAnnotation.SA; namespace VariantAnnotation.NSA { public sealed class NsiWriter:IDisposable { private readonly Stream _stream; private readonly ExtendedBinaryWriter _writer; private readonly bool _leaveOpen; public NsiWriter(Stream stream, DataSourceVersion version, GenomeAssembly assembly, string jsonKey, ReportFor reportFor, int schemaVersion, bool leaveOpen = false) { _stream = stream; _leaveOpen = leaveOpen; WriteHeader(version, assembly, jsonKey, reportFor, schemaVersion); var blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Compress); _writer = new ExtendedBinaryWriter(blockStream, Encoding.UTF8, leaveOpen); } private void WriteHeader(DataSourceVersion version, GenomeAssembly assembly, string jsonKey, ReportFor reportFor, int schemaVersion) { using (var writer = new ExtendedBinaryWriter(_stream, Encoding.UTF8, true)) { writer.WriteOptAscii(SaCommon.NsiIdentifier); version.Write(writer); writer.Write((byte)assembly); writer.WriteOptAscii(jsonKey); writer.Write((byte)reportFor); writer.Write(schemaVersion); writer.Write(SaCommon.GuardInt); } } public void Write(IEnumerable siItems) { var sortedItems = siItems.OrderBy(x => x.Chromosome.Index).ThenBy(x => x.Start).ThenBy(x => x.End).ToList(); Console.WriteLine($"Writing {sortedItems.Count} intervals to database..."); _writer.WriteOpt(sortedItems.Count); foreach (ISuppIntervalItem item in sortedItems) { _writer.WriteOptAscii(item.Chromosome.EnsemblName); _writer.WriteOptAscii(item.Chromosome.UcscName); _writer.WriteOpt(item.Chromosome.Index); _writer.WriteOpt(item.Start); _writer.WriteOpt(item.End); _writer.Write(item.GetJsonString()); } } public void Dispose() { _writer?.Dispose(); if(!_leaveOpen) _stream?.Dispose(); } } } ================================================ FILE: VariantAnnotation/NSA/RefMinorDbReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using Genome; using IO; using VariantAnnotation.SA; namespace VariantAnnotation.NSA { public sealed class RefMinorDbReader : IDisposable { private readonly ExtendedBinaryReader _reader; private readonly RefMinorIndex _index; private readonly Dictionary _annotations; private readonly Stream _dbStream, _indexStream; public RefMinorDbReader(Stream dbStream, Stream indexStream) { _dbStream = dbStream; _indexStream = indexStream; _reader = new ExtendedBinaryReader(dbStream); _index = new RefMinorIndex(new ExtendedBinaryReader(indexStream)); _annotations = new Dictionary(); if (_index.SchemaVersion != SaCommon.SchemaVersion) throw new UserErrorException($"SA schema version mismatch. Expected {SaCommon.SchemaVersion}, observed {_index.SchemaVersion}"); } private Chromosome _chromosome; private void PreLoad(Chromosome chrom) { _annotations.Clear(); _chromosome = chrom; (long startLocation, int numBytes, int refMinorCount) = _index.GetFileRange(chrom.Index); if (startLocation == -1) return; _reader.BaseStream.Position = startLocation; var buffer = _reader.ReadBytes(numBytes); using (var memStream = new MemoryStream(buffer)) using(var reader = new ExtendedBinaryReader(memStream)) { for (var i = 0; i < refMinorCount; i++) { var position = reader.ReadOptInt32(); var globalMajor = reader.ReadAsciiString(); _annotations[position] = globalMajor; } } } public string GetGlobalMajorAllele(Chromosome chromosome, int position) { if (_chromosome == null || chromosome.Index != _chromosome.Index) PreLoad(chromosome); return _annotations.TryGetValue(position, out string globalMajor) ? globalMajor : null; } public void Dispose() { _dbStream?.Dispose(); _indexStream?.Dispose(); _reader?.Dispose(); } } } ================================================ FILE: VariantAnnotation/NSA/RefMinorIndex.cs ================================================ using System.Collections.Generic; using Genome; using IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; namespace VariantAnnotation.NSA { public sealed class RefMinorIndex { private readonly ExtendedBinaryWriter _writer; private readonly Dictionary _chromBlocks; private readonly IDataSourceVersion _version; private readonly GenomeAssembly _assembly; public readonly int SchemaVersion; public RefMinorIndex(ExtendedBinaryWriter writer, GenomeAssembly assembly, IDataSourceVersion version, int schemaVersion) { _writer = writer; _chromBlocks = new Dictionary(); _assembly = assembly; _version = version; SchemaVersion = schemaVersion; } private ushort _chromIndex = ushort.MaxValue; private long _chromLocation =-1; private int _blockLength =-1; private int _count; public void Add(ushort chromIndex, long location) { if (_chromIndex != chromIndex) { _blockLength = (int) (location - _chromLocation); //if you try to add a chrom twice (i.e. the positions are not sorted by chrom), this will throw an exception _chromBlocks.Add(_chromIndex, (_chromLocation, _blockLength, _count)); _chromIndex = chromIndex; _chromLocation = location; _count = 1; } else _count++; } public (long location, int numBytes, int count) GetFileRange(ushort chromIndex) { return _chromBlocks.TryGetValue(chromIndex, out var locationSize) ? locationSize : (-1, -1, 0); } public void Write(long finalLocation) { _blockLength = (int)(finalLocation - _chromLocation); //adding the last chrom to index _chromBlocks.Add(_chromIndex, (_chromLocation, _blockLength, _count)); _writer.Write((byte)_assembly); _version.Write(_writer); _writer.WriteOpt(SchemaVersion); _writer.WriteOpt(_chromBlocks.Count); foreach ((ushort chromIndex, (long location, int numBytes, int count)) in _chromBlocks) { _writer.WriteOpt(chromIndex); _writer.WriteOpt(location); _writer.WriteOpt(numBytes); _writer.WriteOpt(count); } } public RefMinorIndex(ExtendedBinaryReader reader) { _assembly = (GenomeAssembly) reader.ReadByte(); _version = DataSourceVersion.Read(reader); SchemaVersion = reader.ReadOptInt32(); var chromCount = reader.ReadOptInt32(); _chromBlocks= new Dictionary(chromCount); for (int i = 0; i < chromCount; i++) { var chromIndex = reader.ReadOptUInt16(); var location = reader.ReadOptInt64(); var numBytes = reader.ReadOptInt32(); int count = reader.ReadOptInt32(); _chromBlocks.Add(chromIndex, (location, numBytes, count)); } } } } ================================================ FILE: VariantAnnotation/NSA/SuppInterval.cs ================================================ using Genome; using IO; using VariantAnnotation.Interface.SA; namespace VariantAnnotation.NSA { public sealed class SuppInterval : ISuppIntervalItem { public int Start { get; } public int End { get; } public Chromosome Chromosome { get; } private readonly string _jsonString; private SuppInterval(Chromosome chromosome, int start, int end, string jsonString) { Chromosome = chromosome; Start = start; End = end; _jsonString = jsonString; } public static SuppInterval Read(ExtendedBinaryReader reader) { string ensemblName = reader.ReadAsciiString(); string ucscName = reader.ReadAsciiString(); ushort chromIndex = reader.ReadOptUInt16(); var chromosome = new Chromosome(ucscName, ensemblName, null, null, 1, chromIndex); var start = reader.ReadOptInt32(); var end = reader.ReadOptInt32(); var jsonString = reader.ReadString(); return new SuppInterval(chromosome, start, end, jsonString); } public string GetJsonString() => _jsonString; } } ================================================ FILE: VariantAnnotation/NSA/SuppIntervalUtilities.cs ================================================ using System; using Genome; using Variants; namespace VariantAnnotation.NSA { public static class SuppIntervalUtilities { public static (double? ReciprocalOverlap, double? AnnotationOverlap) GetOverlapFractions( ChromosomeInterval saInterval, ISimpleVariant variant) { if (saInterval.Chromosome.Index != variant.Chromosome.Index) return (null, null); //skip for insertions if (saInterval.Start >= saInterval.End || variant.Type == VariantType.insertion) return (null, null); //skip for break-ends if (variant.Type == VariantType.translocation_breakend) return (null, null); if (!Intervals.Utilities.Overlaps(saInterval.Start, saInterval.End, variant.Start, variant.End)) return (null, null); var overlapSize = (double)(Math.Min(saInterval.End, variant.End) - Math.Max(saInterval.Start, variant.Start) + 1); int annoSize = saInterval.End - saInterval.Start + 1; int varSize = variant.End - variant.Start + 1; int maxSize = Math.Max(annoSize, varSize); return (overlapSize / maxSize, overlapSize / annoSize); } } } ================================================ FILE: VariantAnnotation/NSA/SupplementaryAnnotation.cs ================================================ using System.Collections.Generic; using System.Text; using ErrorHandling.Exceptions; using VariantAnnotation.Interface.SA; namespace VariantAnnotation.NSA { public sealed class SupplementaryAnnotation:ISupplementaryAnnotation { public string JsonKey { get; } private readonly bool _isArray; private readonly bool _isPositional; private readonly string _jsonString; private readonly IEnumerable _jsonStrings; public SupplementaryAnnotation(string key, bool isArray, bool isPositional, string jsonString, IEnumerable jsonStrings) { JsonKey = key; _isArray = isArray; _isPositional = isPositional; _jsonString = jsonString; _jsonStrings = jsonStrings; if (_isArray && _jsonStrings == null) { throw new UserErrorException($"No list of json strings provided for a supplementary annotation of array type!! JsonKey: {JsonKey}"); } if (!_isArray && string.IsNullOrEmpty(jsonString)) throw new UserErrorException("ERROR: No json string provided for a supplementary annotation of non-array type!!"); } public void SerializeJson(StringBuilder sb) { if (_isPositional) { sb.Append(_jsonString); return; } if (!_isArray) { sb.Append('{'); sb.Append(_jsonString); sb.Append('}'); } else { sb.Append('['); var firstString = true; foreach (var jsonString in _jsonStrings) { if (!firstString) sb.Append(','); if (!jsonString.StartsWith("\"rs")) { sb.Append('{'); sb.Append(jsonString); sb.Append('}'); } else sb.Append(jsonString); firstString = false; } sb.Append(']'); } } } } ================================================ FILE: VariantAnnotation/PerformanceMetrics.cs ================================================ using System; using System.Diagnostics; using CommandLine.Utilities; using Genome; using IO; namespace VariantAnnotation { public sealed class PerformanceMetrics { public readonly TimeKeeper Cache = new TimeKeeper(); public readonly TimeKeeper Annotation = new TimeKeeper(); public readonly TimeKeeper Preload = new TimeKeeper(); public readonly TimeKeeper SaPositionScan = new TimeKeeper(); public void ShowAnnotationEntry(Chromosome chromosome, int numVariants) { Annotation.Stop(); string referenceName = GetPaddedField(chromosome.UcscName, 38); string preloadTime = Preload.GetTime(); string annotationTime = Annotation.GetTime(); double variantsPerSecond = Annotation.GetIterationsPerSecond(numVariants); Logger.WriteLine($"{referenceName} {preloadTime} {annotationTime} {variantsPerSecond,11:N0}"); } public void ShowCacheLoad() { Cache.Stop(); string time = Cache.GetTime(); Logger.WriteLine($"Cache {time}"); } public void ShowSaPositionScanLoad(int numPositions) { SaPositionScan.Stop(); string time = SaPositionScan.GetTime(); double positionsPerSecond = SaPositionScan.GetIterationsPerSecond(numPositions); Logger.WriteLine($"SA Position Scan {time} {positionsPerSecond,11:N0}"); } private static string GetPaddedField(string s, int fieldLength) { if (s.Length > fieldLength) return s.Substring(0, fieldLength - 3) + "..."; return s.PadRight(fieldLength, ' '); } public static void ShowAnnotationHeader() => MetricsCommon.DisplayHeader("\nReference Preload Annotation Variants/s"); public static void ShowInitializationHeader() => MetricsCommon.DisplayHeader("Initialization Time Positions/s"); public void ShowSummaryTable() { MetricsCommon.DisplayHeader("\nSummary Time Percent"); long processTicks = GetTotalProcessTicks(); long initializationTicks = Cache.TotalTicks + SaPositionScan.TotalTicks; long annotationTicks = Annotation.TotalTicks; long preloadTicks = Preload.TotalTicks; ShowSummaryEntry("Initialization", initializationTicks, processTicks); ShowSummaryEntry("Preload", preloadTicks, processTicks); ShowSummaryEntry("Annotation", annotationTicks, processTicks); } private void ShowSummaryEntry(string description, long entryTicks, long processTicks) { string paddedDescription = GetPaddedField(description, 50); string time = Benchmark.ToHumanReadable(TimeSpan.FromTicks(entryTicks)); double percentage = entryTicks / (double) processTicks * 100.0; Logger.WriteLine($"{paddedDescription} {time} {percentage, 9:0.0} %"); } private static long GetTotalProcessTicks() => DateTime.Now.Ticks - Process.GetCurrentProcess().StartTime.Ticks; } public sealed class TimeKeeper { public long TotalTicks { get; private set; } private readonly Benchmark _benchmark = new Benchmark(); private TimeSpan _elapsedTime; public void Stop() { _elapsedTime = _benchmark.GetElapsedTime(); TotalTicks += _elapsedTime.Ticks; } public void Start() => _benchmark.Reset(); public string GetTime() => Benchmark.ToHumanReadable(_elapsedTime); public double GetIterationsPerSecond(int num) => Benchmark.GetElapsedIterationsPerSecond(_elapsedTime, num); } public static class MetricsCommon { private const int LineLength = 75; private static readonly string Divider = new string('-', LineLength); public static void DisplayHeader(string s) { Logger.SetBold(); Logger.WriteLine(s); Logger.ResetColor(); Logger.WriteLine(Divider); } } } ================================================ FILE: VariantAnnotation/PhyloP/NpdIndex.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; namespace VariantAnnotation.PhyloP { public sealed class NpdIndex { private readonly Dictionary _chromRanges; private readonly ExtendedBinaryWriter _writer; public readonly IDataSourceVersion Version; public readonly GenomeAssembly Assembly; public readonly int SchemaVersion; private readonly string _jsonKey; public readonly Dictionary ScoreMap; public const int MaxChromLength = 250_000_000; public NpdIndex(Stream stream, GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, int schemaVersion) { _writer = new ExtendedBinaryWriter(stream); Assembly = assembly; Version = version; _jsonKey = jsonKey; SchemaVersion = schemaVersion; _chromRanges = new Dictionary(32); } public void Add(ushort chromIndex, long location, int byteCount) { _chromRanges.Add(chromIndex, (location, byteCount)); } public (long location, int numBytes) GetFileRange(ushort chromIndex) { return _chromRanges.TryGetValue(chromIndex, out var fileRange) ? fileRange: (-1, -1); } public void Write(Dictionary scoreMap) { _writer.Write((byte)Assembly); Version.Write(_writer); _writer.WriteOptAscii(_jsonKey); _writer.WriteOpt(SchemaVersion); _writer.WriteOpt(_chromRanges.Count); foreach ((ushort chromIndex, (long location, int byteCount)) in _chromRanges) { _writer.WriteOpt(chromIndex); _writer.WriteOpt(location); _writer.WriteOpt(byteCount); } _writer.WriteOpt(scoreMap.Count); foreach ((double score, byte code) in scoreMap) { _writer.Write(score); _writer.Write(code); } } public NpdIndex(ExtendedBinaryReader reader) { Assembly = (GenomeAssembly)reader.ReadByte(); Version = DataSourceVersion.Read(reader); _jsonKey = reader.ReadAsciiString(); SchemaVersion = reader.ReadOptInt32(); var chromCount = reader.ReadOptInt32(); _chromRanges = new Dictionary(chromCount); for (int i = 0; i < chromCount; i++) { var chromIndex = reader.ReadOptUInt16(); var location = reader.ReadOptInt64(); var numBytes = reader.ReadOptInt32(); _chromRanges.Add(chromIndex, (location, numBytes)); } var scoreCount = reader.ReadOptInt32(); var scoreMap = new Dictionary(scoreCount); for (int i = 0; i < scoreCount; i++) { var score = reader.ReadDouble(); var code = reader.ReadByte(); scoreMap.Add(score, code); } ScoreMap = scoreMap; } } } ================================================ FILE: VariantAnnotation/PhyloP/NpdReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Compression.Algorithms; using ErrorHandling.Exceptions; using Genome; using IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.SA; namespace VariantAnnotation.PhyloP { public sealed class NpdReader:IDisposable { private readonly ExtendedBinaryReader _reader; private readonly byte[] _scores; private readonly Zstandard _zstd; private readonly Dictionary _scoreMap; private readonly NpdIndex _index; public GenomeAssembly Assembly { get; } public IDataSourceVersion Version { get; } private readonly Stream _dbStream; private readonly Stream _indexStream; public NpdReader(Stream dbStream, Stream indexStream) { _dbStream = dbStream; _indexStream = indexStream; _reader = new ExtendedBinaryReader(dbStream); _index = new NpdIndex(new ExtendedBinaryReader(indexStream)); Assembly = _index.Assembly; Version = _index.Version; if (_index.SchemaVersion != SaCommon.SchemaVersion) throw new UserErrorException($"SA schema version mismatch. Expected {SaCommon.SchemaVersion}, observed {_index.SchemaVersion}"); var scoreMap= new Dictionary(); foreach ((double score, byte code)in _index.ScoreMap) { scoreMap.Add(code, score); } _scoreMap = scoreMap; _zstd = new Zstandard(); _scores = new byte[NpdIndex.MaxChromLength]; } private Chromosome _chromosome; private int _lastPhylopPosition; private void PreLoad(Chromosome chrom) { _chromosome = chrom; (long startLocation, int numBytes) = _index.GetFileRange(chrom.Index); if (startLocation == -1) { _lastPhylopPosition = -1; return; } _reader.BaseStream.Position = startLocation; var buffer = _reader.ReadBytes(numBytes); _lastPhylopPosition = _zstd.Decompress(buffer, buffer.Length, _scores, _scores.Length); } public double? GetAnnotation(Chromosome chromosome, int position) { if (_chromosome==null || chromosome.Index != _chromosome.Index) PreLoad(chromosome); if (position >= _lastPhylopPosition) return null; var scoreCode = _scores[position - 1]; if (scoreCode == 0) return null; return _scoreMap[scoreCode]; } public void Dispose() { _reader?.Dispose(); _dbStream?.Dispose(); _indexStream?.Dispose(); } } } ================================================ FILE: VariantAnnotation/Pools/AnnotatedPositionPool.cs ================================================ using Microsoft.Extensions.ObjectPool; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Positions; namespace VariantAnnotation.Pools { public static class AnnotatedPositionPool { private static readonly ObjectPool Pool = new DefaultObjectPool(new DefaultPooledObjectPolicy(), 4); public static AnnotatedPosition Get(IPosition position, IAnnotatedVariant[] annotatedVariants) { var annotatedPosition = Pool.Get(); annotatedPosition.Initialize(position, annotatedVariants); return annotatedPosition; } public static void Return(AnnotatedPosition ap) => Pool.Return(ap); } } ================================================ FILE: VariantAnnotation/Pools/AnnotatedTranscriptPool.cs ================================================ using System.Collections.Generic; using Microsoft.Extensions.ObjectPool; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Pools { public static class AnnotatedTranscriptPool { private static readonly ObjectPool Pool = new DefaultObjectPool(new DefaultPooledObjectPolicy(), 16); public static AnnotatedTranscript Get(ITranscript transcript, string referenceAminoAcids, string alternateAminoAcids, string referenceCodons, string alternateCodons, IMappedPosition mappedPosition, string hgvsCoding, string hgvsProtein, PredictionScore sift, PredictionScore polyphen, List consequences, bool? completeOverlap) { var annotatedTranscript = Pool.Get(); annotatedTranscript.Initialize(transcript, referenceAminoAcids, alternateAminoAcids, referenceCodons, alternateCodons, mappedPosition, hgvsCoding, hgvsProtein, sift, polyphen, consequences, completeOverlap); return annotatedTranscript; } public static void Return(AnnotatedTranscript annotatedTranscript) => Pool.Return(annotatedTranscript); } } ================================================ FILE: VariantAnnotation/Pools/AnnotatedVariantPool.cs ================================================ using Microsoft.Extensions.ObjectPool; using VariantAnnotation.AnnotatedPositions; using Variants; namespace VariantAnnotation.Pools { public static class AnnotatedVariantPool { private static readonly ObjectPool Pool = new DefaultObjectPool(new DefaultPooledObjectPolicy(), 8); public static AnnotatedVariant Get(IVariant variant) { var annotatedVariant = Pool.Get(); annotatedVariant.Initialize(variant); return annotatedVariant; } public static void Return(AnnotatedVariant av) => Pool.Return(av); } } ================================================ FILE: VariantAnnotation/Pools/VariantPool.cs ================================================ using Genome; using Microsoft.Extensions.ObjectPool; using Variants; namespace VariantAnnotation.Pools { public static class VariantPool { private static readonly ObjectPool Pool = new DefaultObjectPool(new DefaultPooledObjectPolicy(), 8); public static Variant Get(Chromosome chromosome, int start, int end, string refAllele, string altAllele, VariantType variantType, string variantId, bool isRefMinor, bool isDecomposed, bool isRecomposed, string[] linkedVids, AnnotationBehavior behavior, bool isStructuralVariant) { var variant = Pool.Get(); variant.Initialize( chromosome, start, end, refAllele, altAllele, variantType, variantId, isRefMinor, isDecomposed, isRecomposed, linkedVids, behavior, isStructuralVariant); return variant; } public static void Return(Variant variant) => Pool.Return(variant); } } ================================================ FILE: VariantAnnotation/ProteinConservation/ProteinConservationCommon.cs ================================================ namespace VariantAnnotation.ProteinConservation { public static class ProteinConservationCommon { public const string FileSuffix = ".pcs"; public const int SchemaVersion = 1; } } ================================================ FILE: VariantAnnotation/ProteinConservation/ProteinConservationItem.cs ================================================ namespace VariantAnnotation.ProteinConservation { public sealed class ProteinConservationItem { public readonly string TranscriptId; public readonly string Chromosome; public readonly string ProteinSequence; public readonly byte[] Scores; public ProteinConservationItem(string chrom, string transcriptId, string proteinSequence, byte[] scores) { Chromosome = chrom; TranscriptId = transcriptId; ProteinSequence = proteinSequence; Scores = scores; } } } ================================================ FILE: VariantAnnotation/ProteinConservation/ProteinConservationReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Providers; namespace VariantAnnotation.ProteinConservation { public sealed class ProteinConservationReader:IDisposable { private GenomeAssembly Assembly { get; } private readonly ExtendedBinaryReader _reader; public readonly IDataSourceVersion Version; public ProteinConservationReader(Stream stream) { _reader = new ExtendedBinaryReader(stream); var schemaVersion = _reader.ReadOptInt32(); if(schemaVersion != ProteinConservationCommon.SchemaVersion) throw new Exception($"Schema version mismatch found. Observed: {schemaVersion}, expected: {ProteinConservationCommon.SchemaVersion}"); Assembly = (GenomeAssembly) _reader.ReadByte(); Version = DataSourceVersion.Read(_reader); } public IEnumerable GetItems() { TranscriptConservationScores score; while ((score = TranscriptConservationScores.Read(_reader))!=null) { if (score.IsEmpty()) break; yield return score; } } public void Dispose() =>_reader?.Dispose(); } } ================================================ FILE: VariantAnnotation/ProteinConservation/TranscriptConservationScores.cs ================================================ using System; using IO; namespace VariantAnnotation.ProteinConservation { public sealed class TranscriptConservationScores { public readonly string TranscriptId; public readonly byte[] ConservationScores; public TranscriptConservationScores(string id, byte[] scores) { //removing versions for ensembl only TranscriptId = id; ConservationScores = scores; } public static TranscriptConservationScores Read(ExtendedBinaryReader reader) { var id = reader.ReadAsciiString(); var count = reader.ReadOptInt32(); var scores = reader.ReadBytes(count); var item = new TranscriptConservationScores(id, scores); return item.IsEmpty() ? null : item; } public void Write(ExtendedBinaryWriter writer) { writer.WriteOptAscii(TranscriptId); writer.WriteOpt(ConservationScores.Length); writer.Write(ConservationScores); } public static TranscriptConservationScores GetEmptyItem() => new TranscriptConservationScores("", Array.Empty()); public bool IsEmpty() => string.IsNullOrEmpty(TranscriptId) && ConservationScores.Length == 0; } } ================================================ FILE: VariantAnnotation/Providers/ConservationScoreProvider.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.PhyloP; using Variants; namespace VariantAnnotation.Providers { public sealed class ConservationScoreProvider : IAnnotationProvider { private NpdReader _phylopReader; public string Name { get; } public GenomeAssembly Assembly => _phylopReader.Assembly; public IEnumerable DataSourceVersions => _versions; private readonly List _versions = new(); public ConservationScoreProvider() { Name = "Conservation score provider"; } public ConservationScoreProvider AddPhylopReader(Stream dbStream, Stream indexStream) { if (dbStream == null || indexStream == null) return this; _phylopReader = new NpdReader(dbStream, indexStream); _versions.Add(_phylopReader.Version); return this; } public void Annotate(IAnnotatedPosition annotatedPosition) { foreach (var annotatedVariant in annotatedPosition.AnnotatedVariants) { if (annotatedVariant.Variant.Type != VariantType.SNV) continue; if (_phylopReader != null) annotatedVariant.PhylopScore = _phylopReader.GetAnnotation(annotatedPosition.Position.Chromosome, annotatedVariant.Variant.Start); } } public void PreLoad(Chromosome chromosome, List positions) { throw new NotImplementedException(); } public void Dispose() { _phylopReader?.Dispose(); } } } ================================================ FILE: VariantAnnotation/Providers/DataSourceVersion.cs ================================================ using System.Collections.Generic; using System.Text; using IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO; using VariantAnnotation.Utilities; namespace VariantAnnotation.Providers { public sealed class DataSourceVersion : IDataSourceVersion, ISerializable { public string Name { get; } public string Description { get; } public string Version { get; } public long ReleaseDateTicks { get; } public DataSourceVersion(string name, string version, long releaseDateTicks, string description = null) { Name = name; Description = description; Version = version; ReleaseDateTicks = releaseDateTicks; } public static IDataSourceVersion Read(ExtendedBinaryReader reader) { var name = reader.ReadAsciiString(); var version = reader.ReadAsciiString(); var releaseDateTicks = reader.ReadOptInt64(); var description = reader.ReadAsciiString(); return new DataSourceVersion(name, version, releaseDateTicks, description); } public void Write(IExtendedBinaryWriter writer) { writer.WriteOptAscii(Name); writer.WriteOptAscii(Version); writer.WriteOpt(ReleaseDateTicks); writer.WriteOptAscii(Description); } private string GetReleaseDate() => Date.GetDate(ReleaseDateTicks); public override string ToString() => "dataSource=" + Name + ",version:" + Version + ",release date:" + GetReleaseDate(); public void SerializeJson(StringBuilder sb) { var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); jsonObject.AddStringValue("name", Name); jsonObject.AddStringValue("version", Version); if (Description != null) jsonObject.AddStringValue("description", Description.Trim()); if (ReleaseDateTicks != 0) jsonObject.AddStringValue("releaseDate", GetReleaseDate()); sb.Append(JsonObject.CloseBrace); } } public sealed class DataSourceVersionComparer : EqualityComparer { public override bool Equals(IDataSourceVersion x, IDataSourceVersion y) { return string.Equals(x.Name, y.Name) && string.Equals(x.Description, y.Description) && string.Equals(x.Version, y.Version) && x.ReleaseDateTicks == y.ReleaseDateTicks; } public override int GetHashCode(IDataSourceVersion obj) { unchecked { var hashCode = obj.Name.GetHashCode(); if (obj.Description != null) hashCode = (hashCode * 397) ^ obj.Description.GetHashCode(); if (obj.Version != null) hashCode = (hashCode * 397) ^ obj.Version.GetHashCode(); hashCode = (hashCode * 397) ^ obj.ReleaseDateTicks.GetHashCode(); return hashCode; } } } } ================================================ FILE: VariantAnnotation/Providers/LcrProvider.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.NSA; namespace VariantAnnotation.Providers { public class LcrProvider: IAnnotationProvider { public string Name => "Lcr provider"; public GenomeAssembly Assembly { get; } public IEnumerable DataSourceVersions { get; } private readonly NsiReader _nsiReader; public LcrProvider(Stream stream) { _nsiReader = NsiReader.Read(stream); Assembly = _nsiReader.Assembly; DataSourceVersions = new[] { _nsiReader.Version }; } public void Dispose() { // nsiReaders are not disposable. They read from the input stream and disposes it in the Read method. } public void Annotate(IAnnotatedPosition annotatedPosition) { foreach (var annotatedVariant in annotatedPosition.AnnotatedVariants) { annotatedVariant.InLowComplexityRegion = _nsiReader.OverlapsAny(annotatedVariant.Variant); } } public void PreLoad(Chromosome chromosome, List positions) { throw new System.NotImplementedException(); } } } ================================================ FILE: VariantAnnotation/Providers/NsaProvider.cs ================================================ using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using ErrorHandling.Exceptions; using Genome; using VariantAnnotation.GeneFusions.IO; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.Interface.SA; using VariantAnnotation.NSA; using Variants; namespace VariantAnnotation.Providers { public sealed class NsaProvider : IAnnotationProvider { public string Name => "Supplementary annotation provider"; public GenomeAssembly Assembly { get; } public IEnumerable DataSourceVersions { get; } private readonly INsaReader[] _nsaReaders; private readonly INsiReader[] _nsiReaders; private readonly IGeneFusionSaReader[] _fusionReaders; private readonly bool _hasFusionReaders; private bool _hasLoadedGeneFusions; private readonly List<(string refAllele, string altAllele, string jsonString)> _annotations = new(); public NsaProvider(INsaReader[] nsaReaders, INsiReader[] nsiReaders, IGeneFusionSaReader[] fusionReaders) { _nsaReaders = nsaReaders; _nsiReaders = nsiReaders; _fusionReaders = fusionReaders; if (fusionReaders != null && fusionReaders.Length > 0) _hasFusionReaders = true; (List variant, List position, List all) = OrganizeReaders(nsaReaders, nsiReaders, fusionReaders); (Assembly, DataSourceVersions) = GetReaderMetadata(all); CheckDuplicatePositionKeys(position); CheckDuplicateVariantKeys(variant); } private static (List Variant, List Position, List All) OrganizeReaders( INsaReader[] nsaReaders, INsiReader[] nsiReaders, IGeneFusionSaReader[] fusionReaders) { List variant = new(); List position = new(); List all = new(); if (nsaReaders != null) { foreach (INsaReader reader in nsaReaders) { variant.Add(reader); all.Add(reader); } } if (nsiReaders != null) { foreach (INsiReader reader in nsiReaders) { position.Add(reader); all.Add(reader); } } if (fusionReaders != null) { foreach (IGeneFusionSaReader reader in fusionReaders) { variant.Add(reader); all.Add(reader); } } return (variant, position, all); } private static void CheckDuplicateVariantKeys(List readers) { var jsonKeys = new HashSet(); foreach (ISaMetadata reader in readers) CheckJsonKey(reader.JsonKey, "variant-level (.nsa or fusion)", jsonKeys); } private static void CheckDuplicatePositionKeys(List readers) { var jsonKeys = new HashSet(); foreach (ISaMetadata reader in readers) CheckJsonKey(reader.JsonKey, "position-level (.nsi)", jsonKeys); } private static void CheckJsonKey(string jsonKey, string description, HashSet jsonKeys) { if (jsonKeys.Contains(jsonKey)) throw new UserErrorException($"Duplicate {description} JSON keys found for: {jsonKey}"); jsonKeys.Add(jsonKey); } private static (GenomeAssembly Assembly, IEnumerable Versions) GetReaderMetadata(List readers) { HashSet assemblies = new(); List versions = new(); var sb = new StringBuilder(); foreach (ISaMetadata reader in readers) { if (reader.Assembly != GenomeAssembly.rCRS && reader.Assembly != GenomeAssembly.Unknown) assemblies.Add(reader.Assembly); versions.Add(reader.Version); sb.AppendLine($"{reader.Version}, Assembly: {reader.Assembly}"); } if (assemblies.Count == 1) return (assemblies.First(), versions); throw new UserErrorException($"Multiple genome assemblies detected in Supplementary annotation directory.\n{sb}"); } public void Annotate(IAnnotatedPosition annotatedPosition) { if (_nsaReaders != null) AddPositionAndAlleleAnnotations(annotatedPosition); if (_nsiReaders != null) GetStructuralVariantAnnotations(annotatedPosition); if (_hasFusionReaders && annotatedPosition.Position.HasStructuralVariant) GetGeneFusionAnnotations(annotatedPosition); } private void GetGeneFusionAnnotations(IAnnotatedPosition annotatedPosition) { foreach (IAnnotatedVariant variant in annotatedPosition.AnnotatedVariants) { IGeneFusionPair[] fusionPairs = GetGeneFusionPairs(variant); if (fusionPairs == null) continue; // this only needs to happen if we have a gene fusion if (!_hasLoadedGeneFusions) LoadGeneFusions(); foreach (IGeneFusionSaReader reader in _fusionReaders) reader.AddAnnotations(fusionPairs, variant.SaList); } } private void LoadGeneFusions() { foreach (IGeneFusionSaReader reader in _fusionReaders) reader.LoadAnnotations(); _hasLoadedGeneFusions = true; } private static IGeneFusionPair[] GetGeneFusionPairs(IAnnotatedVariant variant) { var fusionPairs = new HashSet(); foreach (IAnnotatedTranscript transcript in variant.Transcripts) transcript.AddGeneFusionPairs(fusionPairs); return fusionPairs.Count == 0 ? null : fusionPairs.ToArray(); } private void GetStructuralVariantAnnotations(IAnnotatedPosition annotatedPosition) { bool needSaIntervals = annotatedPosition.AnnotatedVariants.Any(x => x.Variant.Behavior.NeedSaInterval); bool needSmallAnnotation = annotatedPosition.AnnotatedVariants.Any(x => x.Variant.Behavior == AnnotationBehavior.SmallVariants); foreach (INsiReader nsiReader in _nsiReaders) { IPosition position = annotatedPosition.Position; if (nsiReader.ReportFor == ReportFor.SmallVariants && !needSmallAnnotation) continue; if (nsiReader.ReportFor == ReportFor.StructuralVariants && !needSaIntervals) continue; IEnumerable annotations = nsiReader.GetAnnotation(position.Variants[0]); if (annotations == null) continue; annotatedPosition.SupplementaryIntervals.Add(new SupplementaryAnnotation(nsiReader.JsonKey, true, false, null, annotations)); } } private void AddPositionAndAlleleAnnotations(IAnnotatedPosition annotatedPosition) { foreach (IAnnotatedVariant annotatedVariant in annotatedPosition.AnnotatedVariants) { bool needSaPosition = annotatedVariant.Variant.Behavior.NeedSaPosition; bool needSaAllele = annotatedVariant.Variant.Behavior.NeedSaAllele; if (!needSaPosition && !needSaAllele) continue; AddSmallAnnotations(annotatedVariant, needSaPosition, needSaAllele); } } private void AddSmallAnnotations(IAnnotatedVariant annotatedVariant, bool needSaPosition, bool needSaAllele) { foreach (INsaReader nsaReader in _nsaReaders) { IVariant variant = annotatedVariant.Variant; nsaReader.GetAnnotation(variant.Start, _annotations); if (_annotations.Count == 0) continue; if (nsaReader.IsPositional && needSaPosition) { AddPositionalAnnotation(_annotations, annotatedVariant, nsaReader); continue; } if (nsaReader.MatchByAllele && needSaAllele) AddAlleleSpecificAnnotation(nsaReader, _annotations, annotatedVariant, variant); else AddNonAlleleSpecificAnnotations(_annotations, variant, annotatedVariant, nsaReader); } } private static void AddPositionalAnnotation(IEnumerable<(string refAllele, string altAllele, string annotation)> annotations, IAnnotatedVariant annotatedVariant, INsaReader nsaReader) { // e.g. ancestral allele, global minor allele string jsonString = annotations.First().annotation; annotatedVariant.SaList.Add(new SupplementaryAnnotation(nsaReader.JsonKey, nsaReader.IsArray, nsaReader.IsPositional, jsonString, null)); } private static void AddNonAlleleSpecificAnnotations(IEnumerable<(string refAllele, string altAllele, string annotation)> annotations, IVariant variant, IAnnotatedVariant annotatedVariant, INsaReader nsaReader) { var jsonStrings = new List(); foreach ((string refAllele, string altAllele, string jsonString) in annotations) { if (refAllele == variant.RefAllele && altAllele == variant.AltAllele) jsonStrings.Add(jsonString + ",\"isAlleleSpecific\":true"); else jsonStrings.Add(jsonString); } if (jsonStrings.Count > 0) annotatedVariant.SaList.Add(new SupplementaryAnnotation(nsaReader.JsonKey, nsaReader.IsArray, nsaReader.IsPositional, null, jsonStrings)); } private static void AddAlleleSpecificAnnotation(INsaReader nsaReader, IEnumerable<(string refAllele, string altAllele, string annotation)> annotations, IAnnotatedVariant annotatedVariant, IVariant variant) { if (nsaReader.IsArray) { var jsonStrings = new List(); foreach ((string refAllele, string altAllele, string jsonString) in annotations) { if (refAllele == variant.RefAllele && altAllele == variant.AltAllele) jsonStrings.Add(jsonString); } if (jsonStrings.Count > 0) annotatedVariant.SaList.Add(new SupplementaryAnnotation(nsaReader.JsonKey, nsaReader.IsArray, nsaReader.IsPositional, null, jsonStrings)); } else foreach ((string refAllele, string altAllele, string jsonString) in annotations) { if (refAllele != variant.RefAllele || altAllele != variant.AltAllele) continue; annotatedVariant.SaList.Add(new SupplementaryAnnotation(nsaReader.JsonKey, nsaReader.IsArray, nsaReader.IsPositional, jsonString, null)); break; } } public void PreLoad(Chromosome chromosome, List positions) { Task[] preloadTasks = _nsaReaders.Select(x => DoPreload(x, chromosome, positions)).ToArray(); Task.WaitAll(preloadTasks); foreach (Task preloadTask in preloadTasks) preloadTask.Dispose(); } private static Task DoPreload(INsaReader nsaReader, Chromosome chromosome, List positions) => Task.Run(() => { nsaReader.PreLoad(chromosome, positions); }); public void Dispose() { if (_nsaReaders != null) foreach (INsaReader reader in _nsaReaders) reader.Dispose(); if (_fusionReaders != null) foreach (IGeneFusionSaReader reader in _fusionReaders) reader.Dispose(); } } } ================================================ FILE: VariantAnnotation/Providers/ProteinConservationProvider.cs ================================================ using System; using System.Collections.Generic; using System.IO; using VariantAnnotation.Interface.Providers; using VariantAnnotation.ProteinConservation; namespace VariantAnnotation.Providers { public sealed class ProteinConservationProvider:IDisposable { private readonly ProteinConservationReader _reader; public string Name => "Amino acid conservation score provider"; public IDataSourceVersion Version => _reader.Version; private readonly Dictionary _conservationScores; public ProteinConservationProvider(Stream stream) { _reader = new ProteinConservationReader(stream); _conservationScores = new Dictionary(100_000); } public void Load() { foreach (var item in _reader.GetItems()) { _conservationScores.Add(item.TranscriptId, item.ConservationScores); } } public int GetConservationScore(string transcriptId, int position) { if (_conservationScores.TryGetValue(transcriptId, out var scores)) return position < scores.Length ? scores[position - 1] : -1; return -1; } public void Dispose() =>_reader?.Dispose(); } } ================================================ FILE: VariantAnnotation/Providers/RefMinorProvider.cs ================================================ using System.IO; using Genome; using VariantAnnotation.Interface.Providers; using VariantAnnotation.NSA; namespace VariantAnnotation.Providers { public sealed class RefMinorProvider : IRefMinorProvider { private readonly RefMinorDbReader _reader; public RefMinorProvider(Stream dbStream, Stream indexStream) { _reader = new RefMinorDbReader(dbStream, indexStream); } public string GetGlobalMajorAllele(Chromosome chromosome, int pos) => _reader.GetGlobalMajorAllele(chromosome, pos); public void Dispose() { _reader?.Dispose(); } } } ================================================ FILE: VariantAnnotation/Providers/ReferenceSequenceProvider.cs ================================================ using System.Collections.Generic; using System.IO; using Genome; using Intervals; using ReferenceSequence.IO; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.Providers { public sealed class ReferenceSequenceProvider : ISequenceProvider { public Dictionary RefNameToChromosome => _sequenceReader.RefNameToChromosome; public Dictionary RefIndexToChromosome => _sequenceReader.RefIndexToChromosome; public GenomeAssembly Assembly => _sequenceReader.Assembly; public string Name => "Reference sequence provider"; public IEnumerable DataSourceVersions => null; public ISequence Sequence { get; } private ushort _currentChromosomeIndex = 65534; // guaranteed to be updated private readonly CompressedSequenceReader _sequenceReader; public ReferenceSequenceProvider(Stream stream) { _sequenceReader = new CompressedSequenceReader(stream); Sequence = _sequenceReader.Sequence; } public void Annotate(IAnnotatedPosition annotatedPosition) { if (annotatedPosition.AnnotatedVariants == null) return; annotatedPosition.CytogeneticBand = Sequence.CytogeneticBands.Find(annotatedPosition.Position.Chromosome, annotatedPosition.Position.Start, annotatedPosition.Position.End); // we don't want HGVS g. nomenclature for structural variants or STRs if (annotatedPosition.Position.HasStructuralVariant || annotatedPosition.Position.HasShortTandemRepeat) return; string refSeqAccession = annotatedPosition.Position.Chromosome.RefSeqAccession; foreach (var annotatedVariant in annotatedPosition.AnnotatedVariants) { annotatedVariant.HgvsgNotation = HgvsgNotation.GetNotation(refSeqAccession, annotatedVariant.Variant, Sequence, new Interval(0, Sequence.Length)); } } public void PreLoad(Chromosome chromosome, List positions) { throw new System.NotImplementedException(); } public void LoadChromosome(Chromosome chromosome) { if (chromosome.Index == _currentChromosomeIndex) return; _sequenceReader.GetCompressedSequence(chromosome); _currentChromosomeIndex = chromosome.Index; } public void Dispose() => _sequenceReader?.Dispose(); } } ================================================ FILE: VariantAnnotation/Providers/ScoreProvider.cs ================================================ using System.Collections.Generic; using System.Linq; using System.Text; using ErrorHandling.Exceptions; using Genome; using VariantAnnotation.GenericScore; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.NSA; using Variants; namespace VariantAnnotation.Providers; public sealed class ScoreProvider : IAnnotationProvider { public string Name => "Supplementary annotation provider"; public GenomeAssembly Assembly { get; } public IEnumerable DataSourceVersions { get; } private readonly ScoreReader[] _scoreReaders; public ScoreProvider(ScoreReader[] scoreReaders) { _scoreReaders = scoreReaders; (Assembly, DataSourceVersions) = GetReadersMetadata(); } public void Annotate(IAnnotatedPosition annotatedPosition) { foreach (ScoreReader scoreReader in _scoreReaders) { foreach (IAnnotatedVariant annotatedVariant in annotatedPosition.AnnotatedVariants) { IVariant variant = annotatedVariant.Variant; // Score provider is only limited to SNV type calls if (variant.Type != VariantType.SNV) continue; Chromosome chromosome = variant.Chromosome; string jsonString = scoreReader.GetAnnotationJson(chromosome.Index, variant.Start, variant.AltAllele); if (jsonString == null) continue; annotatedVariant.SaList.Add(new SupplementaryAnnotation( scoreReader.JsonKey, false, true, jsonString, null )); } } } private (GenomeAssembly Assembly, IEnumerable Versions) GetReadersMetadata() { HashSet assemblies = new(); List versions = new(); var sb = new StringBuilder(); foreach (ScoreReader reader in _scoreReaders) { if (reader.Assembly != GenomeAssembly.rCRS && reader.Assembly != GenomeAssembly.Unknown) assemblies.Add(reader.Assembly); versions.Add(reader.Version); sb.AppendLine($"{reader.Version}, Assembly: {reader.Assembly}"); } if (assemblies.Count == 1) return (assemblies.First(), versions); throw new UserErrorException($"Multiple genome assemblies detected in Supplementary annotation directory.\n{sb}"); } public void PreLoad(Chromosome chromosome, List positions) { } public void Dispose() { } } ================================================ FILE: VariantAnnotation/Providers/TranscriptAnnotationProvider.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using ErrorHandling.Exceptions; using Genome; using Intervals; using IO; using OptimizedCore; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.Caches; using VariantAnnotation.GeneFusions.Calling; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using VariantAnnotation.IO.Caches; using VariantAnnotation.TranscriptAnnotation; using Variants; namespace VariantAnnotation.Providers { public sealed class TranscriptAnnotationProvider : ITranscriptAnnotationProvider { private readonly ITranscriptCache _transcriptCache; private readonly ISequence _sequence; public string Name { get; } public GenomeAssembly Assembly { get; } public IEnumerable DataSourceVersions { get; } public IntervalArray[] TranscriptIntervalArrays { get; } public ushort VepVersion { get; } private readonly Stream _siftStream; private readonly Stream _polyphenStream; private readonly PredictionCacheReader _siftReader; private readonly PredictionCacheReader _polyphenReader; private IPredictionCache _siftCache; private IPredictionCache _polyphenCache; private ushort _currentRefIndex = ushort.MaxValue; private readonly ProteinConservationProvider _conservationProvider; private readonly GeneFusionCaller _fusionCaller; public TranscriptAnnotationProvider(string pathPrefix, ISequenceProvider sequenceProvider, ProteinConservationProvider conservationProvider) { Name = "Transcript annotation provider"; _sequence = sequenceProvider.Sequence; _conservationProvider = conservationProvider; using (var stream = PersistentStreamUtils.GetReadStream(CacheConstants.TranscriptPath(pathPrefix))) { (_transcriptCache, TranscriptIntervalArrays, VepVersion) = InitiateCache(stream, sequenceProvider.RefIndexToChromosome, sequenceProvider.Assembly); } _fusionCaller = new GeneFusionCaller(sequenceProvider.RefNameToChromosome, _transcriptCache.TranscriptIntervalForest); Assembly = _transcriptCache.Assembly; DataSourceVersions = _transcriptCache.DataSourceVersions; // TODO: this is not great. We should not be using IEnumerables if we have to resort to strange stuff like this if (conservationProvider != null) DataSourceVersions = DataSourceVersions.Concat(new[] {conservationProvider.Version}); _siftStream = PersistentStreamUtils.GetReadStream(CacheConstants.SiftPath(pathPrefix)); _siftReader = new PredictionCacheReader(_siftStream, PredictionCacheReader.SiftDescriptions); _polyphenStream = PersistentStreamUtils.GetReadStream(CacheConstants.PolyPhenPath(pathPrefix)); _polyphenReader = new PredictionCacheReader(_polyphenStream, PredictionCacheReader.PolyphenDescriptions); } private static (TranscriptCache Cache, IntervalArray[] TranscriptIntervalArrays, ushort VepVersion) InitiateCache(Stream stream, Dictionary refIndexToChromosome, GenomeAssembly refAssembly) { using var reader = new TranscriptCacheReader(stream); ushort vepVersion = reader.Header.Custom.VepVersion; CheckHeaderVersion(reader.Header, refAssembly); TranscriptCacheData cacheData = reader.Read(refIndexToChromosome); TranscriptCache cache = cacheData.GetCache(); return (cache, cacheData.TranscriptIntervalArrays, vepVersion); } private static void CheckHeaderVersion(Header header, GenomeAssembly refAssembly) { if (header.Assembly != refAssembly) throw new UserErrorException(GetAssemblyErrorMessage(header.Assembly, refAssembly)); if (header.SchemaVersion != CacheConstants.SchemaVersion) throw new UserErrorException( $"Expected the cache schema version ({CacheConstants.SchemaVersion}) to be identical to the schema version in the cache header ({header.SchemaVersion})"); } private static string GetAssemblyErrorMessage(GenomeAssembly cacheAssembly, GenomeAssembly refAssembly) { StringBuilder sb = StringBuilderPool.Get(); sb.AppendLine("Not all of the data sources have the same genome assembly:"); sb.AppendLine($"- Using {refAssembly}: Reference sequence provider"); sb.AppendLine($"- Using {cacheAssembly}: Transcript annotation provider"); return StringBuilderPool.GetStringAndReturn(sb); } public void Annotate(IAnnotatedPosition annotatedPosition) { if (annotatedPosition.AnnotatedVariants == null || annotatedPosition.AnnotatedVariants.Length == 0) return; IPosition position = annotatedPosition.Position; ushort refIndex = position.Chromosome.Index; LoadPredictionCaches(refIndex); AddRegulatoryRegions(annotatedPosition.AnnotatedVariants, _transcriptCache.RegulatoryIntervalForest); AddTranscripts(annotatedPosition.AnnotatedVariants); if (position.HasStructuralVariant) _fusionCaller.AddGeneFusions(annotatedPosition.AnnotatedVariants, annotatedPosition.Position.InfoData.IsImprecise, position.InfoData.IsInv3, position.InfoData.IsInv5); } private void AddTranscripts(IAnnotatedVariant[] annotatedVariants) { foreach (var annotatedVariant in annotatedVariants) { var variant = annotatedVariant.Variant; if (variant.Behavior.MinimalTranscriptAnnotation) continue; ITranscript[] transcripts = _transcriptCache.TranscriptIntervalForest.GetAllFlankingValues(variant); if (transcripts == null) continue; IList annotatedTranscripts = TranscriptAnnotationFactory.GetAnnotatedTranscripts(variant, transcripts, _sequence, _siftCache, _polyphenCache); if (annotatedTranscripts.Count == 0) continue; foreach (IAnnotatedTranscript annotatedTranscript in annotatedTranscripts) { AddConservationScore(annotatedTranscript); } foreach (IAnnotatedTranscript annotatedTranscript in annotatedTranscripts) annotatedVariant.Transcripts.Add(annotatedTranscript); } } private void AddConservationScore(IAnnotatedTranscript annotatedTranscript) { if (_conservationProvider == null) return; if (annotatedTranscript.MappedPosition == null) return; var scores = new List(); int start = annotatedTranscript.MappedPosition.ProteinStart; int end = annotatedTranscript.MappedPosition.ProteinEnd; if (start == -1 || end == -1) return; for (int aaPos = start; aaPos <= end; aaPos++) { string transcriptId = annotatedTranscript.Transcript.Id.WithVersion; int score = _conservationProvider.GetConservationScore(transcriptId, aaPos); if (score == -1) return; //don't add conservation scores scores.Add(1.0 * score / 100); } annotatedTranscript.ConservationScores = scores; } public void PreLoad(Chromosome chromosome, List positions) => throw new NotImplementedException(); private void LoadPredictionCaches(ushort refIndex) { if (refIndex == _currentRefIndex) return; if (refIndex == ushort.MaxValue) { ClearCache(); return; } _siftCache = _siftReader.Read(refIndex); _polyphenCache = _polyphenReader.Read(refIndex); _currentRefIndex = refIndex; } private void ClearCache() { _siftCache = null; _polyphenCache = null; _currentRefIndex = ushort.MaxValue; } private static void AddRegulatoryRegions(IAnnotatedVariant[] annotatedVariants, IIntervalForest regulatoryIntervalForest) { foreach (IAnnotatedVariant annotatedVariant in annotatedVariants) { if (!annotatedVariant.Variant.Behavior.NeedRegulatoryRegions) continue; // In case of insertions, the base(s) are assumed to be inserted at the end position // if this is an insertion just before the beginning of the regulatory element, this takes care of it IVariant variant = annotatedVariant.Variant; int variantBegin = variant.Type == VariantType.insertion ? variant.End : variant.Start; if (SkipLargeVariants(variantBegin, variant.End)) continue; IRegulatoryRegion[] regulatoryRegions = regulatoryIntervalForest.GetAllOverlappingValues(variant.Chromosome.Index, variantBegin, variant.End); if (regulatoryRegions == null) continue; foreach (IRegulatoryRegion regulatoryRegion in regulatoryRegions) { // if the insertion is at the end, its past the feature and therefore not overlapping if (variant.Type == VariantType.insertion && variant.End == regulatoryRegion.End) continue; annotatedVariant.RegulatoryRegions.Add(RegulatoryRegionAnnotator.Annotate(variant, regulatoryRegion)); } } } private const int MaxSvLengthForRegulatoryRegionAnnotation = 50000; private static bool SkipLargeVariants(int begin, int end) => end - begin + 1 > MaxSvLengthForRegulatoryRegionAnnotation; public void Dispose() { _siftReader?.Dispose(); _polyphenReader?.Dispose(); _siftStream?.Dispose(); _polyphenStream?.Dispose(); } } } ================================================ FILE: VariantAnnotation/Providers/VersionProvider.cs ================================================ using IO; using ReferenceSequence; using VariantAnnotation.Interface.Providers; using VariantAnnotation.SA; namespace VariantAnnotation.Providers { public sealed class VersionProvider : IVersionProvider { public string DataVersion { get; } = $"Cache version: {CacheConstants.DataVersion}, Supplementary annotation version: {SaCommon.DataVersion}, Reference version: {ReferenceSequenceCommon.HeaderVersion}"; } } ================================================ FILE: VariantAnnotation/SA/CustomAnnotationCategories.cs ================================================ namespace VariantAnnotation.SA { public enum CustomAnnotationCategories:byte { Unknown, AlleleCount, AlleleNumber, AlleleFrequency, Prediction, Filter, Description, Identifier, HomozygousCount, Score } } ================================================ FILE: VariantAnnotation/SA/JsonDataType.cs ================================================ namespace VariantAnnotation.SA { public enum JsonDataType : byte { String, Bool, Number, Array, Object } public static class BacisJsonTypeExtension { public static string ToTypeString(this JsonDataType jsonDataType) { switch (jsonDataType) { case JsonDataType.String: return "string"; case JsonDataType.Bool: return "boolean"; case JsonDataType.Number: return "number"; case JsonDataType.Array: return "array"; case JsonDataType.Object: return "object"; default: return ""; } } public static string GetSchemaKey(this JsonDataType jsonDataType) { switch (jsonDataType) { case JsonDataType.Array: return "items"; case JsonDataType.Object: return "properties"; default: return ""; } } public static bool IsComplexType(this JsonDataType jsonDataType) { switch (jsonDataType) { case JsonDataType.Array: case JsonDataType.Object: return true; default: return false; } } } } ================================================ FILE: VariantAnnotation/SA/SaCommon.cs ================================================ namespace VariantAnnotation.SA { public static class SaCommon { public const int DefaultBlockSize = 8 * 1024 * 1024; public const ushort DataVersion = 66; public const ushort SchemaVersion = 22; public const ushort NsiSchemaVersion = 22; public const double RefMinorThreshold = 0.95; public const uint GuardInt = 4041327495; public const string NgaIdentifier = "NirvanaGenes"; public const string NsiIdentifier = "NirvanaSupplementaryIntervals"; public const string IndexSuffix = ".idx"; public const string SaFileSuffix = ".nsa"; public const string GsaFileSuffix = ".gsa"; public const string PhylopFileSuffix = ".npd"; public const string RefMinorFileSuffix = ".rma"; public const string IntervalFileSuffix = ".nsi"; public const string LcrFileSuffix = ".lcr"; public const string GeneFileSuffix = ".nga"; public const string GeneFusionSourceSuffix = ".gfs"; public const string GeneFusionJsonSuffix = ".gfj"; public const string JsonSchemaSuffix = ".schema"; public const string DbsnpTag = "dbsnp"; public const string GlobalAlleleTag = "globalAllele"; public const string OneKgenTag = "oneKg"; public const string AncestralAlleleTag = "ancestralAllele"; public const string RefMinorTag = "refMinor"; public const string GnomadTag = "gnomad"; public const string GnomadExomeTag = "gnomadExome"; public const string ClinvarTag = "clinvar"; public const string CosmicTag = "cosmic"; public const string CosmicCnvTag = "cosmicCnv"; public const string OnekSvTag = "oneKg"; public const string DgvTag = "dgv"; public const string ClinGenTag = "clingen"; public const string MitoMapTag = "mitomap"; public const string TopMedTag = "topmed"; public const string PhylopTag = "phylopScore"; public const string OmimTag = "omim"; public const string GnomadGeneScoreTag = "gnomAD"; public const string GnomadStructuralVariant = "gnomAD-preview"; public const string DosageSensitivityTag = "clingenDosageSensitivityMap"; public const string DiseaseValidityTag = "clingenGeneValidity"; public const string SpliceAiTag = "spliceAI"; public const string PrimateAiTag = "primateAI"; public const string MitoHeteroplasmyTag = "heteroplasmy"; public const string RevelTag = "revel"; public const string DannTag = "dann"; public const string GerpTag = "gerp"; public const string LowComplexityRegionTag = "lowComplexityRegion"; public const string GmeTag = "gmeVariome"; public const string DecipherTag = "decipher"; public const string Score = "Score"; } } ================================================ FILE: VariantAnnotation/TranscriptAnnotation/FlankingTranscriptAnnotator.cs ================================================ using System.Collections.Generic; using VariantAnnotation.AnnotatedPositions.Consequence; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Pools; namespace VariantAnnotation.TranscriptAnnotation { public static class FlankingTranscriptAnnotator { public static IAnnotatedTranscript GetAnnotatedTranscript(int variantEnd, ITranscript transcript) { bool isDownStream = variantEnd < transcript.Start == transcript.Gene.OnReverseStrand; List consequences = Consequences.DetermineFlankingVariantEffects(isDownStream); return AnnotatedTranscriptPool.Get(transcript, null, null, null, null, null, null, null, null, null, consequences, false); } } } ================================================ FILE: VariantAnnotation/TranscriptAnnotation/FullTranscriptAnnotator.cs ================================================ using System.Collections.Generic; using Genome; using Intervals; using VariantAnnotation.Algorithms; using VariantAnnotation.AnnotatedPositions; using VariantAnnotation.AnnotatedPositions.Consequence; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using VariantAnnotation.Pools; using Variants; namespace VariantAnnotation.TranscriptAnnotation { public static class FullTranscriptAnnotator { public static IAnnotatedTranscript GetAnnotatedTranscript(ITranscript transcript, IVariant leftShiftedVariant, ISequence refSequence, IPredictionCache siftCache, IPredictionCache polyphenCache, AminoAcids aminoAcids) { var rightShiftedVariant = VariantRotator.Right(leftShiftedVariant, transcript, refSequence, transcript.Gene.OnReverseStrand); var leftAnnotation = AnnotateTranscript(transcript, leftShiftedVariant, aminoAcids, refSequence); var rightAnnotation = ReferenceEquals(leftShiftedVariant, rightShiftedVariant) ? leftAnnotation : AnnotateTranscript(transcript, rightShiftedVariant, aminoAcids, refSequence); List consequences = GetConsequences(transcript, transcript.Gene.OnReverseStrand, leftShiftedVariant, leftAnnotation.VariantEffect); var refAllele = rightAnnotation.TranscriptRefAllele; var altAllele = rightAnnotation.TranscriptAltAllele; var hgvsCoding = HgvsCodingNomenclature.GetHgvscAnnotation(transcript, rightShiftedVariant, refSequence, rightAnnotation.Position.RegionStartIndex, rightAnnotation.Position.RegionEndIndex, refAllele, altAllele); var hgvsProtein = HgvsProteinNomenclature.GetHgvsProteinAnnotation(transcript, rightAnnotation.RefAminoAcids, rightAnnotation.AltAminoAcids, rightAnnotation.TranscriptAltAllele, rightAnnotation.Position, rightAnnotation.VariantEffect, rightShiftedVariant, refSequence, hgvsCoding, leftShiftedVariant.Chromosome.UcscName == "chrM"); var predictionScores = GetPredictionScores(leftAnnotation.Position, leftAnnotation.RefAminoAcids, leftAnnotation.AltAminoAcids, siftCache, polyphenCache, transcript.SiftIndex, transcript.PolyPhenIndex); return AnnotatedTranscriptPool.Get(transcript, leftAnnotation.RefAminoAcids, leftAnnotation.AltAminoAcids, leftAnnotation.RefCodons, leftAnnotation.AltCodons, leftAnnotation.Position, hgvsCoding, hgvsProtein, predictionScores.Sift, predictionScores.PolyPhen, consequences, false); } private static (VariantEffect VariantEffect, IMappedPosition Position, string RefAminoAcids, string AltAminoAcids, string RefCodons, string AltCodons, string TranscriptAltAllele, string TranscriptRefAllele) AnnotateTranscript(ITranscript transcript, ISimpleVariant variant, AminoAcids aminoAcids, ISequence refSequence) { bool onReverseStrand = transcript.Gene.OnReverseStrand; var start = MappedPositionUtilities.FindRegion(transcript.TranscriptRegions, variant.Start); var end = MappedPositionUtilities.FindRegion(transcript.TranscriptRegions, variant.End); var position = GetMappedPosition(transcript.TranscriptRegions, start.Region, start.Index, end.Region, end.Index, variant, onReverseStrand, transcript.Translation?.CodingRegion, transcript.StartExonPhase, variant.Type == VariantType.insertion); var codingSequence = GetCodingSequence(transcript, refSequence); var cdnaSequence = GetCdnaSequence(transcript, refSequence); string transcriptAltAllele = HgvsUtilities.GetTranscriptAllele(variant.AltAllele, onReverseStrand); var codons = Codons.GetCodons(transcriptAltAllele, position.CdsStart, position.CdsEnd, position.ProteinStart, position.ProteinEnd, codingSequence); var aa = aminoAcids.Translate(codons.Reference, codons.Alternate); (aa, position.ProteinStart, position.ProteinEnd) = TryTrimAminoAcidsAndUpdateProteinPositions(aa, position.ProteinStart, position.ProteinEnd); (position.CoveredCdnaStart, position.CoveredCdnaEnd) = transcript.TranscriptRegions.GetCoveredCdnaPositions(position.CdnaStart, start.Index, position.CdnaEnd, end.Index, onReverseStrand); (position.CoveredCdsStart, position.CoveredCdsEnd, position.CoveredProteinStart, position.CoveredProteinEnd) = MappedPositionUtilities.GetCoveredCdsAndProteinPositions(position.CoveredCdnaStart, position.CoveredCdnaEnd, transcript.StartExonPhase, transcript.Translation?.CodingRegion); var transcriptRefAllele = GetTranscriptRefAllele(position, cdnaSequence, variant, onReverseStrand); SequenceChange coveredAa; // only generate the covered version of ref & alt alleles when CDS start/end is -1 if (position.CdsStart == -1 || position.CdsEnd == -1) { coveredAa = GetCoveredAa(aminoAcids, transcriptAltAllele, position.CoveredCdsStart, position.CoveredCdsEnd, position.CoveredProteinStart, position.CoveredProteinEnd, codingSequence); (coveredAa, position.CoveredProteinStart, position.CoveredProteinEnd) = TryTrimAminoAcidsAndUpdateProteinPositions(coveredAa, position.CoveredProteinStart, position.CoveredProteinEnd); } else { coveredAa = aa; position.CoveredProteinStart = position.ProteinStart; position.CoveredProteinEnd = position.ProteinEnd; } var positionalEffect = GetPositionalEffect(transcript, variant, position, aa.Reference, aa.Alternate, position.CoveredCdnaStart, position.CoveredCdnaEnd, position.CoveredCdsStart, position.CoveredCdsEnd); var variantEffect = new VariantEffect(positionalEffect, variant, transcript, aa.Reference, aa.Alternate, codons.Reference, codons.Alternate, position.ProteinStart, coveredAa.Reference, coveredAa.Alternate); return (variantEffect, position, aa.Reference, aa.Alternate, codons.Reference, codons.Alternate, transcriptAltAllele, transcriptRefAllele); } private static string GetTranscriptRefAllele(IMappedPosition position, ISequence cdnaSequence, ISimpleVariant variant, bool onReverseStrand) { var variantRef = HgvsUtilities.GetTranscriptAllele(variant.RefAllele, onReverseStrand); if (position == null || cdnaSequence == null) return variantRef; var start = position.CoveredCdnaStart; var end = position.CoveredCdnaEnd; if (start == -1 && end == -1) return variantRef; if (start != -1 && end != -1 && end < start) Swap.Int(ref start, ref end); return cdnaSequence.Substring(start - 1, end - start + 1); } private static string GetCodingFromCdna(ICodingRegion codingRegion, ISequence cdnaSequence) { if (codingRegion == null) return null; return cdnaSequence.Substring(codingRegion.CdnaStart - 1, codingRegion.CdnaEnd - codingRegion.CdnaStart + 1); } internal static (SequenceChange AaChange, int ProteinStart, int ProteinEnd) TryTrimAminoAcidsAndUpdateProteinPositions(SequenceChange aaChange, int proteinStart, int proteinEnd) { (int newStart, string newReference, string newAlternate) = BiDirectionalTrimmer.Trim(proteinStart, aaChange.Reference, aaChange.Alternate); return string.IsNullOrEmpty(newReference) ? (aaChange, proteinStart, proteinEnd) : (new SequenceChange(newReference, newAlternate), newStart, newStart + newReference.Length - 1); } private static SequenceChange GetCoveredAa(AminoAcids aminoAcids, string transcriptAltAllele, int coveredCdsStart, int coveredCdsEnd, int coveredProteinStart, int coveredProteinEnd, ISequence codingSequence) { var codonsChange = Codons.GetCodons(transcriptAltAllele, coveredCdsStart, coveredCdsEnd, coveredProteinStart, coveredProteinEnd, codingSequence); return aminoAcids.Translate(codonsChange.Reference, codonsChange.Alternate); } private static ISequence GetCodingSequence(ITranscript transcript, ISequence refSequence) { if (transcript.Translation == null) return null; return transcript.CodingSequence ?? (transcript.CodingSequence = new CodingSequence(refSequence, transcript.Translation.CodingRegion, transcript.TranscriptRegions, transcript.Gene.OnReverseStrand, transcript.StartExonPhase, transcript.RnaEdits)); } private static ISequence GetCdnaSequence(ITranscript transcript, ISequence refSequence) { return transcript.CdnaSequence ?? (transcript.CdnaSequence = new CdnaSequence(refSequence, transcript.Translation?.CodingRegion, transcript.TranscriptRegions, transcript.Gene.OnReverseStrand, transcript.RnaEdits)); } private static IMappedPosition GetMappedPosition(ITranscriptRegion[] regions, ITranscriptRegion startRegion, int startIndex, ITranscriptRegion endRegion, int endIndex, IInterval variant, bool onReverseStrand, ICodingRegion codingRegion, byte startExonPhase, bool isInsertion) { (int cdnaStart, int cdnaEnd) = MappedPositionUtilities.GetCdnaPositions(startRegion, endRegion, variant, onReverseStrand, isInsertion); if (onReverseStrand) Swap.Int(ref cdnaStart, ref cdnaEnd); (int cdsStart, int cdsEnd) = MappedPositionUtilities.GetCdsPositions(codingRegion, cdnaStart, cdnaEnd, startExonPhase, isInsertion); int proteinStart = MappedPositionUtilities.GetProteinPosition(cdsStart); int proteinEnd = MappedPositionUtilities.GetProteinPosition(cdsEnd); (int exonStart, int exonEnd, int intronStart, int intronEnd) = regions.GetExonsAndIntrons(startIndex, endIndex); return new MappedPosition(cdnaStart, cdnaEnd, cdsStart, cdsEnd, proteinStart, proteinEnd, exonStart, exonEnd, intronStart, intronEnd, startIndex, endIndex); } private static TranscriptPositionalEffect GetPositionalEffect(ITranscript transcript, ISimpleVariant variant, IMappedPosition position, string refAminoAcid, string altAminoAcid, int coveredCdnaStart, int coveredCdnaEnd, int coveredCdsStart, int coveredCdsEnd) { bool startCodonInsertionWithNoImpact = variant.Type == VariantType.insertion && position.ProteinStart <= 1 && altAminoAcid.EndsWith(refAminoAcid); var positionalEffect = new TranscriptPositionalEffect(); positionalEffect.DetermineIntronicEffect(transcript.TranscriptRegions, variant, variant.Type); positionalEffect.DetermineExonicEffect(transcript, variant, position, coveredCdnaStart, coveredCdnaEnd, coveredCdsStart, coveredCdsEnd, variant.AltAllele, startCodonInsertionWithNoImpact); return positionalEffect; } private static List GetConsequences(IInterval transcript, bool onReverseStrand, IVariant variant, IVariantEffect variantEffect) { OverlapType overlapType = Intervals.Utilities.GetOverlapType(transcript.Start, transcript.End, variant.Start, variant.End); EndpointOverlapType endpointOverlapType = Intervals.Utilities.GetEndpointOverlapType(transcript.Start, transcript.End, variant.Start, variant.End); var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, onReverseStrand, variant.Type, variant.IsStructuralVariant); var consequence = new Consequences(variant.Type, variantEffect, featureEffect); consequence.DetermineSmallVariantEffects(); return consequence.GetConsequences(); } private static (PredictionScore Sift, PredictionScore PolyPhen) GetPredictionScores(IMappedPosition position, string refAminoAcid, string altAminoAcid, IPredictionCache siftCache, IPredictionCache polyphenCache, int siftIndex, int polyphenIndex) { if (!NeedPredictionScore(position.ProteinStart, position.ProteinEnd, refAminoAcid, altAminoAcid) || position.ProteinStart == -1) return (null, null); var newAminoAcid = altAminoAcid[0]; var siftScore = GetPredictionScore(position.ProteinStart, newAminoAcid, siftCache, siftIndex); var polyphenScore = GetPredictionScore(position.ProteinStart, newAminoAcid, polyphenCache, polyphenIndex); return (siftScore, polyphenScore); } private static bool NeedPredictionScore(int proteinStart, int proteinEnd, string referenceAminoAcids, string alternateAminoAcids) { return proteinStart != -1 && proteinEnd != -1 && proteinStart == proteinEnd && referenceAminoAcids.Length == 1 && alternateAminoAcids.Length == 1 && referenceAminoAcids != alternateAminoAcids; } private static PredictionScore GetPredictionScore(int proteinPosition, char newAminoAcid, IPredictionCache predictionCache, int predictionIndex) { return predictionIndex == -1 ? null : predictionCache?.GetProteinFunctionPrediction(predictionIndex, newAminoAcid, proteinPosition); } } } ================================================ FILE: VariantAnnotation/TranscriptAnnotation/ReducedTranscriptAnnotator.cs ================================================ using System.Collections.Generic; using Intervals; using VariantAnnotation.AnnotatedPositions.Consequence; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Caches.DataStructures; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Pools; using Variants; namespace VariantAnnotation.TranscriptAnnotation { public static class ReducedTranscriptAnnotator { public static IAnnotatedTranscript GetAnnotatedTranscript(ITranscript transcript, IVariant variant) { bool completeOverlap = variant.Contains(transcript); var mappedPosition = completeOverlap ? null : GetMappedPosition(transcript.TranscriptRegions, variant); List consequences = GetConsequences(transcript, transcript.Gene.OnReverseStrand, variant); return AnnotatedTranscriptPool.Get(transcript, null, null, null, null, mappedPosition, null, null, null, null, consequences, completeOverlap); } private static IMappedPosition GetMappedPosition(ITranscriptRegion[] regions, IInterval variant) { (int startIndex, _) = MappedPositionUtilities.FindRegion(regions, variant.Start); (int endIndex, _) = MappedPositionUtilities.FindRegion(regions, variant.End); (int exonStart, int exonEnd, int intronStart, int intronEnd) = regions.GetExonsAndIntrons(startIndex, endIndex); return new MappedPosition(-1, -1, -1, -1, -1, -1, exonStart, exonEnd, intronStart, intronEnd, startIndex, endIndex); } private static List GetConsequences(IInterval transcript, bool onReverseStrand, IVariant variant) { OverlapType overlapType = Intervals.Utilities.GetOverlapType(transcript.Start, transcript.End, variant.Start, variant.End); EndpointOverlapType endpointOverlapType = Intervals.Utilities.GetEndpointOverlapType(transcript.Start, transcript.End, variant.Start, variant.End); var featureEffect = new FeatureVariantEffects(overlapType, endpointOverlapType, onReverseStrand, variant.Type, true); var consequence = new Consequences(variant.Type, null, featureEffect); consequence.DetermineStructuralVariantEffect(variant); return consequence.GetConsequences(); } } } ================================================ FILE: VariantAnnotation/TranscriptAnnotation/RohTranscriptAnnotator.cs ================================================ using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Pools; namespace VariantAnnotation.TranscriptAnnotation { public static class RohTranscriptAnnotator { public static IAnnotatedTranscript GetAnnotatedTranscript(ITranscript transcript) { return transcript.IsCanonical ? AnnotatedTranscriptPool.Get(transcript, null, null, null, null, null, null, null, null, null, null, null) : null; } } } ================================================ FILE: VariantAnnotation/TranscriptAnnotation/SequenceChange.cs ================================================ namespace VariantAnnotation.TranscriptAnnotation { public struct SequenceChange { public readonly string Reference; public readonly string Alternate; public SequenceChange(string reference, string alternate) { Reference = reference; Alternate = alternate; } } } ================================================ FILE: VariantAnnotation/TranscriptAnnotation/TranscriptAnnotationFactory.cs ================================================ using System.Collections.Generic; using Genome; using Intervals; using VariantAnnotation.AnnotatedPositions.Transcript; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Caches; using Variants; namespace VariantAnnotation.TranscriptAnnotation { public static class TranscriptAnnotationFactory { private static readonly AminoAcids AminoAcidsProvider = new AminoAcids(false); private static readonly AminoAcids MitoAminoAcidsProvider = new AminoAcids(true); public static IList GetAnnotatedTranscripts(IVariant variant, ITranscript[] transcriptCandidates, ISequence compressedSequence, IPredictionCache siftCache, IPredictionCache polyphenCache) { var annotatedTranscripts = new List(); foreach (var transcript in transcriptCandidates) { var annotationStatus = DecideAnnotationStatus(variant, transcript, variant.Behavior, variant.Chromosome.FlankingLength); var annotatedTranscript = GetAnnotatedTranscript(variant, compressedSequence, transcript, annotationStatus, siftCache, polyphenCache); if (annotatedTranscript != null) annotatedTranscripts.Add(annotatedTranscript); } return annotatedTranscripts; } private static IAnnotatedTranscript GetAnnotatedTranscript(IVariant variant, ISequence compressedSequence, ITranscript transcript, Status annotationStatus, IPredictionCache siftCache, IPredictionCache polyphenCache) { IAnnotatedTranscript annotatedTranscript = null; // ReSharper disable once SwitchStatementMissingSomeCases switch (annotationStatus) { case Status.FlankingAnnotation: annotatedTranscript = FlankingTranscriptAnnotator.GetAnnotatedTranscript(variant.End, transcript); break; case Status.ReducedAnnotation: annotatedTranscript = ReducedTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant); break; case Status.RohAnnotation: annotatedTranscript = RohTranscriptAnnotator.GetAnnotatedTranscript(transcript); break; case Status.FullAnnotation: var acidsProvider = variant.Chromosome.UcscName == "chrM" ? MitoAminoAcidsProvider : AminoAcidsProvider; annotatedTranscript = FullTranscriptAnnotator.GetAnnotatedTranscript(transcript, variant, compressedSequence, siftCache, polyphenCache, acidsProvider); break; } return annotatedTranscript; } internal static Status DecideAnnotationStatus(IInterval variant, IInterval transcript, AnnotationBehavior behavior, int flankingLength) { bool overlapsTranscript = variant.Overlaps(transcript); if (!behavior.ReducedTranscriptAnnotation) { // handle small variants if (overlapsTranscript) return Status.FullAnnotation; if (behavior.NeedFlankingTranscripts && variant.Overlaps(transcript, flankingLength)) return Status.FlankingAnnotation; } else if (overlapsTranscript) { // handle large variants if (behavior.CanonicalTranscriptOnly) return Status.RohAnnotation; return Status.ReducedAnnotation; } return Status.NoAnnotation; } public enum Status { NoAnnotation, FlankingAnnotation, ReducedAnnotation, FullAnnotation, RohAnnotation } } } ================================================ FILE: VariantAnnotation/Utilities/BaseFormatting.cs ================================================ namespace VariantAnnotation.Utilities { public static class BaseFormatting { public static string EmptyToDash(string bases) => bases == "" ? "-" : bases; } } ================================================ FILE: VariantAnnotation/Utilities/Date.cs ================================================ using System; namespace VariantAnnotation.Utilities { public static class Date { public static string CurrentTimeStamp => DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"); public static string GetDate(long ticks) => new DateTime(ticks).ToString("yyyy-MM-dd"); } } ================================================ FILE: VariantAnnotation/Utilities/FormatUtilities.cs ================================================ namespace VariantAnnotation.Utilities { public static class FormatUtilities { public static (string Id, byte Version) SplitVersion(string s) { if (s == null) return (null, 0); int lastPeriodPos = s.LastIndexOf('.'); if (lastPeriodPos == -1) return (s, 0); string id = s.Substring(0, lastPeriodPos); string remaining = s.Substring(lastPeriodPos + 1); return !byte.TryParse(remaining, out byte version) ? (s, (byte)1) : (id, version); } } } ================================================ FILE: VariantAnnotation/Utilities/GeneComparer.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Utilities { public sealed class GeneComparer : EqualityComparer { public override bool Equals(IGene x, IGene y) { return x.Start == y.Start && x.End == y.End && x.Chromosome.Index == y.Chromosome.Index && x.OnReverseStrand == y.OnReverseStrand && x.Symbol == y.Symbol && x.EntrezGeneId.WithVersion == y.EntrezGeneId.WithVersion && x.EnsemblId.WithVersion == y.EnsemblId.WithVersion && x.HgncId == y.HgncId; } public override int GetHashCode(IGene obj) { string entrezGeneId = obj.EntrezGeneId.WithVersion; string ensemblId = obj.EnsemblId.WithVersion; unchecked { int hashCode = obj.Start; hashCode = (hashCode * 397) ^ obj.End; hashCode = (hashCode * 397) ^ obj.Chromosome.Index; hashCode = (hashCode * 397) ^ obj.OnReverseStrand.GetHashCode(); hashCode = (hashCode * 397) ^ obj.Symbol.GetHashCode(); if (entrezGeneId != null) hashCode = (hashCode * 397) ^ entrezGeneId.GetHashCode(); if (ensemblId != null) hashCode = (hashCode * 397) ^ ensemblId.GetHashCode(); hashCode = (hashCode * 397) ^ obj.HgncId; return hashCode; } } } } ================================================ FILE: VariantAnnotation/VariantAnnotation.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/BioType.cs ================================================ namespace VariantAnnotation.Interface.AnnotatedPositions { public enum BioType : byte { // ReSharper disable InconsistentNaming aligned_transcript, ambiguous_orf, antisense, antisense_RNA, bidirectional_promoter_lncRNA, guide_RNA, IG_pseudogene, IG_C_gene, IG_C_pseudogene, IG_D_gene, IG_J_gene, IG_J_pseudogene, IG_V_gene, IG_V_pseudogene, lincRNA, lncRNA, macro_lncRNA, mRNA, miRNA, Mt_rRNA, Mt_tRNA, non_coding, nonsense_mediated_decay, non_stop_decay, other, polymorphic_pseudogene, processed_pseudogene, processed_transcript, protein_coding, pseudogene, retained_intron, retrotransposed, RNase_MRP_RNA, RNase_P_RNA, rRNA, ribozyme, misc_RNA, sRNA, scRNA, scaRNA, sense_intronic, sense_overlapping, SRP_RNA, snoRNA, snRNA, telomerase_RNA, three_prime_overlapping_ncRNA, transcribed_processed_pseudogene, transcribed_unitary_pseudogene, transcribed_unprocessed_pseudogene, TEC, translated_processed_pseudogene, translated_unprocessed_pseudogene, TR_C_gene, TR_D_gene, TR_J_gene, TR_J_pseudogene, tRNA, TR_V_gene, TR_V_pseudogene, unitary_pseudogene, unprocessed_pseudogene, vaultRNA, Y_RNA // ReSharper restore InconsistentNaming } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/ConsequenceTag.cs ================================================ namespace VariantAnnotation.Interface.AnnotatedPositions { public enum ConsequenceTag : byte { // ReSharper disable InconsistentNaming coding_sequence_variant, copy_number_increase, copy_number_decrease, copy_number_change, downstream_gene_variant, feature_elongation, five_prime_duplicated_transcript, five_prime_UTR_variant, frameshift_variant, incomplete_terminal_codon_variant, inframe_deletion, inframe_insertion, start_lost, start_retained_variant, intron_variant, missense_variant, mature_miRNA_variant, non_coding_transcript_exon_variant, non_coding_transcript_variant, NMD_transcript_variant, protein_altering_variant, regulatory_region_variant, regulatory_region_ablation, regulatory_region_amplification, splice_acceptor_variant, splice_donor_variant, splice_region_variant, stop_gained, stop_lost, stop_retained_variant, synonymous_variant, three_prime_duplicated_transcript, three_prime_UTR_variant, transcript_amplification, transcript_ablation, feature_truncation, upstream_gene_variant, short_tandem_repeat_change, short_tandem_repeat_expansion, short_tandem_repeat_contraction, transcript_variant, unidirectional_gene_fusion // ReSharper restore InconsistentNaming } public static class ConsequenceUtil { public static string GetConsequence(ConsequenceTag consequence) { if (consequence == ConsequenceTag.five_prime_UTR_variant) return "5_prime_UTR_variant"; return consequence == ConsequenceTag.three_prime_UTR_variant ? "3_prime_UTR_variant" : consequence.ToString(); } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IAnnotatedGeneFusion.cs ================================================ // ReSharper disable InconsistentNaming using VariantAnnotation.Interface.IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IAnnotatedGeneFusion : IGeneFusionPair, IJsonSerializer { ITranscript transcript { get; } int? exon { get; } int? intron { get; } string hgvsr { get; } bool isInFrame { get; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IAnnotatedPosition.cs ================================================ using System.Collections.Generic; using System.Text; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.SA; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IAnnotatedPosition { IPosition Position { get; } string CytogeneticBand { get; set; } IAnnotatedVariant[] AnnotatedVariants { get; } IList SupplementaryIntervals { get; } StringBuilder GetJsonStringBuilder(); } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IAnnotatedRegulatoryRegion.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IAnnotatedRegulatoryRegion:IJsonSerializer { IRegulatoryRegion RegulatoryRegion { get; } IEnumerable Consequences { get; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IAnnotatedTranscript.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IAnnotatedTranscript : IJsonSerializer { ITranscript Transcript { get; } string ReferenceAminoAcids { get; } string AlternateAminoAcids { get; } string ReferenceCodons { get; } string AlternateCodons { get; } IMappedPosition MappedPosition { get; } string HgvsCoding { get; } string HgvsProtein { get; } PredictionScore Sift { get; } PredictionScore PolyPhen { get; } List Consequences { get; } bool? CompleteOverlap { get; } List ConservationScores { get; set; } void AddGeneFusions(IAnnotatedGeneFusion[] geneFusions); void AddGeneFusionPairs(HashSet geneKeys); public void Initialize(ITranscript transcript, string referenceAminoAcids, string alternateAminoAcids, string referenceCodons, string alternateCodons, IMappedPosition mappedPosition, string hgvsCoding, string hgvsProtein, PredictionScore sift, PredictionScore polyphen, List consequences, bool? completeOverlap); } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IAnnotatedVariant.cs ================================================ using System.Collections.Generic; using System.Text; using VariantAnnotation.Interface.SA; using Variants; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IAnnotatedVariant { IVariant Variant { get; } string HgvsgNotation { get; set; } IList RegulatoryRegions { get; } IList Transcripts { get; } IList SaList { get; } ISupplementaryAnnotation RepeatExpansionPhenotypes { get; set; } double? PhylopScore { get; set; } double? GerpScore { get; set; } bool InLowComplexityRegion { get; set; } StringBuilder GetJsonStringBuilder(string originalChromName); } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/ICodingRegion.cs ================================================ using Intervals; using IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface ICodingRegion : IInterval, ISerializable { int CdnaStart { get; } int CdnaEnd { get; } int Length { get; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/ICompactId.cs ================================================ using IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface ICompactId : ISerializable { bool IsEmpty(); string WithVersion { get; } string WithoutVersion { get; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IFeatureVariantEffects.cs ================================================ namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IFeatureVariantEffects { bool Ablation(); bool Amplification(); bool Truncation(); bool Elongation(); bool FivePrimeDuplicatedTranscript(); bool ThreePrimeDuplicatedTranscript(); } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IGene.cs ================================================ using Genome; using IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IGene : IChromosomeInterval, ISerializable { bool OnReverseStrand { get; } string Symbol { get; } ICompactId EntrezGeneId { get; } ICompactId EnsemblId { get; } int HgncId { get; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IGeneFusion.cs ================================================ using VariantAnnotation.Interface.IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IGeneFusion : IJsonSerializer { int? Exon { get; } int? Intron { get; } string HgvsCoding { get; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IGeneFusionPair.cs ================================================ namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IGeneFusionPair { ulong FusionKey { get; } string FirstGeneSymbol { get; } uint FirstGeneKey { get; } string SecondGeneSymbol { get; } uint SecondGeneKey { get; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IMappedPosition.cs ================================================ namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IMappedPosition { int ProteinStart { get; set; } int ProteinEnd { get; set; } int CdsStart { get; } int CdsEnd { get; } int CdnaStart { get; } int CdnaEnd { get; } int ExonStart { get; } int ExonEnd { get; } int IntronStart { get; } int IntronEnd { get; } int RegionStartIndex { get; } int RegionEndIndex { get; } int CoveredProteinStart { get; set; } int CoveredProteinEnd { get; set; } int CoveredCdsStart { get; set; } int CoveredCdsEnd { get; set; } int CoveredCdnaStart { get; set; } int CoveredCdnaEnd { get; set; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IRegulatoryRegion.cs ================================================ using Genome; using IO; using VariantAnnotation.Interface.Caches; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IRegulatoryRegion : IChromosomeInterval, ISerializable { ICompactId Id { get; } RegulatoryRegionType Type { get; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IRnaEdit.cs ================================================ using System; using Intervals; using IO; using Variants; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IRnaEdit : IInterval, ISerializable, IComparable { string Bases { get; } VariantType Type { get; set; } } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/ITranscript.cs ================================================ using System.Collections.Generic; using Genome; using Intervals; using IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface ITranscript : IChromosomeInterval { ICompactId Id { get; } BioType BioType { get; } bool IsCanonical { get; } Source Source { get; } IGene Gene { get; } ITranscriptRegion[] TranscriptRegions { get; } ushort NumExons { get; } int TotalExonLength { get; } byte StartExonPhase { get; } int SiftIndex { get; } int PolyPhenIndex { get; } ITranslation Translation { get; } IInterval[] MicroRnas { get; } int[] Selenocysteines { get; } IRnaEdit[] RnaEdits { get; } bool CdsStartNotFound { get; } bool CdsEndNotFound { get; } ISequence CodingSequence { get; set; } ISequence CdnaSequence { get; set; } void Write(IExtendedBinaryWriter writer, Dictionary geneIndices, Dictionary transcriptRegionIndices, Dictionary microRnaIndices, Dictionary peptideIndices); } public enum Source : byte { None, RefSeq, Ensembl, BothRefSeqAndEnsembl } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/ITranscriptRegion.cs ================================================ using Intervals; using IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface ITranscriptRegion : IInterval, ISerializable { TranscriptRegionType Type { get; } ushort Id { get; } int CdnaStart { get; } int CdnaEnd { get; } } public enum TranscriptRegionType : byte { Exon, Gap, Intron } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/ITranslation.cs ================================================ using IO; namespace VariantAnnotation.Interface.AnnotatedPositions { public interface ITranslation { ICodingRegion CodingRegion { get; } ICompactId ProteinId { get; } string PeptideSeq { get; } void Write(IExtendedBinaryWriter writer, int peptideIndex); } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/IVariantEffect.cs ================================================ namespace VariantAnnotation.Interface.AnnotatedPositions { public interface IVariantEffect { bool IsStopLost(); bool IsStopRetained(); bool IsStartLost(); bool IsFrameshiftVariant(); bool IsMatureMirnaVariant(); bool IsSpliceDonorVariant(); bool IsSpliceAcceptorVariant(); bool IsStopGained(); bool IsInframeInsertion(); bool IsInframeDeletion(); bool IsMissenseVariant(); bool IsProteinAlteringVariant(); bool IsSpliceRegionVariant(); bool IsIncompleteTerminalCodonVariant(); bool IsStartRetained(); bool IsSynonymousVariant(); bool IsCodingSequenceVariant(); bool IsFivePrimeUtrVariant(); bool IsThreePrimeUtrVariant(); bool IsNonCodingTranscriptExonVariant(); bool IsWithinIntron(); bool IsNonsenseMediatedDecayTranscriptVariant(); bool IsNonCodingTranscriptVariant(); } } ================================================ FILE: VariantAnnotation.Interface/AnnotatedPositions/PredictionScore.cs ================================================ namespace VariantAnnotation.Interface.AnnotatedPositions { public sealed class PredictionScore { public readonly double Score; public readonly string Prediction; public PredictionScore(string prediction, double score) { Prediction = prediction; Score = score; } } } ================================================ FILE: VariantAnnotation.Interface/Caches/IPredictionCache.cs ================================================ using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.Interface.Caches { public interface IPredictionCache : IProvider { PredictionScore GetProteinFunctionPrediction(int predictionIndex, char newAminoAcid, int aaPosition); } } ================================================ FILE: VariantAnnotation.Interface/Caches/ITranscriptCache.cs ================================================ using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.Interface.Caches { public interface ITranscriptCache : IProvider { IIntervalForest TranscriptIntervalForest { get; } IIntervalForest RegulatoryIntervalForest { get; } } } ================================================ FILE: VariantAnnotation.Interface/Caches/RegulatoryRegionType.cs ================================================ namespace VariantAnnotation.Interface.Caches { public enum RegulatoryRegionType : byte { // ReSharper disable InconsistentNaming CTCF_binding_site, enhancer, open_chromatin_region, promoter, promoter_flanking_region, TF_binding_site, mature_protein_region // ReSharper restore InconsistentNaming } } ================================================ FILE: VariantAnnotation.Interface/Constants.cs ================================================ namespace VariantAnnotation.Interface { public static class Constants { public const string Authors = "Stromberg, Roy, Platzer, Siddiqui, Ouyang, et al"; } } ================================================ FILE: VariantAnnotation.Interface/GeneAnnotation/IGeneAnnotationProvider.cs ================================================ using System; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.Interface.GeneAnnotation { public interface IGeneAnnotationProvider : IProvider, IDisposable { string Annotate(string geneName); } } ================================================ FILE: VariantAnnotation.Interface/IAnnotationResources.cs ================================================ using System; using System.Collections.Generic; using VariantAnnotation.Interface.GeneAnnotation; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.Interface { public interface IAnnotationResources : IDisposable { ISequenceProvider SequenceProvider { get; } ITranscriptAnnotationProvider TranscriptAnnotationProvider { get; } IAnnotationProvider SaProvider { get; } IAnnotationProvider ConservationProvider { get; } IRefMinorProvider RefMinorProvider { get; } IGeneAnnotationProvider GeneAnnotationProvider { get; } IMitoHeteroplasmyProvider MitoHeteroplasmyProvider { get; } IAnnotator Annotator { get; } IVariantIdCreator VidCreator { get; } List DataSourceVersions { get; } string VepDataVersion { get; } string AnnotatorVersionTag { get; set; } bool ForceMitochondrialAnnotation { get; } long InputStartVirtualPosition { get; } void SingleVariantPreLoad(IPosition position); } } ================================================ FILE: VariantAnnotation.Interface/IAnnotator.cs ================================================ using System.Collections.Generic; using Genome; using VariantAnnotation.Interface.AnnotatedPositions; using VariantAnnotation.Interface.Positions; namespace VariantAnnotation.Interface { public interface IAnnotator { GenomeAssembly Assembly { get; } IAnnotatedPosition Annotate(IPosition position); IEnumerable GetGeneAnnotations(); void EnableMitochondrialAnnotation(); } } ================================================ FILE: VariantAnnotation.Interface/IO/IJsonSerializer.cs ================================================ using System.Text; namespace VariantAnnotation.Interface.IO { public interface IJsonSerializer { void SerializeJson(StringBuilder sb); } } ================================================ FILE: VariantAnnotation.Interface/IO/IJsonWriter.cs ================================================ using System; using VariantAnnotation.Interface.Positions; namespace VariantAnnotation.Interface.IO { public interface IJsonWriter : IDisposable { void WritePosition(IPosition position, string entry); } } ================================================ FILE: VariantAnnotation.Interface/IO/IVcfReader.cs ================================================ using System; namespace VariantAnnotation.Interface.IO { public interface IVcfReader : IDisposable { bool IsRcrsMitochondrion { get; } } } ================================================ FILE: VariantAnnotation.Interface/IO/VcfCommon.cs ================================================ using System.Collections.Generic; namespace VariantAnnotation.Interface.IO { public static class VcfCommon { public const string ChromosomeHeader = "#CHROM"; public const string GatkNonRefAllele = ""; private const string MissingValue = "."; public const int MinNumColumnsSampleGenotypes = 10; // define the column names public const int ChromIndex = 0; public const int PosIndex = 1; public const int IdIndex = 2; public const int RefIndex = 3; public const int AltIndex = 4; public const int QualIndex = 5; public const int FilterIndex = 6; public const int InfoIndex = 7; public const int FormatIndex = 8; public const int GenotypeIndex = 9; private static readonly HashSet NonInformativeAltAllele = new HashSet {"<*>", "*", "", GatkNonRefAllele}; public static readonly HashSet ReferenceAltAllele = new HashSet {MissingValue, GatkNonRefAllele}; public static bool IsNonInformativeAltAllele(string altAllele) => NonInformativeAltAllele.Contains(altAllele); } } ================================================ FILE: VariantAnnotation.Interface/IVariantIdCreator.cs ================================================ using Genome; namespace VariantAnnotation.Interface { public interface IVariantIdCreator { string Create(ISequence sequence, VariantCategory category, string svType, Chromosome chromosome, int start, int end, string refAllele, string altAllele, string repeatUnit); (int Start, string RefAllele, string AltAllele) Normalize(ISequence sequence, int start, string refAllele, string altAllele); } } ================================================ FILE: VariantAnnotation.Interface/Positions/ICustomFields.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.IO; namespace VariantAnnotation.Interface.Positions { public interface ICustomFields: IJsonSerializer { void Add(string key, string value); void Clear(); bool IsEmpty(); } } ================================================ FILE: VariantAnnotation.Interface/Positions/IInfoData.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.IO; namespace VariantAnnotation.Interface.Positions { public interface IInfoData { int[] CiEnd { get; } int[] CiPos { get; } int? End { get; } double? RecalibratedQuality { get; } int? JointSomaticNormalQuality { get; } int? RefRepeatCount { get; } string RepeatUnit { get; } double? StrandBias { get; } int? SvLength { get; } string SvType { get; } double? FisherStrandBias { get; } double? MappingQuality { get; } string BreakendEventId { get; } bool IsImprecise { get; } ICustomFields CustomKeyValues{ get; } // for old version of Manta, but still required by Encore bool IsInv3 { get; } bool IsInv5 { get; } double? LogOddsRatio { get; } } } ================================================ FILE: VariantAnnotation.Interface/Positions/IPosition.cs ================================================ using Variants; namespace VariantAnnotation.Interface.Positions { public interface IPosition : ISimplePosition { double? Quality { get; } string[] Filters { get; } IVariant[] Variants { get; } ISample[] Samples { get; } IInfoData InfoData { get; } bool HasStructuralVariant { get; } bool HasShortTandemRepeat { get; } } } ================================================ FILE: VariantAnnotation.Interface/Positions/ISample.cs ================================================ namespace VariantAnnotation.Interface.Positions { public interface ISample { int[] AlleleDepths { get; } float? ArtifactAdjustedQualityScore { get; } // PEPE int? CopyNumber { get; } string[] DiseaseAffectedStatuses { get; } // SMN1 bool FailedFilter { get; } string Genotype { get; } int? GenotypeQuality { get; } bool IsDeNovo { get; } double? DeNovoQuality { get; } bool IsEmpty { get; } float? LikelihoodRatioQualityScore { get; } // PEPE int[] PairedEndReadCounts { get; } // Manta int[] RepeatUnitCounts { get; } // ExpansionHunter int[] SplitReadCounts { get; } // Manta int? TotalDepth { get; } double[] VariantFrequencies { get; } int? MinorHaplotypeCopyNumber { get; } double? SomaticQuality { get; } bool? IsLossOfHeterozygosity { get; } string[] HeteroplasmyPercentile { get; } int? BinCount { get; } public ICustomFields CustomFields { get; } } } ================================================ FILE: VariantAnnotation.Interface/Positions/ISimplePosition.cs ================================================ using System.Collections.Generic; using Genome; namespace VariantAnnotation.Interface.Positions { public interface ISimplePosition : IChromosomeInterval { string RefAllele { get; } string[] AltAlleles { get; } string[] VcfFields { get; } bool[] IsDecomposed { get; } bool IsRecomposed { get; } string[] Vids { get; } List[] LinkedVids { get; } } } ================================================ FILE: VariantAnnotation.Interface/Providers/IAnnotationProvider.cs ================================================ using System; using System.Collections.Generic; using Genome; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Interface.Providers { public interface IAnnotationProvider : IProvider, IDisposable { void Annotate(IAnnotatedPosition annotatedPosition); void PreLoad(Chromosome chromosome, List positions); } } ================================================ FILE: VariantAnnotation.Interface/Providers/IDataSourceVersion.cs ================================================ using IO; using VariantAnnotation.Interface.IO; namespace VariantAnnotation.Interface.Providers { public interface IDataSourceVersion : IJsonSerializer { string Name { get; } string Description { get; } string Version { get; } long ReleaseDateTicks { get; } void Write(IExtendedBinaryWriter writer); } } ================================================ FILE: VariantAnnotation.Interface/Providers/IMitoHeteroplasmyProvider.cs ================================================ using Variants; namespace VariantAnnotation.Interface.Providers { public interface IMitoHeteroplasmyProvider : IProvider { double?[] GetVrfPercentiles(IVariant[] variants, double[] vrfs); } } ================================================ FILE: VariantAnnotation.Interface/Providers/IProvider.cs ================================================ using System.Collections.Generic; using Genome; namespace VariantAnnotation.Interface.Providers { public interface IProvider { string Name { get; } GenomeAssembly Assembly { get; } IEnumerable DataSourceVersions { get; } } } ================================================ FILE: VariantAnnotation.Interface/Providers/IRefMinorProvider.cs ================================================ using System; using Genome; namespace VariantAnnotation.Interface.Providers { public interface IRefMinorProvider:IDisposable { string GetGlobalMajorAllele(Chromosome chromosome, int pos); } } ================================================ FILE: VariantAnnotation.Interface/Providers/ISequenceProvider.cs ================================================ using System.Collections.Generic; using Genome; namespace VariantAnnotation.Interface.Providers { public interface ISequenceProvider : IAnnotationProvider { ISequence Sequence { get; } Dictionary RefNameToChromosome { get; } Dictionary RefIndexToChromosome { get; } void LoadChromosome(Chromosome chromosome); } } ================================================ FILE: VariantAnnotation.Interface/Providers/ITranscriptAnnotationProvider.cs ================================================ using Intervals; using VariantAnnotation.Interface.AnnotatedPositions; namespace VariantAnnotation.Interface.Providers { public interface ITranscriptAnnotationProvider : IAnnotationProvider { IntervalArray[] TranscriptIntervalArrays { get; } ushort VepVersion { get; } } } ================================================ FILE: VariantAnnotation.Interface/SA/INsaReader.cs ================================================ using System; using System.Collections.Generic; using Genome; namespace VariantAnnotation.Interface.SA { public interface INsaReader : ISaMetadata, IDisposable { bool MatchByAllele { get; } bool IsArray { get; } bool IsPositional { get; } void GetAnnotation(int position, List<(string refAllele, string altAllele, string annotation)> annotations); void PreLoad(Chromosome chrom, List positions); } } ================================================ FILE: VariantAnnotation.Interface/SA/INsiReader.cs ================================================ using System.Collections.Generic; using Variants; namespace VariantAnnotation.Interface.SA { public interface INsiReader : ISaMetadata { ReportFor ReportFor { get; } IEnumerable GetAnnotation(IVariant variant); } } ================================================ FILE: VariantAnnotation.Interface/SA/ISaMetadata.cs ================================================ using Genome; using VariantAnnotation.Interface.Providers; namespace VariantAnnotation.Interface.SA { public interface ISaMetadata { GenomeAssembly Assembly { get; } IDataSourceVersion Version { get; } string JsonKey { get; } } } ================================================ FILE: VariantAnnotation.Interface/SA/ISupplementaryAnnotation.cs ================================================ using VariantAnnotation.Interface.IO; namespace VariantAnnotation.Interface.SA { public interface ISupplementaryAnnotation : IJsonSerializer { string JsonKey { get; } } } ================================================ FILE: VariantAnnotation.Interface/SA/ISupplementaryDataItem.cs ================================================ using Genome; namespace VariantAnnotation.Interface.SA { public interface ISupplementaryDataItem { Chromosome Chromosome { get; } int Position { get; set; } string RefAllele { get; set; } string AltAllele { get; set; } string GetJsonString(); string InputLine { get; } } } ================================================ FILE: VariantAnnotation.Interface/SA/ISupplementaryInterval.cs ================================================ namespace VariantAnnotation.Interface.SA { public enum ReportFor { None, AllVariants, SmallVariants, StructuralVariants } } ================================================ FILE: VariantAnnotation.Interface/SA/IsuppGeneItem.cs ================================================ namespace VariantAnnotation.Interface.SA { public interface ISuppGeneItem { string GeneSymbol { get; } string GetJsonString(); } } ================================================ FILE: VariantAnnotation.Interface/SA/IsuppIntervalItem.cs ================================================ using Genome; namespace VariantAnnotation.Interface.SA { public interface ISuppIntervalItem : IChromosomeInterval { string GetJsonString(); } } ================================================ FILE: VariantAnnotation.Interface/VariantAnnotation.Interface.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: VariantAnnotation.Interface/VariantCategory.cs ================================================ namespace VariantAnnotation.Interface { public enum VariantCategory { // ReSharper disable InconsistentNaming Reference, SmallVariant, SV, CNV, RepeatExpansion, ROH // ReSharper restore InconsistentNaming } } ================================================ FILE: Variants/AnnotationBehavior.cs ================================================ namespace Variants { public sealed class AnnotationBehavior { public readonly bool CanonicalTranscriptOnly; public readonly bool NeedFlankingTranscripts; public readonly bool NeedRegulatoryRegions; public readonly bool NeedSaInterval; public readonly bool NeedSaPosition; public readonly bool NeedSaAllele; public readonly bool MinimalTranscriptAnnotation; public readonly bool ReducedTranscriptAnnotation; public static readonly AnnotationBehavior SmallVariants = new AnnotationBehavior(false, false, true, true, false, true, true, false); public static readonly AnnotationBehavior NonInformativeAlleles = new AnnotationBehavior(false, true, false, false, false, false, false, false); public static readonly AnnotationBehavior StructuralVariants = new AnnotationBehavior(false, false, false, true, true, false, false, true); public static readonly AnnotationBehavior BreakendVariants = new AnnotationBehavior(false, false, false, true, true, false, true, true); public static readonly AnnotationBehavior RepeatExpansions = new AnnotationBehavior(false, false, false, true, false, false, false, true); public static readonly AnnotationBehavior RunsOfHomozygosity = new AnnotationBehavior(true, false, false, false, false, false, false, true); private AnnotationBehavior(bool canonicalTranscriptOnly, bool minimalTranscriptAnnotation, bool needFlankingTranscripts, bool needRegulatoryRegions, bool needSaInterval, bool needSaPosition, bool needSaAllele, bool reducedTranscriptAnnotation) { CanonicalTranscriptOnly = canonicalTranscriptOnly; MinimalTranscriptAnnotation = minimalTranscriptAnnotation; NeedFlankingTranscripts = needFlankingTranscripts; NeedRegulatoryRegions = needRegulatoryRegions; NeedSaInterval = needSaInterval; NeedSaPosition = needSaPosition; NeedSaAllele = needSaAllele; ReducedTranscriptAnnotation = reducedTranscriptAnnotation; } } } ================================================ FILE: Variants/BiDirectionalTrimmer.cs ================================================ namespace Variants { public static class BiDirectionalTrimmer { public static (int Start, string RefAllele, string AltAllele) Trim(int start, string refAllele, string altAllele) { // do not trim if ref and alt are same if (refAllele == altAllele) return (start, refAllele, altAllele); if (refAllele == null) refAllele = ""; if (altAllele == null) altAllele = ""; // trimming at the start var i = 0; while (i < refAllele.Length && i < altAllele.Length && refAllele[i] == altAllele[i]) i++; if (i > 0) { start += i; altAllele = altAllele.Substring(i); refAllele = refAllele.Substring(i); } // trimming at the end var j = 0; while (j < refAllele.Length && j < altAllele.Length && refAllele[refAllele.Length - j - 1] == altAllele[altAllele.Length - j - 1]) j++; if (j <= 0) return (start, refAllele, altAllele); altAllele = altAllele.Substring(0, altAllele.Length - j); refAllele = refAllele.Substring(0, refAllele.Length - j); return (start, refAllele, altAllele); } } } ================================================ FILE: Variants/ISimpleVariant.cs ================================================ using Genome; namespace Variants { public interface ISimpleVariant : IChromosomeInterval { string RefAllele { get; } string AltAllele { get; } VariantType Type { get; } } } ================================================ FILE: Variants/IVariant.cs ================================================ namespace Variants { public interface IVariant : ISimpleVariant { string VariantId { get; } bool IsRefMinor { get; } bool IsRecomposed { get; } bool IsDecomposed { get; } string[] LinkedVids { get; } AnnotationBehavior Behavior { get; } bool IsStructuralVariant { get; } } } ================================================ FILE: Variants/RepeatExpansion.cs ================================================ using Genome; namespace Variants { public sealed class RepeatExpansion : IVariant { public Chromosome Chromosome { get; } public int Start { get; } public int End { get; } public string RefAllele { get; } public string AltAllele { get; } public string VariantId { get; } public VariantType Type { get; } = VariantType.short_tandem_repeat_variation; public bool IsRefMinor { get; } = false; public bool IsRecomposed { get; } = false; public bool IsDecomposed { get; } = false; public string[] LinkedVids { get; } = null; public AnnotationBehavior Behavior { get; } = AnnotationBehavior.RepeatExpansions; public bool IsStructuralVariant { get; } = false; public readonly int RepeatCount; public readonly int? RefRepeatCount; public RepeatExpansion(Chromosome chromosome, int start, int end, string refAllele, string altAllele, string variantId, int repeatCount, int? refRepeatCount) { Chromosome = chromosome; Start = start; End = end; RefAllele = refAllele; AltAllele = altAllele; VariantId = variantId; RepeatCount = repeatCount; RefRepeatCount = refRepeatCount; } } } ================================================ FILE: Variants/SimpleVariant.cs ================================================ using Genome; namespace Variants { public sealed class SimpleVariant : ISimpleVariant { public int Start { get; } public int End { get; } public Chromosome Chromosome { get; } public string RefAllele { get; } public string AltAllele { get; } public VariantType Type { get; } public SimpleVariant(Chromosome chromosome, int start, int end, string refAllele, string altAllele, VariantType type) { Chromosome = chromosome; Start = start; End = end; RefAllele = refAllele; AltAllele = altAllele; Type = type; } } } ================================================ FILE: Variants/Variant.cs ================================================ using Genome; namespace Variants { public sealed class Variant : IVariant { public Chromosome Chromosome { get; private set; } public int Start { get; private set;} public int End { get; private set;} public string RefAllele { get; private set;} public string AltAllele { get; private set;} public VariantType Type { get; private set;} public string VariantId { get; private set;} public bool IsRefMinor { get; private set;} public bool IsRecomposed { get; private set;} public bool IsDecomposed { get; private set;} public string[] LinkedVids { get; private set;} public AnnotationBehavior Behavior { get; private set;} public bool IsStructuralVariant { get; private set;} public void Initialize(Chromosome chromosome, int start, int end, string refAllele, string altAllele, VariantType variantType, string variantId, bool isRefMinor, bool isDecomposed, bool isRecomposed, string[] linkedVids, AnnotationBehavior behavior, bool isStructuralVariant) { Chromosome = chromosome; Start = start; End = end; RefAllele = refAllele; AltAllele = altAllele; Type = variantType; VariantId = variantId; IsRefMinor = isRefMinor; IsRecomposed = isRecomposed; IsDecomposed = isDecomposed; LinkedVids = linkedVids; Behavior = behavior; IsStructuralVariant = isStructuralVariant; } } } ================================================ FILE: Variants/VariantRotator.cs ================================================ using System; using Genome; using Intervals; namespace Variants { public static class VariantRotator { public const int MaxDownstreamLength = 500; public static ISimpleVariant Right(ISimpleVariant simpleVariant, IInterval rotateRegion, ISequence refSequence, bool onReverseStrand) { if (refSequence == null) return simpleVariant; if (simpleVariant.Type != VariantType.deletion && simpleVariant.Type != VariantType.insertion) return simpleVariant; if (VariantStartOverlapsRegion(simpleVariant, rotateRegion, onReverseStrand)) return simpleVariant; // if variant is before the transcript start, do not perform 3 prime shift string rotatingBases = GetRotatingBases(simpleVariant, onReverseStrand); string downStreamSeq = GetDownstreamSeq(simpleVariant, rotateRegion, refSequence, onReverseStrand, rotatingBases); string combinedSequence = rotatingBases + downStreamSeq; int shiftStart, shiftEnd; var hasShifted = false; // probably a VEP bug, just use it for consistency int numBases = rotatingBases.Length; for (shiftStart = 0, shiftEnd = numBases; shiftEnd < combinedSequence.Length; shiftStart++, shiftEnd++) { if (combinedSequence[shiftStart] != combinedSequence[shiftEnd]) break; hasShifted = true; } if (!hasShifted) return simpleVariant; // create a new alternative allele string rotatedSequence = combinedSequence.Substring(shiftStart, numBases); int rotatedStart = simpleVariant.Start + shiftStart; int rotatedEnd = simpleVariant.End + shiftStart; if (onReverseStrand) { rotatedSequence = SequenceUtilities.GetReverseComplement(rotatedSequence); rotatedStart = simpleVariant.Start - shiftStart; rotatedEnd = simpleVariant.End - shiftStart; } string rotatedRefAllele = simpleVariant.RefAllele; string rotatedAltAllele = simpleVariant.AltAllele; if (simpleVariant.Type == VariantType.insertion) rotatedAltAllele = rotatedSequence; else rotatedRefAllele = rotatedSequence; return new SimpleVariant(simpleVariant.Chromosome, rotatedStart, rotatedEnd, rotatedRefAllele, rotatedAltAllele, simpleVariant.Type); } private static string GetDownstreamSeq(IInterval simpleVariant, IInterval rotateRegion, ISequence refSequence, bool onReverseStrand, string rotatingBases) { int basesToEnd = onReverseStrand ? simpleVariant.Start - rotateRegion.Start : rotateRegion.End - simpleVariant.End; int downStreamLength = Math.Min(basesToEnd, Math.Max(rotatingBases.Length, MaxDownstreamLength)); // for large rotatingBases, we need to factor in its length but still make sure that we do not go past the end of transcript string downStreamSeq = onReverseStrand ? SequenceUtilities.GetReverseComplement( refSequence.Substring(simpleVariant.Start - 1 - downStreamLength, downStreamLength)) : refSequence.Substring(simpleVariant.End, downStreamLength); return downStreamSeq; } private static string GetRotatingBases(ISimpleVariant simpleVariant, bool onReverseStrand) { string rotatingBases = simpleVariant.Type == VariantType.insertion ? simpleVariant.AltAllele : simpleVariant.RefAllele; rotatingBases = onReverseStrand ? SequenceUtilities.GetReverseComplement(rotatingBases) : rotatingBases; return rotatingBases; } private static bool VariantStartOverlapsRegion(IInterval variant, IInterval region, bool onReverseStrand) { if (onReverseStrand) { return variant.End > region.End || region.Start >= variant.End; } return variant.Start < region.Start || region.End <= variant.Start; } } } ================================================ FILE: Variants/VariantType.cs ================================================ namespace Variants { public enum VariantType { // ReSharper disable InconsistentNaming unknown = 0, // small variants SNV = 2, insertion = 3, deletion = 4, indel = 5, MNV = 6, // structural variants duplication = 10, complex_structural_alteration = 11, structural_alteration = 12, tandem_duplication = 13, translocation_breakend = 14, inversion = 15, mobile_element_insertion = 16, mobile_element_deletion = 17, novel_sequence_insertion = 18, short_tandem_repeat_variation = 19, // CNVs copy_number_variation = 30, copy_number_loss = 31, copy_number_gain = 32, // non variants run_of_homozygosity = 33, // misc reference = 42, non_informative_allele = 43 } // ReSharper restore InconsistentNaming } ================================================ FILE: Variants/VariantUtils.cs ================================================ using System; using Genome; namespace Variants { public static class VariantUtils { public const int MaxUpstreamLength = 500; public static int MaxShiftLength = 0; public static (int start, string refAllele, string altAllele) TrimAndLeftAlign(int start, string refAllele, string altAllele, ISequence refSequence) { var initialStart = start; var (newStart, newRefAllele, newAltAllele) = TrimAndLeftAlign(start, refAllele, altAllele, refSequence, 50); while (newStart != start) { start = newStart; refAllele = newRefAllele; altAllele = newAltAllele; (newStart, newRefAllele, newAltAllele) = TrimAndLeftAlign(start, refAllele, altAllele, refSequence, 50); } // keeping track of maximum bases shifted if (MaxShiftLength < Math.Abs(initialStart - newStart)) MaxShiftLength = Math.Abs(initialStart - newStart); return (newStart, newRefAllele, newAltAllele); } /// /// Left aligns the variant using base rotation /// /// Tuple of new position, ref and alt allele private static (int start, string refAllele, string altAllele) TrimAndLeftAlign(int start, string refAllele, string altAllele, ISequence refSequence, int maxUpstreamLength) { if (IsStructuralVariant(altAllele)) return (start, refAllele, altAllele); // we have to check this before the trimming since it depends on the padding base bool isLeftShiftPossible = IsLeftShiftPossible(refAllele, altAllele); (start, refAllele, altAllele) = BiDirectionalTrimmer.Trim(start, refAllele, altAllele); // alignment only makes sense for insertion and deletion if (!(altAllele.Length == 0 || refAllele.Length == 0)) return (start, refAllele, altAllele); if(! isLeftShiftPossible) return (start, refAllele, altAllele); // base checking to make sure we can safely left shift if (IfRefBaseMismatched(start, refAllele, refSequence)) return (start, refAllele, altAllele); // adjust the max upstream length when you are near the beginning of the chrom if (maxUpstreamLength >= start) maxUpstreamLength = start - 1; string upstreamSeq = refSequence.Substring(start - maxUpstreamLength - 1, maxUpstreamLength); // compressed seq is 0 based string combinedSeq = upstreamSeq; int repeatLength; int i; if (refAllele.Length > altAllele.Length) { // deletion combinedSeq += refAllele; repeatLength = refAllele.Length; for (i = combinedSeq.Length - 1; i >= repeatLength; i--, start--) { if (combinedSeq[i] != combinedSeq[i - repeatLength]) break; } string newRefAllele = combinedSeq.Substring(i + 1 - repeatLength, repeatLength); return (start, newRefAllele, ""); //alt is empty for deletion } //insertion combinedSeq += altAllele; repeatLength = altAllele.Length; for (i = combinedSeq.Length - 1; i >= repeatLength; i--, start--) { if (combinedSeq[i] != combinedSeq[i - repeatLength]) break; } string newAltAllele = combinedSeq.Substring(i + 1 - repeatLength, repeatLength); return (start, "", newAltAllele); } private static bool IfRefBaseMismatched(int start, string refAllele, ISequence refSequence) { return refSequence != null && !string.IsNullOrEmpty(refAllele) && refAllele != refSequence.Substring(start - 1, refAllele.Length); } // we have a padding base we can check if its possible to left shift at all public static bool IsLeftShiftPossible(string refAllele, string altAllele) { if (refAllele == altAllele) return false; if (string.IsNullOrEmpty(refAllele) || string.IsNullOrEmpty(altAllele)) return true; if (refAllele.Length == 1) return refAllele[0] == altAllele[altAllele.Length - 1]; if (altAllele.Length == 1) return altAllele[0] == refAllele[refAllele.Length - 1]; return true; } private static bool IsStructuralVariant(string altAllele) { return altAllele.StartsWith('<') || altAllele.Contains('[') || altAllele.Contains(']'); } } } ================================================ FILE: Variants/Variants.csproj ================================================ net6.0 ..\bin\$(Configuration) ================================================ FILE: Vcf/AssemblyInfo.cs ================================================ using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("UnitTests")] ================================================ FILE: Vcf/IVcfFilter.cs ================================================ using System.IO; using Genome; namespace Vcf { public interface IVcfFilter { void FastForward(StreamReader reader); string GetNextLine(StreamReader reader); bool PassedTheEnd(Chromosome chromosome, int position); } } ================================================ FILE: Vcf/Info/CustomFields.cs ================================================ using System.Collections.Generic; using System.Text; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.IO; namespace Vcf.Info { public sealed class CustomFields:ICustomFields { private readonly Dictionary _keyValues=new (); public void Add(string key, string value) { _keyValues.Add(key, value); } public void Clear() { _keyValues.Clear(); } public bool IsEmpty() =>_keyValues.Count == 0; public void SerializeJson(StringBuilder sb) { var jsonObject = new JsonObject(sb); sb.Append(JsonObject.OpenBrace); foreach (var (key, value) in _keyValues) { jsonObject.AddStringValue(key, value); } sb.Append(JsonObject.CloseBrace); } public override string ToString() { var sb = StringBuilderPool.Get(); SerializeJson(sb); var s = sb.ToString(); StringBuilderPool.Return(sb); return s; } } } ================================================ FILE: Vcf/Info/InfoData.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.Positions; namespace Vcf.Info { public sealed record InfoData(string BreakendEventId, int[] CiEnd, int[] CiPos, int? End, double? FisherStrandBias, bool IsImprecise, bool IsInv3, bool IsInv5, int? JointSomaticNormalQuality, double? MappingQuality, double? RecalibratedQuality, int? RefRepeatCount, string RepeatUnit, double? StrandBias, int? SvLength, string SvType, double? LogOddsRatio, ICustomFields CustomKeyValues) : IInfoData; public sealed class InfoDataBuilder { public string BreakendEventId; public int[] CiEnd; public int[] CiPos; public int? End; public double? FisherStrandBias; public bool IsImprecise; public bool IsInv3; public bool IsInv5; public int? JointSomaticNormalQuality; public double? MappingQuality; public double? RecalibratedQuality; public int? RefRepeatCount; public string RepeatUnit; public double? StrandBias; public int? SvLength; public string SvType; public double? LogOddsRatio; public ICustomFields CustomFields=new CustomFields(); public InfoData Create() => new(BreakendEventId, CiEnd, CiPos, End, FisherStrandBias, IsImprecise, IsInv3, IsInv5, JointSomaticNormalQuality, MappingQuality, RecalibratedQuality, RefRepeatCount, RepeatUnit, StrandBias, SvLength, SvType, LogOddsRatio, CustomFields); public void Reset() { BreakendEventId = null; CiEnd = null; CiPos = null; End = null; FisherStrandBias = null; IsImprecise = false; IsInv3 = false; IsInv5 = false; JointSomaticNormalQuality = null; MappingQuality = null; RecalibratedQuality = null; RefRepeatCount = null; RepeatUnit = null; StrandBias = null; SvLength = null; SvType = null; LogOddsRatio = null; CustomFields.Clear(); } } } ================================================ FILE: Vcf/Info/VcfInfoParser.cs ================================================ using System; using System.Collections.Generic; using OptimizedCore; using VariantAnnotation.Interface.Positions; namespace Vcf.Info { public static class VcfInfoParser { private static readonly InfoDataBuilder Builder = new(); private static readonly Dictionary EmptyDictionary = new(); public static IInfoData Parse(string infoField, HashSet customInfoKeys=null) { if (string.IsNullOrEmpty(infoField)) return null; Dictionary infoKeyValue = ExtractInfoFields(infoField); Builder.Reset(); foreach ((string key, string value) in infoKeyValue) { // ReSharper disable once SwitchStatementMissingSomeCases switch (key) { case "CIEND": Builder.CiEnd = value.SplitToArray(); break; case "CIPOS": Builder.CiPos = value.SplitToArray(); break; case "END": Builder.End = value.GetNullableInt(); break; case "EVENT": Builder.BreakendEventId = value; break; case "REF": Builder.RefRepeatCount = Convert.ToInt32(value); break; case "RU": Builder.RepeatUnit = value; break; case "SB": Builder.StrandBias = value.GetNullableValue(double.TryParse); break; case "FS": Builder.FisherStrandBias = value.GetNullableValue(double.TryParse); break; case "MQ": Builder.MappingQuality = value.GetNullableValue(double.TryParse); break; case "QSI_NT": case "SOMATICSCORE": case "QSS_NT": Builder.JointSomaticNormalQuality = value.GetNullableInt(); break; case "SVLEN": Builder.SvLength = value.GetNullableInt(); if (Builder.SvLength != null) Builder.SvLength = Math.Abs(Builder.SvLength.Value); break; case "SVTYPE": Builder.SvType = value; break; case "VQSR": Builder.RecalibratedQuality = value.GetNullableValue(double.TryParse); break; case "IMPRECISE": Builder.IsImprecise = true; break; case "INV3": Builder.IsInv3 = true; break; case "INV5": Builder.IsInv5 = true; break; case "LOD": Builder.LogOddsRatio = Convert.ToDouble(value); break; } if (customInfoKeys != null && customInfoKeys.Contains(key)) { Builder.CustomFields.Add(key, value); } } return Builder.Create(); } private static Dictionary ExtractInfoFields(string infoField) { if (infoField == ".") return EmptyDictionary; var infoKeyValue = new Dictionary(); foreach (string field in infoField.OptimizedSplit(';')) { (string key, string value) = field.OptimizedKeyValue(); value ??= "true"; infoKeyValue[key] = value; } return infoKeyValue; } } } ================================================ FILE: Vcf/NullVcfFilter.cs ================================================ using System.IO; using Genome; namespace Vcf { public sealed class NullVcfFilter : IVcfFilter { public void FastForward(StreamReader reader) { //stupid sonarQube requires a comment here } public string GetNextLine(StreamReader reader) => reader.ReadLine(); public bool PassedTheEnd(Chromosome chromosome, int position) => false; } } ================================================ FILE: Vcf/Position.cs ================================================ using System.Collections.Generic; using Genome; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Variants; using Vcf.Info; using Vcf.Sample; using Vcf.VariantCreator; namespace Vcf { public sealed class Position : IPosition { public Chromosome Chromosome { get; private set;} public int Start { get; private set;} public int End { get; private set;} public string RefAllele { get; private set;} public string[] AltAlleles { get; private set;} public double? Quality { get; private set;} public string[] Filters { get; private set;} public IVariant[] Variants { get; private set;} public ISample[] Samples { get; private set;} public IInfoData InfoData { get; private set;} public bool HasStructuralVariant { get; private set;} public bool HasShortTandemRepeat { get; private set;} public string[] VcfFields { get; private set;} public bool[] IsDecomposed { get; private set;} public bool IsRecomposed { get; private set;} public string[] Vids { get; private set;} public List[] LinkedVids { get; private set;} public void Initialize(Chromosome chromosome, int start, int end, string refAllele, string[] altAlleles, double? quality, string[] filters, IVariant[] variants, ISample[] samples, IInfoData infoData, string[] vcfFields, bool[] isDecomposed, bool isRecomposed) { Chromosome = chromosome; Start = start; End = end; RefAllele = refAllele; AltAlleles = altAlleles; Quality = quality; Filters = filters; Variants = variants; Samples = samples; InfoData = infoData; VcfFields = vcfFields; IsDecomposed = isDecomposed; (HasStructuralVariant, HasShortTandemRepeat) = CheckVariants(variants); Vids = null; LinkedVids = null; } private static (bool HasStructuralVariant, bool HasShortTandemRepeat) CheckVariants(IVariant[] variants) { if (variants == null) return (false, false); var hasStructuralVariant = false; var hasShortTandemRepeat = false; // ReSharper disable once LoopCanBeConvertedToQuery foreach (var variant in variants) { if (variant.IsStructuralVariant) hasStructuralVariant = true; if (variant.Type == VariantType.short_tandem_repeat_variation) hasShortTandemRepeat = true; } return (hasStructuralVariant, hasShortTandemRepeat); } public static IPosition ToPosition(ISimplePosition simplePosition, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, VariantFactory variantFactory, bool enableDq = false, HashSet customInfoKeys=null) { if (simplePosition == null) return null; sequenceProvider.LoadChromosome(simplePosition.Chromosome); string[] vcfFields = simplePosition.VcfFields; string[] altAlleles = vcfFields[VcfCommon.AltIndex].OptimizedSplit(','); bool isReference = altAlleles.Length == 1 && VcfCommon.ReferenceAltAllele.Contains(altAlleles[0]); string globalMajorAllele = isReference ? refMinorProvider?.GetGlobalMajorAllele(simplePosition.Chromosome, simplePosition.Start) : null; bool isRefMinor = isReference && globalMajorAllele != null; if (isReference && !isRefMinor) return GetReferencePosition(simplePosition); var infoData = VcfInfoParser.Parse(vcfFields[VcfCommon.InfoIndex],customInfoKeys); int end = ExtractEnd(infoData, simplePosition.Start, simplePosition.RefAllele.Length); double? quality = vcfFields[VcfCommon.QualIndex].GetNullableValue(double.TryParse); string[] filters = vcfFields[VcfCommon.FilterIndex].OptimizedSplit(';'); IVariant[] variants = variantFactory.CreateVariants(simplePosition.Chromosome, simplePosition.Start, end, simplePosition.RefAllele, altAlleles, infoData, simplePosition.IsDecomposed, simplePosition.IsRecomposed, simplePosition.LinkedVids, globalMajorAllele); ISample[] samples = vcfFields.ToSamples(variantFactory.FormatIndices, simplePosition, variants, mitoHeteroplasmyProvider, enableDq); return PositionPool.Get(simplePosition.Chromosome, simplePosition.Start, end, simplePosition.RefAllele, altAlleles, quality, filters, variants, samples, infoData, vcfFields, simplePosition.IsDecomposed, simplePosition.IsRecomposed); } private static IPosition GetReferencePosition(ISimplePosition simplePosition) => PositionPool.Get(simplePosition.Chromosome, simplePosition.Start, simplePosition.Start, simplePosition.RefAllele, simplePosition.AltAlleles, null, null, null, null, null, simplePosition.VcfFields, simplePosition.IsDecomposed, simplePosition.IsRecomposed); private static int ExtractEnd(IInfoData infoData, int start, int refAlleleLength) { if (infoData.End != null) return infoData.End.Value; return start + refAlleleLength - 1; } } } ================================================ FILE: Vcf/PositionPool.cs ================================================ using Genome; using Microsoft.Extensions.ObjectPool; using VariantAnnotation.Interface.Positions; using Variants; namespace Vcf { public static class PositionPool { private static readonly ObjectPool Pool = new DefaultObjectPool(new DefaultPooledObjectPolicy(), 4); public static Position Get(Chromosome chromosome, int start, int end, string refAllele, string[] altAlleles, double? quality, string[] filters, IVariant[] variants, ISample[] samples, IInfoData infoData, string[] vcfFields, bool[] isDecomposed, bool isRecomposed) { var position = Pool.Get(); position.Initialize( chromosome, start, end, refAllele, altAlleles, quality, filters, variants, samples,infoData, vcfFields, isDecomposed, isRecomposed); return position; } public static void Return(Position position) => Pool.Return(position); } } ================================================ FILE: Vcf/Sample/BooleanExtensions.cs ================================================ namespace Vcf.Sample { internal static class BooleanExtensions { internal static bool GetFailedFilter(this string ftField) { if (string.IsNullOrEmpty(ftField)) return false; return ftField != "PASS" && ftField != "."; } internal static bool IsDeNovo(this string dnField) { if (string.IsNullOrEmpty(dnField)) return false; return dnField == "DeNovo"; } } } ================================================ FILE: Vcf/Sample/FormatIndices.cs ================================================ using System.Collections.Generic; using OptimizedCore; namespace Vcf.Sample { public sealed class FormatIndices { // ReSharper disable InconsistentNaming internal int? AD; internal int? AQ; internal int? CN; internal int? DN; internal int? DP; internal int? DST; internal int? FT; internal int? GQ; internal int? GT; internal int? LQ; internal int? PR; internal int? REPCN; internal int? SR; internal int? VF; internal int? MCN; internal int? SQ; internal int? BC; //legacy fields internal int? TAR; internal int? TIR; internal int? AU; internal int? CU; internal int? GU; internal int? TU; internal int? MCC; internal int? GQX; internal int? DPI; internal int? DQ; // ReSharper restore InconsistentNaming internal int NumColumns; // custom fields internal readonly Dictionary CustomFields; private void Clear() { AD = null; AQ = null; CN = null; DN = null; DP = null; DST = null; FT = null; GQ = null; GT = null; LQ = null; PR = null; REPCN = null; SR = null; VF = null; MCN = null; SQ = null; BC = null; // legacy sample fields TAR = null; TIR = null; AU = null; CU = null; GU = null; TU = null; MCC = null; GQX = null; DPI = null; DQ = null; // custom fields if (CustomFields == null) return; foreach (var field in CustomFields.Keys) { CustomFields[field] = null; } } public FormatIndices(HashSet customFields=null) { if (customFields == null) return; CustomFields = new(); foreach (var field in customFields) { CustomFields[field] = null; } } internal void Set(string formatColumn) { Clear(); if (formatColumn == null) return; string[] formatCols = formatColumn.OptimizedSplit(':'); NumColumns = formatCols.Length; for (var index = 0; index < NumColumns; index++) { // ReSharper disable once SwitchStatementMissingSomeCases var formatKey = formatCols[index]; switch (formatKey) { case "AD": AD = index; break; case "AQ": AQ = index; break; case "CN": CN = index; break; case "DN": DN = index; break; case "DP": DP = index; break; case "DST": DST = index; break; case "FT": FT = index; break; case "GQ": GQ = index; break; case "GT": GT = index; break; case "LQ": LQ = index; break; case "PR": PR = index; break; case "REPCN": REPCN = index; break; case "SR": SR = index; break; case "VF": VF = index; break; case "MCN": MCN = index; break; case "SQ": SQ = index; break; case "BC": BC = index; break; //LEGACY case "TAR": TAR = index; break; case "TIR": TIR = index; break; case "AU": AU = index; break; case "GU": GU = index; break; case "CU": CU = index; break; case "TU": TU = index; break; case "GQX": GQX = index; break; case "DPI": DPI = index; break; case "MCC": MCC = index; break; case "DQ": DQ = index; break; default: if(CustomFields!=null && CustomFields.ContainsKey(formatKey)) CustomFields[formatKey] = index; break; } } } } } ================================================ FILE: Vcf/Sample/Legacy/AlleleDepths.cs ================================================ using OptimizedCore; namespace Vcf.Sample.Legacy { internal static class AlleleDepths { /// /// returns the allele depths given different sources of information /// public static int[] GetAlleleDepths(IntermediateSampleFields intermediateSampleFields) { int[] ad = null; // use TAR & TIR if (intermediateSampleFields.TAR != null && intermediateSampleFields.TIR != null) ad = GetAlleleDepthsUsingTarTir(intermediateSampleFields); // use allele counts if (ad == null && intermediateSampleFields.TotalAlleleCount != null) ad = GetAlleleDepthsUsingAlleleCounts(intermediateSampleFields); // use allele depths if (ad == null && intermediateSampleFields.FormatIndices.AD != null) ad = GetAlleleDepthsUsingAd(intermediateSampleFields); return ad; } /// /// returns the variant frequency using TIR and TAR /// private static int[] GetAlleleDepthsUsingTarTir(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.TIR == null || intermediateSampleFields.TAR == null || intermediateSampleFields.AltAlleles.Length > 1) return null; return new[] { intermediateSampleFields.TAR.Value, intermediateSampleFields.TIR.Value }; } /// /// returns the allele depths using allele counts /// private static int[] GetAlleleDepthsUsingAlleleCounts(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.TotalAlleleCount == null) return null; // sanity check: make sure all alternate alleles are SNVs if (intermediateSampleFields.VcfRefAllele.Length != 1 || !intermediateSampleFields.AltAlleles.AreAllAltAllelesSingleBase()) return null; var ad = new int[intermediateSampleFields.AltAlleles.Length + 1]; // handle reference allele var ac = GetAlleleCountString(intermediateSampleFields.VcfRefAllele, intermediateSampleFields); if (ac == null) return null; ad[0] = ac.Value; // handle alternate alleles var index = 1; foreach (string altAllele in intermediateSampleFields.AltAlleles) { ac = GetAlleleCountString(altAllele, intermediateSampleFields); if (ac == null) return null; ad[index++] = ac.Value; } return ad; } /// /// returns the appropriate allele count string given the supplied base /// private static int? GetAlleleCountString(string s, IntermediateSampleFields intermediateSampleFields) { int? ac = null; // ReSharper disable once SwitchStatementMissingSomeCases switch (s) { case "A": ac = intermediateSampleFields.ACount; break; case "C": ac = intermediateSampleFields.CCount; break; case "G": ac = intermediateSampleFields.GCount; break; case "T": ac = intermediateSampleFields.TCount; break; } return ac; } /// /// returns the allele depths using allele depths /// private static int[] GetAlleleDepthsUsingAd(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.FormatIndices.AD == null || intermediateSampleFields.SampleColumns.Length <= intermediateSampleFields.FormatIndices.AD.Value) return null; var ad = intermediateSampleFields.SampleColumns[intermediateSampleFields.FormatIndices.AD.Value].OptimizedSplit(','); if (ad[0] == ".") return null; int nAllele = ad.Length; var alleleDepths = new int[nAllele]; for (var i = 0; i < nAllele; i++) { (int number, bool foundError) = ad[i].OptimizedParseInt32(); if (foundError) return null; alleleDepths[i] = number; } return alleleDepths; } } } ================================================ FILE: Vcf/Sample/Legacy/FailedFilter.cs ================================================ namespace Vcf.Sample.Legacy { internal static class FailedFilter { public static bool GetFailedFilter(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.FormatIndices.FT == null) return false; if (intermediateSampleFields.FormatIndices.FT.Value >= intermediateSampleFields.SampleColumns.Length) return false; string filterValue = intermediateSampleFields.SampleColumns[intermediateSampleFields.FormatIndices.FT.Value]; return filterValue != "PASS" && filterValue != "."; } } } ================================================ FILE: Vcf/Sample/Legacy/Genotype.cs ================================================ namespace Vcf.Sample.Legacy { internal static class Genotype { public static string GetGenotype(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.FormatIndices.GT == null) return null; string genotype = intermediateSampleFields.SampleColumns[intermediateSampleFields.FormatIndices.GT.Value]; return genotype == "." ? null : genotype; } } } ================================================ FILE: Vcf/Sample/Legacy/GenotypeQuality.cs ================================================ using OptimizedCore; namespace Vcf.Sample.Legacy { internal static class GenotypeQuality { public static int? GetGenotypeQuality(IntermediateSampleFields intermediateSampleFields) { bool hasGqx = intermediateSampleFields.FormatIndices.GQX != null; bool hasGq = intermediateSampleFields.FormatIndices.GQ != null; if (!hasGqx && !hasGq) return null; int gqIndex = hasGqx ? intermediateSampleFields.FormatIndices.GQX.Value : intermediateSampleFields.FormatIndices.GQ.Value; if (intermediateSampleFields.SampleColumns.Length <= gqIndex) return null; string gq = intermediateSampleFields.SampleColumns[gqIndex]; (int number, bool foundError) = gq.OptimizedParseInt32(); return foundError ? null : (int?)number; } } } ================================================ FILE: Vcf/Sample/Legacy/IntermediateSampleFields.cs ================================================ using OptimizedCore; using VariantAnnotation.Interface.IO; namespace Vcf.Sample.Legacy { public sealed class IntermediateSampleFields { public FormatIndices FormatIndices { get; } public string[] SampleColumns { get; } public string[] AltAlleles { get; } public int? TotalAlleleCount { get; } public string VcfRefAllele { get; } public int? MajorChromosomeCount { get; } public int? CopyNumber { get; } // ReSharper disable InconsistentNaming public float? AQ { get; } public float? LQ { get; } public double? VF { get; } public int? TIR { get; } public int? TAR { get; } public int? ACount { get; } public int? CCount { get; } public int? GCount { get; } public int? TCount { get; } public string[] DST { get; } // ReSharper restore InconsistentNaming // ReSharper disable once SuggestBaseTypeForParameter public IntermediateSampleFields(string[] vcfColumns, FormatIndices formatIndices, string[] sampleCols) { VcfRefAllele = vcfColumns[VcfCommon.RefIndex]; AltAlleles = vcfColumns[VcfCommon.AltIndex].OptimizedSplit(','); FormatIndices = formatIndices; SampleColumns = sampleCols; (TAR, TIR) = GetLinkedIntegers(GetFirstValue(GetString(formatIndices.TAR, sampleCols)), GetFirstValue(GetString(formatIndices.TIR, sampleCols))); MajorChromosomeCount = GetInteger(GetString(formatIndices.MCC, sampleCols)); DST = GetStrings(GetString(formatIndices.DST, sampleCols)); AQ = GetFloat(GetString(formatIndices.AQ, sampleCols)); LQ = GetFloat(GetString(formatIndices.LQ, sampleCols)); VF = GetDouble(GetString(formatIndices.VF, sampleCols)); CopyNumber = GetCopyNumber(GetString(formatIndices.CN, sampleCols), vcfColumns[VcfCommon.AltIndex].Contains("STR")); (ACount, CCount, GCount, TCount, TotalAlleleCount) = GetAlleleCounts( GetString(formatIndices.AU, sampleCols), GetString(formatIndices.CU, sampleCols), GetString(formatIndices.GU, sampleCols), GetString(formatIndices.TU, sampleCols)); } // ReSharper disable once SuggestBaseTypeForParameter private static string GetString(int? index, string[] cols) { if (index == null || index >= cols.Length) return null; string s = cols[index.Value]; return s == "." ? null : s; } private static float? GetFloat(string s) { if (s == null) return null; if (float.TryParse(s, out float ret)) return ret; return null; } private static double? GetDouble(string s) { if (s == null) return null; if (double.TryParse(s, out double ret)) return ret; return null; } private static int? GetInteger(string s) { if (s == null) return null; (int number, bool foundError) = s.OptimizedParseInt32(); return foundError ? null : (int?)number; } private static (int?, int?) GetLinkedIntegers(string s, string s2) { var num = GetInteger(s); var num2 = GetInteger(s2); if (num == null || num2 == null) return (null, null); return (num, num2); } private static string[] GetStrings(string s) => s?.OptimizedSplit(','); private static int? GetCopyNumber(string s, bool containsStr) { if (s == null || containsStr) return null; return GetInteger(s); } private static (int?, int?, int?, int?, int?) GetAlleleCounts(string au, string cu, string gu, string tu) { if (au == null || cu == null || gu == null || tu == null) return (null, null, null, null, null); var a = GetInteger(GetFirstValue(au)); var c = GetInteger(GetFirstValue(cu)); var g = GetInteger(GetFirstValue(gu)); var t = GetInteger(GetFirstValue(tu)); var total = a == null || c == null || g == null || t == null ? null : a + c + g + t; return (a, c, g, t, total); } private static string GetFirstValue(string s) => GetStrings(s)?[0]; } } ================================================ FILE: Vcf/Sample/Legacy/LegacySampleFieldExtractor.cs ================================================ using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; namespace Vcf.Sample.Legacy { public sealed class LegacySampleFieldExtractor { private readonly string[] _vcfColumns; private readonly FormatIndices _formatIndices; private readonly int? _infoDepth; internal LegacySampleFieldExtractor(string[] vcfColumns, FormatIndices formatIndices) { _vcfColumns = vcfColumns; _infoDepth = GetInfoDepth(vcfColumns[VcfCommon.InfoIndex]); _formatIndices = formatIndices; } private static int? GetInfoDepth(string infoColumn) { var splits = infoColumn.OptimizedSplit(';'); foreach (string split in splits) { if(!split.StartsWith("DP")) continue; var depth = int.Parse(split.Split('=')[1]); return depth; } // no DP field present return null; } internal ISample ExtractSample(string sampleColumn) { // sanity check: make sure we have a format column if (_formatIndices == null || string.IsNullOrEmpty(sampleColumn)) return Sample.EmptySample; var sampleColumns = sampleColumn.OptimizedSplit(':'); // handle missing sample columns if (sampleColumns.Length == 1 && sampleColumns[0] == ".") return Sample.EmptySample; var sampleFields = new IntermediateSampleFields(_vcfColumns, _formatIndices, sampleColumns); var alleleDepths = AlleleDepths.GetAlleleDepths(sampleFields); bool failedFilter = FailedFilter.GetFailedFilter(sampleFields); string genotype = Genotype.GetGenotype(sampleFields); var genotypeQuality = GenotypeQuality.GetGenotypeQuality(sampleFields); var totalDepth = TotalDepth.GetTotalDepth(_infoDepth, sampleFields); double? denovoQuality = sampleColumns.GetString(_formatIndices.DQ).GetDouble(); var variantFrequencies = LegacyVariantFrequency.GetVariantFrequencies(sampleFields); var splitReadCounts = ReadCounts.GetSplitReadCounts(sampleFields); var pairEndReadCounts = ReadCounts.GetPairEndReadCounts(sampleFields); bool isLossOfHeterozygosity = sampleFields.MajorChromosomeCount != null && sampleFields.CopyNumber != null && sampleFields.MajorChromosomeCount.Value == sampleFields.CopyNumber.Value && sampleFields.CopyNumber.Value > 1; var sample = new Sample(alleleDepths, sampleFields.AQ, sampleFields.CopyNumber, sampleFields.DST, failedFilter, genotype, genotypeQuality, false, denovoQuality, sampleFields.LQ, pairEndReadCounts, null, splitReadCounts, totalDepth, variantFrequencies, null, null, isLossOfHeterozygosity, null, null); return sample; } } } ================================================ FILE: Vcf/Sample/Legacy/LegacyVariantFrequency.cs ================================================ using OptimizedCore; namespace Vcf.Sample.Legacy { internal static class LegacyVariantFrequency { private static readonly double[] ZeroVf = { 0.0 }; public static double[] GetVariantFrequencies(IntermediateSampleFields sampleFields) { double[] vf = null; // use VF if (sampleFields.VF != null) vf = GetVariantFrequenciesUsingVf(sampleFields); // use TAR & TIR if (sampleFields.TAR != null && sampleFields.TIR != null) vf = GetVariantFrequenciesUsingTarTir(sampleFields); // use allele counts if (vf == null && sampleFields.TotalAlleleCount != null) vf = GetVariantFrequenciesUsingAlleleCounts(sampleFields); // use allele depths if (vf == null && sampleFields.FormatIndices.AD != null) vf = GetVariantFrequenciesUsingAlleleDepths(sampleFields); return vf; } private static double[] GetVariantFrequenciesUsingVf(IntermediateSampleFields sampleFields) { if (sampleFields.AltAlleles.Length > 1 || sampleFields.VF == null) return null; return new[] { sampleFields.VF.Value }; } private static double[] GetVariantFrequenciesUsingAlleleCounts(IntermediateSampleFields sampleFields) { bool isRefSingleBase = sampleFields.VcfRefAllele.Length == 1; bool areAllAltsSingleBase = sampleFields.AltAlleles.AreAllAltAllelesSingleBase(); bool isReference = sampleFields.AltAlleles.Length == 1 && sampleFields.AltAlleles[0] == "."; // for this to work we need a single-base reference allele and all raw allele counts must be available if (sampleFields.TotalAlleleCount == null || isReference || !isRefSingleBase || !areAllAltsSingleBase) return null; int numAltAlleles = sampleFields.AltAlleles.Length; var variantFreqs = new double[numAltAlleles]; if (sampleFields.TotalAlleleCount == 0) return variantFreqs; for (var i = 0; i < numAltAlleles; i++) { int alleleCount = GetAlleleCount(sampleFields, i); variantFreqs[i] = alleleCount / (double)sampleFields.TotalAlleleCount; } return variantFreqs; } internal static bool AreAllAltAllelesSingleBase(this string[] altAlleles) { foreach (string altAllele in altAlleles) if (altAllele.Length != 1) return false; return true; } private static int GetAlleleCount(IntermediateSampleFields sampleFields, int alleleIndex) { string altAllele = sampleFields.AltAlleles[alleleIndex]; var alleleCount = 0; // ReSharper disable once SwitchStatementMissingSomeCases switch (altAllele) { case "A": alleleCount = sampleFields.ACount ?? 0; break; case "C": alleleCount = sampleFields.CCount ?? 0; break; case "G": alleleCount = sampleFields.GCount ?? 0; break; case "T": alleleCount = sampleFields.TCount ?? 0; break; } return alleleCount; } private static double[] GetVariantFrequenciesUsingTarTir(IntermediateSampleFields sampleFields) { // TAR and TIR: never observed with multiple alternate alleles if (sampleFields.TIR == null || sampleFields.TAR == null || sampleFields.AltAlleles.Length > 1) return null; if (sampleFields.TIR + sampleFields.TAR == 0) return ZeroVf; var tir = (double)sampleFields.TIR; var tar = (double)sampleFields.TAR; return new[] { tir / (tar + tir) }; } private static double[] GetVariantFrequenciesUsingAlleleDepths(IntermediateSampleFields sampleFields) { if (sampleFields.FormatIndices.AD == null || sampleFields.SampleColumns.Length <= sampleFields.FormatIndices.AD.Value) return null; int numAltAlleles = sampleFields.AltAlleles.Length; var variantFreqs = new double[numAltAlleles]; string adField = sampleFields.SampleColumns[sampleFields.FormatIndices.AD.Value]; (var alleleDepths, bool allValuesAreValid, int totalDepth) = GetAlleleDepths(adField); if (!allValuesAreValid || numAltAlleles != alleleDepths.Length) return null; // sanity check: make sure we handle NaNs properly if (totalDepth == 0) return variantFreqs; for (var alleleIndex = 0; alleleIndex < numAltAlleles; alleleIndex++) { variantFreqs[alleleIndex] = alleleDepths[alleleIndex] / (double)totalDepth; } return variantFreqs; } private static (int[] AlleleDepths, bool AllValuesAreValid, int totalDepth) GetAlleleDepths(string adField) { var adFields = adField.OptimizedSplit(','); var alleleDepths = new int[adFields.Length - 1]; var totalDepth = 0; for (var i = 0; i < adFields.Length; i++) { (int ad, bool foundError) = adFields[i].OptimizedParseInt32(); if(foundError) return (null, false, totalDepth); if (i > 0) alleleDepths[i - 1] = ad; totalDepth += ad; } return (alleleDepths, true, totalDepth); } } } ================================================ FILE: Vcf/Sample/Legacy/ReadCounts.cs ================================================ using OptimizedCore; namespace Vcf.Sample.Legacy { internal static class ReadCounts { public static int[] GetPairEndReadCounts(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.FormatIndices.PR == null) return null; var readCounts = intermediateSampleFields.SampleColumns[intermediateSampleFields.FormatIndices.PR.Value].OptimizedSplit(','); var pairEndReadCounts = new int[readCounts.Length]; for (var i = 0; i < pairEndReadCounts.Length; i++) { (int number, bool foundError) = readCounts[i].OptimizedParseInt32(); if (foundError) return null; pairEndReadCounts[i] = number; } return pairEndReadCounts; } public static int[] GetSplitReadCounts(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.FormatIndices.SR == null) return null; var splitReadCounts = intermediateSampleFields.SampleColumns[intermediateSampleFields.FormatIndices.SR.Value].OptimizedSplit(','); var splitReads = new int[splitReadCounts.Length]; for (var i = 0; i < splitReads.Length; i++) { (int number, bool foundError) = splitReadCounts[i].OptimizedParseInt32(); if (foundError) return null; splitReads[i] = number; } return splitReads; } } } ================================================ FILE: Vcf/Sample/Legacy/TotalDepth.cs ================================================ using OptimizedCore; namespace Vcf.Sample.Legacy { internal static class TotalDepth { public static int? GetTotalDepth(int? infoDepth, IntermediateSampleFields intermediateSampleFields) { // use TAR & TIR if (intermediateSampleFields.TAR != null && intermediateSampleFields.TIR != null) return GetTotalDepthUsingTarTir(intermediateSampleFields); // use base counts if (intermediateSampleFields.TotalAlleleCount != null) return GetTotalDepthUsingAlleleCounts(intermediateSampleFields); // use DPI if (intermediateSampleFields.FormatIndices.DPI != null) return GetTotalDepthUsingDpi(intermediateSampleFields); // use DP if (intermediateSampleFields.FormatIndices.DP != null) return GetTotalDepthUsingDp(intermediateSampleFields); // use INFO DP (Pisces) return infoDepth; } private static int? GetTotalDepthUsingTarTir(IntermediateSampleFields intermediateSampleFields) => intermediateSampleFields.TAR + intermediateSampleFields.TIR; private static int? GetTotalDepthUsingAlleleCounts(IntermediateSampleFields intermediateSampleFields) => intermediateSampleFields.TotalAlleleCount; private static int? GetTotalDepthUsingDpi(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.FormatIndices.DPI == null || intermediateSampleFields.SampleColumns.Length <= intermediateSampleFields.FormatIndices.DPI.Value) return null; string depth = intermediateSampleFields.SampleColumns[intermediateSampleFields.FormatIndices.DPI.Value]; (int number, bool foundError) = depth.OptimizedParseInt32(); return foundError ? null : (int?)number; } private static int? GetTotalDepthUsingDp(IntermediateSampleFields intermediateSampleFields) { if (intermediateSampleFields.FormatIndices.DP == null || intermediateSampleFields.SampleColumns.Length <= intermediateSampleFields.FormatIndices.DP.Value) return null; string depth = intermediateSampleFields.SampleColumns[intermediateSampleFields.FormatIndices.DP.Value]; (int number, bool foundError) = depth.OptimizedParseInt32(); return foundError ? null : (int?)number; } } } ================================================ FILE: Vcf/Sample/Sample.cs ================================================ using System.Collections.Generic; using VariantAnnotation.Interface.Positions; using Vcf.Info; namespace Vcf.Sample { public sealed class Sample : ISample { public int[] AlleleDepths { get; } public float? ArtifactAdjustedQualityScore { get; } // PEPE public int? CopyNumber { get; } public string[] DiseaseAffectedStatuses { get; } // SMN1 public bool FailedFilter { get; } public string Genotype { get; } public int? GenotypeQuality { get; } public bool IsDeNovo { get; } public double? DeNovoQuality { get; } //for legacy callers only public bool IsEmpty { get; } public float? LikelihoodRatioQualityScore { get; } // PEPE public int[] PairedEndReadCounts { get; } // Manta public int[] RepeatUnitCounts { get; } // ExpansionHunter public int[] SplitReadCounts { get; } // Manta public int? TotalDepth { get; } public double[] VariantFrequencies { get; } public int? MinorHaplotypeCopyNumber { get; } public double? SomaticQuality { get; } public bool? IsLossOfHeterozygosity { get; } public string[] HeteroplasmyPercentile { get; } public int? BinCount { get; } public ICustomFields CustomFields { get; } public static readonly Sample EmptySample = new Sample(null, null, null, null, false, null, null, false, null, null, null, null, null, null, null, null, null, null, null, null); public Sample(int[] alleleDepths, float? artifactAdjustedQualityScore, int? copyNumber, string[] diseaseAffectedStatuses, bool failedFilter, string genotype, int? genotypeQuality, bool isDeNovo, double? deNovoQuality, float? likelihoodRatioQualityScore, int[] pairedEndReadCounts, int[] repeatUnitCounts, int[] splitReadCounts, int? totalDepth, double[] variantFrequencies, int? minorHaplotypeCopyNumber, double? somaticQuality, bool? isLossOfHeterozygosity, string[] heteroplasmyPercentile, int? binCount, ICustomFields customFields=null) { AlleleDepths = alleleDepths; ArtifactAdjustedQualityScore = artifactAdjustedQualityScore; CopyNumber = copyNumber; DiseaseAffectedStatuses = diseaseAffectedStatuses; FailedFilter = failedFilter; Genotype = genotype; GenotypeQuality = genotypeQuality; IsDeNovo = isDeNovo; DeNovoQuality = deNovoQuality; LikelihoodRatioQualityScore = likelihoodRatioQualityScore; PairedEndReadCounts = pairedEndReadCounts; RepeatUnitCounts = repeatUnitCounts; SplitReadCounts = splitReadCounts; TotalDepth = totalDepth; VariantFrequencies = variantFrequencies; IsLossOfHeterozygosity = isLossOfHeterozygosity; HeteroplasmyPercentile = heteroplasmyPercentile; MinorHaplotypeCopyNumber = minorHaplotypeCopyNumber; SomaticQuality = somaticQuality; BinCount = binCount; CustomFields = customFields; IsEmpty = AlleleDepths == null && ArtifactAdjustedQualityScore == null && CopyNumber == null && DiseaseAffectedStatuses == null && Genotype == null && GenotypeQuality == null && LikelihoodRatioQualityScore == null && PairedEndReadCounts == null && RepeatUnitCounts == null && SplitReadCounts == null && TotalDepth == null && VariantFrequencies == null && IsLossOfHeterozygosity == null && MinorHaplotypeCopyNumber == null && SomaticQuality == null && HeteroplasmyPercentile == null && DeNovoQuality == null && BinCount == null && !FailedFilter && !IsDeNovo && (CustomFields == null || CustomFields.IsEmpty()); } } } ================================================ FILE: Vcf/Sample/SampleFieldExtractor.cs ================================================ using System.Collections.Generic; using System.Linq; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Variants; using Vcf.Info; using Vcf.Sample.Legacy; namespace Vcf.Sample { internal static class SampleFieldExtractor { internal static ISample[] ToSamples(this string[] vcfColumns, FormatIndices formatIndices, ISimplePosition simplePosition, IVariant[] variants, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, bool enableDq=false, HashSet customFormatKeys=null) { if (vcfColumns.Length < VcfCommon.MinNumColumnsSampleGenotypes) return null; int numSamples = vcfColumns.Length - VcfCommon.MinNumColumnsSampleGenotypes + 1; var samples = new ISample[numSamples]; formatIndices.Set(vcfColumns[VcfCommon.FormatIndex]); var legacySampleExtractor = IsLegacyVariantCaller(formatIndices) ? new LegacySampleFieldExtractor(vcfColumns, formatIndices) : null; for (int index = VcfCommon.GenotypeIndex; index < vcfColumns.Length; index++) { samples[index - VcfCommon.GenotypeIndex] = ExtractSample(vcfColumns[index], formatIndices, simplePosition, variants, mitoHeteroplasmyProvider, legacySampleExtractor, enableDq, customFormatKeys); } return samples; } internal static ISample ExtractSample(string sampleColumn, FormatIndices formatIndices, ISimplePosition simplePosition, IVariant[] variants, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, LegacySampleFieldExtractor legacyExtractor = null, bool enableDq=false, HashSet customFormatKeys=null) { // sanity check: make sure we have a format column if (string.IsNullOrEmpty(sampleColumn)) return Sample.EmptySample; string[] sampleColumns = sampleColumn.OptimizedSplit(':', formatIndices.NumColumns); if (sampleColumns.Length == 1 && sampleColumns[0] == ".") return Sample.EmptySample; sampleColumns.NormalizeNulls(); if (legacyExtractor != null) { return legacyExtractor.ExtractSample(sampleColumn); } int[] alleleDepths = sampleColumns.GetString(formatIndices.AD).GetIntegers(); float? artifactAdjustedQualityScore = sampleColumns.GetString(formatIndices.AQ).GetFloat(); int? copyNumber = sampleColumns.GetString(formatIndices.CN).GetInteger(); string[] diseaseAffectedStatuses = sampleColumns.GetString(formatIndices.DST).GetStrings(); bool failedFilter = sampleColumns.GetString(formatIndices.FT).GetFailedFilter(); string genotype = sampleColumns.GetString(formatIndices.GT); int? genotypeQuality = sampleColumns.GetString(formatIndices.GQ).GetInteger(); bool isDeNovo = sampleColumns.GetString(formatIndices.DN).IsDeNovo(); double? deNovoQuality = enableDq? sampleColumns.GetString(formatIndices.DQ).GetDouble():null; float? likelihoodRatioQualityScore = sampleColumns.GetString(formatIndices.LQ).GetFloat(); int[] pairedEndReadCounts = sampleColumns.GetString(formatIndices.PR).GetIntegers(); int[] repeatUnitCounts = sampleColumns.GetString(formatIndices.REPCN).GetIntegers('/'); int[] splitReadCounts = sampleColumns.GetString(formatIndices.SR).GetIntegers(); int? totalDepth = sampleColumns.GetString(formatIndices.DP).GetInteger(); double? variantFrequency = sampleColumns.GetString(formatIndices.VF).GetDouble(); int? minorHaplotypeCopyNumber = sampleColumns.GetString(formatIndices.MCN).GetInteger(); double? somaticQuality = sampleColumns.GetString(formatIndices.SQ).GetDouble(); int? binCount = sampleColumns.GetString(formatIndices.BC).GetInteger(); CustomFields customFields = new CustomFields(); if (formatIndices.CustomFields != null) { foreach (var (key, index) in formatIndices.CustomFields) { if (index == null) continue; var value = sampleColumns.GetString(index); if (string.IsNullOrEmpty(value) || value==".") continue; customFields.Add(key, sampleColumns.GetString(index)); } } double[] variantFrequencies = VariantFrequency.GetVariantFrequencies(variantFrequency, alleleDepths, simplePosition.AltAlleles.Length); string[] mitoHeteroplasmyPercentiles = mitoHeteroplasmyProvider?.GetVrfPercentiles(variants, variantFrequencies)?.Select(x => x?.ToString("0.##") ?? "null").ToArray(); var isLoh = GetLoh(copyNumber, minorHaplotypeCopyNumber, genotype); var sample = new Sample(alleleDepths, artifactAdjustedQualityScore, copyNumber, diseaseAffectedStatuses, failedFilter, genotype, genotypeQuality, isDeNovo, deNovoQuality, likelihoodRatioQualityScore, pairedEndReadCounts, repeatUnitCounts, splitReadCounts, totalDepth, variantFrequencies, minorHaplotypeCopyNumber, somaticQuality, isLoh, mitoHeteroplasmyPercentiles, binCount, customFields); return sample; } private static bool? GetLoh(int? copyNumber, int? minorHaplotypeCopyNumber, string genotype) { if (!minorHaplotypeCopyNumber.HasValue || !copyNumber.HasValue) return null; return (genotype == "1/2" || genotype == "1|2") && minorHaplotypeCopyNumber == 0 && copyNumber >= 2; } private static bool IsLegacyVariantCaller(FormatIndices formatIndices) { return formatIndices.TAR != null || formatIndices.TIR != null || formatIndices.AU != null || formatIndices.GU != null || formatIndices.CU != null || formatIndices.TU != null || formatIndices.GQX != null || formatIndices.DPI != null || formatIndices.MCC != null; } internal static void NormalizeNulls(this string[] cols) { for (var i = 0; i < cols.Length; i++) { string col = cols[i]; if (col == null) continue; if (col.Length == 0 || col == ".") cols[i] = null; } } } } ================================================ FILE: Vcf/Sample/SampleParsingExtensions.cs ================================================ using System; using OptimizedCore; namespace Vcf.Sample { public static class SampleParsingExtensions { internal static string GetString(this string[] cols, int? index) => index == null ? null : cols[index.Value]; internal static float? GetFloat(this string s) { if (s == null) return null; if (float.TryParse(s, out float num)) return num; return null; } internal static double? GetDouble(this string s) { if (s == null) return null; if (double.TryParse(s, out double num)) return num; return null; } internal static int? GetInteger(this string s) { if (s == null) return null; (int number, bool foundError) = s.OptimizedParseInt32(); return foundError ? null : (int?)number; } internal static string[] GetStrings(this string s) => s?.OptimizedSplit(','); public static int[] GetIntegers(this string s, char delimiter = ',') { if (s == null) return null; string[] cols = s.OptimizedSplit(delimiter); var values = new int[cols.Length]; for (var i = 0; i < values.Length; i++) { (int number, bool foundError) = cols[i].OptimizedParseInt32(); if (foundError) return null; values[i] = number; } return values; } } } ================================================ FILE: Vcf/Sample/VariantFrequency.cs ================================================ namespace Vcf.Sample { internal static class VariantFrequency { public static double[] GetVariantFrequencies(double? vfField, int[] alleleDepths, int numAltAlleles) { // use VF double[] vf = GetVariantFrequenciesUsingVf(vfField, numAltAlleles > 1) ?? GetVariantFrequenciesUsingAlleleDepths(alleleDepths, numAltAlleles); // use allele depths return vf; } private static double[] GetVariantFrequenciesUsingVf(double? vf, bool multipleAltAlleles) { if (multipleAltAlleles || vf == null) return null; return new[] { vf.Value }; } private static double[] GetVariantFrequenciesUsingAlleleDepths(int[] alleleDepths, int numAltAlleles) { if (alleleDepths == null) return null; if (numAltAlleles + 1 != alleleDepths.Length) return null; var variantFreqs = new double[numAltAlleles]; var totalDepth = 0; foreach (int ad in alleleDepths) totalDepth += ad; if (totalDepth == 0) return variantFreqs; for (var alleleIndex = 0; alleleIndex < numAltAlleles; alleleIndex++) { variantFreqs[alleleIndex] = alleleDepths[alleleIndex + 1] / (double)totalDepth; } return variantFreqs; } } } ================================================ FILE: Vcf/SimplePosition.cs ================================================ using System.Collections.Generic; using Genome; using OptimizedCore; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; namespace Vcf { public sealed class SimplePosition : ISimplePosition { public int Start { get; } public int End { get; private set; } public Chromosome Chromosome { get; } public string RefAllele { get; } public string[] AltAlleles { get; } public string[] VcfFields { get; private set; } public bool[] IsDecomposed { get; private set; } public bool IsRecomposed { get; private set; } public string[] Vids { get; private set; } public List[] LinkedVids { get; private set; } internal SimplePosition(Chromosome chromosome, int start, string refAllele, string[] altAlleles) { Chromosome = chromosome; Start = start; RefAllele = refAllele; AltAlleles = altAlleles; } public static SimplePosition GetSimplePosition(Chromosome chromosome, int position, string[] vcfFields, IVcfFilter vcfFilter) { if (vcfFilter.PassedTheEnd(chromosome, position)) return null; string refAllele = vcfFields[VcfCommon.RefIndex]; string altAlleleField = vcfFields[VcfCommon.AltIndex]; string[] altAlleles = altAlleleField.OptimizedSplit(','); int numAltAlleles = altAlleles.Length; return new SimplePosition(chromosome, position, refAllele, altAlleles) { End = altAlleleField.OptimizedStartsWith('<') || altAlleleField == "*" ? -1 : position + refAllele.Length - 1, VcfFields = vcfFields, IsDecomposed = new bool[numAltAlleles], Vids = new string[numAltAlleles], LinkedVids = new List[numAltAlleles] }; } } } ================================================ FILE: Vcf/StringExtensions.cs ================================================ using System; using OptimizedCore; namespace Vcf { public static class StringExtensions { public delegate bool TryParse(string str, out T value); public static int? GetNullableInt(this string str) { (int number, bool foundError) = str.OptimizedParseInt32(); return foundError ? null : (int?) number; } public static T? GetNullableValue(this string str, TryParse parseFunc) where T : struct { try { if (parseFunc(str, out T val)) return val; return null; } catch (InvalidCastException) { return null; } } public static int[] SplitToArray(this string s) { try { string[] cols = s.OptimizedSplit(','); var values = new int[cols.Length]; for (var i = 0; i < cols.Length; i++) { (int number, bool foundError) = cols[i].OptimizedParseInt32(); if (foundError) return null; values[i] = number; } return values; } catch (InvalidCastException) { return null; } } } } ================================================ FILE: Vcf/VariantCreator/CnvCreator.cs ================================================ using Genome; using VariantAnnotation.Pools; using Variants; namespace Vcf.VariantCreator { public static class CnvCreator { public static IVariant Create(Chromosome chromosome, int start, int end, string refAllele, string altAllele, string vid) { var variantType = GetVariantType(altAllele); return VariantPool.Get(chromosome, start + 1, end, refAllele, altAllele, variantType, vid, false, false, false, null, AnnotationBehavior.StructuralVariants, true); } // For old style allelic CNV calls (e.g. , , etc.), // do not try to determine the overall copy number gain or loss // - for allele-specific you'll probably introduce inconsistency // - for normal , you'll probably get type wrong for MT, sex chromosomes, etc. private static VariantType GetVariantType(string altAllele) { if (altAllele == "") return VariantType.copy_number_loss; return altAllele == "" ? VariantType.copy_number_gain : VariantType.copy_number_variation; } } } ================================================ FILE: Vcf/VariantCreator/LegacyVariantId.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Security.Cryptography; using System.Text; using System.Text.RegularExpressions; using Genome; using OptimizedCore; using VariantAnnotation.Interface; using Variants; namespace Vcf.VariantCreator { public sealed class LegacyVariantId : IVariantIdCreator { private readonly Dictionary _refNameToChromosome; public LegacyVariantId(Dictionary refNameToChromosome) => _refNameToChromosome = refNameToChromosome; public string Create(ISequence sequence, VariantCategory category, string svType, Chromosome chromosome, int start, int end, string refAllele, string altAllele, string repeatUnit) { switch (category) { case VariantCategory.Reference: return $"{chromosome.EnsemblName}:{start}:{end}:{refAllele}"; case VariantCategory.SV: return GetSvVid(_refNameToChromosome, svType, chromosome, start, end, refAllele, altAllele); case VariantCategory.CNV: return GetCnvVid(chromosome, start, end, altAllele); case VariantCategory.RepeatExpansion: return GetRepeatExpansionVid(chromosome, start,end, altAllele, repeatUnit); case VariantCategory.ROH: return $"{chromosome.EnsemblName}:{start + 1}:{end}:ROH"; case VariantCategory.SmallVariant: var variantType = SmallVariantCreator.GetVariantType(refAllele, altAllele); return GetSmallVariantVid(chromosome, start, end, altAllele, variantType); default: throw new ArgumentOutOfRangeException(nameof(category), category, null); } } public (int Start, string RefAllele, string AltAllele) Normalize(ISequence sequence, int start, string refAllele, string altAllele) { if (altAllele.Contains('[') || altAllele.Contains(']')) return (start, refAllele, altAllele); return BiDirectionalTrimmer.Trim(start, refAllele, altAllele); } private static string GetSvVid(Dictionary refNameToChromosome, string svType, Chromosome chromosome, int start, int end, string refAllele, string altAllele) { var variantType = StructuralVariantCreator.GetVariantType(altAllele, svType); switch (variantType) { case VariantType.insertion: return $"{chromosome.EnsemblName}:{start + 1}:{end}:INS"; case VariantType.deletion: return $"{chromosome.EnsemblName}:{start + 1}:{end}"; case VariantType.duplication: return $"{chromosome.EnsemblName}:{start + 1}:{end}:DUP"; case VariantType.tandem_duplication: return $"{chromosome.EnsemblName}:{start + 1}:{end}:TDUP"; case VariantType.translocation_breakend: (Chromosome chromosome2, int position2, bool isSuffix1, bool isSuffix2) = ParseBreakendAltAllele(refNameToChromosome, refAllele, altAllele); char orientation1 = isSuffix1 ? '-' : '+'; char orientation2 = isSuffix2 ? '+' : '-'; return $"{chromosome.EnsemblName}:{start}:{orientation1}:{chromosome2.EnsemblName}:{position2}:{orientation2}"; case VariantType.inversion: return $"{chromosome.EnsemblName}:{start + 1}:{end}:Inverse"; case VariantType.mobile_element_insertion: return $"{chromosome.EnsemblName}:{start + 1}:{end}:MEI"; default: return $"{chromosome.EnsemblName}:{start + 1}:{end}"; } } private static (Chromosome Chromosome2, int Position2, bool IsSuffix1, bool IsSuffix2) ParseBreakendAltAllele( Dictionary refNameToChromosome, string refAllele, string altAllele) { string referenceName2; int position2; bool isSuffix2; const string forwardBreakEnd = "["; if (altAllele.StartsWith(refAllele)) { var forwardRegex = new Regex(@"\w+([\[\]])(.+):(\d+)([\[\]])", RegexOptions.Compiled); Match match = forwardRegex.Match(altAllele); if (!match.Success) throw new InvalidDataException( "Unable to successfully parse the complex rearrangements for the following allele: " + altAllele); isSuffix2 = match.Groups[4].Value == forwardBreakEnd; position2 = Convert.ToInt32(match.Groups[3].Value); referenceName2 = match.Groups[2].Value; return (ReferenceNameUtilities.GetChromosome(refNameToChromosome, referenceName2), position2, false, isSuffix2); } else { var reverseRegex = new Regex(@"([\[\]])(.+):(\d+)([\[\]])\w+", RegexOptions.Compiled); Match match = reverseRegex.Match(altAllele); if (!match.Success) throw new InvalidDataException( "Unable to successfully parse the complex rearrangements for the following allele: " + altAllele); isSuffix2 = match.Groups[1].Value == forwardBreakEnd; position2 = Convert.ToInt32(match.Groups[3].Value); referenceName2 = match.Groups[2].Value; return (ReferenceNameUtilities.GetChromosome(refNameToChromosome, referenceName2), position2, true, isSuffix2); } } private static string GetCnvVid(Chromosome chromosome, int start, int end, string altAllele) { start++; switch (altAllele) { case "": return $"{chromosome.EnsemblName}:{start}:{end}:CNV"; case "": return $"{chromosome.EnsemblName}:{start}:{end}:CDEL"; case "": return $"{chromosome.EnsemblName}:{start}:{end}:CDUP"; } // ReSharper disable once PossibleNullReferenceException string trimmedAltAllele = altAllele.Substring(1, altAllele.Length - 2); return $"{chromosome.EnsemblName}:{start}:{end}:{trimmedAltAllele}"; } internal static string GetSmallVariantVid(Chromosome chromosome, int start, int end, string altAllele, VariantType variantType) { switch (variantType) { case VariantType.SNV: return $"{chromosome.EnsemblName}:{start}:{altAllele}"; case VariantType.insertion: return $"{chromosome.EnsemblName}:{start}:{end}:{GetInsertedAltAllele(altAllele)}"; case VariantType.deletion: return $"{chromosome.EnsemblName}:{start}:{end}"; case VariantType.MNV: case VariantType.indel: return $"{chromosome.EnsemblName}:{start}:{end}:{GetInsertedAltAllele(altAllele)}"; case VariantType.non_informative_allele: return $"{chromosome.EnsemblName}:{start}:*"; default: throw new ArgumentOutOfRangeException(nameof(variantType), variantType, null); } } private static string GetInsertedAltAllele(string altAllele) { if (altAllele.Length <= 32) return altAllele; string insAltAllele; using (var md5Hash = MD5.Create()) { var md5Builder = StringBuilderPool.Get(); byte[] data = md5Hash.ComputeHash(Encoding.UTF8.GetBytes(altAllele)); md5Builder.Clear(); foreach (byte b in data) md5Builder.Append(b.ToString("x2")); insAltAllele = StringBuilderPool.GetStringAndReturn(md5Builder); } return insAltAllele; } private static string GetRepeatExpansionVid(Chromosome chromosome, int start, int end, string altAllele, string repeatUnit) { string repeatCount = altAllele.Trim('<', '>').Substring(3); return $"{chromosome.EnsemblName}:{start + 1}:{end}:{repeatUnit}:{repeatCount}"; } } } ================================================ FILE: Vcf/VariantCreator/ReferenceVariantCreator.cs ================================================ using Genome; using VariantAnnotation.Interface; using Variants; namespace Vcf.VariantCreator { public static class ReferenceVariantCreator { public static IVariant[] Create(IVariantIdCreator vidCreator, ISequence sequence, Chromosome chromosome, int start, int end, string refAllele, string altAllele, string globalMajorAllele) { bool isRefMinor = end == start && globalMajorAllele != null; if (!isRefMinor) return null; string vid = vidCreator.Create(sequence, VariantCategory.SmallVariant, null, chromosome, start, end, refAllele, altAllele, null); return new[] { SmallVariantCreator.Create(chromosome, start, end, globalMajorAllele, refAllele, false, false, null, vid, true) }; } } } ================================================ FILE: Vcf/VariantCreator/RepeatExpansionCreator.cs ================================================ using ErrorHandling.Exceptions; using Genome; using OptimizedCore; using Variants; namespace Vcf.VariantCreator { public static class RepeatExpansionCreator { public static IVariant Create(Chromosome chromosome, int start, int end, string refAllele, string altAllele, int? refRepeatCount, string vid) { (int repeatCount, bool foundError) = altAllele.Trim('<', '>').Substring(3).OptimizedParseInt32(); if (foundError) throw new UserErrorException($"Invalid alt allele ({altAllele}) found at {chromosome.UcscName}:{start}"); start++; return new RepeatExpansion(chromosome, start, end, refAllele, altAllele, vid, repeatCount, refRepeatCount); } } } ================================================ FILE: Vcf/VariantCreator/RohVariantCreator.cs ================================================ using Genome; using VariantAnnotation.Pools; using Variants; namespace Vcf.VariantCreator { public static class RohVariantCreator { public static IVariant Create(Chromosome chromosome, int start, int end, string refAllele, string altAllele, string variantId) => VariantPool.Get(chromosome, start + 1, end, refAllele, altAllele, VariantType.run_of_homozygosity, variantId, false, false, false, null, AnnotationBehavior.RunsOfHomozygosity, true); } } ================================================ FILE: Vcf/VariantCreator/SmallVariantCreator.cs ================================================ using Genome; using VariantAnnotation.Interface.IO; using VariantAnnotation.Pools; using Variants; namespace Vcf.VariantCreator { public static class SmallVariantCreator { public static IVariant Create(Chromosome chromosome, int start, int end, string refAllele, string altAllele, bool isDecomposed, bool isRecomposed, string[] linkedVids, string vid, bool isRefMinor) { var variantType = GetVariantType(refAllele, altAllele); var annotationBehavior = variantType == VariantType.non_informative_allele ? AnnotationBehavior.NonInformativeAlleles : AnnotationBehavior.SmallVariants; return VariantPool.Get(chromosome, start, end, refAllele, altAllele, variantType, vid, isRefMinor, isDecomposed, isRecomposed, linkedVids, annotationBehavior, false); } public static VariantType GetVariantType(string refAllele, string altAllele) { if (VcfCommon.IsNonInformativeAltAllele(altAllele)) return VariantType.non_informative_allele; int referenceAlleleLen = refAllele.Length; int alternateAlleleLen = altAllele.Length; if (alternateAlleleLen != referenceAlleleLen) { if (alternateAlleleLen == 0 && referenceAlleleLen > 0) return VariantType.deletion; if (alternateAlleleLen > 0 && referenceAlleleLen == 0) return VariantType.insertion; return VariantType.indel; } var variantType = alternateAlleleLen == 1 ? VariantType.SNV : VariantType.MNV; return variantType; } } } ================================================ FILE: Vcf/VariantCreator/StructuralVariantCreator.cs ================================================ using Genome; using VariantAnnotation.Pools; using Variants; namespace Vcf.VariantCreator { public static class StructuralVariantCreator { public static IVariant Create(Chromosome chromosome, int start, int end, string refAllele, string altAllele, string svType, string vid) { VariantType variantType = GetVariantType(altAllele, svType); AnnotationBehavior behavior = variantType == VariantType.translocation_breakend ? AnnotationBehavior.BreakendVariants : AnnotationBehavior.StructuralVariants; if (variantType != VariantType.translocation_breakend) start++; return VariantPool.Get(chromosome, start, end, refAllele, altAllele, variantType, vid, false, false, false, null, behavior, true); } public static VariantType GetVariantType(string altAllele, string svType) { switch (svType) { case "DEL": return VariantType.deletion; case "INS": return VariantType.insertion; case "DUP": return altAllele == "" ? VariantType.tandem_duplication : VariantType.duplication; case "INV": return VariantType.inversion; case "TDUP": return VariantType.tandem_duplication; case "BND": return VariantType.translocation_breakend; case "CNV": return VariantType.copy_number_variation; case "STR": return VariantType.short_tandem_repeat_variation; case "ALU": return VariantType.mobile_element_insertion; case "LINE1": return VariantType.mobile_element_insertion; case "LOH": return VariantType.copy_number_variation; case "SVA": return VariantType.mobile_element_insertion; default: return VariantType.unknown; } } } } ================================================ FILE: Vcf/VariantCreator/VariantFactory.cs ================================================ using System; using System.Collections.Generic; using System.IO; using Genome; using OptimizedCore; using VariantAnnotation.Interface; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using Variants; using Vcf.Sample; namespace Vcf.VariantCreator { public sealed class VariantFactory { private readonly IVariantIdCreator _vidCreator; private readonly ISequence _sequence; public readonly FormatIndices FormatIndices; public VariantFactory(ISequence sequence, IVariantIdCreator vidCreator, HashSet customSampleInfoKeys=null) { _sequence = sequence; _vidCreator = vidCreator; FormatIndices = new FormatIndices(customSampleInfoKeys); } public IVariant[] CreateVariants(Chromosome chromosome, int start, int end, string refAllele, string[] altAlleles, IInfoData infoData, bool[] isDecomposedByAllele, bool isRecomposed, List[] linkedVids, string globalMajorAllele) { bool isReference = globalMajorAllele != null; if (isReference) return ReferenceVariantCreator.Create(_vidCreator, _sequence, chromosome, start, end, refAllele, altAlleles[0], globalMajorAllele); var variantCategory = GetVariantCategory(altAlleles[0], infoData.SvType); var variants = new List(altAlleles.Length); for (var i = 0; i < altAlleles.Length; i++) { #if (!NI_ALLELE) if (VcfCommon.IsNonInformativeAltAllele(altAlleles[i])) continue; #endif string altAllele = altAlleles[i]; bool isDecomposed = isDecomposedByAllele[i]; if (isDecomposed && isRecomposed) throw new InvalidDataException("A variant can't be both decomposed and recomposed"); (int shiftedStart, string shiftedRef, string shiftedAlt) = _vidCreator.Normalize(_sequence, start, refAllele, altAllele); if (variantCategory == VariantCategory.SmallVariant || variantCategory == VariantCategory.Reference) end = shiftedStart + shiftedRef.Length - 1; variants.Add(GetVariant(chromosome, shiftedStart, end, shiftedRef, shiftedAlt, infoData, variantCategory, isDecomposed, isRecomposed, linkedVids?[i]?.ToArray())); } return variants.Count == 0 ? null : variants.ToArray(); } internal static VariantCategory GetVariantCategory(string firstAltAllele, string svType) { bool isSymbolicAllele = IsSymbolicAllele(firstAltAllele); if (IsBreakend(firstAltAllele)) return VariantCategory.SV; if (!isSymbolicAllele) return VariantCategory.SmallVariant; if (firstAltAllele == "") return VariantCategory.ROH; if (firstAltAllele.StartsWith(" altAllele.Contains("[") || altAllele.Contains("]"); private static bool IsSymbolicAllele(string altAllele) => altAllele.OptimizedStartsWith('<') && altAllele.OptimizedEndsWith('>') && !VcfCommon.IsNonInformativeAltAllele(altAllele); private IVariant GetVariant(Chromosome chromosome, int start, int end, string refAllele, string altAllele, IInfoData infoData, VariantCategory category, bool isDecomposed, bool isRecomposed, string[] linkedVids) { string vid = _vidCreator.Create(_sequence, category, infoData.SvType, chromosome, start, end, refAllele, altAllele, infoData.RepeatUnit); int svEnd = infoData.End ?? start; // ReSharper disable once SwitchStatementMissingSomeCases switch (category) { case VariantCategory.SmallVariant: return SmallVariantCreator.Create(chromosome, start, end, refAllele, altAllele, isDecomposed, isRecomposed, linkedVids, vid, false); case VariantCategory.ROH: return RohVariantCreator.Create(chromosome, start, svEnd, refAllele, altAllele, vid); case VariantCategory.SV: return StructuralVariantCreator.Create(chromosome, start, svEnd, refAllele, altAllele, infoData.SvType, vid); case VariantCategory.CNV: return CnvCreator.Create(chromosome, start, svEnd, refAllele, altAllele, vid); case VariantCategory.RepeatExpansion: return RepeatExpansionCreator.Create(chromosome, start, svEnd, refAllele, altAllele, infoData.RefRepeatCount, vid); default: throw new NotImplementedException($"Unrecognized variant category: {category}"); } } } } ================================================ FILE: Vcf/VariantCreator/VariantId.cs ================================================ using Genome; using VariantAnnotation.Interface; using Variants; namespace Vcf.VariantCreator { public sealed class VariantId : IVariantIdCreator { public string Create(ISequence sequence, VariantCategory category, string svType, Chromosome chromosome, int start, int end, string refAllele, string altAllele, string repeatUnit) { if (altAllele == ".") altAllele = refAllele; // fix N reference if (refAllele == "N") { refAllele = sequence.Substring(start - 1, 1); } // add padding bases if (string.IsNullOrEmpty(refAllele) || string.IsNullOrEmpty(altAllele)) { start--; string paddingBase = sequence.Substring(start - 1, 1); refAllele = paddingBase + refAllele; altAllele = paddingBase + altAllele; } if (category == VariantCategory.SmallVariant || category == VariantCategory.Reference || svType == "BND") { return GetVid(chromosome.EnsemblName, start, refAllele, altAllele); } if (category == VariantCategory.RepeatExpansion) svType = "STR"; return GetLongVid(chromosome.EnsemblName, start, end, refAllele, altAllele, svType); } public (int Start, string RefAllele, string AltAllele) Normalize(ISequence sequence, int start, string refAllele, string altAllele) => VariantUtils.TrimAndLeftAlign(start, refAllele, altAllele, sequence); private static string GetVid(string chromosomeName, int paddedPosition, string paddedRefAllele, string paddedAltAllele) => chromosomeName + '-' + paddedPosition + '-' + paddedRefAllele + '-' + paddedAltAllele; private static string GetLongVid(string chromosomeName, int paddedPosition, int endPosition, string paddedRefAllele, string paddedAltAllele, string svType) => chromosomeName + '-' + paddedPosition + '-' + endPosition + '-' + paddedRefAllele + '-' + paddedAltAllele + '-' + svType; } } ================================================ FILE: Vcf/Vcf.csproj ================================================  net6.0 ..\bin\$(Configuration) ================================================ FILE: Vcf/VcfFilter.cs ================================================ using System.IO; using Genome; using OptimizedCore; using VariantAnnotation.Interface.IO; namespace Vcf { public sealed class VcfFilter : IVcfFilter { private readonly GenomicRange _genomicRange; private readonly GenomicRangeChecker _genomicRangeChecker; internal string BufferedLine; public VcfFilter(GenomicRange genomicRange) { _genomicRange = genomicRange; _genomicRangeChecker = new GenomicRangeChecker(genomicRange); } public void FastForward(StreamReader reader) { string line; while ((line = reader.ReadLine()) != null) { if (line.StartsWith('#')) continue; string[] fields = line.OptimizedSplit('\t'); string chrName = fields[VcfCommon.ChromIndex]; if (chrName != _genomicRange.Start.Chromosome.UcscName && chrName != _genomicRange.Start.Chromosome.EnsemblName) continue; (int position, bool foundError) = fields[VcfCommon.PosIndex].OptimizedParseInt32(); if (foundError) throw new InvalidDataException($"Unable to convert the VCF position to an integer: {fields[VcfCommon.PosIndex]}"); if (position < _genomicRange.Start.Position) continue; BufferedLine = line; return; } } public string GetNextLine(StreamReader reader) { if (BufferedLine == null) { return reader.ReadLine(); } string bufferedLine = BufferedLine; BufferedLine = null; return bufferedLine; } public bool PassedTheEnd(Chromosome chromosome, int position) => _genomicRangeChecker.OutOfRange(chromosome, position); } } ================================================ FILE: Vcf/VcfReader.cs ================================================ using System; using System.Collections.Generic; using System.IO; using ErrorHandling.Exceptions; using Genome; using OptimizedCore; using VariantAnnotation.Interface; using VariantAnnotation.Interface.IO; using VariantAnnotation.Interface.Positions; using VariantAnnotation.Interface.Providers; using Vcf.VariantCreator; namespace Vcf { public sealed class VcfReader : IVcfReader { private readonly StreamReader _headerReader; private readonly StreamReader _reader; private readonly VariantFactory _variantFactory; private readonly IRefMinorProvider _refMinorProvider; private readonly ISequenceProvider _sequenceProvider; private readonly Dictionary _refNameToChromosome; private readonly IVcfFilter _vcfFilter; private readonly IMitoHeteroplasmyProvider _mitoHeteroplasmyProvider; public bool IsRcrsMitochondrion { get; private set; } public string VcfLine { get; private set; } public GenomeAssembly InferredGenomeAssembly { get; private set; } = GenomeAssembly.Unknown; private string[] _sampleNames; private List _headerLines; private readonly Queue _queuedPositions = new Queue(); private readonly HashSet _observedReferenceNames = new HashSet(); private string _currentReferenceName; public string[] GetSampleNames() => _sampleNames; public readonly bool EnableDq; public readonly HashSet CustomInfoKeys; private VcfReader( StreamReader headerReader, StreamReader vcfLineReader, ISequenceProvider sequenceProvider, IRefMinorProvider refMinorProvider, IVcfFilter vcfFilter, IVariantIdCreator vidCreator, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, bool enableDq = false, HashSet customInfoKeys = null, HashSet customSampleInfoKeys=null ) { _headerReader = headerReader; _reader = vcfLineReader; _variantFactory = new VariantFactory(sequenceProvider.Sequence, vidCreator, customSampleInfoKeys); _sequenceProvider = sequenceProvider; _refMinorProvider = refMinorProvider; _vcfFilter = vcfFilter; _refNameToChromosome = sequenceProvider.RefNameToChromosome; _mitoHeteroplasmyProvider = mitoHeteroplasmyProvider; EnableDq = enableDq; CustomInfoKeys = customInfoKeys; } public static VcfReader Create(StreamReader headerReader, StreamReader vcfLineReader, ISequenceProvider sequenceProvider, IRefMinorProvider refMinorProvider, IVcfFilter vcfFilter, IVariantIdCreator vidCreator, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, bool enableDq = false, HashSet customInfoKeys=null, HashSet customSampleInfoKeys=null) { var vcfReader = new VcfReader(headerReader, vcfLineReader, sequenceProvider, refMinorProvider, vcfFilter, vidCreator, mitoHeteroplasmyProvider, enableDq, customInfoKeys, customSampleInfoKeys); vcfReader.ParseHeader(); return vcfReader; } private void ParseHeader() { _headerLines = new List(); string line; while ((line = _headerReader.ReadLine()) != null) { CheckContigId(line); _headerLines.Add(line); if (line.StartsWith(VcfCommon.ChromosomeHeader)) break; } ValidateVcfHeader(); _sampleNames = ExtractSampleNames(line); _vcfFilter.FastForward(_reader); } private void CheckContigId(string line) { string[] chromAndLengthInfo = GetChromAndLengthInfo(line); if (chromAndLengthInfo.Length == 0) return; if (!_refNameToChromosome.TryGetValue(chromAndLengthInfo[0], out Chromosome chromosome)) return; if (!int.TryParse(chromAndLengthInfo[1], out int length)) return; var assemblyThisChrom = ContigInfo.GetGenomeAssembly(chromosome, length); if (assemblyThisChrom == GenomeAssembly.rCRS) { IsRcrsMitochondrion = true; return; } if (!GenomeAssemblyHelper.AutosomeAndAllosomeAssemblies.Contains(assemblyThisChrom)) return; if (InferredGenomeAssembly == GenomeAssembly.Unknown) InferredGenomeAssembly = assemblyThisChrom; if (InferredGenomeAssembly != assemblyThisChrom) throw new UserErrorException($"Inconsistent genome assemblies inferred:\ncurrent line \"{line}\" indicates {assemblyThisChrom}, whereas the lines above it indicate {InferredGenomeAssembly}."); } internal static string[] GetChromAndLengthInfo(string line) { if (!line.StartsWith("##contig=(); if (!line.Contains(",length=")) return Array.Empty(); string[] chromAndLength = line.TrimEnd('>').Substring(13).Split(",length="); return chromAndLength.Length == 2 ? chromAndLength : Array.Empty(); } private void ValidateVcfHeader() { if (_headerLines.Count == 0 || !_headerLines[0].StartsWith("##fileformat=VCFv")) throw new UserErrorException("Please provide a valid VCF file with proper fileformat field."); if (!_headerLines[_headerLines.Count - 1].StartsWith(VcfCommon.ChromosomeHeader)) throw new UserErrorException($"Could not find the vcf header line starting with {VcfCommon.ChromosomeHeader}. Is this a valid vcf file?"); } private static string[] ExtractSampleNames(string line) { string[] cols = line.OptimizedSplit('\t'); bool hasSampleGenotypes = cols.Length >= VcfCommon.MinNumColumnsSampleGenotypes; if (!hasSampleGenotypes) return null; int numSamples = cols.Length - VcfCommon.GenotypeIndex; var samples = new string[numSamples]; for (var i = 0; i < numSamples; i++) samples[i] = cols[VcfCommon.GenotypeIndex + i]; return samples; } private ISimplePosition GetNextSimplePosition() { while (_queuedPositions.Count == 0) { VcfLine = _vcfFilter.GetNextLine(_reader); if (VcfLine != null) { string[] vcfFields = VcfLine.OptimizedSplit('\t'); var chromosome = ReferenceNameUtilities.GetChromosome(_refNameToChromosome, vcfFields[VcfCommon.ChromIndex]); CheckVcfOrder(vcfFields[VcfCommon.ChromIndex]); (int start, bool foundError) = vcfFields[VcfCommon.PosIndex].OptimizedParseInt32(); if (foundError) throw new InvalidDataException($"Unable to convert the VCF position to an integer: {vcfFields[VcfCommon.PosIndex]}"); if (InconsistentSampleFields(vcfFields)) { int sampleCount = _sampleNames?.Length ?? 0; throw new UserErrorException($"Inconsistent number of sample fields in line:\n{VcfLine}\nExpected number of sample fields: {sampleCount}"); } _queuedPositions.Enqueue(SimplePosition.GetSimplePosition(chromosome, start, vcfFields, _vcfFilter)); } if (VcfLine == null) break; } return _queuedPositions.Count == 0 ? null : _queuedPositions.Dequeue(); } private bool InconsistentSampleFields(string[] vcfFields) { int sampleCount = _sampleNames?.Length ?? 0; if (sampleCount != 0) { return vcfFields.Length != VcfCommon.FormatIndex + 1 + sampleCount; } return vcfFields.Length != VcfCommon.InfoIndex + 1; } private void CheckVcfOrder(string referenceName) { if (referenceName == _currentReferenceName) return; if (_observedReferenceNames.Contains(referenceName)) { throw new FileNotSortedException("The current input vcf file is not sorted. Please sort the vcf file before running variant annotation using a tool like vcf-sort in vcftools."); } _observedReferenceNames.Add(referenceName); _currentReferenceName = referenceName; } public IPosition GetNextPosition() => Position.ToPosition(GetNextSimplePosition(), _refMinorProvider, _sequenceProvider, _mitoHeteroplasmyProvider, _variantFactory, EnableDq, CustomInfoKeys); public void Dispose() => _reader?.Dispose(); } }