Repository: LinearDesignSoftware/LinearDesign Branch: main Commit: f0126ca89a8b Files: 22 Total size: 257.8 KB Directory structure: gitextract_f87ovany/ ├── .gitattributes ├── Makefile ├── README.md ├── coding_wheel.txt ├── codon_usage_freq_table_human.csv ├── codon_usage_freq_table_yeast.csv ├── gflags.py ├── license.txt ├── lineardesign ├── src/ │ ├── Utils/ │ │ ├── base.h │ │ ├── codon.h │ │ ├── common.h │ │ ├── constants.h │ │ ├── flat.h │ │ ├── network.h │ │ ├── reader.h │ │ └── utility_v.h │ ├── backtrace_iter.cc │ ├── beam_cky_parser.cc │ ├── beam_cky_parser.h │ └── linear_design.cpp └── testseq ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ *.cpp linguist-language=c++ *.h linguist-language=c++ ================================================ FILE: Makefile ================================================ CLA=clang++ CXX=g++ CXXFLAGS=-std=c++11 -Ofast -DFINAL_CHECK -DSPECIAL_HP -fpermissive DEPS=src/beam_cky_parser.cc src/beam_cky_parser.h src/backtrace_iter.cc src/Utils/reader.h src/Utils/network.h src/Utils/codon.h src/Utils/utility_v.h src/Utils/common.h src/Utils/base.h BIN=bin/LinearDesign_2D UNAME_S := $(shell uname -s) UNAME_M := $(shell uname -m) lineardesign_2D: $(DEPS) @echo "Compiling" $@ "from" $< "..." chmod +x lineardesign mkdir -p ./bin export LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ifeq ($(UNAME_S), Linux) if $(CXX) $(CXXFLAGS) src/linear_design.cpp -o bin/LinearDesign_2D src/Utils/libraries/LinearDesign_linux64.so; then \ echo "Linux system; compiled with g++; finished."; \ echo "Compilation Succeed!"; \ else \ echo "Try another .so file."; \ if $(CXX) $(CXXFLAGS) src/linear_design.cpp -o bin/LinearDesign_2D src/Utils/libraries/LinearDesign_linux64_old.so; then \ echo "Linux system; compiled with g++; finished."; \ echo "Compilation Succeed!"; \ else \ echo "Compilation failed! Make sure it is either Linux-64 or Mac."; \ fi \ fi else if [[ $(UNAME_M) == 'arm64' ]]; then \ if $(CLA) $(CXXFLAGS) src/linear_design.cpp -o bin/LinearDesign_2D src/Utils/libraries/LinearDesign_Mac_M1.so; then \ echo "Mac M1 system; compiled with clang++; finished."; \ echo "Compilation Succeed!"; \ echo "You may encounter a pop-up message at the first run. If so, please go to System Preferences -> Security & Privacy -> General to allow LinearDesign_Mac_M1.so to open. See README.md for details."; \ else \ echo "Compilation failed! Make sure it is either Linux-64 or Mac."; \ fi \ else \ if $(CLA) $(CXXFLAGS) src/linear_design.cpp -o bin/LinearDesign_2D src/Utils/libraries/LinearDesign_Mac_x86.so; then \ echo "Mac x86_64 system; compiled with clang++; finished."; \ echo "Compilation Succeed!"; \ echo "You may encounter a pop-up message at the first run. If so, please go to System Preferences -> Security & Privacy -> General to allow LinearDesign_Mac_x86.so to open. See README.md for details."; \ else \ echo "Compilation failed! Make sure it is either Linux-64 or Mac."; \ fi \ fi endif .PHONY : clean clean: rm -f $(BIN) ================================================ FILE: README.md ================================================ Baidu Research Logo # Algorithm for Optimized mRNA Design Improves Stability and Immunogenicity (LinearDesign) ![GitHub all releases](https://img.shields.io/github/downloads/LinearDesignSoftware/LinearDesign/total) This repository contains the source code for the LinearDesign project. He Zhang†, Liang Zhang†, Ang Lin†, Congcong Xu†, Ziyu Li, Kaibo Liu, Boxiang Liu, Xiaopin Ma, Fanfan Zhao, Huiling Jiang, Chunxiu Chen, Haifa Shen, Hangwen Li*, David H. Mathews*, Yujian Zhang*, Liang Huang†*#. Algorithm for Optimized mRNA Design Improves Stability and Immunogenicity. Nature [https://doi.org/10.1038/s41586-023-06127-z](https://doi.org/10.1038/s41586-023-06127-z) (2023) † contributed equally, \* corresponding authors, # lead corresponding author For questions, please contact the lead corresponding author at . ## Dependencies Clang 11.0.0 (or above) or GCC 4.8.5 (or above) python2.7 ## To Compile ``` make ``` ## To Run The LinearDesign program can be run with: ``` echo SEQUENCE | ./lineardesign [OPTIONS] OR cat FASTA_FILE | ./lineardesign [OPTIONS] ``` OPTIONS: ``` --lambda LAMBDA or -l LAMBDA ``` Set LAMBDA, a hyperparameter balancing MFE and CAI. (default 0.0) ``` --codonusage FILE_NAME or -c FILE_NAME ``` Import a Codon Usage Frequency Table. See "codon_usage_freq_table_human.csv" for the format. (default: using human codon usage frequency table) ``` --verbose or -v ``` Print out more details. (default False) For Macbook, users may encounter a pop-up message at the first run. For Mac-M1 system, the message is: ``` "LinearDesign_Mac_M1.so" can't be opened because Apple cannot check it for malicious software. ``` For Mac-Intel system, the message is: ``` "LinearDesign_Mac_Intel.so" cannot be opened because it is from an unidentified developer. ``` If so, please go to "System Preferences -> Security & Privacy -> General" to allow LinearDesign-Mac-M1.so (or LinearDesign-Mac-Intel.so) to open. ## Example: Single Sequence Design ``` echo MNDTEAI | ./lineardesign mRNA sequence: AUGAACGAUACGGAGGCGAUC mRNA structure: ......(((.((....))))) mRNA folding free energy: -1.10 kcal/mol; mRNA CAI: 0.695 ``` ## Example: Multiple Sequences Design with Option --lambda (-l) ``` cat testseq | ./lineardesign --lambda 3 >seq1 mRNA sequence: AUGCCAAACACCCUGGCAUGCCCC mRNA structure: ((((((.......))))))..... mRNA folding free energy: -6.00 kcal/mol; mRNA CAI: 0.910 >seq2 mRNA sequence: AUGCUGGAUCAGGUGAACAAGCUGAAGUACCCAGAGGUGAGCCUGACCUGA mRNA structure: .....((.((((((..((...(((.......)))..))..))))))))... mRNA folding free energy: -13.50 kcal/mol; mRNA CAI: 0.979 ``` ## Example: Option --codonusage (-c) ``` echo MNDTEAI | ./lineardesign -l 0.3 --codonusage codon_usage_freq_table_yeast.csv mRNA sequence: AUGAAUGAUACGGAAGCGAUC mRNA structure: ......(((.((....))))) mRNA folding free energy: -1.10 kcal/mol; mRNA CAI: 0.670 ``` ## Example: Option --verbose (-v) ``` echo MNDTEAI | ./lineardesign --verbose Input protein: MNDTEAI Using lambda = 0; Using codon frequency table = codon_usage_freq_table_human.csv mRNA sequence: AUGAACGAUACGGAGGCGAUC mRNA structure: ......(((.((....))))) mRNA folding free energy: -1.10 kcal/mol; mRNA CAI: 0.695 Runtime: 0.002 seconds ``` ## Declarations Baidu Research has filed a patent for the LinearDesign algorithm that lists He Zhang, Liang Zhang, Ziyu Li, Kaibo Liu, Boxiang Liu, and Liang Huang as inventors. ================================================ FILE: coding_wheel.txt ================================================ Phe U U CU Leu C U GCUA U U GA Ser U C GCUA A G CU Tyr U A CU STOP U A GA U G A Cys U G CU Trp U G G Pro C C GCUA His C A CU Gln C A GA Arg C G GCUA A G GA Ile A U CUA Met A U G Thr A C GCUA Asn A A CU Lys A A GA Val G U GCUA Asp G A CU Glu G A GA Gly G G GCUA Ala G C GCUA ================================================ FILE: codon_usage_freq_table_human.csv ================================================ #,, UAA,*,0.28 UAG,*,0.2 UGA,*,0.52 GCU,A,0.26 GCC,A,0.4 GCA,A,0.23 GCG,A,0.11 UGU,C,0.45 UGC,C,0.55 GAU,D,0.46 GAC,D,0.54 GAA,E,0.42 GAG,E,0.58 UUU,F,0.45 UUC,F,0.55 GGU,G,0.16 GGC,G,0.34 GGA,G,0.25 GGG,G,0.25 CAU,H,0.41 CAC,H,0.59 AUU,I,0.36 AUC,I,0.48 AUA,I,0.16 AAA,K,0.42 AAG,K,0.58 UUA,L,0.07 UUG,L,0.13 CUU,L,0.13 CUC,L,0.2 CUA,L,0.07 CUG,L,0.41 AUG,M,1 AAU,N,0.46 AAC,N,0.54 CCU,P,0.28 CCC,P,0.33 CCA,P,0.27 CCG,P,0.11 CAA,Q,0.25 CAG,Q,0.75 CGU,R,0.08 CGC,R,0.19 CGA,R,0.11 CGG,R,0.21 AGA,R,0.2 AGG,R,0.2 UCU,S,0.18 UCC,S,0.22 UCA,S,0.15 UCG,S,0.06 AGU,S,0.15 AGC,S,0.24 ACU,T,0.24 ACC,T,0.36 ACA,T,0.28 ACG,T,0.12 GUU,V,0.18 GUC,V,0.24 GUA,V,0.11 GUG,V,0.47 UGG,W,1 UAU,Y,0.43 UAC,Y,0.57 ================================================ FILE: codon_usage_freq_table_yeast.csv ================================================ #,, UAA,*,0.48 UAG,*,0.24 UGA,*,0.29 GCU,A,0.38 GCC,A,0.22 GCA,A,0.29 GCG,A,0.11 UGU,C,0.63 UGC,C,0.37 GAU,D,0.65 GAC,D,0.35 GAA,E,0.71 GAG,E,0.29 UUU,F,0.59 UUC,F,0.41 GGU,G,0.47 GGC,G,0.19 GGA,G,0.22 GGG,G,0.12 CAU,H,0.64 CAC,H,0.36 AUU,I,0.46 AUC,I,0.26 AUA,I,0.27 AAA,K,0.58 AAG,K,0.42 UUA,L,0.28 UUG,L,0.29 CUU,L,0.13 CUC,L,0.06 CUA,L,0.14 CUG,L,0.11 AUG,M,1 AAU,N,0.59 AAC,N,0.41 CCU,P,0.31 CCC,P,0.15 CCA,P,0.41 CCG,P,0.12 CAA,Q,0.69 CAG,Q,0.31 CGU,R,0.15 CGC,R,0.06 CGA,R,0.07 CGG,R,0.04 AGA,R,0.48 AGG,R,0.21 UCU,S,0.26 UCC,S,0.16 UCA,S,0.21 UCG,S,0.1 AGU,S,0.16 AGC,S,0.11 ACU,T,0.35 ACC,T,0.22 ACA,T,0.3 ACG,T,0.13 GUU,V,0.39 GUC,V,0.21 GUA,V,0.21 GUG,V,0.19 UGG,W,1 UAU,Y,0.56 UAC,Y,0.44 ================================================ FILE: gflags.py ================================================ #!/usr/bin/env python # Copyright (c) 2007, Google Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following disclaimer # in the documentation and/or other materials provided with the # distribution. # * Neither the name of Google Inc. nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # --- # Author: Chad Lester # Design and style contributions by: # Amit Patel, Bogdan Cocosel, Daniel Dulitz, Eric Tiedemann, # Eric Veach, Laurence Gonsalves, Matthew Springer # Code reorganized a bit by Craig Silverstein """This module is used to define and parse command line flags. This module defines a *distributed* flag-definition policy: rather than an application having to define all flags in or near main(), each python module defines flags that are useful to it. When one python module imports another, it gains access to the other's flags. (This is implemented by having all modules share a common, global registry object containing all the flag information.) Flags are defined through the use of one of the DEFINE_xxx functions. The specific function used determines how the flag is parsed, checked, and optionally type-converted, when it's seen on the command line. IMPLEMENTATION: DEFINE_* creates a 'Flag' object and registers it with a 'FlagValues' object (typically the global FlagValues FLAGS, defined here). The 'FlagValues' object can scan the command line arguments and pass flag arguments to the corresponding 'Flag' objects for value-checking and type conversion. The converted flag values are available as attributes of the 'FlagValues' object. Code can access the flag through a FlagValues object, for instance gflags.FLAGS.myflag. Typically, the __main__ module passes the command line arguments to gflags.FLAGS for parsing. At bottom, this module calls getopt(), so getopt functionality is supported, including short- and long-style flags, and the use of -- to terminate flags. Methods defined by the flag module will throw 'FlagsError' exceptions. The exception argument will be a human-readable string. FLAG TYPES: This is a list of the DEFINE_*'s that you can do. All flags take a name, default value, help-string, and optional 'short' name (one-letter name). Some flags have other arguments, which are described with the flag. DEFINE_string: takes any input, and interprets it as a string. DEFINE_bool or DEFINE_boolean: typically does not take an argument: say --myflag to set FLAGS.myflag to true, or --nomyflag to set FLAGS.myflag to false. Alternately, you can say --myflag=true or --myflag=t or --myflag=1 or --myflag=false or --myflag=f or --myflag=0 DEFINE_float: takes an input and interprets it as a floating point number. Takes optional args lower_bound and upper_bound; if the number specified on the command line is out of range, it will raise a FlagError. DEFINE_integer: takes an input and interprets it as an integer. Takes optional args lower_bound and upper_bound as for floats. DEFINE_enum: takes a list of strings which represents legal values. If the command-line value is not in this list, raise a flag error. Otherwise, assign to FLAGS.flag as a string. DEFINE_list: Takes a comma-separated list of strings on the commandline. Stores them in a python list object. DEFINE_spaceseplist: Takes a space-separated list of strings on the commandline. Stores them in a python list object. Example: --myspacesepflag "foo bar baz" DEFINE_multistring: The same as DEFINE_string, except the flag can be specified more than once on the commandline. The result is a python list object (list of strings), even if the flag is only on the command line once. DEFINE_multi_int: The same as DEFINE_integer, except the flag can be specified more than once on the commandline. The result is a python list object (list of ints), even if the flag is only on the command line once. SPECIAL FLAGS: There are a few flags that have special meaning: --help prints a list of all the flags in a human-readable fashion --helpshort prints a list of all key flags (see below). --helpxml prints a list of all flags, in XML format. DO NOT parse the output of --help and --helpshort. Instead, parse the output of --helpxml. As we add new flags, we may add new XML elements. Hence, make sure your parser does not crash when it encounters new XML elements. --flagfile=foo read flags from foo. --undefok=f1,f2 ignore unrecognized option errors for f1,f2. For boolean flags, you should use --undefok=boolflag, and --boolflag and --noboolflag will be accepted. Do not use --undefok=noboolflag. -- as in getopt(), terminates flag-processing NOTE ON --flagfile: Flags may be loaded from text files in addition to being specified on the commandline. Any flags you don't feel like typing, throw them in a file, one flag per line, for instance: --myflag=myvalue --nomyboolean_flag You then specify your file with the special flag '--flagfile=somefile'. You CAN recursively nest flagfile= tokens OR use multiple files on the command line. Lines beginning with a single hash '#' or a double slash '//' are comments in your flagfile. Any flagfile= will be interpreted as having a relative path from the current working directory rather than from the place the file was included from: myPythonScript.py --flagfile=config/somefile.cfg If somefile.cfg includes further --flagfile= directives, these will be referenced relative to the original CWD, not from the directory the including flagfile was found in! The caveat applies to people who are including a series of nested files in a different dir than they are executing out of. Relative path names are always from CWD, not from the directory of the parent include flagfile. We do now support '~' expanded directory names. Absolute path names ALWAYS work! EXAMPLE USAGE: import gflags FLAGS = gflags.FLAGS # Flag names are globally defined! So in general, we need to be # careful to pick names that are unlikely to be used by other libraries. # If there is a conflict, we'll get an error at import time. gflags.DEFINE_string('name', 'Mr. President', 'your name') gflags.DEFINE_integer('age', None, 'your age in years', lower_bound=0) gflags.DEFINE_boolean('debug', False, 'produces debugging output') gflags.DEFINE_enum('gender', 'male', ['male', 'female'], 'your gender') def main(argv): try: argv = FLAGS(argv) # parse flags except gflags.FlagsError, e: print '%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS) sys.exit(1) if FLAGS.debug: print 'non-flag arguments:', argv print 'Happy Birthday', FLAGS.name if FLAGS.age is not None: print 'You are a %s, who is %d years old' % (FLAGS.gender, FLAGS.age) if __name__ == '__main__': main(sys.argv) KEY FLAGS: As we already explained, each module gains access to all flags defined by all the other modules it transitively imports. In the case of non-trivial scripts, this means a lot of flags ... For documentation purposes, it is good to identify the flags that are key (i.e., really important) to a module. Clearly, the concept of "key flag" is a subjective one. When trying to determine whether a flag is key to a module or not, assume that you are trying to explain your module to a potential user: which flags would you really like to mention first? We'll describe shortly how to declare which flags are key to a module. For the moment, assume we know the set of key flags for each module. Then, if you use the app.py module, you can use the --helpshort flag to print only the help for the flags that are key to the main module, in a human-readable format. NOTE: If you need to parse the flag help, do NOT use the output of --help / --helpshort. That output is meant for human consumption, and may be changed in the future. Instead, use --helpxml; flags that are key for the main module are marked there with a yes element. The set of key flags for a module M is composed of: 1. Flags defined by module M by calling a DEFINE_* function. 2. Flags that module M explictly declares as key by using the function DECLARE_key_flag() 3. Key flags of other modules that M specifies by using the function ADOPT_module_key_flags() This is a "bulk" declaration of key flags: each flag that is key for becomes key for the current module too. Notice that if you do not use the functions described at points 2 and 3 above, then --helpshort prints information only about the flags defined by the main module of our script. In many cases, this behavior is good enough. But if you move part of the main module code (together with the related flags) into a different module, then it is nice to use DECLARE_key_flag / ADOPT_module_key_flags and make sure --helpshort lists all relevant flags (otherwise, your code refactoring may confuse your users). Note: each of DECLARE_key_flag / ADOPT_module_key_flags has its own pluses and minuses: DECLARE_key_flag is more targeted and may lead a more focused --helpshort documentation. ADOPT_module_key_flags is good for cases when an entire module is considered key to the current script. Also, it does not require updates to client scripts when a new flag is added to the module. EXAMPLE USAGE 2 (WITH KEY FLAGS): Consider an application that contains the following three files (two auxiliary modules and a main module): File libfoo.py: import gflags gflags.DEFINE_integer('num_replicas', 3, 'Number of replicas to start') gflags.DEFINE_boolean('rpc2', True, 'Turn on the usage of RPC2.') ... some code ... File libbar.py: import gflags gflags.DEFINE_string('bar_gfs_path', '/gfs/path', 'Path to the GFS files for libbar.') gflags.DEFINE_string('email_for_bar_errors', 'bar-team@google.com', 'Email address for bug reports about module libbar.') gflags.DEFINE_boolean('bar_risky_hack', False, 'Turn on an experimental and buggy optimization.') ... some code ... File myscript.py: import gflags import libfoo import libbar gflags.DEFINE_integer('num_iterations', 0, 'Number of iterations.') # Declare that all flags that are key for libfoo are # key for this module too. gflags.ADOPT_module_key_flags(libfoo) # Declare that the flag --bar_gfs_path (defined in libbar) is key # for this module. gflags.DECLARE_key_flag('bar_gfs_path') ... some code ... When myscript is invoked with the flag --helpshort, the resulted help message lists information about all the key flags for myscript: --num_iterations, --num_replicas, --rpc2, and --bar_gfs_path (in addition to the special flags --help and --helpshort). Of course, myscript uses all the flags declared by it (in this case, just --num_replicas) or by any of the modules it transitively imports (e.g., the modules libfoo, libbar). E.g., it can access the value of FLAGS.bar_risky_hack, even if --bar_risky_hack is not declared as a key flag for myscript. """ #import cgi import getopt import os import re import string import sys # Are we running at least python 2.2? try: if tuple(sys.version_info[:3]) < (2,2,0): raise NotImplementedError("requires python 2.2.0 or later") except AttributeError: # a very old python, that lacks sys.version_info raise NotImplementedError("requires python 2.2.0 or later") # If we're not running at least python 2.2.1, define True, False, and bool. # Thanks, Guido, for the code. try: True, False, bool except NameError: False = 0 True = 1 def bool(x): if x: return True else: return False # Are we running under pychecker? _RUNNING_PYCHECKER = 'pychecker.python' in sys.modules def _GetCallingModule(): """Returns the name of the module that's calling into this module. We generally use this function to get the name of the module calling a DEFINE_foo... function. """ # Walk down the stack to find the first globals dict that's not ours. for depth in range(1, sys.getrecursionlimit()): if not sys._getframe(depth).f_globals is globals(): module_name = __GetModuleName(sys._getframe(depth).f_globals) if module_name is not None: return module_name raise AssertionError("No module was found") # module exceptions: class FlagsError(Exception): """The base class for all flags errors.""" pass class DuplicateFlag(FlagsError): """Raised if there is a flag naming conflict.""" pass # A DuplicateFlagError conveys more information than a # DuplicateFlag. Since there are external modules that create # DuplicateFlags, the interface to DuplicateFlag shouldn't change. class DuplicateFlagError(DuplicateFlag): def __init__(self, flagname, flag_values): self.flagname = flagname message = "The flag '%s' is defined twice." % self.flagname flags_by_module = flag_values.FlagsByModuleDict() for module in flags_by_module: for flag in flags_by_module[module]: if flag.name == flagname or flag.short_name == flagname: message = message + " First from " + module + "," break message = message + " Second from " + _GetCallingModule() DuplicateFlag.__init__(self, message) class IllegalFlagValue(FlagsError): """The flag command line argument is illegal.""" pass class UnrecognizedFlag(FlagsError): """Raised if a flag is unrecognized.""" pass # An UnrecognizedFlagError conveys more information than an # UnrecognizedFlag. Since there are external modules that create # DuplicateFlags, the interface to DuplicateFlag shouldn't change. class UnrecognizedFlagError(UnrecognizedFlag): def __init__(self, flagname): self.flagname = flagname UnrecognizedFlag.__init__( self, "Unknown command line flag '%s'" % flagname) # Global variable used by expvar _exported_flags = {} _help_width = 80 # width of help output def GetHelpWidth(): """Returns: an integer, the width of help lines that is used in TextWrap.""" return _help_width def CutCommonSpacePrefix(text): """Removes a common space prefix from the lines of a multiline text. If the first line does not start with a space, it is left as it is and only in the remaining lines a common space prefix is being searched for. That means the first line will stay untouched. This is especially useful to turn doc strings into help texts. This is because some people prefer to have the doc comment start already after the apostrophy and then align the following lines while others have the apostrophies on a seperately line. The function also drops trailing empty lines and ignores empty lines following the initial content line while calculating the initial common whitespace. Args: text: text to work on Returns: the resulting text """ text_lines = text.splitlines() # Drop trailing empty lines while text_lines and not text_lines[-1]: text_lines = text_lines[:-1] if text_lines: # We got some content, is the first line starting with a space? if text_lines[0] and text_lines[0][0].isspace(): text_first_line = [] else: text_first_line = [text_lines.pop(0)] # Calculate length of common leading whitesppace (only over content lines) common_prefix = os.path.commonprefix([line for line in text_lines if line]) space_prefix_len = len(common_prefix) - len(common_prefix.lstrip()) # If we have a common space prefix, drop it from all lines if space_prefix_len: for index in xrange(len(text_lines)): if text_lines[index]: text_lines[index] = text_lines[index][space_prefix_len:] return '\n'.join(text_first_line + text_lines) return '' def TextWrap(text, length=None, indent='', firstline_indent=None, tabs=' '): """Wraps a given text to a maximum line length and returns it. We turn lines that only contain whitespaces into empty lines. We keep new lines and tabs (e.g., we do not treat tabs as spaces). Args: text: text to wrap length: maximum length of a line, includes indentation if this is None then use GetHelpWidth() indent: indent for all but first line firstline_indent: indent for first line; if None, fall back to indent tabs: replacement for tabs Returns: wrapped text Raises: FlagsError: if indent not shorter than length FlagsError: if firstline_indent not shorter than length """ # Get defaults where callee used None if length is None: length = GetHelpWidth() if indent is None: indent = '' if len(indent) >= length: raise FlagsError('Indent must be shorter than length') # In line we will be holding the current line which is to be started # with indent (or firstline_indent if available) and then appended # with words. if firstline_indent is None: firstline_indent = '' line = indent else: line = firstline_indent if len(firstline_indent) >= length: raise FlagsError('First iline indent must be shorter than length') # If the callee does not care about tabs we simply convert them to # spaces If callee wanted tabs to be single space then we do that # already here. if not tabs or tabs == ' ': text = text.replace('\t', ' ') else: tabs_are_whitespace = not tabs.strip() line_regex = re.compile('([ ]*)(\t*)([^ \t]+)', re.MULTILINE) # Split the text into lines and the lines with the regex above. The # resulting lines are collected in result[]. For each split we get the # spaces, the tabs and the next non white space (e.g. next word). result = [] for text_line in text.splitlines(): # Store result length so we can find out whether processing the next # line gave any new content old_result_len = len(result) # Process next line with line_regex. For optimization we do an rstrip(). # - process tabs (changes either line or word, see below) # - process word (first try to squeeze on line, then wrap or force wrap) # Spaces found on the line are ignored, they get added while wrapping as # needed. for spaces, current_tabs, word in line_regex.findall(text_line.rstrip()): # If tabs weren't converted to spaces, handle them now if current_tabs: # If the last thing we added was a space anyway then drop # it. But let's not get rid of the indentation. if (((result and line != indent) or (not result and line != firstline_indent)) and line[-1] == ' '): line = line[:-1] # Add the tabs, if that means adding whitespace, just add it at # the line, the rstrip() code while shorten the line down if # necessary if tabs_are_whitespace: line += tabs * len(current_tabs) else: # if not all tab replacement is whitespace we prepend it to the word word = tabs * len(current_tabs) + word # Handle the case where word cannot be squeezed onto current last line if len(line) + len(word) > length and len(indent) + len(word) <= length: result.append(line.rstrip()) line = indent + word word = '' # No space left on line or can we append a space? if len(line) + 1 >= length: result.append(line.rstrip()) line = indent else: line += ' ' # Add word and shorten it up to allowed line length. Restart next # line with indent and repeat, or add a space if we're done (word # finished) This deals with words that caanot fit on one line # (e.g. indent + word longer than allowed line length). while len(line) + len(word) >= length: line += word result.append(line[:length]) word = line[length:] line = indent # Default case, simply append the word and a space if word: line += word + ' ' # End of input line. If we have content we finish the line. If the # current line is just the indent but we had content in during this # original line then we need to add an emoty line. if (result and line != indent) or (not result and line != firstline_indent): result.append(line.rstrip()) elif len(result) == old_result_len: result.append('') line = indent return '\n'.join(result) def DocToHelp(doc): """Takes a __doc__ string and reformats it as help.""" # Get rid of starting and ending white space. Using lstrip() or even # strip() could drop more than maximum of first line and right space # of last line. doc = doc.strip() # Get rid of all empty lines whitespace_only_line = re.compile('^[ \t]+$', re.M) doc = whitespace_only_line.sub('', doc) # Cut out common space at line beginnings doc = CutCommonSpacePrefix(doc) # Just like this module's comment, comments tend to be aligned somehow. # In other words they all start with the same amount of white space # 1) keep double new lines # 2) keep ws after new lines if not empty line # 3) all other new lines shall be changed to a space # Solution: Match new lines between non white space and replace with space. doc = re.sub('(?<=\S)\n(?=\S)', ' ', doc, re.M) return doc def __GetModuleName(globals_dict): """Given a globals dict, returns the name of the module that defines it. Args: globals_dict: A dictionary that should correspond to an environment providing the values of the globals. Returns: A string (the name of the module) or None (if the module could not be identified. """ for name, module in sys.modules.iteritems(): if getattr(module, '__dict__', None) is globals_dict: if name == '__main__': return sys.argv[0] return name return None def _GetMainModule(): """Returns the name of the module from which execution started.""" for depth in range(1, sys.getrecursionlimit()): try: globals_of_main = sys._getframe(depth).f_globals except ValueError: return __GetModuleName(globals_of_main) raise AssertionError("No module was found") # lhuang: main entry here class FlagValues: """Registry of 'Flag' objects. A 'FlagValues' can then scan command line arguments, passing flag arguments through to the 'Flag' objects that it owns. It also provides easy access to the flag values. Typically only one 'FlagValues' object is needed by an application: gflags.FLAGS This class is heavily overloaded: 'Flag' objects are registered via __setitem__: FLAGS['longname'] = x # register a new flag The .value attribute of the registered 'Flag' objects can be accessed as attributes of this 'FlagValues' object, through __getattr__. Both the long and short name of the original 'Flag' objects can be used to access its value: FLAGS.longname # parsed flag value FLAGS.x # parsed flag value (short name) Command line arguments are scanned and passed to the registered 'Flag' objects through the __call__ method. Unparsed arguments, including argv[0] (e.g. the program name) are returned. argv = FLAGS(sys.argv) # scan command line arguments The original registered Flag objects can be retrieved through the use of the dictionary-like operator, __getitem__: x = FLAGS['longname'] # access the registered Flag object The str() operator of a 'FlagValues' object provides help for all of the registered 'Flag' objects. """ def __init__(self): # Since everything in this class is so heavily overloaded, the only # way of defining and using fields is to access __dict__ directly. # Dictionary: flag name (string) -> Flag object. self.__dict__['__flags'] = {} # Dictionary: module name (string) -> list of Flag objects that are defined # by that module. self.__dict__['__flags_by_module'] = {} # Dictionary: module name (string) -> list of Flag objects that are # key for that module. self.__dict__['__key_flags_by_module'] = {} def FlagDict(self): return self.__dict__['__flags'] def FlagsByModuleDict(self): """Returns the dictionary of module_name -> list of defined flags. Returns: A dictionary. Its keys are module names (strings). Its values are lists of Flag objects. """ return self.__dict__['__flags_by_module'] def KeyFlagsByModuleDict(self): """Returns the dictionary of module_name -> list of key flags. Returns: A dictionary. Its keys are module names (strings). Its values are lists of Flag objects. """ return self.__dict__['__key_flags_by_module'] def _RegisterFlagByModule(self, module_name, flag): """Records the module that defines a specific flag. We keep track of which flag is defined by which module so that we can later sort the flags by module. Args: module_name: A string, the name of a Python module. flag: A Flag object, a flag that is key to the module. """ flags_by_module = self.FlagsByModuleDict() flags_by_module.setdefault(module_name, []).append(flag) def _RegisterKeyFlagForModule(self, module_name, flag): """Specifies that a flag is a key flag for a module. Args: module_name: A string, the name of a Python module. flag: A Flag object, a flag that is key to the module. """ key_flags_by_module = self.KeyFlagsByModuleDict() # The list of key flags for the module named module_name. key_flags = key_flags_by_module.setdefault(module_name, []) # Add flag, but avoid duplicates. if flag not in key_flags: key_flags.append(flag) def _GetFlagsDefinedByModule(self, module): """Returns the list of flags defined by a module. Args: module: A module object or a module name (a string). Returns: A new list of Flag objects. Caller may update this list as he wishes: none of those changes will affect the internals of this FlagValue object. """ if not isinstance(module, str): module = module.__name__ return list(self.FlagsByModuleDict().get(module, [])) def _GetKeyFlagsForModule(self, module): """Returns the list of key flags for a module. Args: module: A module object or a module name (a string) Returns: A new list of Flag objects. Caller may update this list as he wishes: none of those changes will affect the internals of this FlagValue object. """ if not isinstance(module, str): module = module.__name__ # Any flag is a key flag for the module that defined it. NOTE: # key_flags is a fresh list: we can update it without affecting the # internals of this FlagValues object. key_flags = self._GetFlagsDefinedByModule(module) # Take into account flags explicitly declared as key for a module. for flag in self.KeyFlagsByModuleDict().get(module, []): if flag not in key_flags: key_flags.append(flag) return key_flags def AppendFlagValues(self, flag_values): """Appends flags registered in another FlagValues instance. Args: flag_values: registry to copy from """ for flag_name, flag in flag_values.FlagDict().iteritems(): # Each flags with shortname appears here twice (once under its # normal name, and again with its short name). To prevent # problems (DuplicateFlagError) with double flag registration, we # perform a check to make sure that the entry we're looking at is # for its normal name. if flag_name == flag.name: self[flag_name] = flag def __setitem__(self, name, flag): """Registers a new flag variable.""" fl = self.FlagDict() if not isinstance(flag, Flag): raise IllegalFlagValue(flag) if not isinstance(name, type("")): raise FlagsError("Flag name must be a string") if len(name) == 0: raise FlagsError("Flag name cannot be empty") # If running under pychecker, duplicate keys are likely to be # defined. Disable check for duplicate keys when pycheck'ing. if (fl.has_key(name) and not flag.allow_override and not fl[name].allow_override and not _RUNNING_PYCHECKER): raise DuplicateFlagError(name, self) short_name = flag.short_name if short_name is not None: if (fl.has_key(short_name) and not flag.allow_override and not fl[short_name].allow_override and not _RUNNING_PYCHECKER): raise DuplicateFlagError(short_name, self) fl[short_name] = flag fl[name] = flag global _exported_flags _exported_flags[name] = flag def __getitem__(self, name): """Retrieves the Flag object for the flag --name.""" return self.FlagDict()[name] def __getattr__(self, name): """Retrieves the 'value' attribute of the flag --name.""" fl = self.FlagDict() if not fl.has_key(name): raise AttributeError(name) return fl[name].value def __setattr__(self, name, value): """Sets the 'value' attribute of the flag --name.""" fl = self.FlagDict() fl[name].value = value return value def _FlagIsRegistered(self, flag_obj): """Checks whether a Flag object is registered under some name. Note: this is non trivial: in addition to its normal name, a flag may have a short name too. In self.FlagDict(), both the normal and the short name are mapped to the same flag object. E.g., calling only "del FLAGS.short_name" is not unregistering the corresponding Flag object (it is still registered under the longer name). Args: flag_obj: A Flag object. Returns: A boolean: True iff flag_obj is registered under some name. """ flag_dict = self.FlagDict() # Check whether flag_obj is registered under its long name. name = flag_obj.name if flag_dict.get(name, None) == flag_obj: return True # Check whether flag_obj is registered under its short name. short_name = flag_obj.short_name if (short_name is not None and flag_dict.get(short_name, None) == flag_obj): return True # The flag cannot be registered under any other name, so we do not # need to do a full search through the values of self.FlagDict(). return False def __delattr__(self, flag_name): """Deletes a previously-defined flag from a flag object. This method makes sure we can delete a flag by using del flag_values_object. E.g., flags.DEFINE_integer('foo', 1, 'Integer flag.') del flags.FLAGS.foo Args: flag_name: A string, the name of the flag to be deleted. Raises: AttributeError: When there is no registered flag named flag_name. """ fl = self.FlagDict() if flag_name not in fl: raise AttributeError(flag_name) flag_obj = fl[flag_name] del fl[flag_name] if not self._FlagIsRegistered(flag_obj): # If the Flag object indicated by flag_name is no longer # registered (please see the docstring of _FlagIsRegistered), then # we delete the occurences of the flag object in all our internal # dictionaries. self.__RemoveFlagFromDictByModule(self.FlagsByModuleDict(), flag_obj) self.__RemoveFlagFromDictByModule(self.KeyFlagsByModuleDict(), flag_obj) def __RemoveFlagFromDictByModule(self, flags_by_module_dict, flag_obj): """Removes a flag object from a module -> list of flags dictionary. Args: flags_by_module_dict: A dictionary that maps module names to lists of flags. flag_obj: A flag object. """ for unused_module, flags_in_module in flags_by_module_dict.iteritems(): # while (as opposed to if) takes care of multiple occurences of a # flag in the list for the same module. while flag_obj in flags_in_module: flags_in_module.remove(flag_obj) def SetDefault(self, name, value): """Changes the default value of the named flag object.""" fl = self.FlagDict() if not fl.has_key(name): raise AttributeError(name) fl[name].SetDefault(value) def __contains__(self, name): """Returns True if name is a value (flag) in the dict.""" return name in self.FlagDict() has_key = __contains__ # a synonym for __contains__() def __iter__(self): return self.FlagDict().iterkeys() # lhuang: my stealthy entry point def __call__(self, argv): try: # N.B.: return the rest of the command-line! (non-flag arguments) return self.__call2__(argv) except FlagsError, e: # lhuang: to 2> instead of > import sys print >> sys.stderr, 'Error: %s\nUsage: %s [flags]\n%s' % (e, list(argv)[0], FLAGS) sys.exit(1) # lhuang: external entry FLAGS(sys.argv) here def __call2__(self, argv): """Parses flags from argv; stores parsed flags into this FlagValues object. All unparsed arguments are returned. Flags are parsed using the GNU Program Argument Syntax Conventions, using getopt: http://www.gnu.org/software/libc/manual/html_mono/libc.html#Getopt Args: argv: argument list. Can be of any type that may be converted to a list. Returns: The list of arguments not parsed as options, including argv[0] Raises: FlagsError: on any parsing error """ # Support any sequence type that can be converted to a list argv = list(argv) shortopts = "" longopts = [] fl = self.FlagDict() # This pre parses the argv list for --flagfile=<> options. argv = self.ReadFlagsFromFiles(argv) # Correct the argv to support the google style of passing boolean # parameters. Boolean parameters may be passed by using --mybool, # --nomybool, --mybool=(true|false|1|0). getopt does not support # having options that may or may not have a parameter. We replace # instances of the short form --mybool and --nomybool with their # full forms: --mybool=(true|false). original_argv = list(argv) # list() makes a copy shortest_matches = None for name, flag in fl.items(): if not flag.boolean: continue if shortest_matches is None: # Determine the smallest allowable prefix for all flag names shortest_matches = self.ShortestUniquePrefixes(fl) no_name = 'no' + name prefix = shortest_matches[name] no_prefix = shortest_matches[no_name] # Replace all occurences of this boolean with extended forms for arg_idx in range(1, len(argv)): arg = argv[arg_idx] if arg.find('=') >= 0: continue if arg.startswith('--'+prefix) and ('--'+name).startswith(arg): argv[arg_idx] = ('--%s=true' % name) elif arg.startswith('--'+no_prefix) and ('--'+no_name).startswith(arg): argv[arg_idx] = ('--%s=false' % name) # Loop over all of the flags, building up the lists of short options # and long options that will be passed to getopt. Short options are # specified as a string of letters, each letter followed by a colon # if it takes an argument. Long options are stored in an array of # strings. Each string ends with an '=' if it takes an argument. for name, flag in fl.items(): longopts.append(name + "=") if len(name) == 1: # one-letter option: allow short flag type also shortopts += name if not flag.boolean: shortopts += ":" longopts.append('undefok=') undefok_flags = [] # In case --undefok is specified, loop to pick up unrecognized # options one by one. unrecognized_opts = [] args = argv[1:] while True: try: optlist, unparsed_args = getopt.gnu_getopt(args, shortopts, longopts) break except getopt.GetoptError, e: if not e.opt or e.opt in fl: # Not an unrecognized option, reraise the exception as a FlagsError raise FlagsError(e) # Handle an unrecognized option. unrecognized_opts.append(e.opt) # Remove offender from args and try again for arg_index in range(len(args)): if ((args[arg_index] == '--' + e.opt) or (args[arg_index] == '-' + e.opt) or args[arg_index].startswith('--' + e.opt + '=')): args = args[0:arg_index] + args[arg_index+1:] break else: # We should have found the option, so we don't expect to get # here. We could assert, but raising the original exception # might work better. raise FlagsError(e) for name, arg in optlist: if name == '--undefok': flag_names = arg.split(',') undefok_flags.extend(flag_names) # For boolean flags, if --undefok=boolflag is specified, then we should # also accept --noboolflag, in addition to --boolflag. # Since we don't know the type of the undefok'd flag, this will affect # non-boolean flags as well. # NOTE: You shouldn't use --undefok=noboolflag, because then we will # accept --nonoboolflag here. We are choosing not to do the conversion # from noboolflag -> boolflag because of the ambiguity that flag names # can start with 'no'. undefok_flags.extend('no' + name for name in flag_names) continue if name.startswith('--'): # long option name = name[2:] short_option = 0 else: # short option name = name[1:] short_option = 1 if fl.has_key(name): flag = fl[name] if flag.boolean and short_option: arg = 1 flag.Parse(arg) # If there were unrecognized options, raise an exception unless # the options were named via --undefok. for opt in unrecognized_opts: if opt not in undefok_flags: raise UnrecognizedFlagError(opt) if unparsed_args: # unparsed_args becomes the first non-flag detected by getopt to # the end of argv. Because argv may have been modified above, # return original_argv for this region. return argv[:1] + original_argv[-len(unparsed_args):] else: return argv[:1] def Reset(self): """Resets the values to the point before FLAGS(argv) was called.""" for f in self.FlagDict().values(): f.Unparse() def RegisteredFlags(self): """Returns: a list of the names and short names of all registered flags.""" return self.FlagDict().keys() def FlagValuesDict(self): """Returns: a dictionary that maps flag names to flag values.""" flag_values = {} for flag_name in self.RegisteredFlags(): flag = self.FlagDict()[flag_name] flag_values[flag_name] = flag.value return flag_values def __str__(self): """Generates a help string for all known flags.""" return self.GetHelp() def GetHelp(self, prefix=''): """Generates a help string for all known flags.""" helplist = [] flags_by_module = self.FlagsByModuleDict() if flags_by_module: modules = flags_by_module.keys() modules.sort() # Print the help for the main module first, if possible. main_module = _GetMainModule() if main_module in modules: modules.remove(main_module) modules = [main_module] + modules for module in modules: self.__RenderOurModuleFlags(module, helplist) self.__RenderModuleFlags('gflags', _SPECIAL_FLAGS.FlagDict().values(), helplist) else: # Just print one long list of flags. self.__RenderFlagList( self.FlagDict().values() + _SPECIAL_FLAGS.FlagDict().values(), helplist, prefix) return '\n'.join(helplist) def __RenderModuleFlags(self, module, flags, output_lines, prefix=""): """Generates a help string for a given module.""" # output_lines.append('\n%s%s:' % (prefix, module)) self.__RenderFlagList(flags, output_lines, prefix + " ") def __RenderOurModuleFlags(self, module, output_lines, prefix=""): """Generates a help string for a given module.""" flags = self._GetFlagsDefinedByModule(module) if flags: self.__RenderModuleFlags(module, flags, output_lines, prefix) def __RenderOurModuleKeyFlags(self, module, output_lines, prefix=""): """Generates a help string for the key flags of a given module. Args: module: A module object or a module name (a string). output_lines: A list of strings. The generated help message lines will be appended to this list. prefix: A string that is prepended to each generated help line. """ key_flags = self._GetKeyFlagsForModule(module) if key_flags: self.__RenderModuleFlags(module, key_flags, output_lines, prefix) def MainModuleHelp(self): """Returns: A string describing the key flags of the main module.""" helplist = [] self.__RenderOurModuleKeyFlags(_GetMainModule(), helplist) return '\n'.join(helplist) def __RenderFlagList(self, flaglist, output_lines, prefix=" "): fl = self.FlagDict() special_fl = _SPECIAL_FLAGS.FlagDict() flaglist = [(flag.name, flag) for flag in flaglist] flaglist.sort() flagset = {} for (name, flag) in flaglist: # It's possible this flag got deleted or overridden since being # registered in the per-module flaglist. Check now against the # canonical source of current flag information, the FlagDict. if fl.get(name, None) != flag and special_fl.get(name, None) != flag: # a different flag is using this name now continue # only print help once if flagset.has_key(flag): continue flagset[flag] = 1 flaghelp = "" # lhuang: if flag.name in ["help", "helpshort"]: continue if flag.short_name: flaghelp += "-" if len(flag.short_name) == 1 else "--" # lhuang: shortname can be long flaghelp += "%s," % flag.short_name if flag.boolean: flaghelp += "--[no]%s" % flag.name + ":" else: flaghelp += "--%s" % flag.name + ":" flaghelp += " " if flag.help: flaghelp += flag.help flaghelp = TextWrap(flaghelp, indent=prefix+" ", firstline_indent=prefix) if flag.default_as_str: flaghelp += "\n" # lhuang flaghelp += TextWrap("(default: %s)" % flag.default_as_str, indent=prefix+" ") if flag.parser.syntactic_help: flaghelp += "\t" # lhuang flaghelp += TextWrap("(%s)" % flag.parser.syntactic_help, indent=prefix+" ") output_lines.append(flaghelp) def get(self, name, default): """Returns the value of a flag (if not None) or a default value. Args: name: A string, the name of a flag. default: Default value to use if the flag value is None. """ value = self.__getattr__(name) if value is not None: # Can't do if not value, b/c value might be '0' or "" return value else: return default def ShortestUniquePrefixes(self, fl): """Returns: dictionary; maps flag names to their shortest unique prefix.""" # Sort the list of flag names sorted_flags = [] for name, flag in fl.items(): sorted_flags.append(name) if flag.boolean: sorted_flags.append('no%s' % name) sorted_flags.sort() # For each name in the sorted list, determine the shortest unique # prefix by comparing itself to the next name and to the previous # name (the latter check uses cached info from the previous loop). shortest_matches = {} prev_idx = 0 for flag_idx in range(len(sorted_flags)): curr = sorted_flags[flag_idx] if flag_idx == (len(sorted_flags) - 1): next = None else: next = sorted_flags[flag_idx+1] next_len = len(next) for curr_idx in range(len(curr)): if (next is None or curr_idx >= next_len or curr[curr_idx] != next[curr_idx]): # curr longer than next or no more chars in common shortest_matches[curr] = curr[:max(prev_idx, curr_idx) + 1] prev_idx = curr_idx break else: # curr shorter than (or equal to) next shortest_matches[curr] = curr prev_idx = curr_idx + 1 # next will need at least one more char return shortest_matches def __IsFlagFileDirective(self, flag_string): """Checks whether flag_string contain a --flagfile= directive.""" if isinstance(flag_string, type("")): if flag_string.startswith('--flagfile='): return 1 elif flag_string == '--flagfile': return 1 elif flag_string.startswith('-flagfile='): return 1 elif flag_string == '-flagfile': return 1 else: return 0 return 0 def ExtractFilename(self, flagfile_str): """Returns filename from a flagfile_str of form -[-]flagfile=filename. The cases of --flagfile foo and -flagfile foo shouldn't be hitting this function, as they are dealt with in the level above this function. """ if flagfile_str.startswith('--flagfile='): return os.path.expanduser((flagfile_str[(len('--flagfile=')):]).strip()) elif flagfile_str.startswith('-flagfile='): return os.path.expanduser((flagfile_str[(len('-flagfile=')):]).strip()) else: raise FlagsError('Hit illegal --flagfile type: %s' % flagfile_str) def __GetFlagFileLines(self, filename, parsed_file_list): """Returns the useful (!=comments, etc) lines from a file with flags. Args: filename: A string, the name of the flag file. parsed_file_list: A list of the names of the files we have already read. MUTATED BY THIS FUNCTION. Returns: List of strings. See the note below. NOTE(springer): This function checks for a nested --flagfile= tag and handles the lower file recursively. It returns a list of all the lines that _could_ contain command flags. This is EVERYTHING except whitespace lines and comments (lines starting with '#' or '//'). """ line_list = [] # All line from flagfile. flag_line_list = [] # Subset of lines w/o comments, blanks, flagfile= tags. try: file_obj = open(filename, 'r') except IOError, e_msg: print e_msg print 'ERROR:: Unable to open flagfile: %s' % (filename) return flag_line_list line_list = file_obj.readlines() file_obj.close() parsed_file_list.append(filename) # This is where we check each line in the file we just read. for line in line_list: if line.isspace(): pass # Checks for comment (a line that starts with '#'). elif line.startswith('#') or line.startswith('//'): pass # Checks for a nested "--flagfile=" flag in the current file. # If we find one, recursively parse down into that file. elif self.__IsFlagFileDirective(line): sub_filename = self.ExtractFilename(line) # We do a little safety check for reparsing a file we've already done. if not sub_filename in parsed_file_list: included_flags = self.__GetFlagFileLines(sub_filename, parsed_file_list) flag_line_list.extend(included_flags) else: # Case of hitting a circularly included file. print >>sys.stderr, ('Warning: Hit circular flagfile dependency: %s' % sub_filename) else: # Any line that's not a comment or a nested flagfile should get # copied into 2nd position. This leaves earlier arguements # further back in the list, thus giving them higher priority. flag_line_list.append(line.strip()) return flag_line_list def ReadFlagsFromFiles(self, argv): """Processes command line args, but also allow args to be read from file. Args: argv: A list of strings, usually sys.argv, which may contain one or more flagfile directives of the form --flagfile="./filename". Returns: A new list which has the original list combined with what we read from any flagfile(s). References: Global gflags.FLAG class instance. This function should be called before the normal FLAGS(argv) call. This function scans the input list for a flag that looks like: --flagfile=. Then it opens , reads all valid key and value pairs and inserts them into the input list between the first item of the list and any subsequent items in the list. Note that your application's flags are still defined the usual way using gflags DEFINE_flag() type functions. Notes (assuming we're getting a commandline of some sort as our input): --> Flags from the command line argv _should_ always take precedence! --> A further "--flagfile=" CAN be nested in a flagfile. It will be processed after the parent flag file is done. --> For duplicate flags, first one we hit should "win". --> In a flagfile, a line beginning with # or // is a comment. --> Entirely blank lines _should_ be ignored. """ parsed_file_list = [] rest_of_args = argv new_argv = [] while rest_of_args: current_arg = rest_of_args[0] rest_of_args = rest_of_args[1:] if self.__IsFlagFileDirective(current_arg): # This handles the case of -(-)flagfile foo. In this case the # next arg really is part of this one. if current_arg == '--flagfile' or current_arg == '-flagfile': if not rest_of_args: raise IllegalFlagValue('--flagfile with no argument') flag_filename = os.path.expanduser(rest_of_args[0]) rest_of_args = rest_of_args[1:] else: # This handles the case of (-)-flagfile=foo. flag_filename = self.ExtractFilename(current_arg) new_argv = (new_argv[:1] + self.__GetFlagFileLines(flag_filename, parsed_file_list) + new_argv[1:]) else: new_argv.append(current_arg) return new_argv def FlagsIntoString(self): """Returns a string with the flags assignments from this FlagValues object. This function ignores flags whose value is None. Each flag assignment is separated by a newline. NOTE: MUST mirror the behavior of the C++ function CommandlineFlagsIntoString from google3/base/commandlineflags.cc. """ s = '' for flag in self.FlagDict().values(): if flag.value is not None: s += flag.Serialize() + '\n' return s def AppendFlagsIntoFile(self, filename): """Appends all flags assignments from this FlagInfo object to a file. Output will be in the format of a flagfile. NOTE: MUST mirror the behavior of the C++ version of AppendFlagsIntoFile from google3/base/commandlineflags.cc. """ out_file = open(filename, 'a') out_file.write(self.FlagsIntoString()) out_file.close() def WriteHelpInXMLFormat(self, outfile=None): """Outputs flag documentation in XML format. NOTE: We use element names that are consistent with those used by the C++ command-line flag library, from google3/base/commandlineflags_reporting.cc. We also use a few new elements (e.g., ), but we do not interfere / overlap with existing XML elements used by the C++ library. Please maintain this consistency. Args: outfile: File object we write to. Default None means sys.stdout. """ outfile = outfile or sys.stdout outfile.write('\n') outfile.write('\n') indent = ' ' _WriteSimpleXMLElement(outfile, 'program', os.path.basename(sys.argv[0]), indent) usage_doc = sys.modules['__main__'].__doc__ if not usage_doc: usage_doc = '\nUSAGE: %s [flags]\n' % sys.argv[0] else: usage_doc = usage_doc.replace('%s', sys.argv[0]) _WriteSimpleXMLElement(outfile, 'usage', usage_doc, indent) # Get list of key flags for the main module. key_flags = self._GetKeyFlagsForModule(_GetMainModule()) # Sort flags by declaring module name and next by flag name. flags_by_module = self.FlagsByModuleDict() all_module_names = list(flags_by_module.keys()) all_module_names.sort() for module_name in all_module_names: flag_list = [(f.name, f) for f in flags_by_module[module_name]] flag_list.sort() for unused_flag_name, flag in flag_list: is_key = flag in key_flags flag.WriteInfoInXMLFormat(outfile, module_name, is_key=is_key, indent=indent) outfile.write('\n') outfile.flush() # end of FlagValues definition # The global FlagValues instance //lhuang FLAGS = FlagValues() def _MakeXMLSafe(s): """Escapes <, >, and & from s, and removes XML 1.0-illegal chars.""" # lhuang: avoid _md5 s = s #cgi.escape(s) # Escape <, >, and & # Remove characters that cannot appear in an XML 1.0 document # (http://www.w3.org/TR/REC-xml/#charsets). # # NOTE: if there are problems with current solution, one may move to # XML 1.1, which allows such chars, if they're entity-escaped (&#xHH;). s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s) return s def _WriteSimpleXMLElement(outfile, name, value, indent): """Writes a simple XML element. Args: outfile: File object we write the XML element to. name: A string, the name of XML element. value: A Python object, whose string representation will be used as the value of the XML element. indent: A string, prepended to each line of generated output. """ value_str = str(value) if isinstance(value, bool): # Display boolean values as the C++ flag library does: no caps. value_str = value_str.lower() outfile.write('%s<%s>%s\n' % (indent, name, _MakeXMLSafe(value_str), name)) class Flag: """Information about a command-line flag. 'Flag' objects define the following fields: .name - the name for this flag .default - the default value for this flag .default_as_str - default value as repr'd string, e.g., "'true'" (or None) .value - the most recent parsed value of this flag; set by Parse() .help - a help string or None if no help is available .short_name - the single letter alias for this flag (or None) .boolean - if 'true', this flag does not accept arguments .present - true if this flag was parsed from command line flags. .parser - an ArgumentParser object .serializer - an ArgumentSerializer object .allow_override - the flag may be redefined without raising an error The only public method of a 'Flag' object is Parse(), but it is typically only called by a 'FlagValues' object. The Parse() method is a thin wrapper around the 'ArgumentParser' Parse() method. The parsed value is saved in .value, and the .present attribute is updated. If this flag was already present, a FlagsError is raised. Parse() is also called during __init__ to parse the default value and initialize the .value attribute. This enables other python modules to safely use flags even if the __main__ module neglects to parse the command line arguments. The .present attribute is cleared after __init__ parsing. If the default value is set to None, then the __init__ parsing step is skipped and the .value attribute is initialized to None. Note: The default value is also presented to the user in the help string, so it is important that it be a legal value for this flag. """ def __init__(self, parser, serializer, name, default, help_string, short_name=None, boolean=0, allow_override=0): self.name = name if not help_string: help_string = '(no help available)' self.help = help_string self.short_name = short_name self.boolean = boolean self.present = 0 self.parser = parser self.serializer = serializer self.allow_override = allow_override self.value = None self.SetDefault(default) def __GetParsedValueAsString(self, value): if value is None: return None if self.serializer: return repr(self.serializer.Serialize(value)) if self.boolean: if value: return repr('true') else: return repr('false') return repr(str(value)) def Parse(self, argument): try: self.value = self.parser.Parse(argument) except ValueError, e: # recast ValueError as IllegalFlagValue raise IllegalFlagValue("flag --%s: %s" % (self.name, e)) self.present += 1 def Unparse(self): if self.default is None: self.value = None else: self.Parse(self.default) self.present = 0 def Serialize(self): if self.value is None: return '' if self.boolean: if self.value: return "--%s" % self.name else: return "--no%s" % self.name else: if not self.serializer: raise FlagsError("Serializer not present for flag %s" % self.name) return "--%s=%s" % (self.name, self.serializer.Serialize(self.value)) def SetDefault(self, value): """Changes the default value (and current value too) for this Flag.""" # We can't allow a None override because it may end up not being # passed to C++ code when we're overriding C++ flags. So we # cowardly bail out until someone fixes the semantics of trying to # pass None to a C++ flag. See swig_flags.Init() for details on # this behavior. if value is None and self.allow_override: raise DuplicateFlag(self.name) self.default = value self.Unparse() self.default_as_str = self.__GetParsedValueAsString(self.value) def Type(self): """Returns: a string that describes the type of this Flag.""" # NOTE: we use strings, and not the types.*Type constants because # our flags can have more exotic types, e.g., 'comma separated list # of strings', 'whitespace separated list of strings', etc. return self.parser.Type() def WriteInfoInXMLFormat(self, outfile, module_name, is_key=False, indent=''): """Writes common info about this flag, in XML format. This is information that is relevant to all flags (e.g., name, meaning, etc.). If you defined a flag that has some other pieces of info, then please override _WriteCustomInfoInXMLFormat. Please do NOT override this method. Args: outfile: File object we write to. module_name: A string, the name of the module that defines this flag. is_key: A boolean, True iff this flag is key for main module. indent: A string that is prepended to each generated line. """ outfile.write(indent + '\n') inner_indent = indent + ' ' if is_key: _WriteSimpleXMLElement(outfile, 'key', 'yes', inner_indent) _WriteSimpleXMLElement(outfile, 'file', module_name, inner_indent) # Print flag features that are relevant for all flags. _WriteSimpleXMLElement(outfile, 'name', self.name, inner_indent) if self.short_name: _WriteSimpleXMLElement(outfile, 'short_name', self.short_name, inner_indent) if self.help: _WriteSimpleXMLElement(outfile, 'meaning', self.help, inner_indent) _WriteSimpleXMLElement(outfile, 'default', self.default, inner_indent) _WriteSimpleXMLElement(outfile, 'current', self.value, inner_indent) _WriteSimpleXMLElement(outfile, 'type', self.Type(), inner_indent) # Print extra flag features this flag may have. self._WriteCustomInfoInXMLFormat(outfile, inner_indent) outfile.write(indent + '\n') def _WriteCustomInfoInXMLFormat(self, outfile, indent): """Writes extra info about this flag, in XML format. "Extra" means "not already printed by WriteInfoInXMLFormat above." Args: outfile: File object we write to. indent: A string that is prepended to each generated line. """ # Usually, the parser knows the extra details about the flag, so # we just forward the call to it. self.parser.WriteCustomInfoInXMLFormat(outfile, indent) # End of Flag definition class ArgumentParser: """Base class used to parse and convert arguments. The Parse() method checks to make sure that the string argument is a legal value and convert it to a native type. If the value cannot be converted, it should throw a 'ValueError' exception with a human readable explanation of why the value is illegal. Subclasses should also define a syntactic_help string which may be presented to the user to describe the form of the legal values. """ syntactic_help = "" def Parse(self, argument): """Default implementation: always returns its argument unmodified.""" return argument def Type(self): return 'string' def WriteCustomInfoInXMLFormat(self, outfile, indent): pass class ArgumentSerializer: """Base class for generating string representations of a flag value.""" def Serialize(self, value): return str(value) class ListSerializer(ArgumentSerializer): def __init__(self, list_sep): self.list_sep = list_sep def Serialize(self, value): return self.list_sep.join([str(x) for x in value]) # The DEFINE functions are explained in mode details in the module doc string. def DEFINE(parser, name, default, help, flag_values=FLAGS, serializer=None, **args): """Registers a generic Flag object. NOTE: in the docstrings of all DEFINE* functions, "registers" is short for "creates a new flag and registers it". Auxiliary function: clients should use the specialized DEFINE_ function instead. Args: parser: ArgumentParser that is used to parse the flag arguments. name: A string, the flag name. default: The default value of the flag. help: A help string. flag_values: FlagValues object the flag will be registered with. serializer: ArgumentSerializer that serializes the flag value. args: Dictionary with extra keyword args that are passes to the Flag __init__. """ DEFINE_flag(Flag(parser, serializer, name, default, help, **args), flag_values) def DEFINE_flag(flag, flag_values=FLAGS): """Registers a 'Flag' object with a 'FlagValues' object. By default, the global FLAGS 'FlagValue' object is used. Typical users will use one of the more specialized DEFINE_xxx functions, such as DEFINE_string or DEFINE_integer. But developers who need to create Flag objects themselves should use this function to register their flags. """ # copying the reference to flag_values prevents pychecker warnings fv = flag_values fv[flag.name] = flag # Tell flag_values who's defining the flag. if isinstance(flag_values, FlagValues): # Regarding the above isinstance test: some users pass funny # values of flag_values (e.g., {}) in order to avoid the flag # registration (in the past, there used to be a flag_values == # FLAGS test here) and redefine flags with the same name (e.g., # debug). To avoid breaking their code, we perform the # registration only if flag_values is a real FlagValues object. flag_values._RegisterFlagByModule(_GetCallingModule(), flag) def _InternalDeclareKeyFlags(flag_names, flag_values=FLAGS): """Declares a flag as key for the calling module. Internal function. User code should call DECLARE_key_flag or ADOPT_module_key_flags instead. Args: flag_names: A list of strings that are names of already-registered Flag objects. flag_values: A FlagValue object. This should almost never need to be overridden. Raises: UnrecognizedFlagError: when we refer to a flag that was not defined yet. """ module = _GetCallingModule() for flag_name in flag_names: if flag_name not in flag_values: raise UnrecognizedFlagError(flag_name) flag = flag_values.FlagDict()[flag_name] flag_values._RegisterKeyFlagForModule(module, flag) def DECLARE_key_flag(flag_name, flag_values=FLAGS): """Declares one flag as key to the current module. Key flags are flags that are deemed really important for a module. They are important when listing help messages; e.g., if the --helpshort command-line flag is used, then only the key flags of the main module are listed (instead of all flags, as in the case of --help). Sample usage: flags.DECLARED_key_flag('flag_1') Args: flag_name: A string, the name of an already declared flag. (Redeclaring flags as key, including flags implicitly key because they were declared in this module, is a no-op.) flag_values: A FlagValues object. This should almost never need to be overridden. """ _InternalDeclareKeyFlags([flag_name], flag_values=flag_values) def ADOPT_module_key_flags(module, flag_values=FLAGS): """Declares that all flags key to a module are key to the current module. Args: module: A module object. flag_values: A FlagValues object. This should almost never need to be overridden. Raises: FlagsError: When given an argument that is a module name (a string), instead of a module object. """ # NOTE(salcianu): an even better test would be if not # isinstance(module, types.ModuleType) but I didn't want to import # types for such a tiny use. if isinstance(module, str): raise FlagsError('Received module name %s; expected a module object.' % module) _InternalDeclareKeyFlags( [f.name for f in flag_values._GetKeyFlagsForModule(module.__name__)], flag_values=flag_values) # # STRING FLAGS # def DEFINE_string(name, default, help, flag_values=FLAGS, **args): """Registers a flag whose value can be any string.""" parser = ArgumentParser() serializer = ArgumentSerializer() DEFINE(parser, name, default, help, flag_values, serializer, **args) # # BOOLEAN FLAGS # # and the special HELP flags. class BooleanParser(ArgumentParser): """Parser of boolean values.""" def Convert(self, argument): """Converts the argument to a boolean; raise ValueError on errors.""" if type(argument) == str: if argument.lower() in ['true', 't', '1']: return True elif argument.lower() in ['false', 'f', '0']: return False bool_argument = bool(argument) if argument == bool_argument: # The argument is a valid boolean (True, False, 0, or 1), and not just # something that always converts to bool (list, string, int, etc.). return bool_argument raise ValueError('Non-boolean argument to boolean flag', argument) def Parse(self, argument): val = self.Convert(argument) return val def Type(self): return 'bool' class BooleanFlag(Flag): """Basic boolean flag. Boolean flags do not take any arguments, and their value is either True (1) or False (0). The false value is specified on the command line by prepending the word 'no' to either the long or the short flag name. For example, if a Boolean flag was created whose long name was 'update' and whose short name was 'x', then this flag could be explicitly unset through either --noupdate or --nox. """ def __init__(self, name, default, help, short_name=None, **args): p = BooleanParser() Flag.__init__(self, p, None, name, default, help, short_name, 1, **args) if not self.help: self.help = "a boolean value" def DEFINE_boolean(name, default, help, flag_values=FLAGS, **args): """Registers a boolean flag. Such a boolean flag does not take an argument. If a user wants to specify a false value explicitly, the long option beginning with 'no' must be used: i.e. --noflag This flag will have a value of None, True or False. None is possible if default=None and the user does not specify the flag on the command line. """ DEFINE_flag(BooleanFlag(name, default, help, **args), flag_values) # Match C++ API to unconfuse C++ people. DEFINE_bool = DEFINE_boolean class HelpFlag(BooleanFlag): """ HelpFlag is a special boolean flag that prints usage information and raises a SystemExit exception if it is ever found in the command line arguments. Note this is called with allow_override=1, so other apps can define their own --help flag, replacing this one, if they want. """ def __init__(self): BooleanFlag.__init__(self, "help", 0, "show this help", short_name="h", allow_override=1) def Parse(self, arg): if arg: doc = sys.modules["__main__"].__doc__ flags = str(FLAGS) print doc or ("\nUSAGE: echo SEQUENCE | %s [flags]\n or\n echo FASTA_FILE | %s [flags]\n" % (sys.argv[0], sys.argv[0])) if flags: print "flags:" print flags print "" sys.exit(1) class HelpXMLFlag(BooleanFlag): """Similar to HelpFlag, but generates output in XML format.""" def __init__(self): BooleanFlag.__init__(self, 'helpxml', False, 'like --help, but generates XML output', allow_override=1) def Parse(self, arg): if arg: FLAGS.WriteHelpInXMLFormat(sys.stdout) sys.exit(1) class HelpshortFlag(BooleanFlag): """ HelpshortFlag is a special boolean flag that prints usage information for the "main" module, and rasies a SystemExit exception if it is ever found in the command line arguments. Note this is called with allow_override=1, so other apps can define their own --helpshort flag, replacing this one, if they want. """ def __init__(self): BooleanFlag.__init__(self, "helpshort", 0, "show usage only for this module", allow_override=1) def Parse(self, arg): if arg: doc = sys.modules["__main__"].__doc__ flags = FLAGS.MainModuleHelp() print doc or ("\nUSAGE: %s [flags]\n" % sys.argv[0]) if flags: print "flags:" print flags sys.exit(1) # # FLOAT FLAGS # class FloatParser(ArgumentParser): """Parser of floating point values. Parsed value may be bounded to a given upper and lower bound. """ number_article = "a" number_name = "number" syntactic_help = " ".join((number_article, number_name)) def __init__(self, lower_bound=None, upper_bound=None): self.lower_bound = lower_bound self.upper_bound = upper_bound sh = self.syntactic_help if lower_bound != None and upper_bound != None: sh = ("%s in the range [%s, %s]" % (sh, lower_bound, upper_bound)) elif lower_bound == 1: sh = "a positive %s" % self.number_name elif upper_bound == -1: sh = "a negative %s" % self.number_name elif lower_bound == 0: sh = "a non-negative %s" % self.number_name elif upper_bound != None: sh = "%s <= %s" % (self.number_name, upper_bound) elif lower_bound != None: sh = "%s >= %s" % (self.number_name, lower_bound) self.syntactic_help = sh def Convert(self, argument): """Converts argument to a float; raises ValueError on errors.""" return float(argument) def Parse(self, argument): val = self.Convert(argument) if ((self.lower_bound != None and val < self.lower_bound) or (self.upper_bound != None and val > self.upper_bound)): raise ValueError("%s is not %s" % (val, self.syntactic_help)) return val def Type(self): return 'float' def WriteCustomInfoInXMLFormat(self, outfile, indent): if self.lower_bound is not None: _WriteSimpleXMLElement(outfile, 'lower_bound', self.lower_bound, indent) if self.upper_bound is not None: _WriteSimpleXMLElement(outfile, 'upper_bound', self.upper_bound, indent) # End of FloatParser def DEFINE_float(name, default, help, lower_bound=None, upper_bound=None, flag_values=FLAGS, **args): """Registers a flag whose value must be a float. If lower_bound or upper_bound are set, then this flag must be within the given range. """ parser = FloatParser(lower_bound, upper_bound) serializer = ArgumentSerializer() DEFINE(parser, name, default, help, flag_values, serializer, **args) # # INTEGER FLAGS # class IntegerParser(FloatParser): """Parser of an integer value. Parsed value may be bounded to a given upper and lower bound. """ number_article = "an" number_name = "integer" syntactic_help = " ".join((number_article, number_name)) def Convert(self, argument): __pychecker__ = 'no-returnvalues' if type(argument) == str: base = 10 if len(argument) > 2 and argument[0] == "0" and argument[1] == "x": base = 16 try: return int(argument, base) # ValueError is thrown when argument is a string, and overflows an int. except ValueError: return long(argument, base) else: try: return int(argument) # OverflowError is thrown when argument is numeric, and overflows an int. except OverflowError: return long(argument) def Type(self): return 'int' def DEFINE_integer(name, default, help, lower_bound=None, upper_bound=None, flag_values=FLAGS, **args): """Registers a flag whose value must be an integer. If lower_bound, or upper_bound are set, then this flag must be within the given range. """ parser = IntegerParser(lower_bound, upper_bound) serializer = ArgumentSerializer() DEFINE(parser, name, default, help, flag_values, serializer, **args) # # ENUM FLAGS # class EnumParser(ArgumentParser): """Parser of a string enum value (a string value from a given set). If enum_values (see below) is not specified, any string is allowed. """ def __init__(self, enum_values=None): self.enum_values = enum_values def Parse(self, argument): if self.enum_values and argument not in self.enum_values: raise ValueError("value should be one of <%s>" % "|".join(self.enum_values)) return argument def Type(self): return 'string enum' class EnumFlag(Flag): """Basic enum flag; its value can be any string from list of enum_values.""" def __init__(self, name, default, help, enum_values=None, short_name=None, **args): enum_values = enum_values or [] p = EnumParser(enum_values) g = ArgumentSerializer() Flag.__init__(self, p, g, name, default, help, short_name, **args) if not self.help: self.help = "an enum string" self.help = "<%s>: %s" % ("|".join(enum_values), self.help) def _WriteCustomInfoInXMLFormat(self, outfile, indent): for enum_value in self.parser.enum_values: _WriteSimpleXMLElement(outfile, 'enum_value', enum_value, indent) def DEFINE_enum(name, default, enum_values, help, flag_values=FLAGS, **args): """Registers a flag whose value can be any string from enum_values.""" DEFINE_flag(EnumFlag(name, default, help, enum_values, ** args), flag_values) # # LIST FLAGS # class BaseListParser(ArgumentParser): """Base class for a parser of lists of strings. To extend, inherit from this class; from the subclass __init__, call BaseListParser.__init__(self, token, name) where token is a character used to tokenize, and name is a description of the separator. """ def __init__(self, token=None, name=None): assert name self._token = token self._name = name self.syntactic_help = "a %s separated list" % self._name def Parse(self, argument): if argument == '': return [] else: return [s.strip() for s in argument.split(self._token)] def Type(self): return '%s separated list of strings' % self._name class ListParser(BaseListParser): """Parser for a comma-separated list of strings.""" def __init__(self): BaseListParser.__init__(self, ',', 'comma') def WriteCustomInfoInXMLFormat(self, outfile, indent): BaseListParser.WriteCustomInfoInXMLFormat(self, outfile, indent) _WriteSimpleXMLElement(outfile, 'list_separator', repr(','), indent) class WhitespaceSeparatedListParser(BaseListParser): """Parser for a whitespace-separated list of strings.""" def __init__(self): BaseListParser.__init__(self, None, 'whitespace') def WriteCustomInfoInXMLFormat(self, outfile, indent): BaseListParser.WriteCustomInfoInXMLFormat(self, outfile, indent) separators = list(string.whitespace) separators.sort() for ws_char in string.whitespace: _WriteSimpleXMLElement(outfile, 'list_separator', repr(ws_char), indent) def DEFINE_list(name, default, help, flag_values=FLAGS, **args): """Registers a flag whose value is a comma-separated list of strings.""" parser = ListParser() serializer = ListSerializer(',') DEFINE(parser, name, default, help, flag_values, serializer, **args) def DEFINE_spaceseplist(name, default, help, flag_values=FLAGS, **args): """Registers a flag whose value is a whitespace-separated list of strings. Any whitespace can be used as a separator. """ parser = WhitespaceSeparatedListParser() serializer = ListSerializer(' ') DEFINE(parser, name, default, help, flag_values, serializer, **args) # # MULTI FLAGS # class MultiFlag(Flag): """A flag that can appear multiple time on the command-line. The value of such a flag is a list that contains the individual values from all the appearances of that flag on the command-line. See the __doc__ for Flag for most behavior of this class. Only differences in behavior are described here: * The default value may be either a single value or a list of values. A single value is interpreted as the [value] singleton list. * The value of the flag is always a list, even if the option was only supplied once, and even if the default value is a single value """ def __init__(self, *args, **kwargs): Flag.__init__(self, *args, **kwargs) self.help += ';\n repeat this option to specify a list of values' def Parse(self, arguments): """Parses one or more arguments with the installed parser. Args: arguments: a single argument or a list of arguments (typically a list of default values); a single argument is converted internally into a list containing one item. """ if not isinstance(arguments, list): # Default value may be a list of values. Most other arguments # will not be, so convert them into a single-item list to make # processing simpler below. arguments = [arguments] if self.present: # keep a backup reference to list of previously supplied option values values = self.value else: # "erase" the defaults with an empty list values = [] for item in arguments: # have Flag superclass parse argument, overwriting self.value reference Flag.Parse(self, item) # also increments self.present values.append(self.value) # put list of option values back in the 'value' attribute self.value = values def Serialize(self): if not self.serializer: raise FlagsError("Serializer not present for flag %s" % self.name) if self.value is None: return '' s = '' multi_value = self.value for self.value in multi_value: if s: s += ' ' s += Flag.Serialize(self) self.value = multi_value return s def Type(self): return 'multi ' + self.parser.Type() def DEFINE_multi(parser, serializer, name, default, help, flag_values=FLAGS, **args): """Registers a generic MultiFlag that parses its args with a given parser. Auxiliary function. Normal users should NOT use it directly. Developers who need to create their own 'Parser' classes for options which can appear multiple times can call this module function to register their flags. """ DEFINE_flag(MultiFlag(parser, serializer, name, default, help, **args), flag_values) def DEFINE_multistring(name, default, help, flag_values=FLAGS, **args): """Registers a flag whose value can be a list of any strings. Use the flag on the command line multiple times to place multiple string values into the list. The 'default' may be a single string (which will be converted into a single-element list) or a list of strings. """ parser = ArgumentParser() serializer = ArgumentSerializer() DEFINE_multi(parser, serializer, name, default, help, flag_values, **args) def DEFINE_multi_int(name, default, help, lower_bound=None, upper_bound=None, flag_values=FLAGS, **args): """Registers a flag whose value can be a list of arbitrary integers. Use the flag on the command line multiple times to place multiple integer values into the list. The 'default' may be a single integer (which will be converted into a single-element list) or a list of integers. """ parser = IntegerParser(lower_bound, upper_bound) serializer = ArgumentSerializer() DEFINE_multi(parser, serializer, name, default, help, flag_values, **args) # Now register the flags that we want to exist in all applications. # These are all defined with allow_override=1, so user-apps can use # these flagnames for their own purposes, if they want. DEFINE_flag(HelpFlag()) DEFINE_flag(HelpshortFlag()) # lhuang #DEFINE_flag(HelpXMLFlag()) # Define special flags here so that help may be generated for them. _SPECIAL_FLAGS = FlagValues() # lhuang ##DEFINE_string( ## 'flagfile', "", ## "Insert flag definitions from the given file into the command line.", ## _SPECIAL_FLAGS) ##DEFINE_string( ## 'undefok', "", ## "comma-separated list of flag names that it is okay to specify " ## "on the command line even if the program does not define a flag " ## "with that name. IMPORTANT: flags in this list that have " ## "arguments MUST use the --flag=value format.", _SPECIAL_FLAGS)gf ================================================ FILE: license.txt ================================================ The LinearDesign code is freely accessible to all interested parties. It is free for academic, non-profit, and research use, and can be licensed for commercial use. To use this software for the development of a commercial product, including but not limited to software, service, or pharmaceuticals, please contact the lead corresponding author. Redistribution of the code with or without modification is not permitted without explicit written permission by the lead corresponding author. ================================================ FILE: lineardesign ================================================ #!/usr/bin/env python2 import gflags as flags import subprocess import sys import os FLAGS = flags.FLAGS def setgflags(): flags.DEFINE_float('lambda', 0.0, "set lambda", short_name='l') flags.DEFINE_boolean('verbose', False, "print out more details", short_name='v') flags.DEFINE_string('codonusage', 'codon_usage_freq_table_human.csv', "import a Codon Usage Frequency Table", short_name='c') argv = FLAGS(sys.argv) def main(): lambda_ = str(FLAGS.l) verbose_ = '1' if FLAGS.verbose else '0' codon_usage = str(FLAGS.codonusage) path = os.path.dirname(os.path.abspath(__file__)) cmd = ["%s/%s" % (path, ('bin/LinearDesign_2D')), lambda_, verbose_, codon_usage] subprocess.call(cmd, stdin=sys.stdin) if __name__ == '__main__': setgflags() main() ================================================ FILE: src/Utils/base.h ================================================ #ifndef base_h #define base_h #include #include #include #include #include #include #if defined(__GNUC__) || defined(__clang__) #define LINEAR_DESIGN_DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) #define LINEAR_DESIGN_DEPRECATED __declspec(deprecated) #else #pragma message("WARNING: function deprecated") #define LINEAR_DESIGN_DEPRECATED #endif #if defined(__GNUC__) || defined(__clang__) #define LINEAR_DESIGN_INLINE inline __attribute__((always_inline)) #else #define LINEAR_DESIGN_INLINE inline #endif #define LINEAR_DESIGN_CACHELINE 64 template using enable_if_t = typename std::enable_if::type; template ::value, int> = 0> std::ostream& operator<< (std::ostream& out, const std::pair& rhs) { out << "(" << rhs.first << ", " << rhs.second << ")"; return out; } template ::value, int> = 0> std::ostream& operator<< (std::ostream& out, const std::vector>& rhs) { out << "["; for (size_t i = 0; i < rhs.size(); ++i) { out << rhs[i]; if (i < rhs.size() - 1) out << ","; } out << "]"; return out; } namespace LinearDesign { namespace util { std::vector split(const std::string &s, char delim) { std::vector result; std::stringstream ss(s); std::string item; while (getline(ss, item, delim)) result.push_back(item); return result; } template constexpr T value_min() { static_assert(std::is_integral::value || std::is_floating_point::value, "Int or float required."); return std::numeric_limits::lowest(); } template constexpr T value_max() { static_assert(std::is_integral::value || std::is_floating_point::value, "Int or float required."); return std::numeric_limits::max(); } } /* util */ // template struct is_any; // template <> struct is_any<> : std::false_type {}; // template struct is_any { // constexpr static bool value = First || is_any::value; // }; struct hash_pair_pair { template size_t operator()(const std::pair, T3>& p) const { auto hash1 = std::hash{}(p.first.first); auto hash2 = std::hash{}(p.first.second); auto hash3 = std::hash{}(p.second); return hash1 ^ hash2 ^ hash3; } }; struct hash_pair { template size_t operator()(const std::pair& p) const { auto hash1 = std::hash{}(p.first); auto hash2 = std::hash{}(p.second); return hash1 ^ hash2; } }; } namespace Hash { template LINEAR_DESIGN_INLINE size_t hash_combine(size_t left_seed, const T& right) { return left_seed ^ (std::hash{}(right) << 1); } template ::value - 1> struct TupleHashImpl { static size_t impl(size_t seed, const Tuple& tuple) { size_t h = hash_combine(seed, std::get(tuple)); return TupleHashImpl::impl(h, tuple); } }; template struct TupleHashImpl { static size_t impl(size_t seed, const Tuple& tuple) { return hash_combine(seed, std::get<0>(tuple)); } }; } template struct std::hash> { size_t operator()(const std::tuple& ts) const { return Hash::TupleHashImpl>::impl(0, ts); } }; template struct std::hash> { size_t operator()(const std::pair& p) const { size_t h = std::hash{}(p.first); return Hash::hash_combine(h, p.second); } }; #endif ================================================ FILE: src/Utils/codon.h ================================================ #ifndef codon_h #define codon_h #include #include #include #include #include #include #include #include #include #include "base.h" #include "constants.h" namespace LinearDesign { // trim from end (in place) static inline void rtrim(std::string &s) { s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base(), s.end()); } class Codon { public: Codon(const std::string& path) : codon_table_(), aa_table_() { std::ifstream codon_file; codon_file.open(path); if (codon_file.is_open()) { int index = 0; for (std::string line; getline(codon_file, line);){ rtrim(line); if(line.size() == 0 or line.empty()) continue; if (index++ == 0) continue; const auto line_split = util::split(line, ','); if(line_split.size() != 3){ std::cerr << "Wrong format of codon frequency file!" << std::endl; exit(1); } const std::string codon = line_split[0]; const std::string aa = line_split[1]; const float fraction = std::stof(line_split[2]); codon_table_[codon] = make_pair(aa, fraction); aa_table_[aa].push_back(make_pair(codon, fraction)); if (!max_aa_table_.count(aa)) max_aa_table_[aa] = fraction; else max_aa_table_[aa] = fmax(max_aa_table_[aa], fraction); } codon_file.close(); if (codon_table_.size() != 64){ std::cerr << "Codon frequency file needs to contain 64 codons!" << std::endl; exit(1); } } else { std::cerr << "The input codon frequency file does not exist!" << std::endl; exit(1); } } float calc_cai(const std::string& rna_seq) const { if (rna_seq.length() % 3) throw std::runtime_error("invalid rna seq"); const int protein_length = static_cast(rna_seq.length() / 3); float cai = 0.0f; for (int index = 3; index < rna_seq.length() + 1; index += 3) { const std::string tri_letter = rna_seq.substr(index - 3, 3); const auto f_ci_aa = codon_table_.at(tri_letter); const auto f_c_max = max_aa_table_.at(f_ci_aa.first); float w_i = f_ci_aa.second / f_c_max; cai += log2f(w_i); } return exp2f(cai / protein_length); } std::string find_max_codon(const char aa, const std::string& match) const { auto candidate_condons = aa_table_.at(std::string(1, aa)); float max_score = 0; std::string max_condon; for (auto& candidate : candidate_condons) { if (std::regex_match(candidate.first, std::regex(match)) && candidate.second > max_score) { max_condon = candidate.first; max_score = candidate.second; } } if (max_condon.empty()) throw std::runtime_error("invald search"); // assert(codon_table_.at(max_condon).first == std::string(1, aa)); return max_condon; } std::string cvt_rna_seq_to_aa_seq(const std::string& rna_seq) const { if (rna_seq.length() % 3) throw std::runtime_error("invalid rna seq"); std::string aa_seq; aa_seq.reserve(rna_seq.length()); for (int index = 3; index < rna_seq.length() + 1; index += 3) { const std::string tri_letter = rna_seq.substr(index - 3, 3); auto aa = codon_table_.at(tri_letter).first; if (aa == "STOP") { aa_seq.append("*"); return aa_seq; } aa_seq.append(codon_table_.at(tri_letter).first); } return aa_seq; } float get_weight(const std::string& aa_tri, const std::string& codon) const { if (k_map_3_1.count(aa_tri)) { auto codons = aa_table_.at(std::string(1, k_map_3_1[aa_tri])); auto it = std::find_if(codons.begin(), codons.end(), [codon](const std::pair& e){ // std::cout << typeid(e).name() << '\n'; return e.first == codon; }); if (it == codons.end()) { throw std::runtime_error("invalid codon"); } return it->second; } else if (three_prime_aa_table_.count(aa_tri)) { return three_prime_aa_table_.at(aa_tri).second; } return 0.0f; } // private: std::vector aux_aa_; std::map> three_prime_codon_table_; std::map> three_prime_aa_table_; std::map max_aa_table_; std::map> codon_table_; std::map>> aa_table_; }; } #endif ================================================ FILE: src/Utils/common.h ================================================ #ifndef common_h #define common_h #include #include #include #include #include #include #include #include #include "base.h" namespace LinearDesign { using SizeType = size_t; using ScoreType = int32_t; using IndexType = int32_t; //if less than 10000, only int16_t is needed here using NucType = int8_t; using NumType = int32_t; using NucPairType = int8_t; using PairType = int8_t; using FinalScoreType = double; using NodeType = std::pair; using NodeNucType = std::pair; using NodeNucWType = std::tuple; using PairType = int8_t; enum class Manner : std::uint8_t { NONE = 0, // 0: empty H, // 1: hairpin candidate HAIRPIN, // 2: hairpin SINGLE, // 3: single HELIX, // 4: helix MULTI, // 5: multi = ..M2. [30 restriction on the left and jump on the right] MULTI_eq_MULTI_plus_U, // 6: multi = multi + U P_eq_MULTI, // 7: P = (multi) M2_eq_M_plus_P, // 8: M2 = M + P M_eq_M2, // 9: M = M2 M_eq_M_plus_U, // 10: M = M + U M_eq_P, // 11: M = P C_eq_C_plus_U, // 12: C = C + U C_eq_C_plus_P, // 13: C = C + P }; enum class Beam_type : std::uint8_t { BEAM_C = 0, BEAM_P, BEAM_MULTI, BEAM_M2, BEAM_M1 }; template struct State { ScoreType score = util::value_min(); double cai_score = util::value_min(); NodeType pre_node; double pre_left_cai; }; struct BacktraceResult { std::string seq; std::string structure; }; template > struct DecoderResult { std::string sequence; std::string structure; ScoreType score; ScoreType cai; ScoreType old_cai; IndexType num_states; }; template > struct ScoreInnerDate { ScoreType newscore; NodeType j_node; NodeType i_node; int nuc_pair; }; struct NodeNucpair { IndexType node_first; NumType node_second; NucPairType nucpair; }; } #endif /* common_h */ ================================================ FILE: src/Utils/constants.h ================================================ #ifndef constants_h #define constants_h #include namespace LinearDesign { constexpr uint8_t k_void_nuc = 127; // static std::map k_map_1_3 = { // {'F',"Phe"}, // {'L',"Leu"}, // {'S',"Ser"}, // {'Y',"Tyr"}, // {'*',"STOP"}, // {'C',"Cys"}, // {'W',"Trp"}, // {'P',"Pro"}, // {'H',"His"}, // {'Q',"Gln"}, // {'R',"Arg"}, // {'I',"Ile"}, // {'M',"Met"}, // {'T',"Thr"}, // {'N',"Asn"}, // {'K',"Lys"}, // {'V',"Val"}, // {'D',"Asp"}, // {'E',"Glu"}, // {'G',"Gly"}, // {'A',"Ala"} // }; static std::map k_map_3_1 = { {"Phe", 'F'}, {"Leu", 'L'}, {"Ser", 'S'}, {"Tyr", 'Y'}, {"STOP", '*'}, {"Cys", 'C'}, {"Trp", 'W'}, {"Pro", 'P'}, {"His", 'H'}, {"Gln", 'Q'}, {"Arg", 'R'}, {"Ile", 'I'}, {"Met", 'M'}, {"Thr", 'T'}, {"Asn", 'N'}, {"Lys", 'K'}, {"Val", 'V'}, {"Asp", 'D'}, {"Glu", 'E'}, {"Gly", 'G'}, {"Ala", 'A'} }; } #endif /* constants_h */ ================================================ FILE: src/Utils/flat.h ================================================ #ifndef flat_h #define flat_h #include #include #include "base.h" namespace detail { template struct DefaultIndex { inline size_t operator()(const Key key) const { return static_cast(key); } }; } template > class Flat { public: using self_type = Flat; using storage_type = std::vector; using key_type = Key; using reference = T&; using const_reference = const T&; using iterator = typename storage_type::iterator; LINEAR_DESIGN_INLINE iterator begin() { return data_.begin(); } LINEAR_DESIGN_INLINE iterator end() { return data_.end(); } LINEAR_DESIGN_INLINE bool empty() const { return false; } LINEAR_DESIGN_INLINE void reserve(const size_t n) { data_.reserve(n); } LINEAR_DESIGN_INLINE void resize(const size_t n) { data_.resize(n); } template ::value, int> = 0> LINEAR_DESIGN_INLINE reference operator[](size_t index) { return data_[index]; } template ::value, int> = 0> LINEAR_DESIGN_INLINE const_reference operator[](size_t index) const { return data_[index]; } LINEAR_DESIGN_INLINE reference operator[](key_type key) { return data_[index_(key)]; } LINEAR_DESIGN_INLINE const_reference operator[](key_type key) const { return data_[index_(key)]; } LINEAR_DESIGN_INLINE size_t size() const { return data_.size(); } private: IndexFn index_; storage_type data_; }; #endif /* flat_h */ ================================================ FILE: src/Utils/network.h ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "utility_v.h" #include "common.h" #include "codon.h" using namespace std; // #define is_verbose namespace LinearDesign { template , typename NodeType = pair, typename NodeNucWType = tuple> class Lattice { public: unordered_map> nodes; unordered_map, hash_pair> left_edges; unordered_map, hash_pair> right_edges; Lattice(): nodes(), left_edges(), right_edges() {}; void add_edge(NodeType n1, NodeType n2, NucType nuc, double weight = 0.0f){ right_edges[n1].push_back(make_tuple(n2, nuc, weight)); left_edges[n2].push_back(make_tuple(n1, nuc, weight)); } void add_node(NodeType n1){ IndexType pos = get<0>(n1); nodes[pos].push_back(n1); } }; template , typename NodeType = pair, typename NodeNucWType = tuple> class DFA { public: unordered_map> nodes; unordered_map, hash_pair> left_edges; unordered_map, hash_pair> right_edges; unordered_map, hash_pair>, hash_pair> auxiliary_left_edges; unordered_map, hash_pair>, hash_pair> auxiliary_right_edges; unordered_map, hash_pair> node_rightedge_weights; DFA(): nodes(), left_edges(), right_edges(), auxiliary_left_edges(), auxiliary_right_edges() {}; void add_edge(NodeType n1, NodeType n2, IndexType nuc, double weight = 0.0f){ right_edges[n1].push_back(make_tuple(n2, nuc, weight)); left_edges[n2].push_back(make_tuple(n1, nuc, weight)); auxiliary_right_edges[n1][n2].push_back(make_pair(nuc, weight)); auxiliary_left_edges[n2][n1].push_back(make_pair(nuc, weight)); node_rightedge_weights[n1][nuc] = weight; } void add_node(NodeType n1){ IndexType pos = get<0>(n1); nodes[pos].push_back(n1); } }; template , typename LatticeType = Lattice> unordered_map read_wheel(string const &filename) { unordered_map aa_graphs; ifstream inFile; inFile.open(filename); if (!inFile) { printf("Unable to open coding_wheel file\n"); exit(1); // call system to stop } vector stuff; vector option_splited; string aa; IndexType i; for (string line; getline(inFile, line);) { stuff = util::split(line, '\t'); aa = stuff[0]; LatticeType graph = LatticeType(); graph.add_node(make_pair(0,0)); // always initialize with node (0,0) char last_first = 0; vector::iterator iter = stuff.begin(); ++iter; // position 0 is aa name i = 0; while(iter != stuff.end()){ string option = *iter; option_splited = util::split(option, ' '); char first = option_splited[0][0]; char second = option_splited[1][0]; string thirds = option_splited[2]; NodeType n2 = make_pair(2, i); graph.add_node(n2); NodeType n1; if (first != last_first) { n1 = make_pair(1, i); graph.add_node(n1); graph.add_edge(make_pair(0, 0), n1, GET_ACGU_NUC(first)); } else { n1 = make_pair(1, i-1); } last_first = first; graph.add_edge(n1, n2, GET_ACGU_NUC(second)); for (auto& third : thirds) { graph.add_edge(n2, make_pair(0,0), GET_ACGU_NUC(third)); } i++; iter++; } aa_graphs[aa] = graph; #ifdef is_verbose printf("-----------------Lattice------------------------\n"); for(IndexType pos = 0; pos <= 2; pos++){ for(auto &node : graph.nodes[pos]){ IndexType p = get<0>(node); IndexType num = get<1>(node); printf("node, (%d, %d)\n", p, num); for(auto &item : graph.right_edges[node]){ NodeType n2 = get<0>(item); IndexType p2 = get<0>(n2); IndexType num2 = get<1>(n2); IndexType nuc = get<1>(item); double weight = get<2>(item); printf(" (%d, %d) -(%d,%lf)-> (%d, %d)\n", p, num, nuc, weight, p2, num2); } for(auto &item : graph.left_edges[node]){ NodeType n1 = get<0>(item); IndexType p1 = get<0>(n1); IndexType num1 = get<1>(n1); IndexType nuc = get<1>(item); double weight = get<2>(item); printf(" (%d, %d) <-(%d,%lf)- (%d, %d)\n", p1, num1, nuc, weight, p, num); } } } #endif } inFile.close(); return aa_graphs; } template , typename LatticeType = Lattice, typename NucType = IndexType, typename NodeNucNodeType = std::tuple> unordered_map read_wheel_with_weights(const std::string& filename, std::unordered_map>& nodes_with_best_weight, std::unordered_map>>& edges_with_best_weight, const Codon& codon) { unordered_map aa_graphs; ifstream inFile; inFile.open(filename); if (!inFile) throw std::runtime_error("Unable to open coding_wheel file\n"); vector stuff; vector option_splited; string aa; IndexType i; for (string line; getline(inFile, line);) { stuff = util::split(line, '\t'); aa = stuff[0]; LatticeType graph = LatticeType(); graph.add_node(make_pair(0,0)); // always initialize with node (0,0) char last_first = 0; vector::iterator iter = stuff.begin(); ++iter; // position 0 is aa name i = 0; while(iter != stuff.end()){ string option = *iter; option_splited = util::split(option, ' '); char first = option_splited[0][0]; char second = option_splited[1][0]; string thirds = option_splited[2]; NodeType n2 = make_pair(2, i); graph.add_node(n2); NodeType n1; if (first != last_first) { n1 = make_pair(1, i); graph.add_node(n1); auto first_num = GET_ACGU_NUC(first); double weight = 0.0f; if (nodes_with_best_weight[aa].count(make_pair(0, 0))) { weight = edges_with_best_weight[aa][make_tuple(make_pair(0, 0), first_num, n1)] / nodes_with_best_weight[aa][make_pair(0, 0)]; } graph.add_edge(make_pair(0, 0), n1, first_num, weight); } else { n1 = make_pair(1, i-1); } last_first = first; auto second_num = GET_ACGU_NUC(second); double weight = 0.0f; if (nodes_with_best_weight[aa].count(n1)) { weight = edges_with_best_weight[aa][make_tuple(n1, second_num, n2)] / nodes_with_best_weight[aa][n1]; } graph.add_edge(n1, n2, second_num, weight); for (auto& third : thirds) { std::string three_nums = std::string(1, first) + std::string(1, second) + std::string(1, third); double weight = 0.0f; if (nodes_with_best_weight[aa].count(n2)) { weight = codon.get_weight(aa, three_nums) / nodes_with_best_weight[aa][n2]; } else { weight = codon.get_weight(aa, three_nums); } graph.add_edge(n2, make_pair(0,0), GET_ACGU_NUC(third), weight); } i++; iter++; } aa_graphs[aa] = graph; } inFile.close(); return aa_graphs; } template , typename LatticeType = Lattice, typename NucType = IndexType, typename NodeNucNodeType = std::tuple> unordered_map read_wheel_with_weights_log(const std::string& filename, std::unordered_map>& nodes_with_best_weight, std::unordered_map>>& edges_with_best_weight, const Codon& codon, double lambda_) { unordered_map aa_graphs; ifstream inFile; inFile.open(filename); if (!inFile) throw std::runtime_error("Unable to open coding_wheel file\n"); vector stuff; vector option_splited; string aa; IndexType i; for (string line; getline(inFile, line);) { stuff = util::split(line, '\t'); aa = stuff[0]; LatticeType graph = LatticeType(); graph.add_node(make_pair(0,0)); // always initialize with node (0,0) char last_first = 0; vector::iterator iter = stuff.begin(); ++iter; // position 0 is aa name i = 0; while(iter != stuff.end()){ string option = *iter; option_splited = util::split(option, ' '); char first = option_splited[0][0]; char second = option_splited[1][0]; string thirds = option_splited[2]; NodeType n2 = make_pair(2, i); graph.add_node(n2); NodeType n1; if (first != last_first) { n1 = make_pair(1, i); graph.add_node(n1); auto first_num = GET_ACGU_NUC(first); double weight = 1.0f; if (nodes_with_best_weight[aa].count(make_pair(0, 0))) { weight = lambda_ * log(edges_with_best_weight[aa][make_tuple(make_pair(0, 0), first_num, n1)] / nodes_with_best_weight[aa][make_pair(0, 0)]); } graph.add_edge(make_pair(0, 0), n1, first_num, weight); } else { n1 = make_pair(1, i-1); } last_first = first; auto second_num = GET_ACGU_NUC(second); double weight = 1.0f; if (nodes_with_best_weight[aa].count(n1)) { weight = lambda_ * log(edges_with_best_weight[aa][make_tuple(n1, second_num, n2)] / nodes_with_best_weight[aa][n1]); } graph.add_edge(n1, n2, second_num, weight); for (auto& third : thirds) { std::string three_nums = std::string(1, first) + std::string(1, second) + std::string(1, third); double weight = 1.0f; if (nodes_with_best_weight[aa].count(n2)) { weight = lambda_ * log(codon.get_weight(aa, three_nums) / nodes_with_best_weight[aa][n2]); } else { weight = lambda_ * log(codon.get_weight(aa, three_nums)); } graph.add_edge(n2, make_pair(0,0), GET_ACGU_NUC(third), weight); } i++; iter++; } aa_graphs[aa] = graph; } inFile.close(); return aa_graphs; } template , typename NodeNucNodeType = std::tuple, typename WeightType = double, typename LatticeType = Lattice> void prepare_codon_unit_lattice(const std::string& wheel_path, const Codon& codon, std::unordered_map& aa_graphs_with_ln_weights_ret, std::unordered_map, std::tuple, std::hash>>>& best_path_in_one_codon_unit_ret, std::unordered_map& aa_best_path_in_a_whole_codon_ret, double lambda_) { std::unordered_map> nodes_with_best_weight; std::unordered_map>> edges_with_best_weight; unordered_map aa_graphs_with_ln_weights; unordered_map aa_graphs_with_weights = read_wheel_with_weights(wheel_path, nodes_with_best_weight, edges_with_best_weight, codon); for (auto& aa_aa_elem : aa_graphs_with_weights) { auto& aa = aa_aa_elem.first; auto& aa_elem = aa_aa_elem.second; for (auto& node_at_2 : aa_elem.nodes[2]) { for (auto& node_at_3_nuc_weight : aa_elem.right_edges[node_at_2]) { auto node_at_3 = std::get<0>(node_at_3_nuc_weight); auto nuc = std::get<1>(node_at_3_nuc_weight); auto weight = std::get<2>(node_at_3_nuc_weight); nodes_with_best_weight[aa][node_at_2] = max(nodes_with_best_weight[aa][node_at_2], weight); edges_with_best_weight[aa][make_tuple(node_at_2,nuc,node_at_3)] = weight; } } for (auto& node_at_1 : aa_elem.nodes[1]) { for (auto& node_at_2_nuc_weight : aa_elem.right_edges[node_at_1]) { auto node_at_2 = std::get<0>(node_at_2_nuc_weight); auto nuc = std::get<1>(node_at_2_nuc_weight); nodes_with_best_weight[aa][node_at_1] = max(nodes_with_best_weight[aa][node_at_1], nodes_with_best_weight[aa][node_at_2]); edges_with_best_weight[aa][make_tuple(node_at_1,nuc,node_at_2)] = nodes_with_best_weight[aa][node_at_2]; } } for (auto& node_at_0 : aa_elem.nodes[0]) { for (auto& node_at_1_nuc_weight : aa_elem.right_edges[node_at_0]) { auto node_at_1 = std::get<0>(node_at_1_nuc_weight); auto nuc = std::get<1>(node_at_1_nuc_weight); nodes_with_best_weight[aa][node_at_0] = max(nodes_with_best_weight[aa][node_at_0], nodes_with_best_weight[aa][node_at_1]); edges_with_best_weight[aa][make_tuple(node_at_0,nuc,node_at_1)] = nodes_with_best_weight[aa][node_at_1]; } } } aa_graphs_with_ln_weights = read_wheel_with_weights_log(wheel_path, nodes_with_best_weight, edges_with_best_weight, codon, lambda_); std::unordered_map, std::tuple, std::hash>>> best_path_in_one_codon_unit; for (auto& aa_graph : aa_graphs_with_ln_weights) { auto& aa = aa_graph.first; auto& graph = aa_graph.second; for (auto& node_0 : graph.nodes[0]) { for (auto& node_1_nuc_log_w : graph.right_edges[node_0]) { auto node_1 = std::get<0>(node_1_nuc_log_w); auto nuc = std::get<1>(node_1_nuc_log_w); auto log_weight = std::get<2>(node_1_nuc_log_w); if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_0,node_1))) best_path_in_one_codon_unit[aa][make_tuple(node_0,node_1)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); double current_log_weight = std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_0,node_1)]); if (current_log_weight < log_weight) { best_path_in_one_codon_unit[aa][make_tuple(node_0,node_1)] = make_tuple(log_weight,nuc,k_void_nuc); } } } for (auto& node_1 : graph.nodes[1]) { for (auto& node_2_nuc_log_w : graph.right_edges[node_1]) { auto node_2 = std::get<0>(node_2_nuc_log_w); auto nuc = std::get<1>(node_2_nuc_log_w); auto log_weight = std::get<2>(node_2_nuc_log_w); if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_1,node_2))) best_path_in_one_codon_unit[aa][make_tuple(node_1,node_2)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); double current_log_weight = std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_1,node_2)]); if (current_log_weight < log_weight) { best_path_in_one_codon_unit[aa][make_tuple(node_1,node_2)] = make_tuple(log_weight,nuc,k_void_nuc); } auto temp = best_path_in_one_codon_unit[aa][make_tuple(node_1,node_2)]; } } for (auto& node_2 : graph.nodes[2]) { for (auto& node_3_nuc_log_w : graph.right_edges[node_2]) { auto node_3 = std::get<0>(node_3_nuc_log_w); auto nuc = std::get<1>(node_3_nuc_log_w); auto log_weight = std::get<2>(node_3_nuc_log_w); if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_2,node_3))) best_path_in_one_codon_unit[aa][make_tuple(node_2,node_3)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); double current_log_weight = std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_2,node_3)]); if (current_log_weight < log_weight) { best_path_in_one_codon_unit[aa][make_tuple(node_2,node_3)] = make_tuple(log_weight,nuc,k_void_nuc); } } } for (auto& node_0 : graph.nodes[0]) { for (auto& node_1_nuc_0_log_weight_0 : graph.right_edges[node_0]) { auto& node_1 = std::get<0>(node_1_nuc_0_log_weight_0); auto& nuc_0 = std::get<1>(node_1_nuc_0_log_weight_0); auto log_weight_0 = std::get<2>(node_1_nuc_0_log_weight_0); for (auto& node_2_nuc_1_log_weight_1 : graph.right_edges[node_1]) { auto& node_2 = std::get<0>(node_2_nuc_1_log_weight_1); auto& nuc_1 = std::get<1>(node_2_nuc_1_log_weight_1); auto log_weight_1 = std::get<2>(node_2_nuc_1_log_weight_1); if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_0,node_2))) best_path_in_one_codon_unit[aa][make_tuple(node_0,node_2)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); if (std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_0,node_2)]) < log_weight_0 + log_weight_1) best_path_in_one_codon_unit[aa][make_tuple(node_0,node_2)] = make_tuple(log_weight_0 + log_weight_1, nuc_0, nuc_1); } } } for (auto& node_1 : graph.nodes[1]) { for (auto& node_2_nuc_1_log_weight_1 : graph.right_edges[node_1]) { auto& node_2 = std::get<0>(node_2_nuc_1_log_weight_1); auto& nuc_1 = std::get<1>(node_2_nuc_1_log_weight_1); auto log_weight_1 = std::get<2>(node_2_nuc_1_log_weight_1); for (auto& node_3_nuc_2_log_weight_2 : graph.right_edges[node_2]) { auto& node_3 = std::get<0>(node_3_nuc_2_log_weight_2); auto& nuc_2 = std::get<1>(node_3_nuc_2_log_weight_2); auto log_weight_2 = std::get<2>(node_3_nuc_2_log_weight_2); if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_1,node_3))) best_path_in_one_codon_unit[aa][make_tuple(node_1,node_3)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); if (std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_1,node_3)]) < log_weight_1 + log_weight_2) best_path_in_one_codon_unit[aa][make_tuple(node_1,node_3)] = make_tuple(log_weight_1 + log_weight_2, nuc_1, nuc_2); } } } } std::unordered_map max_path; std::unordered_map aa_best_path_in_a_whole_codon; for (auto& aa_path_weight : codon.aa_table_) { auto& aa = aa_path_weight.first; // char for (auto& path_weight : aa_path_weight.second) { if (max_path[aa] < path_weight.second) { max_path[aa] = path_weight.second; aa_best_path_in_a_whole_codon[aa] = path_weight.first; } } } aa_graphs_with_ln_weights_ret = aa_graphs_with_ln_weights; best_path_in_one_codon_unit_ret = best_path_in_one_codon_unit; aa_best_path_in_a_whole_codon_ret = aa_best_path_in_a_whole_codon; } template , typename LatticeType = Lattice, typename DFAType = DFA> DFAType get_dfa(unordered_map aa_graphs, vector aa_seq) { DFAType dfa = DFAType(); NodeType newnode = make_pair(3 * static_cast(aa_seq.size()), 0); dfa.add_node(newnode); IndexType i = 0; IndexType i3; string aa; LatticeType graph; for(auto& item : aa_seq) { i3 = i * 3; aa = aa_seq[i]; graph = aa_graphs[aa]; for (IndexType pos = 0; pos <= 2; pos++) { for(auto& node : graph.nodes[pos]) { IndexType num = get<1>(node); newnode = make_pair(i3 + pos, num); dfa.add_node(newnode); for (auto& edge : graph.right_edges[node]) { NodeType n2 = get<0>(edge); IndexType nuc = get<1>(edge); num = get<1>(n2); NodeType newn2 = make_pair(i3 + pos + 1, num); dfa.add_edge(newnode, newn2, nuc, get<2>(edge)); } } } i++; } #ifdef is_verbose printf("-----------------DFA------------------------\n"); for(IndexType pos = 0; pos < 3 * static_cast(aa_seq.size()) + 1; pos++){ for(auto& node : dfa.nodes[pos]) { IndexType p = get<0>(node); IndexType num = get<1>(node); printf("node, (%d, %d)\n", p, num); for(auto &n2 : dfa.auxiliary_right_edges[node]){ IndexType p2 = get<0>(n2.first); IndexType num2 = get<1>(n2.first); for(auto nuc : n2.second){ printf(" (%d, %d) -(%d,%lf)-> (%d, %d)\n", p, num, get<0>(nuc),get<1>(nuc), p2, num2); } } for(auto &n1 : dfa.auxiliary_left_edges[node]){ IndexType p1 = get<0>(n1.first); IndexType num1 = get<1>(n1.first); for(auto nuc : n1.second){ printf(" (%d, %d) <-(%d,%lf)- (%d, %d)\n", p1, num1, get<0>(nuc),get<1>(nuc), p, num); } } } } #endif return dfa; } } ================================================ FILE: src/Utils/reader.h ================================================ #ifndef fasta_h #define fasta_h #include #include #include #include #include "base.h" namespace LinearDesign { struct Reader { static bool cvt_to_seq(const string& from, string& to) { return false; } }; struct Fasta : public Reader { static map map_fasta; static bool cvt_to_seq(const string& fasta, string& nucs) { nucs.reserve(4 * fasta.length()); for(auto aa : fasta) { if (map_fasta.count(aa)) { nucs.append(map_fasta[aa] + " "); } else { cerr << "invalid protein sequence!\n" << endl; return false; } } nucs.pop_back(); return true; } }; map Fasta::map_fasta = { {'F',"Phe"}, {'L',"Leu"}, {'S',"Ser"}, {'Y',"Tyr"}, {'*',"STOP"}, {'C',"Cys"}, {'W',"Trp"}, {'P',"Pro"}, {'H',"His"}, {'Q',"Gln"}, {'R',"Arg"}, {'I',"Ile"}, {'M',"Met"}, {'T',"Thr"}, {'N',"Asn"}, {'K',"Lys"}, {'V',"Val"}, {'D',"Asp"}, {'E',"Glu"}, {'G',"Gly"}, {'A',"Ala"} }; template struct ReaderTraits { static bool cvt_to_seq(const string& from, string& to) { return T::cvt_to_seq(from, to); } }; } #endif /* fasta_h */ ================================================ FILE: src/Utils/utility_v.h ================================================ #ifndef FASTCKY_UTILITY_V_H #define FASTCKY_UTILITY_V_H #include #include #include #include #define NTP(x,y) (x==1? (y==4?5:0) : (x==2? (y==3?1:0) : (x==3 ? (y==2?2:(y==4?3:0)) : (x==4 ? (y==3?4:(y==1?6:0)) : 0)))) #define PTLN(x) (x==1? 2:((x==2 || x==3)? 3:(x==5)? 1:4)) #define PTRN(x) (x==2? 2:((x==1 || x==4)? 3:(x==6)? 1:4)) #define NOTON 5 // NUM_OF_TYPE_OF_NUCS #define NOTOND 25 #define NOTONT 125 #define EXPLICIT_MAX_LEN 4 #define SINGLE_MIN_LEN 0 #define SINGLE_MAX_LEN 20 // NOTE: *must* <= sizeof(char), otherwise modify State::TraceInfo accordingly #define HAIRPIN_MAX_LEN 30 #define BULGE_MAX_LEN SINGLE_MAX_LEN #define INTERNAL_MAX_LEN SINGLE_MAX_LEN #define SYMMETRIC_MAX_LEN 15 #define ASYMMETRY_MAX_LEN 28 #define SPECIAL_HAIRPIN_SCORE_BASELINE -10000 extern bool _allowed_pairs[NOTON][NOTON]; #define MAXLOOP 30 #define GET_ACGU(x) ((x==1? 'A' : (x==2? 'C' : (x==3? 'G' : (x==4?'U': 'X'))))) #define GET_ACGU_NUC(x) ((x=='A'? 1 : (x=='C'? 2 : (x=='G'? 3 : (x=='U'?4: 0))))) #define HAIRPINTYPE(x) ((x==5?0 : (x==6?1 : (x==8?2 : 3)))) extern int func1(std::string& a, int8_t b); extern void func2(std::string& a, int b, std::vector& c, std::vector& d, std::vector& e); extern int func3(int a, int b, int c, int d, int e); extern int func4(int a, int b, int c, int d, int e, int f, int g); extern int func5(int a, int b, int c); extern int func6(int a, int b, int c, int d, int e, int f, int g, int h); extern int func7(int a, int b, int c, int d, int e, int h, int i); extern int func8(int a, int b); extern void func9(int a, int b); extern int func10(int a, int b, int c); extern int func11(int a, int b, int c, int d, int e, int f, int g, int h); extern int func12(int a, int b, int c, int d, int e, int f, int g = -1); extern int func13(int a, int b); extern int func14(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j, int k, int l); extern int func15(int a, int b, int c, int d, int e, int f, int g); #endif ================================================ FILE: src/backtrace_iter.cc ================================================ #include "beam_cky_parser.h" using namespace std; #define tetra_hex_tri -1 namespace LinearDesign { template BacktraceResult BeamCKYParser::backtrace(DFA_t& dfa, const State_t& state, NodeType end_node){ char sequence[seq_length+1]; memset(sequence, '.', seq_length); sequence[seq_length] = 0; char structure[seq_length+1]; memset(structure, '.', seq_length); structure[seq_length] = 0; bool no_backpointer; stack> stk; NodeType start_node = make_pair(0, 0); stk.push(make_tuple(start_node, end_node, state, Beam_type::BEAM_C, -1)); double epsilon = 1e-8; while(!stk.empty()) { tuple top = stk.top(); NodeType i_node = get<0>(top), j_node = get<1>(top); State_t& state = get<2>(top); Beam_type beam_type = get<3>(top); PairType curr_pair_nuc = get<4>(top); stk.pop(); IndexType i, j, p, q, hairpin_length; j = j_node.first; NucType nuci, nucj, nuci1, nucj_1; no_backpointer = true; int left_start, left_end, right_start, right_end; switch (beam_type) { case Beam_type::BEAM_C: if (j <= 0) continue; for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ auto j_1_node = std::get<0>(j_1_node_nucj_1); auto& c_state = bestC[j_1_node]; auto weight_nucj_1 = std::get<2>(j_1_node_nucj_1); auto cai_score = c_state.cai_score + weight_nucj_1; if (state.score == c_state.score && abs(state.cai_score - cai_score) < epsilon){ NucType nucj_1 = std::get<1>(j_1_node_nucj_1); stk.push(make_tuple(i_node, j_1_node, c_state, Beam_type::BEAM_C, curr_pair_nuc)); sequence[j-1] = GET_ACGU(nucj_1); no_backpointer = false; break; } } // C = C + P if(no_backpointer) { for (size_t c_node_nucpair_ = 0; c_node_nucpair_ < 16 * seq_length; ++c_node_nucpair_){ auto& p_state = bestP[j_node][c_node_nucpair_]; if (p_state.score == util::value_min()) continue; auto c_node_nucpair = reverse_index(c_node_nucpair_); auto c = c_node_nucpair.node_first; auto c_num = c_node_nucpair.node_second; auto pair_nuc = c_node_nucpair.nucpair; auto c_node = make_pair(c, c_num); auto nucc = PTLN(pair_nuc); auto nucj_1 = PTRN(pair_nuc); auto newscore = - func3(c, j-1, nucc, nucj_1, seq_length) + p_state.score; if (c > 0){ auto& c_state = bestC[c_node]; auto cai_score = c_state.cai_score + p_state.cai_score; if (state.score == c_state.score + newscore && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(i_node, c_node, c_state, Beam_type::BEAM_C, curr_pair_nuc)); stk.push(make_tuple(c_node, j_node, p_state, Beam_type::BEAM_P, pair_nuc)); no_backpointer = false; break; } } else{ if (state.score == newscore && abs(state.cai_score - p_state.cai_score) < epsilon){ stk.push(make_tuple(c_node, j_node, p_state, Beam_type::BEAM_P, pair_nuc)); no_backpointer = false; break; } } } if (!no_backpointer) break; } assert(no_backpointer == false); // something wrong if no path matches break; case Beam_type::BEAM_P: i = i_node.first; j = j_node.first; nuci = PTLN(curr_pair_nuc); nucj_1 = PTRN(curr_pair_nuc); hairpin_length = j - i; for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ NucType new_nucj_1 = std::get<1>(j_1_node_nucj_1); if (new_nucj_1 != nucj_1) continue; auto j_1_node = std::get<0>(j_1_node_nucj_1); auto weight_nucj_1 = std::get<2>(j_1_node_nucj_1); #ifdef SPECIAL_HP if (hairpin_length == 5 or hairpin_length == 6 or hairpin_length == 8){ for(auto & seq_score_weight : hairpin_seq_score_cai[i_node][j_1_node][NTP(nuci, nucj_1)]){ auto seq = get<0>(seq_score_weight); auto pre_cal_score = get<1>(seq_score_weight); auto pre_cal_cai_score = get<2>(seq_score_weight); if (state.score == pre_cal_score && abs(state.cai_score - pre_cal_cai_score) < epsilon){ for(int c=0; c(i1_node_nuci); if (new_nuci != nuci) continue; auto i1_node = std::get<0>(i1_node_nuci); auto weight_nuci = std::get<2>(i1_node_nuci); // helix for (auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ NucType nucj_2 = std::get<1>(j_2_node_nucj_2); for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ NucType nuci1 = std::get<1>(i2_node_nuci1); auto pair_nuc = NTP(nuci1, nucj_2); NodeNucpair temp = {i1_node.first, i1_node.second, static_cast(pair_nuc)}; auto& p_state = bestP[j_1_node][temp]; auto newscore = - func14(i, j-1, i+1, j-2, nuci, nuci1, nucj_2, nucj_1, nuci, nuci1, nucj_2, nucj_1) + p_state.score; auto cai_score = p_state.cai_score + (weight_nuci + weight_nucj_1); if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(i1_node, j_1_node, p_state, Beam_type::BEAM_P, pair_nuc)); sequence[i] = GET_ACGU(nuci); sequence[j-1] = GET_ACGU(nucj_1); structure[i] = '('; structure[j-1] = ')'; no_backpointer = false; break; } }if (!no_backpointer) break; }if (!no_backpointer) break; } if (!no_backpointer) break; // hairpin NodeNucpair temp = {i_node.first, LinearDesign::NumType(i_node.second), curr_pair_nuc}; if (state.score == bestH[j_1_node][temp].score){ //no need to check CAI score here for (auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ NucType nucj_2 = std::get<1>(j_2_node_nucj_2); auto j_2_node = std::get<0>(j_2_node_nucj_2); auto j_2 = j_2_node.first; auto weight_nucj_2 = std::get<2>(j_2_node_nucj_2); for (auto& i1_node_nuci : dfa.right_edges[i_node]){ NucType new_nuci = std::get<1>(i1_node_nuci); if (new_nuci != nuci) continue; auto i1_node = std::get<0>(i1_node_nuci); auto weight_nuci = get<2>(i1_node_nuci); for(auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ NucType nuci1 = std::get<1>(i2_node_nuci1); auto i2_node = std::get<0>(i2_node_nuci1); auto i2 = i2_node.first; auto weight_nuci1 = std::get<2>(i2_node_nuci1); if (j - 1 - i == 4 and (j_2_node.second != i2_node.second and dfa.nodes[i+2].size() == dfa.nodes[j-2].size())) continue; auto newscore = - func12(i, j-1, nuci, nuci1, nucj_2, nucj_1, tetra_hex_tri); auto cai_score = weight_nuci + weight_nuci1 + get_broken_codon_score(i2_node, j_2_node) + weight_nucj_2 + weight_nucj_1; if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ sequence[i] = GET_ACGU(nuci); sequence[i+1] = GET_ACGU(nuci1); sequence[j-2] = GET_ACGU(nucj_2); sequence[j-1] = GET_ACGU(nucj_1); structure[i] = '('; structure[j-1] = ')'; auto temp_string = get_nuc_from_dfa_cai(dfa, i2_node, j_2_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); int count = i2; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == j_2); no_backpointer = false; break; } }if (!no_backpointer) break; }if (!no_backpointer) break; }if (!no_backpointer) break; } } // single branch if (no_backpointer) { vector> right_seq; vector>, int, int, NodeType, NodeType, double, double, double, double, bool, NodeType>> q_node_nucs_list; for (IndexType q = j-1; q >= std::max(j - SINGLE_MAX_LEN - 1, i + 5); --q){ int right_start = -1; int right_end = -1; q_node_nucs_list.clear(); if (q == j-1){ for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ if (get<1>(j_1_node_nucj_1) != nucj_1) continue; auto q_node = get<0>(j_1_node_nucj_1); auto weight_nucj_1 = get<2>(j_1_node_nucj_1); for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ NodeType q_1_node = get<0>(q_1_node_nucq_1); auto nucq_1 = get<1>(q_1_node_nucq_1); double weight_nucq_1 = get<2>(q_1_node_nucq_1); right_seq.push_back(make_pair(j-1, nucj_1)); q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucj_1, nucq_1, right_seq, right_start, right_end, make_pair(-1,0), make_pair(-1,0), weight_nucq_1, 0., 0., weight_nucj_1, true, make_pair(-1,0))); right_seq.clear(); } } }else if(q == j-2){ for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ if (get<1>(j_1_node_nucj_1) != nucj_1) continue; auto j_1_node = get<0>(j_1_node_nucj_1); auto weight_nucj_1 = get<2>(j_1_node_nucj_1); for (auto& q_node_nucq : dfa.left_edges[j_1_node]){ auto q_node = get<0>(q_node_nucq); auto nucq = get<1>(q_node_nucq); auto weight_nucq = get<2>(q_node_nucq); for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ NodeType q_1_node = get<0>(q_1_node_nucq_1); auto nucq_1 = get<1>(q_1_node_nucq_1); double weight_nucq_1 = get<2>(q_1_node_nucq_1); right_seq.push_back(make_pair(q, nucq)); right_seq.push_back(make_pair(j-1, nucj_1)); q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucq, nucq, right_seq, right_start, right_end, make_pair(-1,0), make_pair(-1,0), weight_nucq_1, weight_nucq, 0., weight_nucj_1, false, j_1_node)); right_seq.clear(); } } } }else if(q == j-3){ for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ if (get<1>(j_1_node_nucj_1) != nucj_1) continue; auto j_1_node = get<0>(j_1_node_nucj_1); auto weight_nucj_1 = get<2>(j_1_node_nucj_1); for(auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ auto j_2_node = get<0>(j_2_node_nucj_2); auto nucj_2 = get<1>(j_2_node_nucj_2); auto weight_nucj_2 = get<2>(j_2_node_nucj_2); for (auto& q_node_nucq : dfa.left_edges[j_2_node]){ auto q_node = get<0>(q_node_nucq); auto nucq = get<1>(q_node_nucq); auto weight_nucq = get<2>(q_node_nucq); for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ NodeType q_1_node = get<0>(q_1_node_nucq_1); auto nucq_1 = get<1>(q_1_node_nucq_1); double weight_nucq_1 = get<2>(q_1_node_nucq_1); right_seq.push_back(make_pair(q, nucq)); right_seq.push_back(make_pair(j-2, nucj_2)); right_seq.push_back(make_pair(j-1, nucj_1)); q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucq, nucj_2, right_seq, right_start, right_end, make_pair(-1,0), make_pair(-1,0), weight_nucq_1, weight_nucq, weight_nucj_2, weight_nucj_1, false, j_1_node)); right_seq.clear(); } } } } } else if(q == j-4){ for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ if (get<1>(j_1_node_nucj_1) != nucj_1) continue; auto j_1_node = get<0>(j_1_node_nucj_1); auto weight_nucj_1 = get<2>(j_1_node_nucj_1); for(auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ auto j_2_node = get<0>(j_2_node_nucj_2); auto nucj_2 = get<1>(j_2_node_nucj_2); auto weight_nucj_2 = get<2>(j_2_node_nucj_2); for(auto& j_3_node_nucj_3 : dfa.left_edges[j_2_node]){ auto j_3_node = get<0>(j_3_node_nucj_3); for (auto& q_node_nucq : dfa.left_edges[j_3_node]){ auto q_node = get<0>(q_node_nucq); auto nucq = get<1>(q_node_nucq); auto weight_nucq = get<2>(q_node_nucq); for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ NodeType q_1_node = get<0>(q_1_node_nucq_1); auto nucq_1 = get<1>(q_1_node_nucq_1); double weight_nucq_1 = get<2>(q_1_node_nucq_1); right_seq.push_back(make_pair(q, nucq)); right_seq.push_back(make_pair(j-2, nucj_2)); right_seq.push_back(make_pair(j-1, nucj_1)); right_start = q+1; right_end = j-2; q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucq, nucj_2, right_seq, right_start, right_end, j_3_node, j_2_node, weight_nucq_1, weight_nucq, weight_nucj_2, weight_nucj_1, false, j_1_node)); right_seq.clear(); } } } } } } else{ for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ if (get<1>(j_1_node_nucj_1) != nucj_1) continue; auto j_1_node = get<0>(j_1_node_nucj_1); auto weight_nucj_1 = get<2>(j_1_node_nucj_1); for(auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ auto j_2_node = get<0>(j_2_node_nucj_2); auto nucj_2 = get<1>(j_2_node_nucj_2); auto weight_nucj_2 = get<2>(j_2_node_nucj_2); for (auto& q_node : dfa.nodes[q]){ for (auto& q1_node_nucq : dfa.right_edges[q_node]){ auto q1_node = get<0>(q1_node_nucq); auto nucq = get<1>(q1_node_nucq); auto weight_nucq = get<2>(q1_node_nucq); for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ NodeType q_1_node = get<0>(q_1_node_nucq_1); auto nucq_1 = get<1>(q_1_node_nucq_1); double weight_nucq_1 = get<2>(q_1_node_nucq_1); right_seq.push_back(make_pair(q, nucq)); right_seq.push_back(make_pair(j-2, nucj_2)); right_seq.push_back(make_pair(j-1, nucj_1)); right_start = q+1; right_end = j-2; q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucq, nucj_2, right_seq, right_start, right_end, q1_node, j_2_node, weight_nucq_1, weight_nucq, weight_nucj_2, weight_nucj_1,false, j_1_node)); right_seq.clear(); } } } } } } for (auto& q_node_nucs : q_node_nucs_list){ auto q_1_node = get<0>(q_node_nucs); auto q_node = get<1>(q_node_nucs); auto nucq_1 = get<2>(q_node_nucs); auto nucq = get<3>(q_node_nucs); auto nucj_2 = get<4>(q_node_nucs); auto right_seq = get<5>(q_node_nucs); auto right_start = get<6>(q_node_nucs); auto right_end = get<7>(q_node_nucs); auto right_start_node = get<8>(q_node_nucs); auto right_end_node = get<9>(q_node_nucs); auto weight_nucq_1 = get<10>(q_node_nucs); auto weight_nucq = get<11>(q_node_nucs); auto weight_nucj_2 = get<12>(q_node_nucs); auto weight_nucj_1 = get<13>(q_node_nucs); bool q_equ_j_1 = get<14>(q_node_nucs); auto j_1_node = get<15>(q_node_nucs); double weight_right = 0.0; double weight_left = 0.0; if(q_equ_j_1){ for (auto& i1_node_nuci : dfa.right_edges[i_node]){ NucType new_nuci = get<1>(i1_node_nuci); if (new_nuci != nuci) continue; auto i1_node = get<0>(i1_node_nuci); auto weight_nuci = get<2>(i1_node_nuci); auto p_list = next_list[nucq_1][i1_node]; for (auto &p_node_nucp : p_list){ auto p_node = get<0>(p_node_nucp); auto nucp = get<1>(p_node_nucp); auto p = p_node.first; PairType pair_nuc = NTP(nucp, nucq_1); if (p == i + 1) continue; // stack if (p - i + j - q - 2 > SINGLE_MAX_LEN) continue; NodeNucpair temp = {p_node.first, p_node.second, static_cast(pair_nuc)}; auto& p_state = bestP[q_node][temp]; auto newscore = - func14(i, j-1, p, q-1, nuci, nucp, nucq_1, nucj_1, nuci, nucp, nucq_1, nucj_1) + p_state.score; auto weight_left = weight_nuci + get_broken_codon_score(i1_node, p_node); auto cai_score = p_state.cai_score + (weight_left + weight_nucj_1); if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); sequence[i] = GET_ACGU(nuci); auto temp_i1_to_p_nucs = get_nuc_from_dfa_cai(dfa, i1_node, p_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); assert(temp_i1_to_p_nucs.size() == p - (i+1)); auto count = i+1; for (auto& nuc : temp_i1_to_p_nucs){ sequence[count] = nuc; count++; } assert(count == p); assert(right_seq.size() == 1); sequence[j-1] = GET_ACGU(right_seq[0].second); structure[i] = '('; structure[j-1] = ')'; no_backpointer = false; break; }if (!no_backpointer) break; }if (!no_backpointer) break; } }else{ for (auto& i1_node_nuci : dfa.right_edges[i_node]){ NucType new_nuci = get<1>(i1_node_nuci); if (new_nuci != nuci) continue; auto i1_node = get<0>(i1_node_nuci); auto weight_nuci = get<2>(i1_node_nuci); auto p_list = next_list[nucq_1][i1_node]; for (auto &p_node_nucp : p_list){ auto p_node = get<0>(p_node_nucp); auto nucp = get<1>(p_node_nucp); auto weight_nucp = get<2>(p_node_nucp); auto p = p_node.first; PairType pair_nuc = NTP(nucp, nucq_1); if (p - i + j - q - 2 > SINGLE_MAX_LEN) continue; NodeNucpair temp = {p_node.first, p_node.second, static_cast(pair_nuc)}; auto& p_state = bestP[q_node][temp]; auto newscore = 0; if (p == i+1){ newscore = - func14(i, j-1, p, q-1, nuci, nucp, nucj_2, nucj_1, nuci, nucp, nucq_1, nucq) + p_state.score; weight_right = get_broken_codon_score(q_node,j_1_node) + weight_nucj_1; auto cai_score = p_state.cai_score + (weight_nuci + weight_right); if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); sequence[i] = GET_ACGU(nuci); for(auto& idx_nucidx : right_seq){ IndexType idx = idx_nucidx.first; NucType nucidx = idx_nucidx.second; sequence[idx] = GET_ACGU(nucidx); } structure[i] = '('; structure[j-1] = ')'; auto temp_string = get_nuc_from_dfa_cai(dfa, q_node, j_1_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); int count = q; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == j-1); no_backpointer = false; break; } }else if(p == i+2){ for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ auto i2_node = get<0>(i2_node_nuci1); if (p_node != i2_node) continue; NucType nuci1 = get<1>(i2_node_nuci1); auto weight_nuci1 = get<2>(i2_node_nuci1); newscore = - func14(i, j-1, p, q-1, nuci, nuci1, nucj_2, nucj_1, nuci1, nucp, nucq_1, nucq) + p_state.score; weight_left = weight_nuci + weight_nuci1; weight_right = weight_nucq + get_broken_codon_score(right_start_node, right_end_node) + weight_nucj_2 + weight_nucj_1; auto cai_score = p_state.cai_score + (weight_left + weight_right); if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); sequence[i] = GET_ACGU(nuci); sequence[i+1] = GET_ACGU(nuci1); for(auto& idx_nucidx : right_seq){ IndexType idx = idx_nucidx.first; NucType nucidx = idx_nucidx.second; sequence[idx] = GET_ACGU(nucidx); } structure[i] = '('; structure[j-1] = ')'; auto temp_string = get_nuc_from_dfa_cai(dfa, right_start_node, right_end_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); int count = right_start; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == right_end); no_backpointer = false; break; } } }else if(p == i+3){ for (auto& p_1_node_nucp_1 : dfa.left_edges[p_node]){ auto p_1_node = get<0>(p_1_node_nucp_1); auto nucp_1 = get<1>(p_1_node_nucp_1); auto weight_nucp_1 = get<2>(p_1_node_nucp_1); for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ auto i2_node = get<0>(i2_node_nuci1); if (p_1_node != i2_node) continue; NucType nuci1 = get<1>(i2_node_nuci1); auto weight_nuci1 = get<2>(i2_node_nuci1); newscore = - func14(i, j-1, p, q-1, nuci, nuci1, nucj_2, nucj_1, nucp_1, nucp, nucq_1, nucq) + p_state.score; weight_left = weight_nuci + weight_nuci1 + weight_nucp_1; weight_right = weight_nucq + get_broken_codon_score(right_start_node,right_end_node) + weight_nucj_2 + weight_nucj_1; auto cai_score = p_state.cai_score + (weight_left + weight_right); if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); sequence[i] = GET_ACGU(nuci); sequence[i+1] = GET_ACGU(nuci1); sequence[p-1] = GET_ACGU(nucp_1); for(auto& idx_nucidx : right_seq){ IndexType idx = idx_nucidx.first; NucType nucidx = idx_nucidx.second; sequence[idx] = GET_ACGU(nucidx); } structure[i] = '('; structure[j-1] = ')'; auto temp_string = get_nuc_from_dfa_cai(dfa, right_start_node, right_end_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); int count = right_start; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == right_end); no_backpointer = false; break; } }if (!no_backpointer) break; } }else if(p == i+4){ for (auto& p_1_node_nucp_1 : dfa.left_edges[p_node]){ auto p_1_node = get<0>(p_1_node_nucp_1); auto nucp_1 = get<1>(p_1_node_nucp_1); auto weight_nucp_1 = get<2>(p_1_node_nucp_1); for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ auto i2_node = get<0>(i2_node_nuci1); NucType nuci1 = get<1>(i2_node_nuci1); auto weight_nuci1 = get<2>(i2_node_nuci1); for (auto& i3_node_nuci2 : dfa.right_edges[i2_node]){ auto i3_node = get<0>(i3_node_nuci2); if (i3_node != p_1_node) continue; auto nuci2 = get<1>(i3_node_nuci2); auto weight_nuci2 = get<2>(i3_node_nuci2); newscore = - func14(i, j-1, p, q-1, nuci, nuci1, nucj_2, nucj_1, nucp_1, nucp, nucq_1, nucq) + p_state.score; weight_left = weight_nuci + weight_nuci1 + weight_nuci2 + weight_nucp_1; weight_right = weight_nucq + get_broken_codon_score(right_start_node,right_end_node) + weight_nucj_2 + weight_nucj_1; auto cai_score = p_state.cai_score + (weight_left + weight_right); if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); sequence[i] = GET_ACGU(nuci); sequence[i+1] = GET_ACGU(nuci1); sequence[i+2] = GET_ACGU(nuci2); sequence[p-1] = GET_ACGU(nucp_1); for(auto& idx_nucidx : right_seq){ IndexType idx = idx_nucidx.first; NucType nucidx = idx_nucidx.second; sequence[idx] = GET_ACGU(nucidx); } structure[i] = '('; structure[j-1] = ')'; auto temp_string = get_nuc_from_dfa_cai(dfa, right_start_node, right_end_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); auto count = right_start; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == right_end); no_backpointer = false; break; } }if (!no_backpointer) break; }if (!no_backpointer) break; } }else{ for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ NucType nuci1 = get<1>(i2_node_nuci1); auto i2_node = get<0>(i2_node_nuci1); auto weight_nuci1 = get<2>(i2_node_nuci1); for (auto& p_1_node_nucp_1 : dfa.left_edges[p_node]){ auto nucp_1 = get<1>(p_1_node_nucp_1); auto p_1_node = get<0>(p_1_node_nucp_1); auto weight_nucp_1 = get<2>(p_1_node_nucp_1); newscore = - func14(i, j-1, p, q-1, nuci, nuci1, nucj_2, nucj_1, nucp_1, nucp, nucq_1, nucq) + p_state.score; weight_left = weight_nuci + weight_nuci1 + get_broken_codon_score(i2_node, p_1_node) + weight_nucp_1; weight_right = weight_nucq + get_broken_codon_score(right_start_node,right_end_node) + weight_nucj_2 + weight_nucj_1; auto cai_score = p_state.cai_score + (weight_left + weight_right); if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); sequence[i] = GET_ACGU(nuci); sequence[i+1] = GET_ACGU(nuci1); sequence[p-1] = GET_ACGU(nucp_1); auto temp_string = get_nuc_from_dfa_cai(dfa, i2_node, p_1_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); int count = i+2; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == p-1); for(auto& idx_nucidx : right_seq){ IndexType idx = idx_nucidx.first; NucType nucidx = idx_nucidx.second; sequence[idx] = GET_ACGU(nucidx); } structure[i] = '('; structure[j-1] = ')'; temp_string = get_nuc_from_dfa_cai(dfa, right_start_node, right_end_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); count = right_start; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == right_end); no_backpointer = false; break; } }if (!no_backpointer) break; } }if (!no_backpointer) break; }if (!no_backpointer) break; }if (!no_backpointer) break; } }if (!no_backpointer) break; } } // Multi if (no_backpointer){ NodeNucpair temp = {i_node.first, i_node.second, static_cast(curr_pair_nuc)}; auto& multi_state = bestMulti[j_node][temp]; auto newscore = multi_state.score - func15(i, j, nuci, -1, -1, nucj_1, seq_length); if (state.score == newscore && abs(state.cai_score - multi_state.cai_score) < epsilon){ stk.push(make_tuple(i_node, j_node, multi_state, Beam_type::BEAM_MULTI, curr_pair_nuc)); sequence[i] = GET_ACGU(nuci); sequence[j-1] = GET_ACGU(nucj_1); structure[i] = '('; structure[j-1] = ')'; no_backpointer = false; } } assert(no_backpointer == false); break; case Beam_type::BEAM_MULTI: nuci = PTLN(curr_pair_nuc); nucj_1 = PTRN(curr_pair_nuc); j = j_node.first; i = i_node.first; for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ auto j_1_node = get<0>(j_1_node_nucj_1); auto weight_nucj_1 = get<2>(j_1_node_nucj_1); NodeType q_node = state.pre_node; q = q_node.first; if(q == j - 1 and q_node != j_1_node) continue; if(q == j - 2 and dfa.nodes[q].size() == dfa.nodes[j-1].size() and q_node.second != j_1_node.second) continue; for (size_t p_node_ = 0; p_node_ < 2 * q; ++p_node_) { auto& temp_state = bestM2[q_node][p_node_]; if (temp_state.score == util::value_min()) continue; auto p_node = reverse_index2(p_node_); auto p = p_node.first; if(p <= i) continue; for (auto& i1_node_nuci : dfa.right_edges[i_node]){ auto i1_node = get<0>(i1_node_nuci); if(p == i + 1 and p_node != i1_node) continue; if(p == i + 2 and dfa.nodes[p].size() == dfa.nodes[i+1].size() and p_node.second != i1_node.second) continue; double weight_nuci = double(get<2>(i1_node_nuci)); auto& m2_state = bestM2[q_node][p_node]; auto cai_score = m2_state.cai_score + (weight_nuci + get_broken_codon_score(i1_node, p_node) + get_broken_codon_score(q_node, j_1_node) + weight_nucj_1); if (state.score == m2_state.score && abs(state.cai_score - cai_score) < epsilon){ stk.push(make_tuple(p_node, q_node, m2_state, Beam_type::BEAM_M2, -1)); auto temp_string = get_nuc_from_dfa_cai(dfa, i1_node, p_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); auto count = i+1; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == p); temp_string.clear(); temp_string = get_nuc_from_dfa_cai(dfa, q_node, j_1_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); count = q; for (auto & nuc : temp_string ){ sequence[count] = nuc; count++; } assert(count == j-1); no_backpointer = false; }if (!no_backpointer) break; }if (!no_backpointer) break; }if (!no_backpointer) break; } assert(no_backpointer == false); break; case Beam_type::BEAM_M2: // M2 = M + P i = i_node.first; j = j_node.first; for (size_t m_node_nucpair_ = 0; m_node_nucpair_ < 16 * j; ++m_node_nucpair_){ auto& p_state = bestP[j_node][m_node_nucpair_]; if (p_state.score == util::value_min()) continue; auto m_node_nucpair = reverse_index(m_node_nucpair_); auto m = m_node_nucpair.node_first; auto m_num = m_node_nucpair.node_second; auto m_node = make_pair(m, m_num); auto pair_nuc = m_node_nucpair.nucpair; if (m <= i+4) continue; // no sharpturn auto nucm = PTLN(pair_nuc); auto nucj_1 = PTRN(pair_nuc); auto newscore = - func6(-1, -1, -1, -1, nucm, nucj_1, -1, seq_length) + p_state.score; auto& m_state = bestM[m_node][i_node]; auto cai_score = m_state.cai_score + p_state.cai_score; if (state.score == m_state.score + newscore && state.cai_score == cai_score){ stk.push(make_tuple(i_node, m_node, m_state, Beam_type::BEAM_M1, -1)); stk.push(make_tuple(m_node, j_node, p_state, Beam_type::BEAM_P, pair_nuc)); no_backpointer = false; break; } if (!no_backpointer) break; } assert(no_backpointer == false); break; case Beam_type::BEAM_M1: // M = M + U for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ auto j_1_node = std::get<0>(j_1_node_nucj_1); auto weight_nucj_1 = std::get<2>(j_1_node_nucj_1); auto& m_state = bestM[j_1_node][i_node]; auto cai_score = m_state.cai_score + weight_nucj_1; if (state.score == m_state.score && abs(state.cai_score - cai_score) < epsilon) { NucType nucj_1 = std::get<1>(j_1_node_nucj_1); stk.push(make_tuple(i_node, j_1_node, m_state, Beam_type::BEAM_M1, -1)); sequence[j-1] = GET_ACGU(nucj_1); no_backpointer = false; break; } } // M = P if(no_backpointer){ for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ NucType nucj_1 = std::get<1>(j_1_node_nucj_1); for (auto& i1_node_nuci : dfa.right_edges[i_node]){ NucType nuci = std::get<1>(i1_node_nuci); PairType pair_nuc = NTP(nuci, nucj_1); NodeNucpair temp = {i_node.first, i_node.second, static_cast(pair_nuc)}; auto& p_state = bestP[j_node][temp]; auto newscore = - func6(-1, -1, -1, -1, nuci, nucj_1, -1, seq_length) + p_state.score; if (state.score == newscore && abs(state.cai_score - p_state.cai_score) < epsilon) { stk.push(make_tuple(i_node, j_node, p_state, Beam_type::BEAM_P, pair_nuc)); no_backpointer = false; break; } }if(!no_backpointer) break; } } // M = M2 if(no_backpointer){ auto& m2_state = bestM2[j_node][i_node]; if (state.score == m2_state.score && state.cai_score == m2_state.cai_score) { stk.push(make_tuple(i_node, j_node, m2_state, Beam_type::BEAM_M2, -1)); no_backpointer = false; } } assert(no_backpointer == false); break; default: // MANNER_NONE or other cases printf("wrong beam_type at %d, %d\n", i, j); fflush(stdout); assert(false); } } assert(string(sequence).size() == string(structure).size()); return {string(sequence), string(structure)}; } } ================================================ FILE: src/beam_cky_parser.cc ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "beam_cky_parser.h" #include "Utils/utility_v.h" #include "backtrace_iter.cc" #include "Utils/common.h" using namespace std; using NodeType = std::pair; #define tetra_hex_tri -1 namespace LinearDesign { template double BeamCKYParser::get_broken_codon_score( const NodeType& start_node, const NodeType& end_node) { IndexType s_index = start_node.first; IndexType t_index = end_node.first; if (s_index >= t_index) return 0.0; auto aa_left = protein[s_index / 3]; // tri letter auto aa_right = protein[(int)(s_index / 3)]; if (t_index / 3 < protein.size()){ aa_right = protein[(int)(t_index / 3)]; } auto start_node_re_index = make_pair(s_index % 3, start_node.second); auto end_node_re_index = make_pair(t_index % 3, end_node.second); double ret = 0.0; if (t_index - s_index < 3) { if (s_index / 3 == t_index / 3) { ret = std::get<0>(best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index,end_node_re_index)]); }else{ double left_ln_cai = 0.0, right_ln_cai = 0.0; if (s_index % 3 != 0) left_ln_cai = std::get<0>(best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index,make_pair(0, 0))]); if (t_index % 3 != 0) right_ln_cai = std::get<0>(best_path_in_one_codon_unit[aa_right][make_tuple(make_pair(0, 0), end_node_re_index)]); ret = left_ln_cai + right_ln_cai; } }else{ double left_ln_cai = 0.0, right_ln_cai = 0.0; if (s_index % 3 != 0) left_ln_cai = std::get<0>(best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index,make_pair(0, 0))]); if (t_index % 3 != 0) right_ln_cai = std::get<0>(best_path_in_one_codon_unit[aa_right][make_tuple(make_pair(0, 0), end_node_re_index)]); ret = left_ln_cai + right_ln_cai; } return ret; } template template void BeamCKYParser::hairpin_beam(IndexType j, DFA_t& dfa) { auto j_node = make_pair(j,j_num); for (auto &j1_node_nucj : dfa.right_edges[j_node]) { // right_edges[j][j_num][j1_num][nuc]: false/true auto j1_node = std::get<0>(j1_node_nucj); auto nucj = std::get<1>(j1_node_nucj); auto weight_nucj = std::get<2>(j1_node_nucj); for (auto &j4_node : dfa.nodes[j+4]){ const auto& jnext_list = next_pair[nucj][j4_node]; if (jnext_list.empty()) continue; for (auto &jnext_node_nucjnext : jnext_list){ auto jnext_node = std::get<0>(jnext_node_nucjnext); auto nucjnext = std::get<1>(jnext_node_nucjnext); auto weight_nucjnext = std::get<2>(jnext_node_nucjnext); auto jnext = jnext_node.first; IndexType hairpin_length = jnext + 1 - j; //special hairpin NodeNucpair temp = {j, j_num, static_cast(NTP(nucj, nucjnext))}; #ifdef SPECIAL_HP if (hairpin_length == 5 or hairpin_length == 6 or hairpin_length == 8){ for(auto & seq_score_weight : hairpin_seq_score_cai[j_node][jnext_node][NTP(nucj, nucjnext)]){ auto seq = get<0>(seq_score_weight); auto pre_cal_score = get<1>(seq_score_weight); auto pre_cal_cai_score = get<2>(seq_score_weight); update_if_better(bestH[jnext_node][temp], pre_cal_score, pre_cal_cai_score); } continue; } #endif for (auto &j2_node_nucj1 : dfa.right_edges[j1_node]) { auto j2_node = std::get<0>(j2_node_nucj1); auto j2_num = j2_node.second; auto nucj1 = std::get<1>(j2_node_nucj1); auto weight_nucj1 = std::get<2>(j2_node_nucj1); for (auto& jnext_1_node_list : dfa.auxiliary_left_edges[jnext_node]){ NodeType jnext_1_node = jnext_1_node_list.first; NumType jnext_1_num = jnext_1_node.second; if (jnext - j == 4 and (jnext_1_num != j2_num and dfa.nodes[j+2].size() == dfa.nodes[jnext-1].size())) continue; for (auto& nucjnext_1_weight : jnext_1_node_list.second){ IndexType nucjnext_1 = get<0>(nucjnext_1_weight); auto weight_nucjnext_1 = get<1>(nucjnext_1_weight); auto newscore = - func12(j, jnext, nucj, nucj1, nucjnext_1, nucjnext, tetra_hex_tri); FinalScoreType cai_score = weight_nucj + weight_nucj1 + weight_nucjnext_1 + weight_nucjnext; //ZL need to add weight_nucjnext if ((jnext_1_node.first - j2_node.first) <= SINGLE_MAX_LEN) cai_score += get_broken_codon_score_map[j2_node][jnext_1_node]; else cai_score += get_broken_codon_score(j2_node,jnext_1_node); update_if_better(bestH[jnext_node][temp], newscore, cai_score); } } } } } } // for every state h in H[j] // 1. extend h(i, j) to h(i, jnext) // 2. generate p(i, j) for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_) { if (bestH[j_node][i_node_nucpair_].score == util::value_min()) continue; auto i_node_nucpair = reverse_index(i_node_nucpair_); auto i = i_node_nucpair.node_first; auto i_num = i_node_nucpair.node_second; auto pair_nuc = i_node_nucpair.nucpair; auto i_node = make_pair(i,i_num); auto nuci = PTLN(pair_nuc); auto nucj = PTRN(pair_nuc); for (const auto& item : dfa.auxiliary_right_edges[j_node]){ auto j1_node = item.first; auto jnext_list = next_pair[nuci][j1_node]; if (jnext_list.empty()) continue; for (auto &jnext_node_nucjnext : jnext_list){ auto jnext_node = std::get<0>(jnext_node_nucjnext); auto nucjnext = std::get<1>(jnext_node_nucjnext); auto jnext = jnext_node.first; auto weight_nucjnext = std::get<2>(jnext_node_nucjnext); auto hairpin_length = jnext + 1 - i; NodeNucpair temp = {i, i_num, static_cast(NTP(nuci, nucjnext))}; #ifdef SPECIAL_HP if (hairpin_length == 5 or hairpin_length == 6 or hairpin_length == 8){ for(auto & seq_score_weight : hairpin_seq_score_cai[i_node][jnext_node][NTP(nuci, nucjnext)]){ auto seq = get<0>(seq_score_weight); auto pre_cal_score = get<1>(seq_score_weight); auto pre_cal_cai_score = get<2>(seq_score_weight); update_if_better(bestH[jnext_node][temp], pre_cal_score, pre_cal_cai_score); } continue; } #endif for (auto &i1_node_newnuci : dfa.right_edges[i_node]){ NucType newnuci = get<1>(i1_node_newnuci); if (nuci != newnuci) continue; NodeType i1_node = get<0>(i1_node_newnuci); double weight_newnuci = get<2>(i1_node_newnuci); for (auto &i2_node_nuci1 : dfa.right_edges[i1_node]) { auto i2_node = get<0>(i2_node_nuci1); auto nuci1 = get<1>(i2_node_nuci1); auto weight_nuci1 = get<2>(i2_node_nuci1); for (auto &jnext_1_node_nucjnext_1 : dfa.left_edges[jnext_node]) { auto jnext_1_node = get<0>(jnext_1_node_nucjnext_1); auto nucjnext_1 = get<1>(jnext_1_node_nucjnext_1); auto weight_nucjnext_1 = get<2>(jnext_1_node_nucjnext_1); auto newscore = - func12(i, jnext, nuci, nuci1, nucjnext_1, nucjnext, tetra_hex_tri); FinalScoreType cai_score = weight_newnuci + weight_nuci1 + weight_nucjnext_1 + weight_nucjnext; //move weight_nucjnext from H to P to here. Since we added SH here, so it must be here. if ((jnext_1_node.first - i2_node.first) <= SINGLE_MAX_LEN) cai_score += get_broken_codon_score_map[i2_node][jnext_1_node]; else cai_score += get_broken_codon_score(i2_node,jnext_1_node); update_if_better(bestH[jnext_node][temp], newscore, cai_score); } } } } } auto& state = bestH[j_node][i_node_nucpair_]; for (auto &j1_node_newnucj : dfa.right_edges[j_node]){ NucType newnucj = get<1>(j1_node_newnucj); if (nucj != newnucj) continue; NodeType j1_node = get<0>(j1_node_newnucj); update_if_better(bestP[j1_node][i_node_nucpair_], state.score, state.cai_score); } } } template template void BeamCKYParser::Multi_beam(IndexType j, DFA_t& dfa){ NodeType j_node = make_pair(j, j_num); for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_){ auto& new_state_score = bestMulti[j_node][i_node_nucpair_]; if (new_state_score.score == util::value_min()) continue; auto i_node_nucpair = reverse_index(i_node_nucpair_); auto i = i_node_nucpair.node_first; auto i_num = i_node_nucpair.node_second; auto pair_nuc = i_node_nucpair.nucpair; auto nuci = PTLN(pair_nuc); auto nucj_1 = PTRN(pair_nuc); auto& jnext_list = next_pair[nuci][j_node]; if (!jnext_list.empty()){ for (auto &jnext_node_nucjnext : jnext_list){ auto jnext_node = std::get<0>(jnext_node_nucjnext); auto nucjnext = std::get<1>(jnext_node_nucjnext); auto weight_nucjnext = std::get<2>(jnext_node_nucjnext); auto jnext = jnext_node.first; for (auto &jnext1_node_newnucjnext : dfa.right_edges[jnext_node]){ auto jnext1_node = std::get<0>(jnext1_node_newnucjnext); auto newnucjnext = std::get<1>(jnext1_node_newnucjnext); if (newnucjnext == nucjnext){ double cai_score; if ((jnext_node.first - new_state_score.pre_node.first) <= SINGLE_MAX_LEN) cai_score = new_state_score.pre_left_cai + (get_broken_codon_score_map[new_state_score.pre_node][jnext_node] + weight_nucjnext); else cai_score = new_state_score.pre_left_cai + (get_broken_codon_score(new_state_score.pre_node, jnext_node) + weight_nucjnext); NodeNucpair temp = {i, i_num, static_cast(NTP(nuci, nucjnext))}; update_if_better(bestMulti[jnext1_node][temp], new_state_score.score, cai_score, new_state_score.pre_node, new_state_score.pre_left_cai); } } } } // 2. generate multi(i, j) -> p(i, j) auto newscore = new_state_score.score - func15(i, j, nuci, -1, -1, nucj_1, seq_length); // hzhang: TODO update_if_better(bestP[j_node][i_node_nucpair_], newscore, new_state_score.cai_score); } } template template void BeamCKYParser::P_beam(IndexType j, DFA_t& dfa){ auto j_node = make_pair(j, j_num); if (j < seq_length){ for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_){ auto& state = bestP[j_node][i_node_nucpair_]; if (state.score == util::value_min()) continue; auto i_node_nucpair = reverse_index(i_node_nucpair_); auto i = i_node_nucpair.node_first; if (i <= 0) continue; auto i_num = i_node_nucpair.node_second; auto pair_nuc = i_node_nucpair.nucpair; auto i_node = make_pair(i, i_num); auto nuci = PTLN(pair_nuc); auto nucj_1 = PTRN(pair_nuc); // stacking for (auto &j1_node_nucj : dfa.right_edges[j_node]){ auto j1_node = std::get<0>(j1_node_nucj); auto nucj = std::get<1>(j1_node_nucj); auto weight_nucj = std::get<2>(j1_node_nucj); for (auto &i_1_node_nuci_1 : dfa.left_edges[i_node]){ auto i_1_node = std::get<0>(i_1_node_nuci_1); auto nuci_1 = std::get<1>(i_1_node_nuci_1); auto weight_nuci_1 = std::get<2>(i_1_node_nuci_1); auto outer_pair = NTP(nuci_1, nucj); if (_allowed_pairs[nuci_1][nucj]){ auto newscore = stacking_score[outer_pair-1][pair_nuc-1] + state.score; double cai_score = state.cai_score + (weight_nuci_1 + weight_nucj); NodeNucpair temp = {i_1_node.first, i_1_node.second, static_cast(NTP(nuci_1, nucj))}; update_if_better(bestP[j1_node][temp], newscore, cai_score); } } } // right bulge: ((...)..) for (auto &j1_node_list : dfa.auxiliary_right_edges[j_node]){ auto j1_node = j1_node_list.first; for (auto &i_1_node_nuci_1 : dfa.left_edges[i_node]){ auto i_1_node = std::get<0>(i_1_node_nuci_1); auto nuci_1 = std::get<1>(i_1_node_nuci_1); auto weight_nuci_1 = std::get<2>(i_1_node_nuci_1); auto q_list = next_list[nuci_1][j1_node]; for (auto& q_node_nucq : q_list){ auto q_node = std::get<0>(q_node_nucq); auto q_num = q_node.second; auto q = q_node.first; if (q-j > SINGLE_MAX_LEN) break; auto nucq = std::get<1>(q_node_nucq); auto weight_nucq = std::get<2>(q_node_nucq); auto outer_pair = NTP(nuci_1, nucq); for(auto& q1_node_list : dfa.auxiliary_right_edges[q_node]){ NodeType q1_node = q1_node_list.first; if(dfa.nodes[q].size() == 1 and dfa.nodes[q+1].size() == 2 and ((q1_node_list.second)[0]).first != nucq) continue; auto newscore = bulge_score[outer_pair-1][pair_nuc-1][q-j-1] + state.score; double cai_score; if ((q_node.first - j_node.first) <= SINGLE_MAX_LEN) cai_score = state.cai_score + (weight_nuci_1 + get_broken_codon_score_map[j_node][q_node] + weight_nucq); else cai_score = state.cai_score + (weight_nuci_1 + get_broken_codon_score(j_node, q_node) + weight_nucq); NodeNucpair temp = {i_1_node.first, i_1_node.second, static_cast(outer_pair)}; update_if_better(bestP[q1_node][temp], newscore, cai_score); break; } } } } // left bulge: (..(...)) for (auto &j1_node_nucj : dfa.right_edges[j_node]){ auto j1_node = std::get<0>(j1_node_nucj); auto nucj = std::get<1>(j1_node_nucj); auto weight_nucj = std::get<2>(j1_node_nucj); for (auto &i_1_node_list : dfa.auxiliary_left_edges[i_node]){ auto i_1_node = i_1_node_list.first; auto p_list = prev_list[nucj][i_1_node]; for (auto &p_node_nucp_1 : p_list){ auto p_node = std::get<0>(p_node_nucp_1); auto p_num = p_node.second; auto p = p_node.first; if (i-p > SINGLE_MAX_LEN) break; auto nucp_1 = std::get<1>(p_node_nucp_1); auto outer_pair = NTP(nucp_1, nucj); for(auto& p_1_node_new_nucp_1 : dfa.left_edges[p_node]){ NucType new_nucp_1 = std::get<1>(p_1_node_new_nucp_1); if(nucp_1 != new_nucp_1) continue; NodeType p_1_node = std::get<0>(p_1_node_new_nucp_1); auto weight_nucp_1 = std::get<2>(p_1_node_new_nucp_1); auto newscore = bulge_score[outer_pair-1][pair_nuc-1][i-p-1] + state.score; double cai_score; if ((i_node.first - p_node.first) <= SINGLE_MAX_LEN) cai_score = state.cai_score + (weight_nucp_1 + get_broken_codon_score_map[p_node][i_node] + weight_nucj); else cai_score = state.cai_score + (weight_nucp_1 + get_broken_codon_score(p_node, i_node) + weight_nucj); NodeNucpair temp = {p_1_node.first, (NumType)p_1_node.second, static_cast(outer_pair)}; update_if_better(bestP[j1_node][temp], newscore, cai_score); } } } } // internal loop for (auto &j1_node_dict : dfa.auxiliary_right_edges[j_node]){ auto j1_node = j1_node_dict.first; auto j1_num = j1_node.second; for (auto &i_1_node_nuci_1 : dfa.left_edges[i_node]){ auto i_1_node = std::get<0>(i_1_node_nuci_1); auto i_1_num = i_1_node.second; auto nuci_1 = std::get<1>(i_1_node_nuci_1); auto weight_nuci_1 = std::get<2>(i_1_node_nuci_1); for (IndexType p = i-1; p > max(i - SINGLE_MAX_LEN, 0); --p) {//ZL, i-(p-1)<=len => i - len < p vector> p_node_list; if (p == i - 1) p_node_list.push_back(i_1_node); else if (p == i - 2) // hzhang: N.B. add this p, i-1, i o--o--o for (auto &p_node_dict : dfa.auxiliary_left_edges[i_1_node]) p_node_list.push_back(p_node_dict.first); else p_node_list = dfa.nodes[p]; for (auto &p_node : p_node_list){ for (auto &p1_node_nucp : dfa.right_edges[p_node]){ auto p1_node = std::get<0>(p1_node_nucp); auto p1_num = p1_node.second; auto nucp = std::get<1>(p1_node_nucp); auto weight_nucp = std::get<2>(p1_node_nucp); if (p == i - 1 and nucp != nuci_1) continue; else if (p == i - 2 and p1_num != i_1_num) continue; else if (p == i - 3 and p1_num != i_1_num and dfa.nodes[p+1].size() == dfa.nodes[i-1].size()) continue; for (auto &p_1_node_nucp_1 : dfa.left_edges[p_node]){ auto p_1_node = std::get<0>(p_1_node_nucp_1); auto nucp_1 = std::get<1>(p_1_node_nucp_1); auto weight_nucp_1 = std::get<2>(p_1_node_nucp_1); auto q_list = next_list[nucp_1][j1_node]; for (auto &q_node_nucq : q_list){ auto q_node = std::get<0>(q_node_nucq); auto q_num = q_node.second; auto q = q_node.first; if (i-p+q-j > SINGLE_MAX_LEN) //check if q is still in the internal loop limit boundary. break; auto nucq = std::get<1>(q_node_nucq); auto weight_nucq = std::get<2>(q_node_nucq); for(auto& q1_node_list : dfa.auxiliary_right_edges[q_node]){ NodeType q1_node = q1_node_list.first; if(dfa.nodes[q].size() == 1 and dfa.nodes[q+1].size() == 2 and ((q1_node_list.second)[0]).first != nucq) continue; NodeNucpair temp = {p_1_node.first, p_1_node.second, static_cast(NTP(nucp_1, nucq))}; auto& BestP_val = bestP[q1_node][temp]; for(auto & nucj_weightj: j1_node_dict.second){ auto nucj = nucj_weightj.first; auto weight_nucj = nucj_weightj.second; if (q == j+1){ auto newscore = - func14(p-1, q, i, j-1, nucp_1, nucp, nucj, nucq, nuci_1, nuci, nucj_1, nucj) + state.score; double weight_left; if (p == i-1){ weight_left = weight_nucp_1 + weight_nucp; } else{ if (i_1_node.first - p1_node.first <= SINGLE_MAX_LEN) weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score_map[p1_node][i_1_node] + weight_nuci_1; else weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score(p1_node, i_1_node) + weight_nuci_1; } double cai_score = state.cai_score + (weight_left + weight_nucj + weight_nucq); //j+1 == q update_if_better(BestP_val, newscore, cai_score); }else if (q == j+2){ for(auto& q_1_node_list : dfa.auxiliary_left_edges[q_node]){ auto q_1_node = q_1_node_list.first; NumType q_1_num = q_1_node.second; if (q_1_num != j1_num) continue; for(auto & nucq_1_weight : q_1_node_list.second){ auto nucq_1 = nucq_1_weight.first; auto weight_nucq_1 = nucq_1_weight.second; auto newscore = - func14(p-1, q, i, j-1, nucp_1, nucp, nucq_1, nucq, nuci_1, nuci, nucj_1, nucj) + state.score; double weight_left; if (p == i-1){ weight_left = weight_nucp_1 + weight_nucp; } else{ // assert(p < i-1); if (i_1_node.first - p1_node.first <= SINGLE_MAX_LEN) weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score_map[p1_node][i_1_node] + weight_nuci_1; else weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score(p1_node, i_1_node) + weight_nuci_1; } auto cai_score = state.cai_score + (weight_left + weight_nucj + weight_nucq_1 + weight_nucq); update_if_better(BestP_val, newscore, cai_score); } if(dfa.nodes[q-1].size() == 2) break; } }else if (q == j + 3){ for(auto& q_1_node_list : dfa.auxiliary_left_edges[q_node]){ auto q_1_node = q_1_node_list.first; NumType q_1_num = q_1_node.second; if (q_1_num != j1_num and dfa.nodes[q-1].size() == dfa.nodes[j+1].size()) continue; for(auto & nucq_1_weight : q_1_node_list.second){ auto nucq_1 = nucq_1_weight.first; auto weight_nucq_1 = nucq_1_weight.second; auto newscore = - func14(p-1, q, i, j-1, nucp_1, nucp, nucq_1, nucq, nuci_1, nuci, nucj_1, nucj) + state.score; double weight_left; if (p == i-1){ weight_left = weight_nucp_1 + weight_nucp; } else{ // assert(p < i-1); if (i_1_node.first - p1_node.first <= SINGLE_MAX_LEN) weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score_map[p1_node][i_1_node] + weight_nuci_1; else weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score(p1_node, i_1_node) + weight_nuci_1; } double cai_score; if (q_1_node.first - j1_node.first <= SINGLE_MAX_LEN) cai_score = state.cai_score + (weight_left + weight_nucj + get_broken_codon_score_map[j1_node][q_1_node] + weight_nucq_1 + weight_nucq); else cai_score = state.cai_score + (weight_left + weight_nucj + get_broken_codon_score(j1_node, q_1_node) + weight_nucq_1 + weight_nucq); update_if_better(BestP_val, newscore, cai_score); } if(dfa.nodes[q-1].size() == 2) break; } }else{ for(auto& q_1_node_list : dfa.auxiliary_left_edges[q_node]){ auto q_1_node = q_1_node_list.first; for(auto & nucq_1_weight : q_1_node_list.second){ auto nucq_1 = nucq_1_weight.first; auto weight_nucq_1 = nucq_1_weight.second; auto newscore = - func14(p-1, q, i, j-1, nucp_1, nucp, nucq_1, nucq, nuci_1, nuci, nucj_1, nucj) + state.score; double weight_left; if (p == i-1){ weight_left = weight_nucp_1 + weight_nucp; } else{ // assert(p < i-1); if (i_1_node.first - p1_node.first <= SINGLE_MAX_LEN){ weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score_map[p1_node][i_1_node] + weight_nuci_1; } else weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score(p1_node, i_1_node) + weight_nuci_1; } double cai_score; if (q_1_node.first - j1_node.first <= SINGLE_MAX_LEN){ cai_score = state.cai_score + (weight_left + weight_nucj + get_broken_codon_score_map[j1_node][q_1_node] + weight_nucq_1 + weight_nucq); } else cai_score = state.cai_score + (weight_left + weight_nucj + get_broken_codon_score(j1_node, q_1_node) + weight_nucq_1 + weight_nucq); update_if_better(BestP_val, newscore, cai_score); } if(dfa.nodes[q-1].size() == 2) break; } } } } } } } } } } } } } // M = P and M_P = P for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_){ auto& state = bestP[j_node][i_node_nucpair_]; if (state.score == util::value_min()) continue; auto i_node_nucpair = reverse_index(i_node_nucpair_); auto i = i_node_nucpair.node_first; auto i_num = i_node_nucpair.node_second; auto pair_nuc = i_node_nucpair.nucpair; auto i_node = make_pair(i, i_num); auto nuci = PTLN(pair_nuc); auto nucj_1 = PTRN(pair_nuc); if (i > 0 and j < seq_length){ auto M1_score = - func6(i, j-1, j-1, -1, nuci, nucj_1, -1, seq_length) + state.score; update_if_better(bestM[j_node][i_node], M1_score, state.cai_score); update_if_better(bestM_P[j_node][i_node], M1_score, state.cai_score); } } // M2 = M + M_P for (size_t i_node_ = 0; i_node_ < 2 * j; ++i_node_) { auto& state = bestM_P[j_node][i_node_]; auto i_node = reverse_index2(i_node_); auto i = i_node.first; if (state.score == util::value_min()) continue; if (i > 0 and j < seq_length){ for (size_t m_node = 0; m_node < 2 * i; ++m_node){ auto& m_new_state_score = bestM[i_node][m_node]; if (m_new_state_score.score == util::value_min()) continue; auto newscore = m_new_state_score.score + state.score; auto cai_score = m_new_state_score.cai_score + state.cai_score; update_if_better(bestM2[j_node][m_node], newscore, cai_score); } } } // C = C + P for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_){ auto& state = bestP[j_node][i_node_nucpair_]; if (state.score == util::value_min()) continue; auto i_node_nucpair = reverse_index(i_node_nucpair_); auto i = i_node_nucpair.node_first; auto i_num = i_node_nucpair.node_second; auto pair_nuc = i_node_nucpair.nucpair; auto i_node = make_pair(i, i_num); auto nuci = PTLN(pair_nuc); auto nucj_1 = PTRN(pair_nuc); if (i > 0){ auto& prefix_C = bestC[i_node]; if (prefix_C.score != util::value_min()){ auto newscore = - func3(i, j-1, nuci, nucj_1, seq_length) + prefix_C.score + state.score; auto cai_score = prefix_C.cai_score + state.cai_score; update_if_better(bestC[j_node], newscore, cai_score); } } else{ auto newscore = - func3(0, j-1, nuci, nucj_1, seq_length) + state.score; update_if_better(bestC[j_node], newscore, state.cai_score); } } } template template void BeamCKYParser::M2_beam(IndexType j, DFA_t& dfa){ auto j_node = make_pair(j, j_num); for (size_t i_node_ = 0; i_node_ < 2 * j; ++i_node_) { auto& state = bestM2[j_node][i_node_]; if (state.score == util::value_min()) continue; auto i_node = reverse_index2(i_node_); auto i = i_node.first; // 1. multi-loop for (IndexType p = i-1; p >= max(i - SINGLE_MAX_LEN, 0); --p){ vector> p_node_list; if (p == i - 1) for(auto& p_node_dict : dfa.auxiliary_left_edges[i_node]) p_node_list.push_back(p_node_dict.first); else p_node_list = dfa.nodes[p]; for (auto &p_node : p_node_list){ for (auto &p1_node_nucp : dfa.right_edges[p_node]){ auto p1_node = std::get<0>(p1_node_nucp); auto nucp = std::get<1>(p1_node_nucp); auto weight_nucp = std::get<2>(p1_node_nucp); if(p == i - 1 and p1_node != i_node) continue; if(p == i - 2 and dfa.nodes[p+1].size() == dfa.nodes[i].size() and p1_node.second != i_node.second) continue; auto q_list = next_pair[nucp][j_node]; for (auto &q_node_nucq : q_list){ auto q_node = std::get<0>(q_node_nucq); auto nucq = std::get<1>(q_node_nucq); auto weight_nucq = std::get<2>(q_node_nucq); auto q = q_node.first; if (i - p + q - j - 1 > SINGLE_MAX_LEN) continue; //ZL, i-p-1+q-j auto outer_pair = NTP(nucp, nucq); for (auto &q1_node_newnucq : dfa.right_edges[q_node]){ auto newnucq = std::get<1>(q1_node_newnucq); if (newnucq == nucq) { auto q1_node = std::get<0>(q1_node_newnucq); double cai_score = state.cai_score + (weight_nucp + get_broken_codon_score_map[p1_node][i_node] + get_broken_codon_score_map[j_node][q_node] + weight_nucq); double temp_left_cai = state.cai_score + (weight_nucp + get_broken_codon_score_map[p1_node][i_node]); NodeNucpair temp = {p_node.first, p_node.second, static_cast(NTP(nucp, nucq))}; update_if_better(bestMulti[q1_node][temp], state.score, cai_score, j_node, temp_left_cai); break; } } } } } } // 2. M = M2 update_if_better(bestM[j_node][i_node], state.score, state.cai_score); } } template template void BeamCKYParser::M_beam(IndexType j, DFA_t& dfa) { auto j_node = make_pair(j, j_num); for (size_t i_node_ = 0; i_node_ < 2 * j; ++i_node_) { auto& state = bestM[j_node][i_node_]; if (state.score == util::value_min()) continue; auto i_node = reverse_index2(i_node_); for (auto &j1_node_nucj : dfa.right_edges[j_node]){ auto j1_node = std::get<0>(j1_node_nucj); auto nucj = std::get<1>(j1_node_nucj); auto weight_nucj = std::get<2>(j1_node_nucj); double cai_score = state.cai_score + weight_nucj; update_if_better(bestM[j1_node][i_node], state.score, cai_score); } } } template template void BeamCKYParser::C_beam(IndexType j, DFA_t& dfa) { // beam of C // C = C + U auto j_node = make_pair(j, j_num); auto& state = bestC[j_node]; for (auto &j1_node_nucj : dfa.right_edges[j_node]){ NodeType j1_node = std::get<0>(j1_node_nucj); IndexType nucj = std::get<1>(j1_node_nucj); auto weight_nucj = std::get<2>(j1_node_nucj); double cai_score = state.cai_score + (double)weight_nucj; update_if_better(bestC[j1_node], state.score, cai_score); } } template void BeamCKYParser::get_next_pair(DFA_t& dfa) { vector> temp_vector; for (NucType nuci = 0; nuci < NOTON; nuci++) { for (IndexType j = seq_length; j > 0; j--) { for (auto& j_node : dfa.nodes[j]) { for (auto& item : dfa.auxiliary_left_edges[j_node]) { NodeType j_1_node = item.first; temp_vector.clear(); for (auto& nuc_weight : item.second){ auto nuc = std::get<0>(nuc_weight); auto weight_nuc = std::get<1>(nuc_weight); if (_allowed_pairs[nuci][nuc]) temp_vector.push_back(make_tuple(j_1_node, nuc, weight_nuc)); } if(temp_vector.size() == 0){ if (next_pair[nuci][j_1_node].size() > 0 and next_pair[nuci][j_node].size() > 0) { // merge IndexType index1 = std::get<0>(next_pair[nuci][j_1_node][0]).first; IndexType index2 = std::get<0>(next_pair[nuci][j_node][0]).first; if(index1/3 == index2/3) next_pair[nuci][j_1_node].insert(next_pair[nuci][j_1_node].end(), next_pair[nuci][j_node].begin(), next_pair[nuci][j_node].end()); else if(index1 > index2){ next_pair[nuci][j_1_node].clear(); next_pair[nuci][j_1_node].insert(next_pair[nuci][j_1_node].end(), next_pair[nuci][j_node].begin(), next_pair[nuci][j_node].end()); } }else if (next_pair[nuci][j_node].size() > 0) next_pair[nuci][j_1_node].insert(next_pair[nuci][j_1_node].end(), next_pair[nuci][j_node].begin(), next_pair[nuci][j_node].end()); } else next_pair[nuci][j_1_node].insert(next_pair[nuci][j_1_node].end(), temp_vector.begin(), temp_vector.end()); } } } } } template void BeamCKYParser::get_next_pair_set() { for(NucType nuci=0; nuci<5; nuci++){ for (auto& j_node_vnuc : next_pair[nuci]) { NodeType j_node = j_node_vnuc.first; next_pair_set[nuci][j_node] = set>(j_node_vnuc.second.begin(), j_node_vnuc.second.end()); } } for(NucType nuci=0; nuci<5; nuci++){ for (auto& j_node_vnuc : next_pair_set[nuci]) { NodeType j_node = j_node_vnuc.first; next_pair[nuci][j_node].clear(); for(auto& item : next_pair_set[nuci][j_node]){ next_pair[nuci][j_node].push_back(item); } } } } template void BeamCKYParser::get_prev_pair(DFA_t& dfa) { vector> temp_vector; for (NucType nuci = 0; nuci < NOTON; nuci++) { for (IndexType j = 0; j < seq_length; j++) { for (auto& j_node : dfa.nodes[j]) { for (auto& item : dfa.auxiliary_right_edges[j_node]) { NodeType j1_node = item.first; temp_vector.clear(); for (auto& nuc_weight : item.second){ auto nuc = std::get<0>(nuc_weight); auto weight_nuc = std::get<1>(nuc_weight); if (_allowed_pairs[nuci][nuc]) temp_vector.push_back(make_tuple(j1_node, nuc, weight_nuc)); } if(temp_vector.size() == 0){ if (prev_pair[nuci][j1_node].size() > 0 and prev_pair[nuci][j_node].size() > 0) { // merge IndexType index1 = std::get<0>(prev_pair[nuci][j1_node][0]).first-1; IndexType index2 = std::get<0>(prev_pair[nuci][j_node][0]).first-1; if(index1/3 == index2/3) prev_pair[nuci][j1_node].insert(prev_pair[nuci][j1_node].end(), prev_pair[nuci][j_node].begin(), prev_pair[nuci][j_node].end()); else if(index1 < index2){ prev_pair[nuci][j1_node].clear(); prev_pair[nuci][j1_node].insert(prev_pair[nuci][j1_node].end(), prev_pair[nuci][j_node].begin(), prev_pair[nuci][j_node].end()); } }else if (prev_pair[nuci][j_node].size() > 0) prev_pair[nuci][j1_node].insert(prev_pair[nuci][j1_node].end(), prev_pair[nuci][j_node].begin(), prev_pair[nuci][j_node].end()); } else prev_pair[nuci][j1_node].insert(prev_pair[nuci][j1_node].end(), temp_vector.begin(), temp_vector.end()); } } } } } template void BeamCKYParser::get_prev_pair_set() { for(NucType nuci=0; nuci<5; nuci++){ for (auto& j_node_vnuc : prev_pair[nuci]) { NodeType j_node = j_node_vnuc.first; prev_pair_set[nuci][j_node] = set>(j_node_vnuc.second.begin(), j_node_vnuc.second.end()); } } for(NucType nuci=0; nuci<5; nuci++){ for (auto& j_node_vnuc : prev_pair_set[nuci]) { NodeType j_node = j_node_vnuc.first; prev_pair[nuci][j_node].clear(); for(auto& item : prev_pair_set[nuci][j_node]){ prev_pair[nuci][j_node].push_back(item); } } } } #ifdef SPECIAL_HP template void BeamCKYParser::special_hp(DFA_t& dfa, int8_t hairpin_length) { int8_t hairpin_type = HAIRPINTYPE(hairpin_length); vector> queue; vector> frontier; // vector for(IndexType i=0; i<=seq_length - hairpin_length; i++){ for(NodeType i_node : dfa.nodes[i]){ int count = hairpin_length; queue.clear(); queue.push_back(make_tuple(i_node, "", double(0.), i_node)); while(count > 0){ count --; frontier.clear(); for(auto& node_str : queue){ NodeType cur_node = std::get<0>(node_str); string cur_str = std::get<1>(node_str); double cur_lncai = std::get<2>(node_str); for(auto& node_nuc : dfa.right_edges[cur_node]){ NodeType new_node = std::get<0>(node_nuc); string new_str = cur_str + GET_ACGU(std::get<1>(node_nuc)); double new_total_lncai = cur_lncai + std::get<2>(node_nuc); frontier.push_back(make_tuple(new_node, new_str, new_total_lncai, cur_node)); } } queue.swap(frontier); } for(auto node_str : queue){ auto j_node = std::get<3>(node_str); auto temp_seq = std::get<1>(node_str); auto cai_score = std::get<2>(node_str); auto hairpin_length = temp_seq.size(); int8_t hairpin_type = HAIRPINTYPE(hairpin_length); NucType nuci = GET_ACGU_NUC(temp_seq[0]); NucType nucj = GET_ACGU_NUC(temp_seq[temp_seq.size() - 1]); auto temp_nucpair = NTP(nuci, nucj); ScoreType special_hairpin_score = func1(temp_seq, hairpin_type); if(special_hairpin_score == SPECIAL_HAIRPIN_SCORE_BASELINE){ auto newscore = - func12(0, hairpin_length - 1, GET_ACGU_NUC(temp_seq[0]), GET_ACGU_NUC(temp_seq[1]), GET_ACGU_NUC(temp_seq[hairpin_length-2]), GET_ACGU_NUC(temp_seq[hairpin_length-1]), tetra_hex_tri); hairpin_seq_score_cai[i_node][j_node][temp_nucpair].push_back(make_tuple(temp_seq, newscore, cai_score)); } else{ hairpin_seq_score_cai[i_node][j_node][temp_nucpair].push_back(make_tuple(temp_seq, special_hairpin_score, cai_score)); } } } } } #endif template void BeamCKYParser::preprocess(DFA_t& dfa) { vector> new_q_list, new_p_list; set visited; // next_list NodeType init_node = make_pair(0, 0); for (NucType nuci=1; nuci(q_node_nucq); auto q_num = q_node.second; auto q = q_node.first; auto nucq = std::get<1>(q_node_nucq); // q_node next_list[nuci][q_node].push_back(q_node_nucq); // q-1 is special for(auto& q_1_node_dict : dfa.auxiliary_left_edges[q_node]){ NodeType q_1_node = q_1_node_dict.first; next_list[nuci][q_1_node].push_back(q_node_nucq); } for(IndexType j=q-2; j>=max(0, q-SINGLE_MAX_LEN-1); j--) for(NodeType j_node : dfa.nodes[j]) next_list[nuci][j_node].push_back(q_node_nucq); for(auto& q1_node_list : dfa.auxiliary_right_edges[q_node]){ NodeType q1_node = q1_node_list.first; if(dfa.nodes[q].size() == 1 and dfa.nodes[q+1].size() == 2 and ((q1_node_list.second)[0]).first != nucq) continue; if(visited.find(q1_node) == visited.end()){ visited.insert(q1_node); new_q_list.insert(new_q_list.end(), next_pair[nuci][q1_node].cbegin(), next_pair[nuci][q1_node].cend()); } break; } } q_list.swap(new_q_list); } } // prev_list init_node = make_pair(seq_length, 0); for (NucType nucj=1; nucj(p_node_nucp_1); auto p_num = p_node.second; auto p = p_node.first; auto nucp_1 = std::get<1>(p_node_nucp_1); // p_node prev_list[nucj][p_node].push_back(p_node_nucp_1); // p+1 is special for(auto& p1_node_dict : dfa.auxiliary_right_edges[p_node]){ NodeType p1_node = p1_node_dict.first; prev_list[nucj][p1_node].push_back(p_node_nucp_1); } for(IndexType i=p+2; i<=min(seq_length, p+SINGLE_MAX_LEN+1); i++) for(NodeType i_node : dfa.nodes[i]) prev_list[nucj][i_node].push_back(p_node_nucp_1); for(auto& p_1_node_new_nucp_1 : dfa.left_edges[p_node]){ NucType new_nucp_1 = std::get<1>(p_1_node_new_nucp_1); if(nucp_1 != new_nucp_1) continue; NodeType p_1_node = std::get<0>(p_1_node_new_nucp_1); if(visited.find(p_1_node) == visited.end()){ visited.insert(p_1_node); new_p_list.insert(new_p_list.end(), prev_pair[nucj][p_1_node].cbegin(), prev_pair[nucj][p_1_node].cend()); } } } p_list.swap(new_p_list); } } // stacking energy computation int newscore; for(int8_t outer_pair=1; outer_pair<=6; outer_pair++){ auto nuci_1 = PTLN(outer_pair); auto nucq = PTRN(outer_pair); for(int8_t inner_pair=1; inner_pair<=6; inner_pair++){ auto nuci = PTLN(inner_pair); auto nucj_1 = PTRN(inner_pair); newscore = - func14(0, 1, 1, 0, nuci_1, nuci, nucj_1, nucq, nuci_1, nuci, nucj_1, nucq); stacking_score[outer_pair-1][inner_pair-1] = newscore; for (IndexType l=0; l<=SINGLE_MAX_LEN; l++){ newscore = - func14(0, l+2, 1, 0, nuci_1, nuci, nucj_1, nucq, nuci_1, nuci, nucj_1, nucq); bulge_score[outer_pair-1][inner_pair-1][l] = newscore; } } } #ifdef SPECIAL_HP // Triloops special_hp(dfa, 5); // Tetraloop37 special_hp(dfa, 6); // Hexaloops special_hp(dfa, 8); #endif } template DecoderResult BeamCKYParser::parse( DFA_t& dfa, Codon& codon, std::string& aa_seq, std::vector& p, std::unordered_map& aa_best_in_codon, std::unordered_map, std::tuple, std::hash>>>& best_path_in_one_codon, std::unordered_map>& aa_graphs_with_ln_weights) { protein = p; aa_graphs_with_ln_w = aa_graphs_with_ln_weights; aa_best_path_in_a_whole_codon = aa_best_in_codon; best_path_in_one_codon_unit = best_path_in_one_codon; seq_length = 3 * static_cast(aa_seq.size()); next_pair.resize(5); next_pair_set.resize(5); get_next_pair(dfa); get_next_pair_set(); prev_pair.resize(5); prev_pair_set.resize(5); get_prev_pair(dfa); get_prev_pair_set(); next_list.resize(5); prev_list.resize(5); stacking_score.resize(6, vector(6)); bulge_score.resize(6, vector>(6, vector(SINGLE_MAX_LEN+1))); preprocess(dfa); int reserved_size = (seq_length + 1) * 16; //node,nucpair int reserved_size2 = (seq_length + 1) * 2; //node bestH.resize(reserved_size2); bestP.resize(reserved_size2); bestM2.resize(reserved_size2); //slim signature, Liang Zhang bestMulti.resize(reserved_size2); bestM.resize(reserved_size2); //slim signature, Liang Zhang bestM_P.resize(reserved_size2); // hzhang: inter-state: P -> M bestC.resize(reserved_size2); //slim signature, Liang Zhang get_broken_codon_score_map.resize(reserved_size2); for (auto& e : get_broken_codon_score_map){ //slim signature, Liang Zhang e.resize(reserved_size2); for (auto& ee : e){ ee = util::value_min(); } } for (auto& ee : bestC){ ee.score = util::value_min(); ee.cai_score = util::value_min(); } for (auto& e : bestH){ e.resize(reserved_size); for (auto& ee : e){ ee.score = util::value_min(); ee.cai_score = util::value_min(); } } for (auto& e : bestP){ e.resize(reserved_size); for (auto& ee : e){ ee.score = util::value_min(); ee.cai_score = util::value_min(); } } for (auto& e : bestMulti){ //slim signature, Liang Zhang e.resize(reserved_size); for (auto& ee : e){ ee.score = util::value_min(); ee.cai_score = util::value_min(); } } for (auto& e : bestM2){ //slim signature, Liang Zhang e.resize(reserved_size2); for (auto& ee : e){ ee.score = util::value_min(); ee.cai_score = util::value_min(); } } for (auto& e : bestM){ //slim signature, Liang Zhang e.resize(reserved_size2); for (auto& ee : e){ ee.score = util::value_min(); ee.cai_score = util::value_min(); } } for (auto& e : bestM_P){ // hzhang e.resize(reserved_size2); for (auto& ee : e){ ee.score = util::value_min(); ee.cai_score = util::value_min(); } } for (IndexType i = 0; i <= seq_length; ++i) { for (auto & node_i : dfa.nodes[i]){ for (IndexType l = 0; l <= SINGLE_MAX_LEN; ++l){ auto j = i + l; if (j > seq_length) break; for (auto & node_j : dfa.nodes[j]){ get_broken_codon_score_map[node_i][node_j] = get_broken_codon_score(node_i, node_j); } } } } bestC[make_pair(0,0)].score = 0; bestC[make_pair(0,0)].cai_score = double(0.); for(const auto& node_nue_weight : dfa.right_edges[make_pair(0,0)]) { auto node = std::get<0>(node_nue_weight); auto weight_nue = std::get<2>(node_nue_weight); update_if_better(bestC[node], 0, weight_nue); } for (IndexType j = 0; j <= seq_length; ++j) { cout << "j=" << j << "\r" << flush; hairpin_beam<0>(j, dfa); hairpin_beam<1>(j, dfa); if (j == 0) continue; Multi_beam<0>(j, dfa); Multi_beam<1>(j, dfa); P_beam<0>(j, dfa); P_beam<1>(j, dfa); M2_beam<0>(j, dfa); M2_beam<1>(j, dfa); if (j < seq_length) { M_beam<0>(j, dfa); M_beam<1>(j, dfa); C_beam<0>(j, dfa); C_beam<1>(j, dfa); } } auto end_node = make_pair(seq_length, 0); auto viterbi = bestC[end_node]; auto backtrace_result = backtrace(dfa, viterbi, end_node); return DecoderResult{backtrace_result.seq, backtrace_result.structure, viterbi.score / -100.0, 0., viterbi.cai_score, seq_length}; } template BeamCKYParser::BeamCKYParser(const double lambda_value, const bool verbose) : lambda(lambda_value), is_verbose(verbose) { func9(0, 0); } } ================================================ FILE: src/beam_cky_parser.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include "Utils/network.h" #include "Utils/codon.h" #include "Utils/flat.h" namespace LinearDesign { namespace detail { struct NodeNucIndex { LINEAR_DESIGN_INLINE size_t operator()(const NodeNucpair& node_nucpair) const { return (node_nucpair.node_first << 4) | (node_nucpair.node_second << 3) | node_nucpair.nucpair; } }; struct NodeNucReverseIndex { LINEAR_DESIGN_INLINE NodeNucpair operator()(const size_t index) const { NodeNucpair node_nucpair = {IndexType(index >> 4), NumType((index & 0xf) >> 3), NucPairType(index & 0x7)}; return node_nucpair; } }; struct NodeIndex { LINEAR_DESIGN_INLINE size_t operator()(const NodeType& node) const { return (node.first << 1) | node.second; } }; struct NodeNucReverseIndex2 { LINEAR_DESIGN_INLINE NodeType operator()(const size_t index) const { return {index >> 1, (index & 0x1)}; } }; } /* detail */ template , typename DFAType = DFA> string get_nuc_from_dfa_cai(DFAType& dfa, const NodeType& start_node, const NodeType& end_node, const std::vector& protein, std::unordered_map, std::tuple, std::hash>>>& best_path_in_one_codon_unit, std::unordered_map& aa_best_path_in_a_whole_codon) { IndexType s_index = start_node.first; IndexType t_index = end_node.first; if (s_index >= t_index) return ""; auto aa_left = protein[s_index / 3]; // tri letter auto aa_right = protein[t_index / 3]; auto start_node_re_index = make_pair(s_index % 3, start_node.second); auto end_node_re_index = make_pair(t_index % 3, end_node.second); if (t_index - s_index < 3) { if (s_index / 3 == t_index / 3) { std::string temp_seq = ""; auto& nucs = best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index, end_node_re_index)]; temp_seq.append(1, GET_ACGU(std::get<1>(nucs))); if (std::get<2>(nucs) != k_void_nuc) temp_seq.append(1, GET_ACGU(std::get<2>(nucs))); if (temp_seq.length() != end_node.first - start_node.first) { assert(false); } return temp_seq; } else { std::string temp_left = ""; std::string temp_right = ""; if (s_index % 3 != 0) { auto& nucs = best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index, make_pair(0, 0))]; temp_left.append(1, GET_ACGU(std::get<1>(nucs))); if (std::get<2>(nucs) != k_void_nuc) temp_left.append(1, GET_ACGU(std::get<2>(nucs))); } if (t_index % 3 != 0) { auto& nucs = best_path_in_one_codon_unit[aa_right][make_tuple(make_pair(0, 0), end_node_re_index)]; temp_right.append(1, GET_ACGU(std::get<1>(nucs))); if (std::get<2>(nucs) != k_void_nuc) temp_right.append(1, GET_ACGU(std::get<2>(nucs))); } assert((temp_left + temp_right).length() == end_node.first - start_node.first); return temp_left + temp_right; } } else { std::string temp_left = ""; std::string temp_mid = ""; std::string temp_right = ""; if (s_index % 3 != 0) { auto& nucs = best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index, make_pair(0, 0))]; temp_left.append(1, GET_ACGU(std::get<1>(nucs))); if (std::get<2>(nucs) != k_void_nuc) temp_left.append(1, GET_ACGU(std::get<2>(nucs))); } IndexType protein_start_index = s_index / 3; if (s_index % 3 != 0) protein_start_index++; IndexType protein_end_index = t_index / 3; if (protein_start_index != protein_end_index) { for (IndexType protein_index = protein_start_index; protein_index < protein_end_index; ++protein_index) { std::string nucs; auto aa_tri = protein[protein_index]; if (k_map_3_1.count(aa_tri)) { nucs = aa_best_path_in_a_whole_codon[std::string(1, k_map_3_1[aa_tri])]; } else if (aa_best_path_in_a_whole_codon.count(aa_tri)) { nucs = aa_best_path_in_a_whole_codon[aa_tri]; } else { assert(false); } for (auto nuc : nucs) { temp_mid.append(1, nuc); } } } if (t_index % 3 != 0) { auto& nucs = best_path_in_one_codon_unit[aa_right][make_tuple(make_pair(0, 0), end_node_re_index)]; temp_right.append(1, GET_ACGU(std::get<1>(nucs))); if (std::get<2>(nucs) != k_void_nuc) temp_right.append(1, GET_ACGU(std::get<2>(nucs))); } assert((temp_left + temp_mid + temp_right).length() == end_node.first - start_node.first); return temp_left + temp_mid + temp_right; } } template > class BeamCKYParser { public: using State_t = State; using DFA_t = DFA; using ScoreInnerDate_t = ScoreInnerDate; using NextPair_t = vector>, hash_pair>>; using NextPairSet_t = vector>, hash_pair>>; using PrefixScore_t = unordered_map; using BestX_t_CAI = Flat, detail::NodeIndex>; using BestM_t_CAI = Flat, detail::NodeIndex>; using BestC_t_CAI = Flat; using Broken_codon_t_CAI = Flat, detail::NodeIndex>; BeamCKYParser(const double lambda_value, const bool verbose); DecoderResult parse(DFA_t& dfa, Codon& codon, std::string& aa_seq, std::vector& p, std::unordered_map& aa_best_in_codon, std::unordered_map, std::tuple, std::hash>>>& best_path_in_one_codon, std::unordered_map>& aa_graphs_with_ln_weights); private: template void hairpin_beam(IndexType j, DFA_t& dfa); template void Multi_beam(IndexType j, DFA_t& dfa); template void P_beam(IndexType j, DFA_t& dfa); template void M2_beam(IndexType j, DFA_t& dfa); template void M_beam(IndexType j, DFA_t& dfa); template void C_beam(IndexType j, DFA_t& dfa); void update_if_better(State_t &state, const ScoreType newscore, const double cai_score) { if (state.score + state.cai_score < newscore + cai_score) { state.score = newscore; state.cai_score = cai_score; } } void update_if_better(State_t &state, const ScoreType newscore, const double cai_score, const NodeType pre_node, const double pre_left_cai) { if (state.score + state.cai_score < newscore + cai_score) { state.score = newscore; state.cai_score = cai_score; state.pre_node = pre_node; state.pre_left_cai = pre_left_cai; } } void get_next_pair(DFA_t& dfa); void get_next_pair_set(); void get_prev_pair(DFA_t& dfa); void get_prev_pair_set(); void preprocess(DFA_t& dfa); BacktraceResult backtrace(DFA_t& dfa, const State_t& state, NodeType end_node); ScoreType quickselect_partition(std::vector& scores, ScoreType lower, ScoreType upper); ScoreType quickselect(std::vector& scores, const ScoreType lower, const ScoreType upper, const IndexType k); double get_broken_codon_score(const NodeType& start_node, const NodeType& end_node); double lambda; bool is_verbose; IndexType seq_length; BestX_t_CAI bestH, bestP, bestMulti; BestM_t_CAI bestM2, bestM, bestM_P; // hzhang: bestM_P BestC_t_CAI bestC; detail::NodeNucReverseIndex reverse_index; detail::NodeNucReverseIndex2 reverse_index2; NextPair_t next_pair; NextPairSet_t next_pair_set; NextPair_t prev_pair; NextPairSet_t prev_pair_set; NextPair_t next_list; NextPair_t prev_list; vector>> bulge_score; vector> stacking_score; std::unordered_map> aa_graphs_with_ln_w; std::vector protein; std::unordered_map aa_best_path_in_a_whole_codon; std::unordered_map, std::tuple, std::hash>>> best_path_in_one_codon_unit; Broken_codon_t_CAI get_broken_codon_score_map; #ifdef SPECIAL_HP unordered_map>>, hash_pair>, hash_pair> hairpin_seq_score_cai; void special_hp(DFA_t& dfa, int8_t hairpin_length); #endif }; } ================================================ FILE: src/linear_design.cpp ================================================ #include #include "beam_cky_parser.h" #include "beam_cky_parser.cc" #include "Utils/reader.h" #include "Utils/common.h" #include "Utils/codon.h" // #ifndef CODON_TABLE // #define CODON_TABLE "./codon_usage_freq_table_human.csv" // #endif #ifndef CODING_WHEEL #define CODING_WHEEL "./coding_wheel.txt" #endif using namespace LinearDesign; template bool output_result(const DecoderResult& result, const double duration, const double lambda, const bool is_verbose, const Codon& codon, string& CODON_TABLE) { stringstream ss; if (is_verbose) ss << "Using lambda = " << (lambda / 100.) << "; Using codon frequency table = " << CODON_TABLE << endl; ss << "mRNA sequence: " << result.sequence << endl; ss << "mRNA structure: " << result.structure << endl; ss << "mRNA folding free energy: " << std::setprecision(2) << fixed << result.score << " kcal/mol; mRNA CAI: " << std::setprecision(3) << fixed << codon.calc_cai(result.sequence) << endl; if (is_verbose) ss << "Runtime: " << duration << " seconds" << endl; cout << ss.str() << endl; return true; } void show_usage() { cerr << "echo SEQUENCE | ./lineardesign -l [LAMBDA]" << endl; cerr << "OR" << endl; cerr << "cat SEQ_FILE_OR_FASTA_FILE | ./lineardesign -l [LAMBDA]" << endl; } int main(int argc, char** argv) { // default args double lambda = 0.0f; bool is_verbose = false; string CODON_TABLE = "./codon_usage_freq_table_human.csv"; // parse args if (argc != 4) { show_usage(); return 1; }else{ lambda = atof(argv[1]); is_verbose = atoi(argv[2]) == 1; if (string(argv[3]) != ""){ CODON_TABLE = argv[3]; } } lambda *= 100.; // load codon table and coding wheel Codon codon(CODON_TABLE); std::unordered_map> aa_graphs_with_ln_weights; std::unordered_map, std::tuple, std::hash>>> best_path_in_one_codon_unit; std::unordered_map aa_best_path_in_a_whole_codon; prepare_codon_unit_lattice(CODING_WHEEL, codon, aa_graphs_with_ln_weights, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon, lambda); // main loop string aa_seq, aa_tri_seq; vector aa_seq_list, aa_name_list; // load input for (string seq; getline(cin, seq);){ if (seq.empty()) continue; if (seq[0] == '>'){ aa_name_list.push_back(seq); // sequence name if (!aa_seq.empty()) aa_seq_list.push_back(aa_seq); aa_seq.clear(); continue; }else{ rtrim(seq); aa_seq += seq; } } if (!aa_seq.empty()) aa_seq_list.push_back(aa_seq); // start design for(int i = 0; i < aa_seq_list.size(); i++){ if (aa_name_list.size() > i) cout << aa_name_list[i] << endl; auto& aa_seq = aa_seq_list[i]; // convert to uppercase transform(aa_seq.begin(), aa_seq.end(), aa_seq.begin(), ::toupper); aa_tri_seq.clear(); if (is_verbose) cout << "Input protein: " << aa_seq << endl; if (!ReaderTraits::cvt_to_seq(aa_seq, aa_tri_seq)) continue; // init parser BeamCKYParser parser(lambda, is_verbose); auto protein = util::split(aa_tri_seq, ' '); // parse auto system_start = chrono::system_clock::now(); auto dfa = get_dfa(aa_graphs_with_ln_weights, util::split(aa_tri_seq, ' ')); auto result = parser.parse(dfa, codon, aa_seq, protein, aa_best_path_in_a_whole_codon, best_path_in_one_codon_unit, aa_graphs_with_ln_weights); auto system_diff = chrono::system_clock::now() - system_start; auto system_duration = chrono::duration(system_diff).count(); // output output_result(result, system_duration, lambda, is_verbose, codon, CODON_TABLE); #ifdef FINAL_CHECK if (codon.cvt_rna_seq_to_aa_seq(result.sequence) != aa_seq) { std::cerr << "Final Check Failed:" << std::endl; std::cerr << codon.cvt_rna_seq_to_aa_seq(result.sequence) << std::endl; std::cerr << aa_seq << std::endl; assert(false); } #endif } return 0; } ================================================ FILE: testseq ================================================ >seq1 MPNTLACP >seq2 MLDQVNKLKYPEVSLT*