Repository: microsoft/RedStone Branch: main Commit: 50b3bd9dcc6f Files: 130 Total size: 57.7 MB Directory structure: gitextract_ayw6h_qv/ ├── .github/ │ └── workflows/ │ └── codeql.yml ├── CODE_OF_CONDUCT.md ├── DomainSpecific/ │ ├── .gitignore │ ├── configs/ │ │ ├── cc_math_filter.CC-MAIN-2023-23.json │ │ ├── cc_openquestion_filter.CC-MAIN-2023-23.json │ │ ├── cc_warc_download.CC-MAIN-2023-23.json │ │ ├── cc_warc_filter.CC-MAIN-2023-23.json │ │ ├── cc_warc_to_wet.code.CC-MAIN-2023-23.json │ │ ├── cc_warc_to_wet.math.CC-MAIN-2023-23.json │ │ └── network_template.json │ ├── core/ │ │ ├── __init__.py │ │ ├── data.py │ │ ├── layer.py │ │ ├── layers/ │ │ │ ├── __init__.py │ │ │ ├── control/ │ │ │ │ ├── __init__.py │ │ │ │ ├── data_concat_layer.py │ │ │ │ ├── data_filter_layer.py │ │ │ │ ├── data_order_layer.py │ │ │ │ ├── data_partition_layer.py │ │ │ │ ├── data_sample_layer.py │ │ │ │ └── data_shuffle_layer.py │ │ │ ├── extract/ │ │ │ │ ├── __init__.py │ │ │ │ ├── build_index_layer.py │ │ │ │ ├── extract_article_layer.py │ │ │ │ └── search_index_layer.py │ │ │ ├── global_var.py │ │ │ ├── io/ │ │ │ │ ├── __init__.py │ │ │ │ ├── from_binary_file_layer.py │ │ │ │ ├── from_index_file_layer.py │ │ │ │ ├── from_jsonl_file_layer.py │ │ │ │ ├── from_line_file_layer.py │ │ │ │ ├── from_parquet_file_layer.py │ │ │ │ ├── from_warc_file_layer.py │ │ │ │ ├── from_wet_file_layer.py │ │ │ │ ├── to_binary_file_layer.py │ │ │ │ ├── to_index_file_layer.py │ │ │ │ ├── to_jsonl_file_layer.py │ │ │ │ ├── to_line_file_layer.py │ │ │ │ └── to_parquet_file_layer.py │ │ │ ├── network/ │ │ │ │ ├── __init__.py │ │ │ │ ├── download_bytes_from_blob_layer.py │ │ │ │ ├── download_bytes_from_internet_layer.py │ │ │ │ ├── download_file_from_blob_layer.py │ │ │ │ ├── download_file_from_internet_layer.py │ │ │ │ ├── download_starcoder_layer.py │ │ │ │ ├── download_url_list_layer.py │ │ │ │ ├── download_urls_from_website_layer.py │ │ │ │ ├── download_warc_file_layer.py │ │ │ │ ├── download_warc_indice_layer.py │ │ │ │ ├── upload_bytes_to_blob_layer.py │ │ │ │ └── upload_file_to_blob_layer.py │ │ │ ├── template_layer.py │ │ │ ├── transform/ │ │ │ │ ├── __init__.py │ │ │ │ ├── lsh_minhash_layer.py │ │ │ │ ├── math_filter_layer.py │ │ │ │ ├── mcq_filter_layer.py │ │ │ │ ├── minhash_tokens_layer.py │ │ │ │ ├── ngrams_layer.py │ │ │ │ ├── openquestion_filter_layer.py │ │ │ │ ├── tokenize_article_layer.py │ │ │ │ ├── warc_encode_layer.py │ │ │ │ ├── warc_filter_layer.py │ │ │ │ ├── warc_to_wet_layer.py │ │ │ │ └── wet_decode_layer.py │ │ │ └── util.py │ │ └── network.py │ ├── dependency/ │ │ ├── gpt_api.py │ │ ├── ia-hadoop-tools-jar-with-dependencies.jar │ │ ├── install.py │ │ ├── requirements.txt │ │ └── xsltml_2.0/ │ │ ├── cmarkup.xsl │ │ ├── entities.xsl │ │ ├── glayout.xsl │ │ ├── mmltex.xsl │ │ ├── scripts.xsl │ │ ├── tables.xsl │ │ └── tokens.xsl │ ├── readme.md │ ├── requirements.txt │ ├── resources/ │ │ ├── computation/ │ │ │ ├── batch_dca_eastus.yaml │ │ │ └── local.yaml │ │ ├── environment/ │ │ │ ├── amlt_sing.yaml │ │ │ └── local.yaml │ │ └── storage/ │ │ ├── llmstore.yaml │ │ └── local.yaml │ ├── sample_run.sh │ ├── submit.py │ ├── tools/ │ │ ├── __init__.py │ │ ├── submit_batch_job.py │ │ └── submit_local_job.py │ └── wrapper/ │ ├── __init__.py │ ├── interpreter.py │ ├── parser.py │ ├── runner.py │ └── utility/ │ ├── __init__.py │ ├── azure_env.py │ ├── cpu_count.py │ ├── load_yaml.py │ ├── logger.py │ └── save_yaml.py ├── GeneralDomain/ │ ├── .gitignore │ ├── README.md │ ├── pyproject.toml │ └── redstone_cc/ │ ├── __init__.py │ ├── __main__.py │ ├── algos/ │ │ ├── __init__.py │ │ ├── deduplication/ │ │ │ ├── __init__.py │ │ │ ├── minhash.py │ │ │ ├── sha1.py │ │ │ └── utils.py │ │ ├── fasttext_classifier.py │ │ ├── rule_based_filters/ │ │ │ ├── __init__.py │ │ │ ├── func/ │ │ │ │ ├── __init__.py │ │ │ │ ├── document.py │ │ │ │ ├── line.py │ │ │ │ └── repetition.py │ │ │ ├── model/ │ │ │ │ ├── __init__.py │ │ │ │ ├── document.py │ │ │ │ └── violations.py │ │ │ ├── ruleset/ │ │ │ │ ├── __init__.py │ │ │ │ ├── gopher.py │ │ │ │ └── refinedweb.py │ │ │ └── utils.py │ │ └── trafilatura_process.py │ ├── download_utils.py │ └── process.py ├── LICENSE ├── README.md ├── SECURITY.md └── SUPPORT.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/codeql.yml ================================================ # For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL Advanced" on: push: branches: [ "main" ] pull_request: branches: [ "main" ] schedule: - cron: '24 3 * * 5' jobs: analyze: name: Analyze (${{ matrix.language }}) # Runner size impacts CodeQL analysis time. To learn more, please see: # - https://gh.io/recommended-hardware-resources-for-running-codeql # - https://gh.io/supported-runners-and-hardware-resources # - https://gh.io/using-larger-runners (GitHub.com only) # Consider using larger runners or machines with greater resources for possible analysis time improvements. runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} permissions: # required for all workflows security-events: write # required to fetch internal or private CodeQL packs packages: read # only required for workflows in private repositories actions: read contents: read strategy: fail-fast: false matrix: include: - language: python build-mode: none # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' # Use `c-cpp` to analyze code written in C, C++ or both # Use 'java-kotlin' to analyze code written in Java, Kotlin or both # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages steps: - name: Checkout repository uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs # queries: security-extended,security-and-quality # If the analyze step fails for one of the languages you are analyzing with # "We were unable to automatically build your code", modify the matrix above # to set the build mode to "manual" for that language. Then modify this step # to build your code. # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - if: matrix.build-mode == 'manual' shell: bash run: | echo 'If you are using a "manual" build mode for one or more of the' \ 'languages you are analyzing, replace this with the commands to build' \ 'your code, for example:' echo ' make bootstrap' echo ' make release' exit 1 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Microsoft Open Source Code of Conduct This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). Resources: - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns ================================================ FILE: DomainSpecific/.gitignore ================================================ __pycache__/ dependency/models/ env_ready workspace ================================================ FILE: DomainSpecific/configs/cc_math_filter.CC-MAIN-2023-23.json ================================================ { "name": "cc_math_extraction", "description": "math extraction from cc parquet file - 202323.", "date": "20240513", "version": "1.0.0", "author": "yanghuan", "backend": "Native", "input": { "pq_name_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/cc_pqs/raw/CC-MAIN-2023-23/pqs.CC-MAIN-2023-23.txt" }, "filtered_pq_name_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/cc_pqs/math/CC-MAIN-2023-23/paths.{worker_id}.txt" } }, "output": { "filtered_pq_name_list_file_path": { "type": "Mem_Str" } }, "layer": { "layer01": { "type": "From_Line_File", "joint": "Default", "input": ["pq_name_list_file_path"], "output": ["pq_names"] }, "layer01_par": { "type": "Data_Partition", "joint": "Default", "input": ["pq_names"], "output": ["pq_names"] }, "layer01_sam": { "type": "Data_Sample", "joint": "Default", "param": { "N": -1 }, "input": ["pq_names"], "output": ["pq_names"] }, "layer02": { "type": "Math_Filter", "joint": "FlatMap", "param": { "INPUT_FOLDER": "{workspace_dir}/cc_pqs/raw/CC-MAIN-2023-23/", "OUTPUT_FOLDER": "{workspace_dir}/cc_pqs/math/CC-MAIN-2023-23/" }, "input": ["pq_names"], "output": ["filtered_pq_names"] }, "layer03": { "type": "To_Line_File", "joint": "Default", "input": ["filtered_pq_names", "filtered_pq_name_list_file_path"], "output": ["filtered_pq_name_list_file_path"] } } } ================================================ FILE: DomainSpecific/configs/cc_openquestion_filter.CC-MAIN-2023-23.json ================================================ { "name": "cc_openquestion_extraction", "description": "open question extraction from cc parquet file - 202323.", "date": "20240527", "version": "1.0.0", "author": "yanghuan", "backend": "Native", "input": { "pq_name_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/cc_pqs/raw/CC-MAIN-2023-23/pqs.CC-MAIN-2023-23.txt" }, "filtered_pq_name_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/cc_pqs/openquestion/CC-MAIN-2023-23/paths.{worker_id}.txt" } }, "output": { "filtered_pq_name_list_file_path": { "type": "Mem_Str" } }, "layer": { "layer01": { "type": "From_Line_File", "joint": "Default", "input": ["pq_name_list_file_path"], "output": ["pq_names"] }, "layer01_par": { "type": "Data_Partition", "joint": "Default", "input": ["pq_names"], "output": ["pq_names"] }, "layer01_sam": { "type": "Data_Sample", "joint": "Default", "param": { "N": -1 }, "input": ["pq_names"], "output": ["pq_names"] }, "layer02": { "type": "OpenQuestion_Filter", "joint": "FlatMap", "param": { "INPUT_FOLDER": "{workspace_dir}/cc_pqs/raw/CC-MAIN-2023-23/", "OUTPUT_FOLDER": "{workspace_dir}/cc_pqs/openquestion/CC-MAIN-2023-23/" }, "input": ["pq_names"], "output": ["filtered_pq_names"] }, "layer03": { "type": "To_Line_File", "joint": "Default", "input": ["filtered_pq_names", "filtered_pq_name_list_file_path"], "output": ["filtered_pq_name_list_file_path"] } } } ================================================ FILE: DomainSpecific/configs/cc_warc_download.CC-MAIN-2023-23.json ================================================ { "name": "cc_warc_download", "description": "download warc files for a specific cc snapshot - CC-MAIN-2023-23.", "date": "20231011", "version": "1.0.0", "author": "yanghuan", "backend": "Native", "input": { "warc_url_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/urls.CC-MAIN-2023-23.txt" }, "success_warc_name_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23/paths.{worker_id}.txt" }, "fail_warc_url_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23/fail_urls.{worker_id}.txt" } }, "output": { "success_warc_name_list_file_path": { "type": "Mem_Str" }, "fail_warc_url_list_file_path": { "type": "Mem_Str" } }, "layer": { "layer01": { "type": "From_Line_File", "joint": "Default", "input": ["warc_url_list_file_path"], "output": ["warc_urls"] }, "layer01_par": { "type": "Data_Partition", "joint": "Default", "input": ["warc_urls"], "output": ["warc_urls"] }, "layer01_sam": { "type": "Data_Sample", "joint": "Default", "param": { "N": 1 }, "input": ["warc_urls"], "output": ["warc_urls"] }, "layer02": { "type": "Download_Warc_File", "joint": "Map", "param": { "DOWNLOAD_FOLDER": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23", "CONNECTS": 16, "TRIES": 3 }, "input": ["warc_urls"], "output": ["success_warc_names", "fail_warc_urls"] }, "layer03": { "type": "Data_Filter", "param": { "FILTERS": [null] }, "input": ["success_warc_names"], "output": ["success_warc_names"] }, "layer04": { "type": "To_Line_File", "joint": "Default", "input": ["success_warc_names", "success_warc_name_list_file_path"], "output": ["success_warc_name_list_file_path"] }, "layer05": { "type": "Data_Filter", "param": { "FILTERS": [null] }, "input": ["fail_warc_urls"], "output": ["fail_warc_urls"] }, "layer06": { "type": "To_Line_File", "joint": "Default", "input": ["fail_warc_urls", "fail_warc_url_list_file_path"], "output": ["fail_warc_url_list_file_path"] } } } ================================================ FILE: DomainSpecific/configs/cc_warc_filter.CC-MAIN-2023-23.json ================================================ { "name": "cc_warc_filter", "description": "filter html containing specific tags on warc files - CC-MAIN-2023-23.", "date": "20230825", "version": "1.0.0", "author": "yanghuan", "backend": "Native", "input": { "warc_name_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23/paths.txt" }, "filtered_warc_name_list_file_path": { "type": "Mem_Str", "value": "{workspace_dir}/cc_filtered_warc/CC-MAIN-2023-23/paths.{worker_id}.txt" } }, "output": { "filtered_warc_name_list_file_path": { "type": "Mem_Str" } }, "layer": { "layer01": { "type": "From_Line_File", "joint": "Default", "input": ["warc_name_list_file_path"], "output": ["warc_names"] }, "layer01_par": { "type": "Data_Partition", "joint": "Default", "input": ["warc_names"], "output": ["warc_names"] }, "layer01_sam": { "type": "Data_Sample", "joint": "Default", "param": { "N": -1 }, "input": ["warc_names"], "output": ["warc_names"] }, "layer02": { "type": "Warc_Filter", "joint": "FlatMap", "param": { "INPUT_FOLDER": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23", "OUTPUT_FOLDER": "{workspace_dir}/cc_filtered_warc/CC-MAIN-2023-23/", "TAGS": ["= 100) class Data: """ Data class (Deprecated). """ def __init__(self, type=DataType.Mem_Any, value=None): self.type = type if isinstance(type, DataType) else DataType[type] self.value = value if __name__ == "__main__": data = Data() print(data) ================================================ FILE: DomainSpecific/core/layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback from enum import Enum from tqdm import tqdm from core.layers import LayerType, LayerType2Func class JointType(Enum): Default = 0 # Only process data as whole (frequently used in data IO and control layers). Map = 1 # Firstly split data list into data unit, then process data unit to any type, finnaly return the list of processed data unit. FlatMap = 2 # Firstly split data list into data unit, then process data unit to list type, then concat the whole processed data lists, finnally return the concated data list. class Layer: def __init__(self, type, joint=JointType.Default, repetition=1, param=dict(), input_names=list(), output_names=list()): self.type = type if isinstance(type, LayerType) else LayerType[type] self.func, self.input_types, self.output_types, self.enabled = LayerType2Func[self.type] self.joint = joint if isinstance(joint, JointType) else JointType[joint] self.repetition = repetition self.param = param self.input_names = input_names self.output_names = output_names def __call__(self, inputs, worker_id=0, worker_num=1, variables=dict()): outputs = list() try: variables["worker_id"] = worker_id variables["worker_num"] = worker_num if not isinstance(inputs, list): raise Exception(f"The inputs of layer should be list data type.") if len(inputs) != len(self.input_types): raise Exception(f"The number of inputs is not {len(self.input_types)}.") for i, (data, input_type) in enumerate(zip(inputs, self.input_types)): # TODO: add the check of input type. # check the data type of input. #if data.type != DataType[input_type]: # raise Exception(f"The {i}th data, whose type is {data.type.name}, does not match the input type {input_type}") # Condition of empty input. if data is None: outputs = [None for _ in self.output_types] return outputs # TODO: to address the situation of repetition > 1. for i in range(self.repetition): if self.joint == JointType.Default: values = list(self.func(*inputs, variables, **self.param)) else: n = min([len(data) for data in inputs]) if n != max([len(data) for data in inputs]): raise Exception(f"Element amount of input datas are not equal.") values = [[] for _ in self.output_types] for i in tqdm(range(n), desc=f"Layer: {self.type.name}, worker_id: {worker_id}/{worker_num}"): _values = self.func(*[data[i] for data in inputs], variables, **self.param) for value, _value in zip(values, _values): if _value is None: continue if self.joint == JointType.Map: value.append(_value) elif self.joint == JointType.FlatMap: if not isinstance(_value, list): raise Exception(f"The output of layer should be list data type.") value.extend(_value) else: raise Exception(f"Using unsupported joint type for {self.type.name} layer.") outputs = values except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return outputs if __name__ == "__main__": inputs = [["a", "b", "c", "d", "e"]] layer = Layer(LayerType.Data_Sample, param={"N": 2}) outputs = layer(inputs) print(layer) ================================================ FILE: DomainSpecific/core/layers/__init__.py ================================================ from enum import Enum from ..data import DataType from .template_layer import template_layer # Control layers from .control import * # Network (download/upload) layers from .network import * # IO (read/write) layers from .io import * # Extract layers from .extract import * # Transform layers from .transform import * class LayerType(Enum): Template = 0 # Control Data_Sample = 1 Data_Concat = 2 Data_Order = 3 Data_Partition = 4 Data_Filter = 5 Data_Shuffle = 6 # Network - download/upload Upload_File_To_Blob = 101 Upload_Bytes_To_Blob = 102 Download_File_From_Blob = 103 Download_Bytes_From_Blob = 104 Download_File_From_Internet = 105 Download_Bytes_From_Internet = 106 Download_Url_List = 107 Download_Warc_Indice = 108 Download_Warc_File = 109 Download_Urls_From_Website = 110 Download_Image_From_Jsonl = 111 Download_StarCoder = 112 # IO - read/write To_Binary_File = 201 To_Line_File = 202 To_Jsonl_File = 203 To_Parquet_File = 204 To_Index_File = 205 To_Warc_File = 206 From_Binary_File = 207 From_Line_File = 208 From_Jsonl_File = 209 From_Parquet_File = 210 From_Index_File = 211 From_Wet_File = 212 From_Warc_File = 213 # Extract Extract_Article = 301 Build_Index = 302 Search_Index = 303 # Transform Tokenize_Article = 401 Ngrams = 402 Minhash_Tokens = 403 LSH_Minhash = 404 Warc_Filter = 405 Warc_Encode = 406 Warc_To_Wet = 407 Wet_Decode = 408 Text_Embedding = 409 Sentence_Embedding = 410 Sentence_Filter = 411 Code_Generation = 412 Url_To_Record = 413 Extract_Link_From_Warc = 414 Wet_To_Imageinfos = 415 Warc_To_Screenshot_MD = 416 MCQ_Filter = 417 OpenQuestion_Filter = 418 Convert_PDF = 419 Extract_HTML = 420 MD_Filter = 421 Cascaded_Filter = 422 Math_Filter = 423 LayerType2Func = \ { LayerType.Template : (template_layer, [DataType.Mem_Any], [DataType.Mem_Any], True), # Control LayerType.Data_Sample : (data_sample_layer, [DataType.Mem_List], [DataType.Mem_List], True), LayerType.Data_Concat : (data_concat_layer, [DataType.Mem_List], [DataType.Mem_List], True), LayerType.Data_Order : (data_order_layer, [DataType.Mem_List], [DataType.Mem_List], True), LayerType.Data_Filter : (data_filter_layer, [DataType.Mem_List], [DataType.Mem_List], True), LayerType.Data_Partition : (data_partition_layer, [DataType.Mem_List], [DataType.Mem_List], True), LayerType.Data_Shuffle : (data_shuffle_layer, [DataType.Mem_List], [DataType.Mem_List], True), # Network - download/upload LayerType.Upload_File_To_Blob : (upload_file_to_blob_layer, [DataType.Mem_Str, DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True), LayerType.Upload_Bytes_To_Blob : (upload_bytes_to_blob_layer, [DataType.Mem_Binary, DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True), LayerType.Download_File_From_Blob : (download_file_from_blob_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True), LayerType.Download_Bytes_From_Blob : (download_bytes_from_blob_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Binary, DataType.Mem_Str], True), LayerType.Download_File_From_Internet : (download_file_from_internet_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True), LayerType.Download_Bytes_From_Internet : (download_bytes_from_internet_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Binary, DataType.Mem_Str], True), LayerType.Download_Url_List : (download_url_list_layer, [DataType.Mem_Str], [DataType.Mem_StrList, DataType.Mem_StrList], True), LayerType.Download_Warc_File : (download_warc_file_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True), LayerType.Download_Warc_Indice : (download_warc_indice_layer, [DataType.Mem_Str], [DataType.Mem_StrList, DataType.Mem_StrList], True), LayerType.Download_Urls_From_Website : (download_urls_from_website_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), LayerType.Download_StarCoder : (download_starcoder_layer, [DataType.Mem_Str], [DataType.Mem_Int], True), # IO - read/write LayerType.To_Binary_File : (to_binary_file_layer, [DataType.Mem_Binary, DataType.Mem_Str], [DataType.Mem_Str], True), LayerType.To_Line_File : (to_line_file_layer, [DataType.Mem_StrList, DataType.Mem_Str], [DataType.Mem_Str], True), LayerType.To_Jsonl_File : (to_jsonl_file_layer, [DataType.Mem_DictList, DataType.Mem_Str], [DataType.Mem_Str], True), LayerType.To_Parquet_File : (to_parquet_file_layer, [DataType.Mem_DictList, DataType.Mem_Str], [DataType.Mem_Str], True), LayerType.To_Index_File : (to_index_file_layer, [DataType.Mem_Index, DataType.Mem_Str], [DataType.Mem_Str], True), LayerType.From_Binary_File : (from_binary_file_layer, [DataType.Mem_Str], [DataType.Mem_Binary], True), LayerType.From_Line_File : (from_line_file_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), LayerType.From_Jsonl_File : (from_jsonl_file_layer, [DataType.Mem_Str], [DataType.Mem_DictList], True), LayerType.From_Parquet_File : (from_parquet_file_layer, [DataType.Mem_Str], [DataType.Mem_DictList], True), LayerType.From_Index_File : (from_index_file_layer, [DataType.Mem_Str], [DataType.Mem_Index], True), LayerType.From_Wet_File : (from_wet_file_layer, [DataType.Mem_Str], [DataType.Mem_DictList], True), LayerType.From_Warc_File : (from_warc_file_layer, [DataType.Mem_Str], [DataType.Mem_DictList], True), # Extract LayerType.Extract_Article : (extract_article_layer, [DataType.Mem_Warc], [DataType.Mem_Dict], True), LayerType.Build_Index : (build_index_layer, [DataType.Mem_VectorList], [DataType.Mem_Index], True), LayerType.Search_Index : (search_index_layer, [DataType.Mem_Index, DataType.Mem_VectorList], [DataType.Mem_VectorList, DataType.Mem_VectorList], True), # Transform LayerType.Tokenize_Article : (tokenize_article_layer, [DataType.Mem_Dict], [DataType.Mem_StrList], True), LayerType.Ngrams : (ngrams_layer, [DataType.Mem_StrList], [DataType.Mem_StrList], True), LayerType.Minhash_Tokens : (minhash_tokens_layer, [DataType.Mem_StrList], [DataType.Mem_StrList], True), LayerType.LSH_Minhash : (lsh_minhash_layer, [DataType.Mem_StrList], [DataType.Mem_StrList], True), LayerType.Warc_Filter : (warc_filter_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), LayerType.Warc_Encode : (warc_encode_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), LayerType.Warc_To_Wet : (warc_to_wet_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), LayerType.Wet_Decode : (wet_decode_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), LayerType.Math_Filter : (math_filter_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), LayerType.OpenQuestion_Filter : (openquestion_filter_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), LayerType.MCQ_Filter : (mcq_filter_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True), } __all__ = [ "LayerType", "LayerType2Func", "template_layer", "data_sample_layer", "data_concat_layer", "data_order_layer", "data_partition_layer", "data_filter_layer", "data_shuffle_layer", "upload_file_to_blob_layer", "upload_bytes_to_blob_layer", "download_file_from_blob_layer", "download_bytes_from_blob_layer", "download_file_from_internet_layer", "download_bytes_from_internet_layer", "download_url_list_layer", "download_warc_file_layer", "download_warc_indice_layer", "download_urls_from_website_layer", "download_starcoder_layer", "to_binary_file_layer", "to_line_file_layer", "to_jsonl_file_layer", "to_parquet_file_layer", "to_index_file_layer", "from_binary_file_layer", "from_line_file_layer", "from_jsonl_file_layer", "from_parquet_file_layer", "from_index_file_layer", "from_wet_file_layer", "from_warc_file_layer", "extract_article_layer", "build_index_layer", "search_index_layer", "tokenize_article_layer", "ngrams_layer", "minhash_tokens_layer", "lsh_minhash_layer", "warc_filter_layer", "warc_encode_layer", "warc_to_wet_layer", "wet_decode_layer", "math_filter_layer", "openquestion_filter_layer", "mcq_filter_layer", ] ================================================ FILE: DomainSpecific/core/layers/control/__init__.py ================================================ # Control from .data_sample_layer import data_sample_layer from .data_filter_layer import data_filter_layer from .data_order_layer import data_order_layer from .data_partition_layer import data_partition_layer from .data_shuffle_layer import data_shuffle_layer from .data_concat_layer import data_concat_layer __all__ = [ "data_sample_layer", "data_filter_layer", "data_order_layer", "data_partition_layer", "data_shuffle_layer", "data_concat_layer", ] ================================================ FILE: DomainSpecific/core/layers/control/data_concat_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback def data_concat_layer(lists, variables=dict()): ret = list() try: for a_list in lists[::-1]: if a_list is not None: ret[0:0] = a_list except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": lists = [["a"], ["b", "c"], None, ["d", "e", "f"]] lines = data_concat_layer(lists) print(lines) ================================================ FILE: DomainSpecific/core/layers/control/data_filter_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback def data_filter_layer(lines, variables=dict(), IN=False, FILTERS=(None,)): ret = list() try: ret = list(filter(lambda line: line in FILTERS if IN else line not in FILTERS, lines)) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": lines = ["a", None, "b"] FILTERS = (None,) lines = data_filter_layer(lines, FILTERS=FILTERS) print(lines) ================================================ FILE: DomainSpecific/core/layers/control/data_order_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback def data_order_layer(lines, variables=dict(), REVERSE=False): ret = list() try: ret = sorted(lines, reverse=REVERSE) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": lines = [1, 3, 2] lines = data_order_layer(lines) print(lines) ================================================ FILE: DomainSpecific/core/layers/control/data_partition_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback def data_partition_layer(lines, variables=dict(), WORKER_ID=-1): ret = list() try: worker_id = variables.get("worker_id", 0) worker_num = variables.get("worker_num", 1) n = len(lines) if WORKER_ID == -1: ret = [lines[i] for i in range(worker_id, n, worker_num)] else: ret = lines if WORKER_ID == worker_id else list() except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": lines = [0, 1, 2, 3, 4, 5, 6, 7, 8] variables = {"worker_id": 0, "worker_num": 2} lines = data_partition_layer(lines, variables=variables) print(lines) ================================================ FILE: DomainSpecific/core/layers/control/data_sample_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import random import traceback def data_sample_layer(lines, variables=dict(), N=-1, SEED=1): ret = list() try: random.seed(SEED) N = min(N, len(lines)) if N >= 0: ret = random.sample(lines, N) else: ret = lines except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": lines = ["a", "b"] N = 1 lines = data_sample_layer(lines, N=N) print(lines) ================================================ FILE: DomainSpecific/core/layers/control/data_shuffle_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import random import traceback def data_shuffle_layer(lines, variables=dict(), SEED=1): ret = list() try: random.seed(SEED) random.shuffle(lines) ret = lines except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": lines = ["a", "b"] lines = data_shuffle_layer(lines) print(lines) ================================================ FILE: DomainSpecific/core/layers/extract/__init__.py ================================================ # Extract from .extract_article_layer import extract_article_layer from .build_index_layer import build_index_layer from .search_index_layer import search_index_layer __all__ = [ "extract_article_layer", "build_index_layer", "search_index_layer", ] ================================================ FILE: DomainSpecific/core/layers/extract/build_index_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import sys import faiss import numpy as np import traceback def build_index_layer(base_vectors, variables=dict(), SEED=1, DIM=4096, CLUSTERS=100): ret = None try: np.random.seed(SEED) quantizer = faiss.IndexFlatL2(DIM) index = faiss.IndexIVFFlat(quantizer, DIM, CLUSTERS, faiss.METRIC_L2) assert not index.is_trained base_vectors = np.array(base_vectors) index.train(base_vectors) assert index.is_trained index.add(base_vectors) ret = index except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == '__main__': D = 64 base_vectors = np.random.random((100000, D)).astype('float32') base_vectors[:, 0] += np.arange(100000) / 1000. index = build_index_layer(base_vectors, D=D) print(index) ================================================ FILE: DomainSpecific/core/layers/extract/extract_article_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import re import fasttext import traceback from unittest.mock import patch from bs4 import BeautifulSoup from markdownify import MarkdownConverter, chomp from newspaper import Article import global_var def filter_tags_in_html(soup): def del_tags(soup): del_tags = ['style', 'script', 'img'] for tag in del_tags: tags = soup.find_all(tag) for tag in tags: tag.decompose() tags = soup.find_all('table') for tag in tags: if len(tag.text.strip()) == 0: for tag in tags: tag.decompose() def modify_text(soup): modify_tags = ['a'] for i in range(len(modify_tags)): for tag in soup.find_all(modify_tags[i]): tag_text = tag.text new_tag_text = tag_text.replace('\n', '') if len(new_tag_text) != len(tag_text): tag.string = new_tag_text del_tags(soup) modify_text(soup) return soup def lid(soup, model): LID_WIN_SIZE=256 text = ''.join(soup.text.split()) span_start, span_end = 0, len(text) if len(text) > LID_WIN_SIZE: mid = len(text) // 2 mid_win = LID_WIN_SIZE // 2 span_start = max(0, int(mid - mid_win)) span_end = min(len(text), int(mid + mid_win)) det_text = text[span_start: span_end] res = model.predict(det_text) la = res[0][0].replace("__label__", "") prob = float(res[1][0]) return la, prob def get_main_text_html(soup): article = Article("padding_url", fetch_images=False, keep_article_html=True) article.download(input_html=str(soup)) article.parse() # assert len(article.text.strip()) >= 128 main_html = article.article_html main_text = article.text return main_html, main_text def remove_dup_newline(text): fields = text.split('\n') for i in range(len(fields)): fields[i] = fields[i].strip() return re.sub('\n{2,}', '\n\n', '\n'.join(fields)).strip() class User_MarkdownConverter(MarkdownConverter): def convert_tr(self, el, text, convert_as_inline): cells = el.find_all(['td', 'th']) is_headrow = all([cell.name == 'th' for cell in cells]) overline = '' underline = '' if is_headrow and not el.previous_sibling: # first row and is headline: print headline underline underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' elif (not el.previous_sibling and (el.parent.name == 'table' or (el.parent.name == 'tbody' and not el.parent.previous_sibling))): # first row, not headline, and: # - the parent is table or # - the parent is tbody at the beginning of a table. # print empty headline above this row overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n' overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' if len(text.replace('|', ' ').strip()) == 0: return overline + underline else: return overline + '|' + text.replace('\n', ' ') + '\n' + underline def convert_a(self, el, text, convert_as_inline): prefix, suffix, text = chomp(text) if not text: return '' href = el.get('href') title = el.get('title') # For the replacement see #29: text nodes underscores are escaped if (self.options['autolinks'] and text.replace(r'\_', '_') == href and not title and not self.options['default_title']): # Shortcut syntax return '<%s>' % href if self.options['default_title'] and not title: title = href title_part = ' "%s"' % title.replace('"', r'\"') if title else '' # return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text return '%s %s %s' % (prefix, text.replace('\n', ' '), suffix) if href else text def convert_pre(self, el, text, convert_as_inline): if not text: return '' code_language = self.options['code_language'] if self.options['code_language_callback']: code_language = self.options['code_language_callback'](el) or code_language return '\n```%s\n%s\n```\n' % (code_language, text) def html2text(soup, **options): def clean_markdown(md): fields = md.split('\n') for i in range(len(fields)): fields[i] = fields[i].strip() new_fields = [] for i in range(len(fields)): field_set = list(set(fields[i])) if len(field_set) == 1 and field_set[0] in ['#', '*', '+', '-']: continue new_fields.append(fields[i]) fields = new_fields md = '\n'.join(fields) return re.sub('\n{2,}', '\n\n', md).strip() return clean_markdown(User_MarkdownConverter(**options).convert_soup(soup)) def trans2md(html): soup = BeautifulSoup(html, 'html5lib') markdown_text = html2text(soup) # assert len(markdown_text) > 50 and len(markdown_text.split('\n')) != 1 if markdown_text.startswith('.') and markdown_text.endswith('.'): markdown_text = markdown_text[1:-1] main_text = remove_dup_newline(soup.text) return markdown_text, main_text @classmethod def _patch_newspaper_parser_clean(cls, node): return node @patch('newspaper.parsers.Parser.clean_article_html', new=_patch_newspaper_parser_clean) def extract(soup): main_html, main_text = get_main_text_html(soup) markdown_text, _new_main_text = trans2md(main_html) return markdown_text, main_text def extract_article_layer(id_html, variables=dict()): ret = None try: LA_TIER1 = ["en", "es", "ja", "fr", "de", "pt", "it", "zh"] LA_TIER2 = ["nl", "sv", "da", "fi", "ru", "no", "ko", "zh", "pl", "tr", "ar", "he", "pt", "cs", "hu", "th", "hi"] LA_TIER = LA_TIER1 + LA_TIER2 article_id, html = id_html soup = BeautifulSoup(html, 'html5lib') soup = filter_tags_in_html(soup) la, la_prob = lid(soup, global_var.lid_model) if la in LA_TIER: main_md, main_text = extract(soup) if len(main_text) >= 128: ret = {"id": article_id, "text": main_text, "lang": la, "lang_prob": la_prob} except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == '__main__': id_html = (None, None) id_text_la = extract_article_layer(id_html) print(id_text_la) ================================================ FILE: DomainSpecific/core/layers/extract/search_index_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys import faiss import numpy as np import traceback def search_index_layer(index, query_vectors, variables=dict(), TOPK=1): ret = (None, None) try: query_vectors = np.array(query_vectors) D, I = index.search(query_vectors, TOPK) ret = (I, D) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == '__main__': DIM = 4096 CLUSTERS = 2 base_vectors = np.random.random((100000, DIM)).astype('float32') base_vectors[:, 0] += np.arange(100000) / 1000. quantizer = faiss.IndexFlatL2(DIM) index = faiss.IndexIVFFlat(quantizer, DIM, CLUSTERS, faiss.METRIC_L2) assert not index.is_trained index.train(base_vectors) assert index.is_trained index.add(base_vectors) query_vectors = np.random.random((10000, DIM)).astype('float32') query_vectors[:, 0] += np.arange(10000) / 1000. I, D = search_index_layer(index, query_vectors, D=D) print(D[:1]) ================================================ FILE: DomainSpecific/core/layers/global_var.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys import traceback #import torch import fasttext from transformers import AutoTokenizer, RobertaForSequenceClassification from dependency.gpt_api import GPTAPI try: # silences warnings as the package does not properly use the python 'warnings' package # see https://github.com/facebookresearch/fastText/issues/1056 fasttext.FastText.eprint = lambda *args,**kwargs: None except: pass """ class OpenQuestionModel: def __init__(self, pretrained_model_path, token_model_path="cardiffnlp/twitter-roberta-base-emotion", local_files_only=False): # load tokenizer model. self.tokenizer = AutoTokenizer.from_pretrained(token_model_path) # load trained model. self.model = RobertaForSequenceClassification.from_pretrained(pretrained_model_path, local_files_only=local_files_only) def run(self, text, thred=0.5, max_length=512): # tokenization. inputs = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length) # inference. with torch.no_grad(): logits = self.model(**inputs).logits logits = logits.softmax(dim=1)[0] predicted_idx = logits.argmax().item() predicted_label = self.model.config.id2label[predicted_idx] predicted_conf = logits[predicted_idx].item() if predicted_label == "LABEL_0" and predicted_conf < thred: predicted_idx = 1 predicted_label = "LABEL_1" #return predicted_idx, predicted_label, predicted_conf return predicted_label """ # language detection by fasttext. LID_MODEL_PATH = "./dependency/models/lid.176.bin" if os.path.exists(LID_MODEL_PATH): lid_model = fasttext.load_model(LID_MODEL_PATH) else: lid_model = None # math detection by fasttext. MATH_FT_MODEL_PATH = "./dependency/models/math.bin" if os.path.exists(MATH_FT_MODEL_PATH): ft_math_model = fasttext.load_model(MATH_FT_MODEL_PATH) else: ft_math_model = None # openquestion detection by fasttext. OPENQUESTION_MODEL_PATH = "./dependency/models/openquestion.bin" if os.path.exists(OPENQUESTION_MODEL_PATH): ft_openquestion_model = fasttext.load_model(OPENQUESTION_MODEL_PATH) else: ft_openquestion_model = None # multiple-choice question detection by fasttext. MCQ_MODEL_PATH = "./dependency/models/mcq.bin" if os.path.exists(MCQ_MODEL_PATH): ft_mcq_model = fasttext.load_model(MCQ_MODEL_PATH) else: ft_mcq_model = None """ # multiple-choice question detection by pytorch. MCQ_PT_MODEL_PATH = "./dependency/models/mcq.pytorch" if os.path.exists(MCQ_PT_MODEL_PATH): py_mcq_model = OpenQuestionModel(MCQ_PT_MODEL_PATH, local_files_only=True) else: py_mcq_model = None """ # gpt agent. gpt_api = GPTAPI() ================================================ FILE: DomainSpecific/core/layers/io/__init__.py ================================================ # IO - read/write from .to_binary_file_layer import to_binary_file_layer from .to_line_file_layer import to_line_file_layer from .to_jsonl_file_layer import to_jsonl_file_layer from .to_parquet_file_layer import to_parquet_file_layer from .to_index_file_layer import to_index_file_layer from .from_binary_file_layer import from_binary_file_layer from .from_line_file_layer import from_line_file_layer from .from_jsonl_file_layer import from_jsonl_file_layer from .from_parquet_file_layer import from_parquet_file_layer from .from_index_file_layer import from_index_file_layer from .from_wet_file_layer import from_wet_file_layer from .from_warc_file_layer import from_warc_file_layer __all__ = [ "to_binary_file_layer", "to_line_file_layer", "to_jsonl_file_layer", "to_parquet_file_layer", "to_index_file_layer", "from_binary_file_layer", "from_line_file_layer", "from_jsonl_file_layer", "from_parquet_file_layer", "from_index_file_layer", "from_wet_file_layer", "from_warc_file_layer", ] ================================================ FILE: DomainSpecific/core/layers/io/from_binary_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import util def from_binary_file_layer(file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) if STORAGE_PATH is not None: util.download_file_from_blob(STORAGE_PATH, file_path, file_path) with open(file_path, "rb") as f: data = f.read() ret = data except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": file_path = "test.binary" data = from_binary_file_layer(file_path) print(data) ================================================ FILE: DomainSpecific/core/layers/io/from_index_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import faiss import traceback import util def from_index_file_layer(file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) if STORAGE_PATH is not None: util.download_file_from_blob(STORAGE_PATH, file_path, file_path) index = faiss.read_index(file_path) ret = index except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == '__main__': file_path = "index.faiss" index = from_index_file_layer(file_path) print(index) ================================================ FILE: DomainSpecific/core/layers/io/from_jsonl_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import jsonlines import util def from_jsonl_file_layer(file_path, variables=dict(), STORAGE_PATH=None): ret = list() try: file_path = util.to_real_path(file_path, variables) if STORAGE_PATH is not None: util.download_file_from_blob(STORAGE_PATH, file_path, file_path) with jsonlines.open(file_path) as reader: for line in reader: ret.append(line) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": file_path = "test.jsonl" data = from_jsonl_file_layer(file_path) print(data) ================================================ FILE: DomainSpecific/core/layers/io/from_line_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import util def from_line_file_layer(file_path, variables=dict(), STORAGE_PATH=None): ret = list() try: file_path = util.to_real_path(file_path, variables) if STORAGE_PATH is not None: util.download_file_from_blob(STORAGE_PATH, file_path, file_path) for line in open(file_path, "r"): line = line.strip() ret.append(line) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": file_path = "test.line" lines = from_line_file_layer(file_path) print(lines) ================================================ FILE: DomainSpecific/core/layers/io/from_parquet_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import pyarrow as pa import pyarrow.parquet as pq import util def from_parquet_file_layer(file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) if STORAGE_PATH is not None: util.download_file_from_blob(STORAGE_PATH, file_path, file_path) table = pq.read_table(file_path) ret = table.to_pylist() except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": file_path = "test.parquet" data = from_parquet_file_layer(file_path) print(data) ================================================ FILE: DomainSpecific/core/layers/io/from_warc_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback from warcio.archiveiterator import ArchiveIterator import util def from_warc_file_layer(file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) if STORAGE_PATH is not None: util.download_file_from_blob(STORAGE_PATH, file_path, file_path) if os.path.exists(file_path): items = list() with open(file_path, "rb") as input: records = ArchiveIterator(input, arc2warc=True) for idx, record in enumerate(records): if record.rec_type == "response" and record.http_headers.get_header("Content-Type", "").startswith("text/html"): item = dict() item["uri"] = record.rec_headers.get("WARC-Target-URI") item["lang"] = record.rec_headers.get("Detected-Language") item["content_length"] = record.rec_headers["Content-Length"] item["html"] = record.content_stream().read() items.append(item) ret = items except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": file_path = "test.warc.gz" data = from_warc_file_layer(file_path) print(data) ================================================ FILE: DomainSpecific/core/layers/io/from_wet_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback from warcio.archiveiterator import ArchiveIterator import util def from_wet_file_layer(file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) if STORAGE_PATH is not None: util.download_file_from_blob(STORAGE_PATH, file_path, file_path) if os.path.exists(file_path): items = list() with open(file_path, "rb") as input: records = ArchiveIterator(input, arc2warc=False) for idx, record in enumerate(records): if record.rec_type == "conversion": item = dict() item["uri"] = record.rec_headers.get("WARC-Target-URI") item["lang"] = record.rec_headers.get("Detected-Language") item["content_length"] = record.rec_headers["Content-Length"] item["text"] = record.content_stream().read() items.append(item) ret = items except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": file_path = "test.warc.wet.gz" data = from_wet_file_layer(file_path) print(data) ================================================ FILE: DomainSpecific/core/layers/io/to_binary_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import util def to_binary_file_layer(bytes, file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) util.create_folder_by_file_path(file_path) with open(file_path, "wb") as f: f.write(bytes) if STORAGE_PATH is not None: util.upload_file_to_blob(STORAGE_PATH, file_path, file_path) ret = file_path except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": bytes = b"hello" file_path = "test.binary" file_path = to_binary_file_layer(bytes, file_path) print(file_path) ================================================ FILE: DomainSpecific/core/layers/io/to_index_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import faiss import traceback import util def to_index_file_layer(index, file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) util.create_folder_by_file_path(file_path) faiss.write_index(index, file_path) if STORAGE_PATH is not None: util.upload_file_to_blob(STORAGE_PATH, file_path, file_path) ret = file_path except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == '__main__': D = 64 NLIST = 100 base_vectors = np.random.random((100000, D)).astype('float32') base_vectors[:, 0] += np.arange(100000) / 1000. quantizer = faiss.IndexFlatL2(D) index = faiss.IndexIVFFlat(quantizer, D, NLIST, faiss.METRIC_L2) assert not index.is_trained index.train(base_vectors) assert index.is_trained index.add(base_vectors) file_path = "index.faiss" file_path = to_index_file_layer(index, file_path) print(file_path) ================================================ FILE: DomainSpecific/core/layers/io/to_jsonl_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import jsonlines import util def to_jsonl_file_layer(data, file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) util.create_folder_by_file_path(file_path) with jsonlines.open(file_path, "w") as writer: writer.write_all(data) if STORAGE_PATH is not None: util.upload_file_to_blob(STORAGE_PATH, file_path, file_path) ret = file_path except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": data = [{'id': "1", 'html': "hello"}, {'id': "2", 'html': "hi"}] file_path = "test.jsonl" file_path = to_jsonl_file_layer(data, file_path) print(file_path) ================================================ FILE: DomainSpecific/core/layers/io/to_line_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import util def to_line_file_layer(lines, file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) util.create_folder_by_file_path(file_path) with open(file_path, "w") as f: for line in lines: f.write(line + "\n") if STORAGE_PATH is not None: util.upload_file_to_blob(STORAGE_PATH, file_path, file_path) ret = file_path except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": lines = ["line1", "line2"] file_path = "test.line" file_path = to_line_file_layer(lines, file_path) print(file_path) ================================================ FILE: DomainSpecific/core/layers/io/to_parquet_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import pyarrow as pa import pyarrow.parquet as pq import util def to_parquet_file_layer(data, file_path, variables=dict(), STORAGE_PATH=None): ret = None try: file_path = util.to_real_path(file_path, variables) util.create_folder_by_file_path(file_path) table = pa.Table.from_pylist(data) pq.write_table(table, file_path) if STORAGE_PATH is not None: util.upload_file_to_blob(STORAGE_PATH, file_path, file_path) ret = file_path except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": data = [{'id': "1", 'html': "hello"}, {'id': "2", 'html': "hi"}] file_path = "test.parquet" file_path = to_parquet_file_layer(data, file_path) print(file_path) ================================================ FILE: DomainSpecific/core/layers/network/__init__.py ================================================ # Network - download/upload from .upload_file_to_blob_layer import upload_file_to_blob_layer from .upload_bytes_to_blob_layer import upload_bytes_to_blob_layer from .download_file_from_blob_layer import download_file_from_blob_layer from .download_bytes_from_blob_layer import download_bytes_from_blob_layer from .download_file_from_internet_layer import download_file_from_internet_layer from .download_bytes_from_internet_layer import download_bytes_from_internet_layer from .download_url_list_layer import download_url_list_layer from .download_warc_file_layer import download_warc_file_layer from .download_warc_indice_layer import download_warc_indice_layer from .download_urls_from_website_layer import download_urls_from_website_layer from .download_starcoder_layer import download_starcoder_layer __all__ = [ "upload_file_to_blob_layer", "upload_bytes_to_blob_layer", "download_file_from_blob_layer", "download_bytes_from_blob_layer", "download_file_from_internet_layer", "download_bytes_from_internet_layer", "download_url_list_layer", "download_warc_file_layer", "download_warc_indice_layer", "download_urls_from_website_layer", "download_starcoder_layer", ] ================================================ FILE: DomainSpecific/core/layers/network/download_bytes_from_blob_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import traceback import util def download_bytes_from_blob_layer(blob_path, variables=dict(), STORAGE_PATH=None, TRIES=1): ret = (None, None, blob_path) try: for _ in range(TRIES): try: assert STORAGE_PATH is not None and os.path.exists(STORAGE_PATH) storage_config = util.load_yaml(STORAGE_PATH) blob_path = util.to_real_path(blob_path, variables) file_name = util.md5(blob_path) + util.suffix(blob_path) bytes = util.download_bytes_from_blob(storage_config, blob_path) ret = (file_name, bytes, None) break except: time.sleep(1) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == '__main__': blob_path = "$(azure_blob_path)" STORAGE_PATH = "resources/environment/llmstore.yaml" bytes = download_bytes_from_blob_layer(blob_path, STORAGE_PATH=STORAGE_PATH) print(bytes) ================================================ FILE: DomainSpecific/core/layers/network/download_bytes_from_internet_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import traceback import util def download_bytes_from_internet_layer(url, variables=dict(), TRIES=1): ret = (None, None, url) try: for _ in range(TRIES): try: url = util.to_real_path(url, variables) file_name = util.md5(url) + util.suffix(url) bytes = util.download_bytes_from_internet(url) ret = (file_name, bytes, None) break except: time.sleep(1) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == '__main__': url = "https://upload.wikimedia.org/wikipedia/commons/4/4f/SVG_Logo.svg" bytes = download_bytes_from_internet_layer(url) print(bytes) ================================================ FILE: DomainSpecific/core/layers/network/download_file_from_blob_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import traceback import util def download_file_from_blob_layer(blob_path, variables=dict(), DOWNLOAD_PATH=".", STORAGE_PATH=None, TRIES=1): ret = (None, blob_path) try: for _ in range(TRIES): try: assert STORAGE_PATH is not None and os.path.exists(STORAGE_PATH) storage_config = util.load_yaml(STORAGE_PATH) blob_path = util.to_real_path(blob_path, variables) file_name = util.md5(blob_path) + util.suffix(blob_path) file_path = os.path.join(DOWNLOAD_PATH, file_name) file_path = util.to_real_path(file_path, variables) util.download_file_from_blob(storage_config, blob_path, file_path) ret = (file_path, None) break except: time.sleep(1) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == '__main__': blob_path = "$(azure_blob_path)" DOWNLOAD_PATH = "$(local_folder_path)" STORAGE_PATH = "resources/environment/llmstore.yaml" path = download_file_from_blob_layer(blob_path, DOWNLOAD_PATH=DOWNLOAD_PATH, STORAGE_PATH=STORAGE_PATH) print(path) ================================================ FILE: DomainSpecific/core/layers/network/download_file_from_internet_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import traceback import util def download_file_from_internet_layer(url, variables=dict(), DOWNLOAD_PATH=".", TRIES=1): ret = (None, url) try: for _ in range(TRIES): try: url = util.to_real_path(url, variables) file_name = util.md5(url) + util.suffix(url) file_path = os.path.join(DOWNLOAD_PATH, file_name) file_path = util.to_real_path(file_path, variables) util.download_file_from_internet(url, file_path) #bytes = util.download_bytes_from_internet(url) #util.upload_bytes_to_blob(variables["storage_config"], bytes, file_path) ret = (file_path, None) break except: time.sleep(1) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == '__main__': url = "https://upload.wikimedia.org/wikipedia/commons/4/4f/SVG_Logo.svg" DOWNLOAD_PATH = "$(local_folder_path)" path = download_file_from_internet_layer(url, DOWNLOAD_PATH=DOWNLOAD_PATH) print(path) ================================================ FILE: DomainSpecific/core/layers/network/download_starcoder_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import json from datetime import datetime import boto3 from botocore import UNSIGNED from botocore.config import Config import smart_open from datasets import load_dataset import util s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED)) def download_contents(blob_id, src_encoding): s3_url = f"s3://softwareheritage/content/{blob_id}" with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": s3}) as fin: content = fin.read().decode(src_encoding) return content def download_starcoder_layer(data_repo, variables=dict(), OUTPUT_FOLDER="./", STORAGE_PATH=None, HUGGINGFACE_TOKEN=None): ret = 0 try: worker_id = variables["worker_id"] worker_num = variables["worker_num"] data_repo = util.to_real_path(data_repo, variables) output_folder = util.to_real_path(OUTPUT_FOLDER, variables) if STORAGE_PATH is not None: storage_config = util.load_yaml(STORAGE_PATH) ds = load_dataset(data_repo, split="train", streaming=True, token=HUGGINGFACE_TOKEN, cache_dir=f"./cache.{worker_id}/") ds = ds.filter(lambda row, idx: idx % worker_num == worker_id, with_indices=True) item_count = 0 for i, row in enumerate(ds): for key in row.keys(): if isinstance(row[key], datetime): row[key] = datetime.timestamp(row[key]) blob_id = row["blob_id"] src_encoding = row["src_encoding"] snapshot_prefix = row["snapshot_id"][:4] repo_name = row["repo_name"].replace("/", "@") branch_name = row["branch_name"].replace("/", "@") language = row["language"].replace(" ", "_") path = row["path"].lstrip("/") filename = row["filename"].strip() filename = path extension = row["extension"].strip() content = download_contents(blob_id, src_encoding) code_path = os.path.join(output_folder, snapshot_prefix, repo_name, branch_name, blob_id) metadata_path = os.path.join(output_folder, snapshot_prefix, repo_name, branch_name, blob_id + ".json") try: util.create_folder_by_file_path(code_path) with open(code_path, "w") as f: f.write(content) if STORAGE_PATH is not None: util.upload_file_to_blob(storage_config, code_path, code_path) util.create_folder_by_file_path(metadata_path) with open(metadata_path, "w") as f: f.write(json.dumps(row, indent=4) + "\n") if STORAGE_PATH is not None: util.upload_file_to_blob(storage_config, metadata_path, metadata_path) if STORAGE_PATH is not None: try: os.remove(code_path) os.remove(metadata_path) except OSError: pass except: traceback.print_exc() item_count += 1 ret = item_count except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == '__main__': data_repo = "$(local_the_stack_v2_dedup_metadata_path)" variables = {"workspace_dir": r"workspace", "worker_id": 0, "worker_num": 1} OUTPUT_FOLDER = "$(local_the_stack_v2_dedup_data_path)" STORAGE_PATH = "resources/storage/llmstore.yaml" HUGGINGFACE_TOKEN = None item_count = download_starcoder_layer(data_repo, variables=variables, OUTPUT_FOLDER=OUTPUT_FOLDER, STORAGE_PATH=STORAGE_PATH, HUGGINGFACE_TOKEN=HUGGINGFACE_TOKEN) print(item_count) ================================================ FILE: DomainSpecific/core/layers/network/download_url_list_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import gzip import json import requests import traceback def download_url_list_layer(index_url, variables=dict(), FILTER_SUFFIXES=(), TRIES=1): ret = list() try: for _ in range(TRIES): try: resp = requests.get(index_url, stream=True) urls = list() with gzip.open(resp.raw, 'rt') as f: for line in f.readlines(): text = "{" + line.strip().split(" {")[1] item = json.loads(text) url = item["url"] suffix = os.path.splitext(url)[1] if suffix in FILTER_SUFFIXES: urls.append(url) ret[0:0] = urls break except: time.sleep(1) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret, [index_url] if len(ret) == 0 else []) if __name__ == '__main__': index_url = "https://data.commoncrawl.org/cc-index/collections/CC-MAIN-2023-23/indexes/cdx-00000.gz" FILTER_SUFFIXES = (".svg",) urls = download_url_list_layer(index_url, FILTER_SUFFIXES=FILTER_SUFFIXES) print(urls) ================================================ FILE: DomainSpecific/core/layers/network/download_urls_from_website_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import requests import logging import traceback import xml.etree.ElementTree as ET def download_urls_from_website_layer(website_url, variables=dict(), FILTER=None): ret = list() try: robot_url = website_url + "/robots.txt" logging.disable(logging.WARNING) # get sitemap. xml_urls = list() whilte_url_prefixs = list() black_url_prefixs = list() resp = requests.get(robot_url) crawler = None for line in resp.text.split("\n"): line = line.strip() if len(line) == 0: continue if line.startswith("#"): continue if line.startswith("User-agent:"): crawler = line.split(":")[-1].strip() continue if crawler != "*": continue if crawler == "*" and line.startswith("Disallow:"): url_prefix = line.replace("Disallow:", "").strip() black_url_prefixs.append(url_prefix) continue if crawler == "*" and line.startswith("Allow:"): url_prefix = line.replace("Allow:", "").strip() whilte_url_prefixs.append(url_prefix) continue if crawler == "*" and line.startswith("Sitemap:"): xml_url = line.replace("Sitemap:", "").strip() if (FILTER is None or FILTER in xml_url) and xml_url.endswith(".xml"): xml_urls.append(xml_url) continue # get urls. html_urls = set() for xml_url in xml_urls: try: resp = requests.get(xml_url) root = ET.fromstring(resp.content) for sitemap in root: html_url = list(sitemap)[0].text html_urls.add(html_url) #nodes = tree.xpath('//a/@href') #nodes = tree.xpath("//loc") except: pass ret = list(html_urls) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == '__main__': website_url = "https://byjus.com/" FILTER = "math" urls = download_urls_from_website_layer(website_url, FILTER=FILTER) print(urls[0][0]) ================================================ FILE: DomainSpecific/core/layers/network/download_warc_file_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import traceback import util def download_warc_file_layer(warc_url, variables=dict(), DOWNLOAD_FOLDER="./", CONNECTS=16, TRIES=1, OVERWRITE=False): ret = (None, warc_url) try: if not warc_url.startswith("https://"): warc_url = "https://data.commoncrawl.org/" + warc_url #warc_url = warc_url.replace("https://data.commoncrawl.org/", "https://ds5q9oxwqwsfj.cloudfront.net/")# debug warc_name = warc_url.split("/")[-3] + "_" + os.path.basename(warc_url) warc_path = os.path.join(DOWNLOAD_FOLDER, warc_name) warc_path = util.to_real_path(warc_path, variables) for _ in range(TRIES): if OVERWRITE or not os.path.exists(warc_path): util.create_folder_by_file_path(warc_path) commandline = f"axel -q -n {CONNECTS} -o {warc_path} {warc_url}" exit_status = os.system(commandline) else: exit_status = 0 if exit_status == 0: break time.sleep(1) if exit_status == 0: ret = (warc_name, None) else: ret = (None, warc_url) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == '__main__': warc_url = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-49/segments/1669446706285.92/warc/CC-MAIN-20221126080725-20221126110725-00000.warc.gz" DOWNLOAD_FOLDER = "$(local_folder_path)" (success_warc_url, failed_warc_url) = download_warc_file_layer(warc_url, DOWNLOAD_FOLDER=DOWNLOAD_FOLDER) print(success_warc_url, failed_warc_url) ================================================ FILE: DomainSpecific/core/layers/network/download_warc_indice_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import gzip import requests import traceback def download_warc_indice_layer(index_url, variables=dict(), TRIES=1, URL_PREFIX="https://data.commoncrawl.org/"): ret = list() try: for _ in range(TRIES): try: resp = requests.get(index_url, stream=True) urls = list() with gzip.open(resp.raw, 'rt') as f: for line in f.readlines(): warc_url = URL_PREFIX + line.strip() urls.append(warc_url) ret = urls break except: time.sleep(1) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret, [index_url] if len(ret) == 0 else []) if __name__ == '__main__': index_url = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-49/warc.paths.gz" warc_urls = download_warc_indice_layer(index_url) print(warc_urls[0][0]) ================================================ FILE: DomainSpecific/core/layers/network/upload_bytes_to_blob_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import traceback import util def upload_bytes_to_blob_layer(bytes, blob_path, variables=dict(), STORAGE_PATH=None, BLOB_PREFIX="", TRIES=1): ret = (None, blob_path) try: for _ in range(TRIES): try: assert STORAGE_PATH is not None and os.path.exists(STORAGE_PATH) storage_config = util.load_yaml(STORAGE_PATH) blob_path = util.to_real_path(os.path.join(BLOB_PREFIX, blob_path), variables) util.upload_bytes_to_blob(storage_config, bytes, blob_path) ret = (blob_path, None) break except: time.sleep(1) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == '__main__': bytes = b"hello" blob_path = "$(azure_blob_path)" STORAGE_PATH = "resources/environment/llmstore.yaml" path = upload_bytes_to_blob_layer(bytes, blob_path, STORAGE_PATH=STORAGE_PATH) print(path) ================================================ FILE: DomainSpecific/core/layers/network/upload_file_to_blob_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import traceback import util def upload_file_to_blob_layer(file_path, blob_path, variables=dict(), STORAGE_PATH=None, BLOB_PREFIX="", TRIES=1): ret = (None, blob_path) try: for _ in range(TRIES): try: assert STORAGE_PATH is not None and os.path.exists(STORAGE_PATH) storage_config = util.load_yaml(STORAGE_PATH) file_path = util.to_real_path(file_path, variables) blob_path = util.to_real_path(os.path.join(BLOB_PREFIX, blob_path), variables) util.upload_file_to_blob(storage_config, file_path, blob_path) ret = (blob_path, None) break except: time.sleep(1) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == '__main__': blob_path = "$(azure_blob_path)" file_path = "$(local_file_path)" STORAGE_PATH = "resources/environment/llmstore.yaml" path = upload_file_to_blob_layer(file_path, blob_path, STORAGE_PATH=STORAGE_PATH) print(path) ================================================ FILE: DomainSpecific/core/layers/template_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import sys import traceback # Spec of adding a new layer: # 1. the layer function should be registered in __init__.py file of current folder. # 2. the layer function should return tuple value, even though the return value is empty. # 3. the layer function should contain a "variables" variable in dictionary type for the access of global shared variables. # 4. It's better to implement the unit test and put it to the "__main__" function. # 5. It's better to have exception handling for the function logic. # 6. It's better to end with "_layer" for the name of function. # 7. It's better to write comments for the function of purpose, input and output. # 8. It's better to be lowercase for the name of input datas. # 9. It's better to be uppercase for the name of input parameters. def template_layer(input, variables=dict(), PARAM=None): ret = None try: ret = input except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret,) if __name__ == "__main__": input = None output = template_layer(input) ================================================ FILE: DomainSpecific/core/layers/transform/__init__.py ================================================ # Transform from .tokenize_article_layer import tokenize_article_layer from .ngrams_layer import ngrams_layer from .minhash_tokens_layer import minhash_tokens_layer from .lsh_minhash_layer import lsh_minhash_layer from .warc_filter_layer import warc_filter_layer from .warc_encode_layer import warc_encode_layer from .warc_to_wet_layer import warc_to_wet_layer from .wet_decode_layer import wet_decode_layer from .math_filter_layer import math_filter_layer from .openquestion_filter_layer import openquestion_filter_layer from .mcq_filter_layer import mcq_filter_layer __all__ = [ "tokenize_article_layer", "ngrams_layer", "minhash_tokens_layer", "lsh_minhash_layer", "warc_filter_layer", "warc_encode_layer", "warc_to_wet_layer", "wet_decode_layer", "math_filter_layer", "openquestion_filter_layer", "mcq_filter_layer", ] ================================================ FILE: DomainSpecific/core/layers/transform/lsh_minhash_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import time import traceback import numpy as np from scipy.integrate import quad as integrate # different from datasketch's implementation, will use 2^61-1 as the maximum hash value instead of 2^32-1 NUM_PERM = 256 LSH_THRESHOLD = 0.8 class LSH: def __init__(self): # gen lsh range b, r = self.optimal_param(LSH_THRESHOLD, NUM_PERM, 0.5, 0.5) self.hashranges = [(i*r, (i+1)*r) for i in range(b)] # gen lsh param # https://github.com/ekzhu/datasketch/blob/44077457d32887a91297f15c3efee2c1982f690e/datasketch/lsh.py def false_positive_probability(self, threshold, b, r): _probability = lambda s : 1 - (1 - s**float(r))**float(b) a, err = integrate(_probability, 0.0, threshold) return a def false_negative_probability(self, threshold, b, r): _probability = lambda s : 1 - (1 - (1 - s**float(r))**float(b)) a, err = integrate(_probability, threshold, 1.0) return a def optimal_param(self, threshold, num_perm, false_positive_weight, false_negative_weight): ''' Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum of probabilities of false positive and false negative. ''' min_error = float("inf") opt = (0, 0) for b in range(1, num_perm+1): max_r = int(num_perm / b) for r in range(1, max_r+1): fp = self.false_positive_probability(threshold, b, r) fn = self.false_negative_probability(threshold, b, r) error = fp*false_positive_weight + fn*false_negative_weight if error < min_error: min_error = error opt = (b, r) return opt def gen_lsh(self, minhash): return [bytearray(minhash[start:end]) for start, end in self.hashranges] lsh = LSH() def lsh_minhash_layer(minhash, variables=dict()): ret = list() try: minhash = np.array(minhash, dtype=np.uint64) #assert minhash.dtype == np.uint64 and minhash.shape == (NUM_PERM,) lshvalues = lsh.gen_lsh(minhash) for i, value in enumerate(lshvalues): key = f'{i}_'.encode() + value ret.append(key) except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == "__main__": minhash = [2170239837623632,1287605064391826,7877338491737559,1522708576701298,1959803855170230,136353893425081,3067530819312822,19822079906565762,14191953696745176,371933081470560,2359093478290026,24211742396711177,5207401883495830,3386445753675098,6482843287028185,14956790165792002,7760994632330526,3801562091963312,654119844389846,6118541550243605,1058268864309841,19648312785892006,5519054639081138,17769255728697304,1326859272534844,6541616202650748,11131462447891679,11540424367241221,6416091255362971,1178274890175074,9516296843449206,5019313649584786,556043434180166,3170749841321737,788403856226243,16256424180717928,11536645058081246,13331271075979702,5603975614240490,11332978618315755,49833277925775,28529817665769800,5399529123965416,5804862109442032,10516842515700528,1383775130067327,9593857895450592,344120332429946,3650720026287843,4927677784872807,3114522307389328,1054088699310940,11453703275676121,17145094372333782,11943406601641085,429519913626747,3559765888081715,6380853683568781,13142954055708448,1122751140539670,7679037943867431,23532369906879837,4460946791673399,6284691595180437,5534632051525650,4326069154983305,6645880540672905,1199004738171304,2741143312089611,3315947713975755,33325056362165,17905224452748795,11081894870845940,2429362824597352,8796539339687473,17606225237179401,2406479086961618,25285711888782525,1847958183256316,4198878926995358,5057832224878357,10146090240130753,2413082792037196,3530471135853536,7672611456084586,2230458118023706,9790058494528486,3351632677682193,6902744571969727,4063006572456150,2761280786272613,6242978327908865,26924233559187524,2214283527827093,951652422014210,1577851399523074,282734099627651,4284321096276342,1571021659718705,2064444079057042,25995837896147107,3642452037001290,615591136529782,2579917399379439,10350113780305730,141093940432428,9292013714641581,16926413460125,4351013271280123,4492914008491347,3885988895709230,3643655265951773,4028855757933683,10480484972551973,2399277677842610,391439629014342,4511050103292841,13930059233224697,10142483490268814,10209387364437517,10291028774837120,1963510243393060,6698235608219585,10249974506598137,2090329927024291,19452257405817527,5395347850501660,1466647506773938,18271233688875585,17909487123073655,22732716574954981,28208124344155426,16118266291737203,6436198404802809,935143955767639,4692764892567773,8853071216371112,1600664618209927,39702070969452097,7552579352900360,2729546584440357,12309935356310386,426760114692333,1297488733224877,153415463561661,18948566290952420,8432980683248649,21321844297374743,8265174613176795,905258690673816,705406607744747,9105597597214747,517772088040257,1591136193162784,27511729624229236,3634922285407283,1831578225426174,13255266977668852,15312685554649660,722931468693513,1049089865098577,3498618026981595,4820015824926872,21126162808808528,27814106051492575,4822875592156961,14999120736412943,10825146296544249,6314954554132894,937945964737656,5459760788750366,3819227047549912,6591064604768721,7907494363943122,3486632627636937,9384132089104933,22104346516322826,6658745931891482,34093012584282609,4995951742943174,3517485897161771,135044219482780,7630383357514628,5162177136386332,10728488430543051,5828055747100055,6893511170015442,11011121196423559,2528283999013590,5080079240873515,19593423843180365,6822359610856040,191087978655560,8846708703413576,33146998994366094,3940701969864300,3507581990705859,6201879648552385,27956522101531374,10178358282977630,2205391899838384,2614926987404300,1090899715885363,6945147978151211,5432157012678156,1250518799355535,3948407147690489,10306927288370802,4580562167416191,8475303907451120,2243101892749971,2451601302451002,2180238663422921,3834240093757495,12119880871693653,12134080723101916,1805202361835209,31781168568203930,42987808989068825,41914343122681270,7985132073155851,16763654385115268,1387995454655588,2351466328427087,3139781779642664,27792958762616566,11961004800461011,6612181571493100,22715857059525182,689087660337260,244785061275028,11511948953811059,8237401627755449,8214914423544509,5470929524034644,9110614658125771,17166417582628999,18571246019891132,3766276759071421,1226388404627669,9965671498507403,1214978610204088,7808074359603991,1313444080667563,9031456783378283,3783393382666945,34163041205217466,3314866608200743,3451870308271748,11716681494447625,1667361573332888,13859255454740261,7299000064706400,6085019581018810,4996856251238621,5666642298303467] lsh_values = lsh_minhash_layer(minhash) print(lsh_values) ================================================ FILE: DomainSpecific/core/layers/transform/math_filter_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import re import requests import fasttext from gensim.utils import simple_preprocess import pyarrow as pa import pyarrow.parquet as pq import util import global_var whilte_list = {r"\\displaystyle", r"\\alpha", r"\\beta", r"\\gamma", r"\\delta", r"\\zeta", r"\\eta", r"\\iota", r"\\kappa", r"\\mu", r"\\nu", r"\\xi", r"\\rho", r"\\tau", r"\\phi", r"\\chi", r"\\psi", r"\\omicron", r"\\epsilon", r"\\pi", r"\\lambda", r"\\omega", r"\\sigma", r"\\theta", r"\\vartheta", r"\\times", r"\\cdot", r"\\dot", r"\\div", r"\\frac", r"\\log", r"\\exp", r"\\poly", r"\\eq", r"\\neq", r"\\leq", r"\\geq", r"\\approx", r"\\infty", r"\\int", r"\\sum", r"\\lim", r"\\begin", r"\\subset", r"\\supset", r"\\top", r"\\star", r"\\sim", r"\\simeq", r"\\ne", r"\\ll", r"\\gg", r"\\pm", r"\\mp", r"\\triangleleft", r"\\triangleright", r"\\ast", r"\\circ", r"\\bullet", r"\\oplus", r"\\odot", r"\\otimes", r"\\ominus", r"\\oslash", r"\\bigcirc", r"\\wr", r"\\dagger", r"\\bigtriangleup", r"\\bigtriangledown", r"\\setminus", r"\\sqcup", r"\\wedge", r"\\dotplus", r"\\centerdot", r"\\ltimes", r"\\rtimes", r"\\prod", r"\\coprod", r"\\iint", r"\\iiint", r"\\iiiint", r"\\idotsint", r"\\bigoplus", r"\\big", r"\\oint", r"\\rightarrow", r"\\to", r"\\leftarrow", r"\\gets", r"\\uparrow", r"\\downarrow", r"\\forall", r"\\exists", r"\\pmod", r"\\cup", r"\\cap", r"\\hat", r"\\acute", r"\\check", r"\\grave", r"\\vec", r"\\ddot", r"\\tilde", r"\\breve", r"\\mathring", r"\\land", r"\\lor", r"\\lnot", r"\\in", r"\\smile", r"\\frown", r"\\infty", r"\\mid", r"\\sin", r"\\cos", r"\\tan", r"\\equiv", r"\\circ", r"\\dfrac", r"\\prec", r"\\preccurlyeq", r"\\sqrt",} black_list = {r"\\text", r"\\if", r"\\local", r"\\usr", r"\\include", r"\\lib", r"\\bin", r"\\url", r"\\program", r"\\microsoft", r"\\temp", r"\\windows", r"\\documents", r"\\users", r"\\my", r"\\the",} keywords1 = whilte_list - black_list keywords1 = set(map(lambda x: x + "[^a-zA-Z]", keywords1)) keywords2 = {r"\+", r"\-", r"\*", r"\/", r"\%", r"\=", r"\!\=", r"\<", r"\>", r"\^", r"\_", r"\(", r"\)", r"\[", r"\]", r"\{", r"\}", r"\|\|", r"\&\&", r"sqrt", r"sum", r"int", r"\$", r"\", r"\[math\]", } pattern0 = re.compile(r"\\[A-Z]{0,9}[a-z]{2,9}") pattern1 = re.compile("|".join(keywords1)) pattern2 = re.compile("|".join(keywords2)) def ismath_by_model(text, model, thred=0.5): if model is None: return False if not isinstance(text, str) or len(text.strip()) == 0: return False try: x = " ".join(simple_preprocess(text)) ret = model.predict(x) label, prob = ret[0][0], ret[1][0] return label != "__label__0" except: traceback.print_exc() return False def math_filter_layer(pq_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", OVERWRITE=False): ret = list() try: in_pq_path = os.path.join(INPUT_FOLDER, pq_name) in_pq_path = util.to_real_path(in_pq_path, variables) out_pq_path = os.path.join(OUTPUT_FOLDER, pq_name) out_pq_path = util.to_real_path(out_pq_path, variables) if os.path.exists(in_pq_path) and (OVERWRITE or not os.path.exists(out_pq_path)): util.create_folder_by_file_path(out_pq_path) # read parquet file. try: table = pq.read_table(in_pq_path) except: traceback.print_exc() # filter records containing math. records = list() for record in table.to_pylist(): try: text = record["text"] if record["la"] != "en": continue #if item["la_prob"] < 0.65: # continue #if text is None or len(text) < 64: # continue #if text.count("\\u") >= 10: # continue #if not check_quality(record): # continue symbols0 = set(pattern0.findall(text)) if len(symbols0) <= 0: continue symbols1 = set(pattern1.findall(text.lower())) symbols1 = set(map(lambda sym: sym[:-1], symbols1)) if len(symbols1) <= 0: continue symbols2 = set(pattern2.findall(text.lower())) if len(symbols1) == 1 and len(symbols2) <= 0: continue ismath = len(symbols1) >= 5 or ismath_by_model(text, global_var.ft_math_model) if not ismath: continue records.append(record) except: traceback.print_exc() # write parquet file. try: table = pa.Table.from_pylist(records) pq.write_table(table, out_pq_path) except: traceback.print_exc() ret = [out_pq_path] except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret, ) if __name__ == '__main__': snapshot = "CC-MAIN-2022-49" variables = {"workspace_dir": r"workspace", "worker_id": 0, "worker_num": 1} INPUT_FOLDER = "$(input_data_folder)" OUTPUT_FOLDER = "$(output_data_folder)" STORAGE_PATH = "resources/storage/llmstore.yaml" ret = math_filter_layer(snapshot, variables=variables, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, STORAGE_PATH=STORAGE_PATH) print(ret) ================================================ FILE: DomainSpecific/core/layers/transform/mcq_filter_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import re import json import fasttext import requests from io import BytesIO from gensim.utils import simple_preprocess from warcio.limitreader import LimitReader from warcio.warcwriter import WARCWriter from warcio.archiveiterator import ArchiveIterator import util import global_var def detect_lang(text): try: LID_WIN_SIZE = 256 text = ''.join(text.split()) span_start, span_end = 0, len(text) if len(text) > LID_WIN_SIZE: mid = len(text) // 2 mid_win = LID_WIN_SIZE // 2 span_start = max(0, int(mid - mid_win)) span_end = min(len(text), int(mid + mid_win)) det_text = text[span_start: span_end] res = global_var.lid_model.predict(det_text) lang = res[0][0].replace("__label__", "") prob = float(res[1][0]) return lang except: return "unkown" def detect_choice_exercise_by_rule(uri, html): uri = uri.lower() html = html.lower() contain_cnt = 0 keywords_in_text = [b"choice question"] for keyword in keywords_in_text: if keyword in html: contain_cnt += 1 break combo_keywords_in_text = [ (b"a.", b"b.", b"c.", b"d."), (b"a)", b"b)", b"c)", b"d)"), (b"\na ", b"\nb ", b"\nc ", b"\nd "), (b">a<", b">b<", b">c<", b">d<"), (b"1.", b"2.", b"3.", b"4."), (b"1)", b"2)", b"3)", b"4)"), (b"\n1 ", b"\n2 ", b"\n3 ", b"\n4 "), (b">1<", b">2<", b">3<", b">4<"), (b"i.", b"ii.", b"iii.", b"iv."), (b"i)", b"ii)", b"iii)", b"iv)"), (b"\ni ", b"\nii ", b"\niii ", b"\niv "), (b">i<", b">ii<", b">iii<", b">iv<"), ] for combo_keyword in combo_keywords_in_text: if combo_keyword[0] in html and combo_keyword[1] in html and combo_keyword[2] in html and combo_keyword[3] in html: contain_cnt += 1 break return contain_cnt == 2 def detect_choice_exercise_by_ft_model(uri, text, thred=0.5): try: if not isinstance(text, str) or len(text.strip()) == 0: return False x = " ".join(simple_preprocess(text)) ret = global_var.ft_mcq_model.predict(x) label, prob = ret[0][0], ret[1][0] if label == "__label__0" and prob < thred: return True return label == "__label__1" except: return False """ def detect_choice_exercise_by_pt_model(uri, text, thred=0.5): try: if not isinstance(text, str) or len(text.strip()) == 0: return False label = global_var.py_mcq_model.run(text, thred) return label == "LABEL_1" except: return False """ def detect_choice_exercise_by_LLM(text, engine=None): system = ''' You will be given a text converted from a webpage. Your task is to detect whether it contains choice question by responding with 'yes' or 'no'. ''' answer = global_var.gpt_api.run(system=system, question=text, engine=engine) answer = answer.lower().strip() if answer.startswith("yes"): return True elif answer.startswith("no"): return False else: return False def LCS(str1, str2): m = len(str1) n = len(str2) dp = [[0 for _ in range(n+1)] for _ in range(m+1)] for i in range(1, m+1): for j in range(1, n+1): if str1[i-1] == str2[j-1]: dp[i][j] = dp[i-1][j-1] + 1 else: dp[i][j] = max(dp[i-1][j], dp[i][j-1]) return round(1.0 * dp[m][n] / n, 6) def localize_choice_exercise_by_LLM(text, engine=None): system = ''' Purpose: Create a multiple-choice question dataset. Task: Extract all multiple-choice questions from the provided text. Requirements: 1. If the given text does not contain multiple-choice questions, respond only with "No multiple-choice questions found". 2. Do not modify the original multiple-choice questions. 3. Ensure all multiple-choice questions are copied without omissions. 4. Ensure all multiple-choice questions are copied in order. 5. Ensure all multiple-choice questions are copied under the original layout. 6. Copy the questions along with their options. 7. If answers and explanations are provided, copy them as well. 8. If source materials or reading passage is provided, copy it as well. 9. Don't add content not from original given text. Please strictly adhere to these requirements while performing the task. ''' exercises = global_var.gpt_api.run(system=system, question=text, engine=engine) exercises = exercises.strip() if len(exercises) == 0 or "no multiple-choice question" in exercises.lower(): return None else: exercises = exercises.replace("Multiple Choice Questions\n", "") exercises = exercises.replace("Multiple-choice questions:\n", "") exercises = exercises.replace("No other multiple-choice questions found.", "") exercises = exercises.replace("No other multiple-choice questions found in the text.", "") exercises = exercises.replace("No multiple-choice questions found.", "") exercises = exercises.replace("No more multiple-choice questions found.", "") sim = LCS(text, exercises) if sim < 0.9: return None else: return exercises # rule + model + GPT3.5 turbor. def mcq_filter_layer(wet_file_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", OVERWRITE=False): ret = list() try: src_wet_file_path = os.path.join(INPUT_FOLDER, wet_file_name) src_wet_file_path = util.to_real_path(src_wet_file_path, variables) jsonl_file_name = wet_file_name.replace(".warc.wet.gz", ".jsonl") dst_jsonl_file_path = os.path.join(OUTPUT_FOLDER, jsonl_file_name) dst_jsonl_file_path = util.to_real_path(dst_jsonl_file_path, variables) if os.path.exists(src_wet_file_path) and (OVERWRITE or not os.path.exists(dst_jsonl_file_path)): items = list() with open(src_wet_file_path, "rb") as input: records = ArchiveIterator(input, arc2warc=False) for id, record in enumerate(records): if record.rec_type == "conversion": try: # read raw html. uri = record.rec_headers["WARC-Target-URI"] bs = record.content_stream().read() if bs is None: continue text = str(bs, "utf-8") if text is None: continue # 1st round filter. round1_contain_exercise = detect_choice_exercise_by_rule(uri, bs) if not round1_contain_exercise: continue # 2nd round filter. round2_contain_exercise = detect_choice_exercise_by_ft_model(uri, text, thred=0.825) if not round2_contain_exercise: continue #round2_contain_exercise = detect_choice_exercise_by_pt_model(uri, text, thred=0.99) #if not round2_contain_exercise: # continue """ # 3rd round filter. round3_contain_exercise = detect_choice_exercise_by_LLM(text, "gpt-35-turbo") if not round3_contain_exercise: continue """ item = dict() item["uri"] = uri item["text"] = text lang = detect_lang(text) item["lang"] = lang #exercises = localize_choice_exercise_by_LLM(text, "gpt-35-turbo") #item["exercises"] = exercises items.append(item) except: traceback.print_exc() pass with open(dst_jsonl_file_path, "w") as output: for item in items: output.write(json.dumps(item) + "\n") ret = [dst_jsonl_file_path] except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret, ) if __name__ == '__main__': wet_file_name = "CC-MAIN-20210115134101-20210115164101-00005_5.warc.wet.gz" variables = {"workspace_dir": r"workspace", "worker_id": 0, "worker_num": 1} INPUT_FOLDER = "$(input_data_folder)" OUTPUT_FOLDER = "$(output_data_folder)" ret = mcq_filter_layer(wet_file_name, variables=variables, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, OVERWRITE=True) print(ret) ================================================ FILE: DomainSpecific/core/layers/transform/minhash_tokens_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import hashlib import traceback import numpy as np from itertools import tee MER = 2**61 - 1 NUM_PERM = 256 SEED = 1 class MinHasher: def __init__(self): np.random.seed(1) self.gen = np.random.RandomState(SEED) self.a = self.gen.randint(1, MER, (NUM_PERM,), dtype='u8') self.b = self.gen.randint(0, MER, (NUM_PERM,), dtype='u8') def _sha1_hash(self, val): val = int.from_bytes(hashlib.sha1(val).digest()[:8], 'little') val &= MER return np.uint64(val) def hash(self, sequence): res = np.ones(NUM_PERM, dtype='u8') * MER for token in sequence: hash0 = self._sha1_hash(token.encode('utf8')) hash_vec = hash0 * self.a + self.b hash_vec %= MER res = np.minimum(res, hash_vec) return res minhasher = MinHasher() def minhash_tokens_layer(tokens, variables=dict()): ret = None try: minhash = minhasher.hash(tokens) ret = minhash except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == "__main__": tokens = {'产权 份额 为 土地 出让', '商品 住房 市场 价格 合理', '确定 , 在 售 房', ', 可 向 代 持', '住房 , 划 拨 土地', '增 购 政府 份额 的', '向社会公布 。 划 拨 土地', '为 商品 住房 , 划', '▁来源 : 中国 网 地产', '出让 土地 共有 产权 保障', '的 , 可 向 代', '售 房 阶段 向社会公布 。', '商品 住房 , 划 拨', '以及 累计 缴纳 社保 或', '性质 转 为 商品 住房', '的 非 市区 户籍 家庭', '购房 款 。 ▁在 使用', '地产 ▁ 杭州市 1 日', '《 杭州市 共有 产权 保障', '住房 享有 与 购买 商品', '类型 商品 住房 市场 价格', '的 申请 , 增 购', '价 按 同 地段 、', '款 。 ▁在 使用 管理', '可根据 支付 能力 在 50%', '按照 单 套 销售 价格', '方可 通过 买卖 等方式 上市', '年限 等相关 条件 。 ▁', '10 年后 , 方可 通过', '市场 价格 合理 优惠 后', '拨 土地 共有 产权 保障', '杭州市 共有 产权 保障 住房', '销售 基准 价 按 同', '能力 在 50% 至 80%', '等相关 条件 。 ▁ 办法', '年 的 , 可 向', '至 80% 范围内 选择 产权', '共有 产权 保障 住房 销售', '符合 限购 政策 前提 下', '购房 家庭 可根据 支付 能力', '提出 共有 产权 保障 住房', '住房 , 购房 家庭 可根据', '。 ▁在 使用 管理 方面', '-12- 03 ▁记者 : ▁来源', '保障 住房 面向 符合条件的 市区', '住房 以及 累计 缴纳 社保', '。 ▁ 办法 明确 ,', '购房 家庭 产权 份额 为', '社保 或 个 税 年限', '价 及其 浮动 幅度 确定', '非 市区 户籍 家庭 供应', '购房 款 。 出让 土地', ', 购房 家庭 可根据 支付', '单 套 销售 价格 对应的', '权利 性质 调整为 出让 。', '03 ▁记者 : ▁来源 :', '▁2021 -12- 03 ▁记者 :', '产权 保障 住房 面向 符合条件的', '日 对外 发布 《 杭州市', '就业 的 非 市区 户籍', '增 购 后 住房 性质', ', 购买 共有 产权 保障', '、 同 类型 商品 住房', '同等 的 公共服务 权益 。', '对应的 不同 比例 支付 购房', '的 公共服务 权益 。 ▁根据', '网 地产 ▁ 杭州市 1', '款 。 出让 土地 共有', '套 销售 价格 对应的 产权', '管理 方面 , 杭州 提出', '住房 , 购房 家庭 产权', '和 稳定 就业 的 非', '土地 权利 性质 调整为 出让', '浮动 幅度 确定 , 在', '不动产 权 证 满 10', '▁ 办法 明确 , 共有', '机构 提出 一次性 增 购', '》 , 其中 明确 ,', '权 证 满 10 年后', '在 50% 至 80% 范围内', '方面 , 杭州 提出 共有', '满 10 年后 , 方可', '基准 价 按 同 地段', '产权 份额 比例 , 按照', '保障 住房 管理办法 》 ,', '居住证 、 住房 以及 累计', '销售 价格 对应的 产权 比例', '住房 面向 符合条件的 市区 户籍', '。 ▁根据 办法 , 市区', '单 套 销售 价格 按照', '销售 基准 价 及其 浮动', ': 中国 网 地产 ▁', '持 机构 提出 一次性 增', '价格 按照 销售 基准 价', '家庭 供应 , 购买 共有', '购买 共有 产权 保障 住房', '稳定 就业 的 非 市区', '购买 商品 住房 同等 的', '其中 明确 , 共有 产权', '▁记者 : ▁来源 : 中国', '价格 对应的 不同 比例 支付', '与 购买 商品 住房 同等', '、 住房 等相关 条件 ,', '条件 。 ▁ 办法 明确', '证 满 5 年 的', '满 5 年 的 ,', '管理办法 》 , 其中 明确', '市区 户籍 家庭 需 满足', '份额 的 申请 , 增', '商品 住房 同等 的 公共服务', '支付 能力 在 50% 至', '权 证 满 5 年', '户籍 家庭 需 满足 居住证', ', 方可 通过 买卖 等方式', ', 在 售 房 阶段', '对应的 产权 比例 支付 购房', '产权 保障 住房 购房 家庭', '家庭 需 满足 居住证 、', '杭州 提出 共有 产权 保障', '1 日 对外 发布 《', ', 其中 明确 , 共有', '满足 居住证 、 住房 以及', '选择 产权 份额 比例 ,', '同时 满足 户籍 、 住房', ', 市区 户籍 家庭 要在', '销售 价格 对应的 不同 比例', '个 税 年限 等相关 条件', '住房 市场 价格 合理 优惠', '产权 保障 住房 , 购房', '、 住房 以及 累计 缴纳', '产权 保障 住房 销售 基准', '后 住房 性质 转 为', '土地 出让 时 已 确定的', '比例 , 按照 单 套', '发布 《 杭州市 共有 产权', '住房 性质 转 为 商品', '累计 缴纳 社保 或 个', '份额 比例 , 按照 单', '时 已 确定的 份额 比例', '划 拨 土地 权利 性质', '基准 价 及其 浮动 幅度', '。 出让 土地 共有 产权', '为 土地 出让 时 已', ', 购房 家庭 产权 份额', '等相关 条件 , 非 市区', '按 同 地段 、 同', '按照 销售 基准 价 及其', '不同 比例 支付 购房 款', '住房 销售 基准 价 按', '家庭 产权 份额 为 土地', '可 向 代 持 机构', '▁在 使用 管理 方面 ,', '家庭 取得 不动产 权 证', '性质 调整为 出让 。 取得', '取得 不动产 权 证 满', '市区 户籍 家庭 要在 符合', ', 杭州 提出 共有 产权', '政策 前提 下 同时 满足', '▁根据 办法 , 市区 户籍', '办法 , 市区 户籍 家庭', '缴纳 社保 或 个 税', '。 划 拨 土地 共有', '家庭 可根据 支付 能力 在', '满足 户籍 、 住房 等相关', '一次性 增 购 政府 份额', '购 政府 份额 的 申请', '需 满足 居住证 、 住房', '同 地段 、 同 类型', '供应 , 购买 共有 产权', '使用 管理 方面 , 杭州', '保障 住房 享有 与 购买', '共有 产权 保障 住房 享有', '限购 政策 前提 下 同时', '套 销售 价格 按照 销售', '户籍 和 稳定 就业 的', '优惠 后 确定 。 单', '住房 管理办法 》 , 其中', '市区 户籍 和 稳定 就业', '支付 购房 款 。 ▁在', '户籍 家庭 供应 , 购买', '同 类型 商品 住房 市场', '保障 住房 购房 家庭 取得', '及其 浮动 幅度 确定 ,', '共有 产权 保障 住房 管理办法', '共有 产权 保障 住房 面向', '在 售 房 阶段 向社会公布', '共有 产权 保障 住房 ,', '政府 份额 的 申请 ,', '买卖 等方式 上市 交易 。', '市区 户籍 家庭 供应 ,', '出让 时 已 确定的 份额', '家庭 要在 符合 限购 政策', '申请 , 增 购 后', ', 非 市区 户籍 家庭', '前提 下 同时 满足 户籍', '划 拨 土地 共有 产权', ', 划 拨 土地 权利', '产权 保障 住房 管理办法 》', '阶段 向社会公布 。 划 拨', '明确 , 共有 产权 保障', '确定的 份额 比例 , 按照', '证 满 10 年后 ,', '通过 买卖 等方式 上市 交易', '已 确定的 份额 比例 ,', '不动产 权 证 满 5', '提出 一次性 增 购 政府', '对外 发布 《 杭州市 共有', '价格 合理 优惠 后 确定', '。 取得 不动产 权 证', '范围内 选择 产权 份额 比例', '房 阶段 向社会公布 。 划', '▁ 杭州市 1 日 对外', '份额 为 土地 出让 时', ', 增 购 后 住房', '地段 、 同 类型 商品', '杭州市 1 日 对外 发布', '户籍 家庭 要在 符合 限购', '保障 住房 销售 基准 价', '调整为 出让 。 取得 不动产', ', 共有 产权 保障 住房', '权益 。 ▁根据 办法 ,', '比例 支付 购房 款 。', '保障 住房 , 购房 家庭', '或 个 税 年限 等相关', '年后 , 方可 通过 买卖', '出让 。 取得 不动产 权', '价格 对应的 产权 比例 支付', '购 后 住房 性质 转', '确定 。 单 套 销售', '支付 购房 款 。 出让', '要在 符合 限购 政策 前提', '拨 土地 权利 性质 调整为', '转 为 商品 住房 ,', '享有 与 购买 商品 住房', '公共服务 权益 。 ▁根据 办法', '中国 网 地产 ▁ 杭州市', '5 年 的 , 可', '合理 优惠 后 确定 。', '办法 明确 , 共有 产权', '共有 产权 保障 住房 购房', '套 销售 价格 对应的 不同', '户籍 、 住房 等相关 条件', '下 同时 满足 户籍 、', '产权 保障 住房 享有 与', '面向 符合条件的 市区 户籍 和', '购房 家庭 取得 不动产 权', '条件 , 非 市区 户籍', '幅度 确定 , 在 售', ': ▁来源 : 中国 网', '代 持 机构 提出 一次性', '产权 比例 支付 购房 款', '80% 范围内 选择 产权 份额', '向 代 持 机构 提出', '住房 同等 的 公共服务 权益', '税 年限 等相关 条件 。', '土地 共有 产权 保障 住房', ', 按照 单 套 销售', '非 市区 户籍 家庭 需', '。 单 套 销售 价格', '符合条件的 市区 户籍 和 稳定', '住房 等相关 条件 , 非', '50% 至 80% 范围内 选择', '后 确定 。 单 套', '住房 购房 家庭 取得 不动产', '销售 价格 按照 销售 基准'} minhash = minhash_tokens_layer(tokens) print(minhash) ================================================ FILE: DomainSpecific/core/layers/transform/ngrams_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback from itertools import tee NGRAM_SIZE = 5 def ngrams_layer(sequence, variables=dict()): ret = None try: # https://github.com/ChenghaoMou/text-dedup/blob/main/text_dedup/utils/tokenization.py if len(sequence) < NGRAM_SIZE: return iter([sequence]) iterables = tee(iter(sequence), NGRAM_SIZE) for i, sub_iterable in enumerate(iterables): for _ in range(i): next(sub_iterable, None) tokens = zip(*iterables) tokens = {" ".join(t).strip() for t in tokens} #tokens = list(tokens) ret = tokens except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == "__main__": tokens = ['▁2021', '-12-', '03', '▁记者', ':', '▁来源', ':', '中国', '网', '地产', '▁', '杭州市', '1', '日', '对外', '发布', '《', '杭州市', '共有', '产权', '保障', '住房', '管理办法', '》', ',', '其中', '明确', ',', '共有', '产权', '保障', '住房', '面向', '符合条件的', '市区', '户籍', '和', '稳定', '就业', '的', '非', '市区', '户籍', '家庭', '供应', ',', '购买', '共有', '产权', '保障', '住房', '享有', '与', '购买', '商品', '住房', '同等', '的', '公共服务', '权益', '。', '▁根据', '办法', ',', '市区', '户籍', '家庭', '要在', '符合', '限购', '政策', '前提', '下', '同时', '满足', '户籍', '、', '住房', '等相关', '条件', ',', '非', '市区', '户籍', '家庭', '需', '满足', '居住证', '、', '住房', '以及', '累计', '缴纳', '社保', '或', '个', '税', '年限', '等相关', '条件', '。', '▁', '办法', '明确', ',', '共有', '产权', '保障', '住房', '销售', '基准', '价', '按', '同', '地段', '、', '同', '类型', '商品', '住房', '市场', '价格', '合理', '优惠', '后', '确定', '。', '单', '套', '销售', '价格', '按照', '销售', '基准', '价', '及其', '浮动', '幅度', '确定', ',', '在', '售', '房', '阶段', '向社会公布', '。', '划', '拨', '土地', '共有', '产权', '保障', '住房', ',', '购房', '家庭', '可根据', '支付', '能力', '在', '50%', '至', '80%', '范围内', '选择', '产权', '份额', '比例', ',', '按照', '单', '套', '销售', '价格', '对应的', '不同', '比例', '支付', '购房', '款', '。', '出让', '土地', '共有', '产权', '保障', '住房', ',', '购房', '家庭', '产权', '份额', '为', '土地', '出让', '时', '已', '确定的', '份额', '比例', ',', '按照', '单', '套', '销售', '价格', '对应的', '产权', '比例', '支付', '购房', '款', '。', '▁在', '使用', '管理', '方面', ',', '杭州', '提出', '共有', '产权', '保障', '住房', '购房', '家庭', '取得', '不动产', '权', '证', '满', '5', '年', '的', ',', '可', '向', '代', '持', '机构', '提出', '一次性', '增', '购', '政府', '份额', '的', '申请', ',', '增', '购', '后', '住房', '性质', '转', '为', '商品', '住房', ',', '划', '拨', '土地', '权利', '性质', '调整为', '出让', '。', '取得', '不动产', '权', '证', '满', '10', '年后', ',', '方可', '通过', '买卖', '等方式', '上市', '交易', '。'] tokens = ngrams_layer(tokens) print(tokens) ================================================ FILE: DomainSpecific/core/layers/transform/openquestion_filter_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import re import gc import requests import fasttext from gensim.utils import simple_preprocess import pyarrow as pa import pyarrow.parquet as pq sys.path.append(".") import util import global_var question_keywords = ("q&a", "q & a", "q:", "que:", "question:", "quiz:", "exam:", "examination:", "probe:", "request:", "challenge:", "test:", "query:", "survey:") #question_keywords2 = ("what ", "where ", "why ", "when ", "who ", "whoes ", "how ", "\?") question_keywords2 = ("what", "where", "why", "when", "who", "whoes", "how") question_keywords += question_keywords2 question_keywords = set(map(lambda x: "[^a-zA-Z]" + x + "[^a-zA-Z]", question_keywords)) question_pattern = re.compile("|".join(question_keywords)) answer_keywords = ("q&a", "q & a", "a:", "ans:", "answer:", "solution:", "reply:", "response:", "result:", "outcome:", "explanation:", "conclusion:", "finding:", "assertion:", "statement:", "clarification:") answer_keywords = set(map(lambda x: "[^a-zA-Z]" + x + "[^a-zA-Z]", answer_keywords)) answer_pattern = re.compile("|".join(answer_keywords)) def is_openquestion_by_model(text, model, thred=0.5): if model is None: return False if not isinstance(text, str) or len(text.strip()) == 0: return False try: x = " ".join(simple_preprocess(text)) ret = model.predict(x) label, prob = ret[0][0], ret[1][0] return label != "__label__0" except: traceback.print_exc() return False def check_yes_no_question(text_before, text_after): text_after = text_after.lower().strip() keywords = ("yes", "y", "no", "n") for keyword in keywords: if text_after.startswith(keyword) and not text_after[len(keyword)].isalnum(): return True return False def check_multiple_choise_question(text_before, text_after): combo_keywords_list = [ ("a.", "b.", "c.", "d."), ("a)", "b)", "c)", "d)"), ("\na ", "\nb ", "\nc ", "\nd "), (">a<", ">b<", ">c<", ">d<"), ("1.", "2.", "3.", "4."), ("1)", "2)", "3)", "4)"), ("\n1 ", "\n2 ", "\n3 ", "\n4 "), (">1<", ">2<", ">3<", ">4<"), ("i.", "ii.", "iii.", "iv."), ("i)", "ii)", "iii)", "iv)"), ("\ni ", "\nii ", "\niii ", "\niv "), (">i<", ">ii<", ">iii<", ">iv<"), ] text_before = text_before.lower().strip() for combo_keywords in combo_keywords_list: t = 0 for combo_keyword in combo_keywords: t = text_before.find(combo_keyword, t) if t == -1: break if t != -1: return True #if combo_keywords[0] in text_before and combo_keywords[1] in text_before and combo_keywords[2] in text_before: # return True return False def check_fill_in_question(text_before, text_after): text_before = text_before.lower().strip() if "___" in text_before or "()" in text_before or "..." in text_before: return True return False def check_quality(item): text = item["text"] lines = text.split("\n") lens = list(map(lambda l: len(l.strip()), lines)) max_len = max(lens) #if max_len > 1024: if max_len > 2048: return False if max_len <= 128: return False if len(lens) <= 3: return False if len(lens) > 256: return False if len(text) < 256: return False if len(text) > 1024 * 16: return False if 1.0 * text.count(" ") / len(text) > 0.33: return False if 1.0 * text.count(" ") / len(text) > 0.1: return False if 1.0 * text.count("\t") / len(text) > 0.1: return False if 1.0 * text.count(".") / len(text) > 0.1: return False if 1.0 * text.count("-") / len(text) > 0.1: return False if 1.0 * text.count("#") / len(text) > 0.1: return False if 1.0 * text.count("|") / len(text) > 0.1: return False if 1.0 * text.count(",") / len(text) > 0.1: return False sl_cnt = 1.0 * len(list(filter(lambda x: len(x.strip()) <= 32, lines))) / len(lines) if sl_cnt > 0.67: return False return True def openquestion_filter_layer(pq_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", OVERWRITE=False): ret = list() try: in_pq_path = os.path.join(INPUT_FOLDER, pq_name) in_pq_path = util.to_real_path(in_pq_path, variables) out_pq_path = os.path.join(OUTPUT_FOLDER, pq_name) out_pq_path = util.to_real_path(out_pq_path, variables) if os.path.exists(in_pq_path) and (OVERWRITE or not os.path.exists(out_pq_path)): util.create_folder_by_file_path(out_pq_path) # read parquet file. try: table = pq.read_table(in_pq_path) records = table.to_pylist() except: traceback.print_exc() # filter records containing open question. openquestion_records = list() for record_idx, record in enumerate(records): try: text = record["text"] text_low = text.lower() if record["la"] != "en": continue #if item["la_prob"] < 0.65: # continue #if text is None or len(text) < 64: # continue #if text.count("\\u") >= 10: # continue #if not check_quality(record): # continue contain_question = len(question_pattern.findall(text_low)) >= 2 if not contain_question: continue contain_answer = len(answer_pattern.findall(text_low)) >= 2 if not contain_answer: continue contain_openquestion = is_openquestion_by_model(text, global_var.ft_openquestion_model) if not contain_openquestion: continue openquestion_records.append(record) except: traceback.print_exc() # write parquet file. try: openquestion_table = pa.Table.from_pylist(openquestion_records) pq.write_table(openquestion_table, out_pq_path) except: traceback.print_exc() ret = [out_pq_path] except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return (ret, ) if __name__ == '__main__': snapshot = "CC-MAIN-2022-49" variables = {"workspace_dir": r"workspace", "worker_id": 0, "worker_num": 1} INPUT_FOLDER = "$(input_data_folder)" OUTPUT_FOLDER = "$(output_data_folder)" STORAGE_PATH = "resources/storage/llmstore.yaml" ret = openquestion_filter_layer(snapshot, variables=variables, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, STORAGE_PATH=STORAGE_PATH) print(ret) ================================================ FILE: DomainSpecific/core/layers/transform/tokenize_article_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import traceback import sentencepiece as spm tokenizer = None def tokenize_article_layer(article, variables=dict(), SPM_MODEL_PATH="./dependency/models/sentencepiece.bpe.model"): ret = None try: global tokenizer if tokenizer is None: tokenizer = spm.SentencePieceProcessor(SPM_MODEL_PATH) tokens = tokenizer.encode(article, out_type=str) ret = tokens except KeyboardInterrupt: sys.exit() except Exception as ex: traceback.print_exc() return ret if __name__ == "__main__": article = "2021-12-03 记者: 来源:中国网地产\n\n杭州市1日对外发布《杭州市共有产权保障住房管理办法》,其中明确,共有产权保障住房面向符合条件的市区户籍和稳定就业的非市区户籍家庭供应,购买共有产权保障住房享有与购买商品住房同等的公共服务权益。\n\n根据办法,市区户籍家庭要在符合限购政策前提下同时满足户籍、住房等相关条件,非市区户籍家庭需满足居住证、住房以及累计缴纳社保或个税年限等相关条件。\n\n办法明确,共有产权保障住房销售基准价按同地段、同类型商品住房市场价格合理优惠后确定。单套销售价格按照销售基准价及其浮动幅度确定,在售房阶段向社会公布。划拨土地共有产权保障住房,购房家庭可根据支付能力在50%至80%范围内选择产权份额比例,按照单套销售价格对应的不同比例支付购房款。出让土地共有产权保障住房,购房家庭产权份额为土地出让时已确定的份额比例,按照单套销售价格对应的产权比例支付购房款。\n\n在使用管理方面,杭州提出共有产权保障住房购房家庭取得不动产权证满5年的,可向代持机构提出一次性增购政府份额的申请,增购后住房性质转为商品住房,划拨土地权利性质调整为出让。取得不动产权证满10年后,方可通过买卖等方式上市交易。" tokens = tokenize_article_layer(article) print(tokens) ================================================ FILE: DomainSpecific/core/layers/transform/warc_encode_layer.py ================================================ # # Copyright (c) Microsoft Corporation. All rights reserved. # # coding=utf-8 import os import sys os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..") import re import codecs import logging import traceback import requests from pathlib import Path from urllib.parse import urlparse from io import BytesIO from warcio.limitreader import LimitReader from warcio.warcwriter import WARCWriter from warcio.archiveiterator import ArchiveIterator import lxml.etree as ET import lxml.html as HT from py_asciimath.translator.translator import MathML2Tex from pylatexenc.latexwalker import LatexWalker from charset_normalizer import detect import util def tex_in_script_tag(text): return text.startswith(''), (b"'), (b"'), (b"'), (b"