Full Code of microsoft/RedStone for AI

main 50b3bd9dcc6f cached

130 files

57.7 MB

120.2k tokens

272 symbols

1 requests

Download .txt

Showing preview only (448K chars total). Download the full file or copy to clipboard to get everything.

Repository: microsoft/RedStone
Branch: main
Commit: 50b3bd9dcc6f
Files: 130
Total size: 57.7 MB

Directory structure:
gitextract_ayw6h_qv/

├── .github/
│   └── workflows/
│       └── codeql.yml
├── CODE_OF_CONDUCT.md
├── DomainSpecific/
│   ├── .gitignore
│   ├── configs/
│   │   ├── cc_math_filter.CC-MAIN-2023-23.json
│   │   ├── cc_openquestion_filter.CC-MAIN-2023-23.json
│   │   ├── cc_warc_download.CC-MAIN-2023-23.json
│   │   ├── cc_warc_filter.CC-MAIN-2023-23.json
│   │   ├── cc_warc_to_wet.code.CC-MAIN-2023-23.json
│   │   ├── cc_warc_to_wet.math.CC-MAIN-2023-23.json
│   │   └── network_template.json
│   ├── core/
│   │   ├── __init__.py
│   │   ├── data.py
│   │   ├── layer.py
│   │   ├── layers/
│   │   │   ├── __init__.py
│   │   │   ├── control/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── data_concat_layer.py
│   │   │   │   ├── data_filter_layer.py
│   │   │   │   ├── data_order_layer.py
│   │   │   │   ├── data_partition_layer.py
│   │   │   │   ├── data_sample_layer.py
│   │   │   │   └── data_shuffle_layer.py
│   │   │   ├── extract/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build_index_layer.py
│   │   │   │   ├── extract_article_layer.py
│   │   │   │   └── search_index_layer.py
│   │   │   ├── global_var.py
│   │   │   ├── io/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── from_binary_file_layer.py
│   │   │   │   ├── from_index_file_layer.py
│   │   │   │   ├── from_jsonl_file_layer.py
│   │   │   │   ├── from_line_file_layer.py
│   │   │   │   ├── from_parquet_file_layer.py
│   │   │   │   ├── from_warc_file_layer.py
│   │   │   │   ├── from_wet_file_layer.py
│   │   │   │   ├── to_binary_file_layer.py
│   │   │   │   ├── to_index_file_layer.py
│   │   │   │   ├── to_jsonl_file_layer.py
│   │   │   │   ├── to_line_file_layer.py
│   │   │   │   └── to_parquet_file_layer.py
│   │   │   ├── network/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── download_bytes_from_blob_layer.py
│   │   │   │   ├── download_bytes_from_internet_layer.py
│   │   │   │   ├── download_file_from_blob_layer.py
│   │   │   │   ├── download_file_from_internet_layer.py
│   │   │   │   ├── download_starcoder_layer.py
│   │   │   │   ├── download_url_list_layer.py
│   │   │   │   ├── download_urls_from_website_layer.py
│   │   │   │   ├── download_warc_file_layer.py
│   │   │   │   ├── download_warc_indice_layer.py
│   │   │   │   ├── upload_bytes_to_blob_layer.py
│   │   │   │   └── upload_file_to_blob_layer.py
│   │   │   ├── template_layer.py
│   │   │   ├── transform/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── lsh_minhash_layer.py
│   │   │   │   ├── math_filter_layer.py
│   │   │   │   ├── mcq_filter_layer.py
│   │   │   │   ├── minhash_tokens_layer.py
│   │   │   │   ├── ngrams_layer.py
│   │   │   │   ├── openquestion_filter_layer.py
│   │   │   │   ├── tokenize_article_layer.py
│   │   │   │   ├── warc_encode_layer.py
│   │   │   │   ├── warc_filter_layer.py
│   │   │   │   ├── warc_to_wet_layer.py
│   │   │   │   └── wet_decode_layer.py
│   │   │   └── util.py
│   │   └── network.py
│   ├── dependency/
│   │   ├── gpt_api.py
│   │   ├── ia-hadoop-tools-jar-with-dependencies.jar
│   │   ├── install.py
│   │   ├── requirements.txt
│   │   └── xsltml_2.0/
│   │       ├── cmarkup.xsl
│   │       ├── entities.xsl
│   │       ├── glayout.xsl
│   │       ├── mmltex.xsl
│   │       ├── scripts.xsl
│   │       ├── tables.xsl
│   │       └── tokens.xsl
│   ├── readme.md
│   ├── requirements.txt
│   ├── resources/
│   │   ├── computation/
│   │   │   ├── batch_dca_eastus.yaml
│   │   │   └── local.yaml
│   │   ├── environment/
│   │   │   ├── amlt_sing.yaml
│   │   │   └── local.yaml
│   │   └── storage/
│   │       ├── llmstore.yaml
│   │       └── local.yaml
│   ├── sample_run.sh
│   ├── submit.py
│   ├── tools/
│   │   ├── __init__.py
│   │   ├── submit_batch_job.py
│   │   └── submit_local_job.py
│   └── wrapper/
│       ├── __init__.py
│       ├── interpreter.py
│       ├── parser.py
│       ├── runner.py
│       └── utility/
│           ├── __init__.py
│           ├── azure_env.py
│           ├── cpu_count.py
│           ├── load_yaml.py
│           ├── logger.py
│           └── save_yaml.py
├── GeneralDomain/
│   ├── .gitignore
│   ├── README.md
│   ├── pyproject.toml
│   └── redstone_cc/
│       ├── __init__.py
│       ├── __main__.py
│       ├── algos/
│       │   ├── __init__.py
│       │   ├── deduplication/
│       │   │   ├── __init__.py
│       │   │   ├── minhash.py
│       │   │   ├── sha1.py
│       │   │   └── utils.py
│       │   ├── fasttext_classifier.py
│       │   ├── rule_based_filters/
│       │   │   ├── __init__.py
│       │   │   ├── func/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── document.py
│       │   │   │   ├── line.py
│       │   │   │   └── repetition.py
│       │   │   ├── model/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── document.py
│       │   │   │   └── violations.py
│       │   │   ├── ruleset/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── gopher.py
│       │   │   │   └── refinedweb.py
│       │   │   └── utils.py
│       │   └── trafilatura_process.py
│       ├── download_utils.py
│       └── process.py
├── LICENSE
├── README.md
├── SECURITY.md
└── SUPPORT.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/codeql.yml
================================================
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL Advanced"

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]
  schedule:
    - cron: '24 3 * * 5'

jobs:
  analyze:
    name: Analyze (${{ matrix.language }})
    # Runner size impacts CodeQL analysis time. To learn more, please see:
    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
    #   - https://gh.io/supported-runners-and-hardware-resources
    #   - https://gh.io/using-larger-runners (GitHub.com only)
    # Consider using larger runners or machines with greater resources for possible analysis time improvements.
    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
    permissions:
      # required for all workflows
      security-events: write

      # required to fetch internal or private CodeQL packs
      packages: read

      # only required for workflows in private repositories
      actions: read
      contents: read

    strategy:
      fail-fast: false
      matrix:
        include:
        - language: python
          build-mode: none
        # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
        # Use `c-cpp` to analyze code written in C, C++ or both
        # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
        # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
        # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
        # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
        # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
        # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4

    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
      uses: github/codeql-action/init@v3
      with:
        languages: ${{ matrix.language }}
        build-mode: ${{ matrix.build-mode }}
        # If you wish to specify custom queries, you can do so here or in a config file.
        # By default, queries listed here will override any specified in a config file.
        # Prefix the list here with "+" to use these queries and those in the config file.

        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
        # queries: security-extended,security-and-quality

    # If the analyze step fails for one of the languages you are analyzing with
    # "We were unable to automatically build your code", modify the matrix above
    # to set the build mode to "manual" for that language. Then modify this step
    # to build your code.
    # ℹ️ Command-line programs to run using the OS shell.
    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
    - if: matrix.build-mode == 'manual'
      shell: bash
      run: |
        echo 'If you are using a "manual" build mode for one or more of the' \
          'languages you are analyzing, replace this with the commands to build' \
          'your code, for example:'
        echo '  make bootstrap'
        echo '  make release'
        exit 1

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v3
      with:
        category: "/language:${{matrix.language}}"


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Microsoft Open Source Code of Conduct

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).

Resources:

- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns


================================================
FILE: DomainSpecific/.gitignore
================================================
__pycache__/
dependency/models/
env_ready
workspace


================================================
FILE: DomainSpecific/configs/cc_math_filter.CC-MAIN-2023-23.json
================================================
{
    "name": "cc_math_extraction",
    "description": "math extraction from cc parquet file - 202323.",
    "date": "20240513",
    "version": "1.0.0",
    "author": "yanghuan",
    "backend": "Native",
    
    "input":
    {
        "pq_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_pqs/raw/CC-MAIN-2023-23/pqs.CC-MAIN-2023-23.txt"
        },
        "filtered_pq_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_pqs/math/CC-MAIN-2023-23/paths.{worker_id}.txt"
        }
    },
    
    "output":
    {
        "filtered_pq_name_list_file_path":
        {
            "type": "Mem_Str"
        }
    },
    
    "layer":
    {
        "layer01":
        {
            "type": "From_Line_File",
            "joint": "Default",
            "input": ["pq_name_list_file_path"],
            "output": ["pq_names"]
        },
        "layer01_par":
        {
            "type": "Data_Partition",
            "joint": "Default",
            "input": ["pq_names"],
            "output": ["pq_names"]
        },
        "layer01_sam":
        {
            "type": "Data_Sample",
            "joint": "Default",
            "param":
            {
                "N": -1
            },
            "input": ["pq_names"],
            "output": ["pq_names"]
        },
        "layer02":
        {
            "type": "Math_Filter",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_pqs/raw/CC-MAIN-2023-23/",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_pqs/math/CC-MAIN-2023-23/"
            },
            "input": ["pq_names"],
            "output": ["filtered_pq_names"]
        },
        "layer03":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["filtered_pq_names", "filtered_pq_name_list_file_path"],
            "output": ["filtered_pq_name_list_file_path"]
        }
    }
}


================================================
FILE: DomainSpecific/configs/cc_openquestion_filter.CC-MAIN-2023-23.json
================================================
{
    "name": "cc_openquestion_extraction",
    "description": "open question extraction from cc parquet file - 202323.",
    "date": "20240527",
    "version": "1.0.0",
    "author": "yanghuan",
    "backend": "Native",
    
    "input":
    {
        "pq_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_pqs/raw/CC-MAIN-2023-23/pqs.CC-MAIN-2023-23.txt"
        },
        "filtered_pq_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_pqs/openquestion/CC-MAIN-2023-23/paths.{worker_id}.txt"
        }
    },
    
    "output":
    {
        "filtered_pq_name_list_file_path":
        {
            "type": "Mem_Str"
        }
    },
    
    "layer":
    {
        "layer01":
        {
            "type": "From_Line_File",
            "joint": "Default",
            "input": ["pq_name_list_file_path"],
            "output": ["pq_names"]
        },
        "layer01_par":
        {
            "type": "Data_Partition",
            "joint": "Default",
            "input": ["pq_names"],
            "output": ["pq_names"]
        },
        "layer01_sam":
        {
            "type": "Data_Sample",
            "joint": "Default",
            "param":
            {
                "N": -1
            },
            "input": ["pq_names"],
            "output": ["pq_names"]
        },
        "layer02":
        {
            "type": "OpenQuestion_Filter",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_pqs/raw/CC-MAIN-2023-23/",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_pqs/openquestion/CC-MAIN-2023-23/"
            },
            "input": ["pq_names"],
            "output": ["filtered_pq_names"]
        },
        "layer03":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["filtered_pq_names", "filtered_pq_name_list_file_path"],
            "output": ["filtered_pq_name_list_file_path"]
        }
    }
}


================================================
FILE: DomainSpecific/configs/cc_warc_download.CC-MAIN-2023-23.json
================================================
{
    "name": "cc_warc_download",
    "description": "download warc files for a specific cc snapshot - CC-MAIN-2023-23.",
    "date": "20231011",
    "version": "1.0.0",
    "author": "yanghuan",
    "backend": "Native",
    
    "input":
    {
        "warc_url_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/urls.CC-MAIN-2023-23.txt"
        },
        "success_warc_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23/paths.{worker_id}.txt"
        },
        "fail_warc_url_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23/fail_urls.{worker_id}.txt"
        }
    },
    
    "output":
    {
        "success_warc_name_list_file_path":
        {
            "type": "Mem_Str"
        },
        "fail_warc_url_list_file_path":
        {
            "type": "Mem_Str"
        }
    },
    
    "layer":
    {
        "layer01":
        {
            "type": "From_Line_File",
            "joint": "Default",
            "input": ["warc_url_list_file_path"],
            "output": ["warc_urls"]
        },
        "layer01_par":
        {
            "type": "Data_Partition",
            "joint": "Default",
            "input": ["warc_urls"],
            "output": ["warc_urls"]
        },
        "layer01_sam":
        {
            "type": "Data_Sample",
            "joint": "Default",
            "param":
            {
                "N": 1
            },
            "input": ["warc_urls"],
            "output": ["warc_urls"]
        },
        "layer02":
        {
            "type": "Download_Warc_File",
            "joint": "Map",
            "param":
            {
                "DOWNLOAD_FOLDER": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23",
                "CONNECTS": 16,
                "TRIES": 3
            },
            "input": ["warc_urls"],
            "output": ["success_warc_names", "fail_warc_urls"]
        },
        "layer03":
        {
            "type": "Data_Filter",
            "param":
            {
                "FILTERS": [null]
            },
            "input": ["success_warc_names"],
            "output": ["success_warc_names"]
        },
        "layer04":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["success_warc_names", "success_warc_name_list_file_path"],
            "output": ["success_warc_name_list_file_path"]
        },
        "layer05":
        {
            "type": "Data_Filter",
            "param":
            {
                "FILTERS": [null]
            },
            "input": ["fail_warc_urls"],
            "output": ["fail_warc_urls"]
        },
        "layer06":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["fail_warc_urls", "fail_warc_url_list_file_path"],
            "output": ["fail_warc_url_list_file_path"]
        }
    }
}


================================================
FILE: DomainSpecific/configs/cc_warc_filter.CC-MAIN-2023-23.json
================================================
{
    "name": "cc_warc_filter",
    "description": "filter html containing specific tags on warc files - CC-MAIN-2023-23.",
    "date": "20230825",
    "version": "1.0.0",
    "author": "yanghuan",
    "backend": "Native",
    
    "input":
    {
        "warc_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23/paths.txt"
        },
        "filtered_warc_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_filtered_warc/CC-MAIN-2023-23/paths.{worker_id}.txt"
        }
    },
    
    "output":
    {
        "filtered_warc_name_list_file_path":
        {
            "type": "Mem_Str"
        }
    },
    
    "layer":
    {
        "layer01":
        {
            "type": "From_Line_File",
            "joint": "Default",
            "input": ["warc_name_list_file_path"],
            "output": ["warc_names"]
        },
        "layer01_par":
        {
            "type": "Data_Partition",
            "joint": "Default",
            "input": ["warc_names"],
            "output": ["warc_names"]
        },
        "layer01_sam":
        {
            "type": "Data_Sample",
            "joint": "Default",
            "param":
            {
                "N": -1
            },
            "input": ["warc_names"],
            "output": ["warc_names"]
        },
        "layer02":
        {
            "type": "Warc_Filter",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_warcs/CC-MAIN-2023-23",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_filtered_warc/CC-MAIN-2023-23/",
                "TAGS": ["<math", "<annotation", "=\"math", "athjax", "math-container", "class=\"tex\"", "tex.cgi", "latex.php", "katex.min.css", "\\frac", "codecogs", "<code", "<pre"]
            },
            "input": ["warc_names"],
            "output": ["filtered_warc_names"]
        },
        "layer03":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["filtered_warc_names", "filtered_warc_name_list_file_path"],
            "output": ["filtered_warc_name_list_file_path"]
        }
    }
}


================================================
FILE: DomainSpecific/configs/cc_warc_to_wet.code.CC-MAIN-2023-23.json
================================================
{
    "name": "cc_warc_to_wet",
    "description": "convert cc warc to wet and keep math formula - CC-MAIN-2023-23.",
    "date": "20230825",
    "version": "1.0.0",
    "author": "yanghuan",
    "backend": "Native",
    
    "input":
    {
        "filter_warc_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_filtered_warc/CC-MAIN-2023-23/paths.txt"
        },
        "encode_warc_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_wets/encode_warc_code/CC-MAIN-2023-23/paths.{worker_id}.txt"
        },
        "filter_wet_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_wets/filter_wet_code/CC-MAIN-2023-23/paths.{worker_id}.txt"
        },
        "decode_wet_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_wets/decode_wet_code/CC-MAIN-2023-23/paths.{worker_id}.txt"
        }
    },
    
    "output":
    {
        "decode_wet_name_list_file_path":
        {
            "type": "Mem_Str"
        }
    },
    
    "layer":
    {
        "layer01":
        {
            "type": "From_Line_File",
            "joint": "Default",
            "input": ["filter_warc_name_list_file_path"],
            "output": ["filter_warc_names"]
        },
        "layer01_par":
        {
            "type": "Data_Partition",
            "joint": "Default",
            "input": ["filter_warc_names"],
            "output": ["filter_warc_names"]
        },
        "layer01_sam":
        {
            "type": "Data_Sample",
            "joint": "Default",
            "param":
            {
                "N": -1
            },
            "input": ["filter_warc_names"],
            "output": ["filter_warc_names"]
        },
        "layer02":
        {
            "type": "Warc_Encode",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_filtered_warc/CC-MAIN-2023-23",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_wets/encode_warc_code/CC-MAIN-2023-23",
                "TAG": "code"
            },
            "input": ["filter_warc_names"],
            "output": ["encode_warc_names"]
        },
        "layer02_out":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["encode_warc_names", "encode_warc_name_list_file_path"],
            "output": ["encode_warc_name_list_file_path"]
        },
        "layer03":
        {
            "type": "Warc_To_Wet",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_wets/encode_warc_code/CC-MAIN-2023-23",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_wets/filter_wet_code/CC-MAIN-2023-23"
            },
            "input": ["encode_warc_names"],
            "output": ["filter_wet_names"]
        },
        "layer03_out":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["filter_wet_names", "filter_wet_name_list_file_path"],
            "output": ["filter_wet_name_list_file_path"]
        },
        "layer04":
        {
            "type": "Wet_Decode",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_wets/filter_wet_code/CC-MAIN-2023-23",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_wets/decode_wet_code/CC-MAIN-2023-23",
                "TAG": "code"
            },
            "input": ["filter_wet_names"],
            "output": ["decode_wet_names"]
        },
        "layer04_out":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["decode_wet_names", "decode_wet_name_list_file_path"],
            "output": ["decode_wet_name_list_file_path"]
        }
    }
}


================================================
FILE: DomainSpecific/configs/cc_warc_to_wet.math.CC-MAIN-2023-23.json
================================================
{
    "name": "cc_warc_to_wet",
    "description": "convert cc warc to wet and keep math formula - CC-MAIN-2023-23.",
    "date": "20230825",
    "version": "1.0.0",
    "author": "yanghuan",
    "backend": "Native",
    
    "input":
    {
        "filter_warc_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_filtered_warc/CC-MAIN-2023-23/paths.txt"
        },
        "encode_warc_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_wets/encode_warc_math/CC-MAIN-2023-23/paths.{worker_id}.txt"
        },
        "filter_wet_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_wets/filter_wet_math/CC-MAIN-2023-23/paths.{worker_id}.txt"
        },
        "decode_wet_name_list_file_path":
        {
            "type": "Mem_Str",
            "value": "{workspace_dir}/cc_wets/decode_wet_math/CC-MAIN-2023-23/paths.{worker_id}.txt"
        }
    },
    
    "output":
    {
        "decode_wet_name_list_file_path":
        {
            "type": "Mem_Str"
        }
    },
    
    "layer":
    {
        "layer01":
        {
            "type": "From_Line_File",
            "joint": "Default",
            "input": ["filter_warc_name_list_file_path"],
            "output": ["filter_warc_names"]
        },
        "layer01_par":
        {
            "type": "Data_Partition",
            "joint": "Default",
            "input": ["filter_warc_names"],
            "output": ["filter_warc_names"]
        },
        "layer01_sam":
        {
            "type": "Data_Sample",
            "joint": "Default",
            "param":
            {
                "N": -1
            },
            "input": ["filter_warc_names"],
            "output": ["filter_warc_names"]
        },
        "layer02":
        {
            "type": "Warc_Encode",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_filtered_warc/CC-MAIN-2023-23",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_wets/encode_warc_math/CC-MAIN-2023-23",
                "TAG": "math"
            },
            "input": ["filter_warc_names"],
            "output": ["encode_warc_names"]
        },
        "layer02_out":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["encode_warc_names", "encode_warc_name_list_file_path"],
            "output": ["encode_warc_name_list_file_path"]
        },
        "layer03":
        {
            "type": "Warc_To_Wet",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_wets/encode_warc_math/CC-MAIN-2023-23",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_wets/filter_wet_math/CC-MAIN-2023-23"
            },
            "input": ["encode_warc_names"],
            "output": ["filter_wet_names"]
        },
        "layer03_out":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["filter_wet_names", "filter_wet_name_list_file_path"],
            "output": ["filter_wet_name_list_file_path"]
        },
        "layer04":
        {
            "type": "Wet_Decode",
            "joint": "FlatMap",
            "param":
            {
                "INPUT_FOLDER": "{workspace_dir}/cc_wets/filter_wet_math/CC-MAIN-2023-23",
                "OUTPUT_FOLDER": "{workspace_dir}/cc_wets/decode_wet_math/CC-MAIN-2023-23",
                "TAG": "math"
            },
            "input": ["filter_wet_names"],
            "output": ["decode_wet_names"]
        },
        "layer04_out":
        {
            "type": "To_Line_File",
            "joint": "Default",
            "input": ["decode_wet_names", "decode_wet_name_list_file_path"],
            "output": ["decode_wet_name_list_file_path"]
        }
    }
}


================================================
FILE: DomainSpecific/configs/network_template.json
================================================
{
    "name": "template_network",
    "description": "Toy example of network.",
    "date": "20230713",
    "version": "1.0.0",
    "author": "yanghuan",
    "backend": "Native",
    
    "input":
    {
        "data1":
        {
            "type": "Mem_StrList",
            "value": ["1", "2", "3", "4", "5"]
        }
    },
    
    "output":
    {
        "data2":
        {
            "type": "Mem_StrList"
        }
    },
    
    "layer":
    {
        "layer1":
        {
            "type": "Data_Sample",
            "joint": "Default",
            "param":
            {
                "N": 2
            },
            "input": ["data1"],
            "output": ["data2"]
        }
    }
}


================================================
FILE: DomainSpecific/core/__init__.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
from .data import DataType
from .layer import Layer, JointType
from .layers import LayerType, LayerType2Func
from .network import Network

__all__ = ["DataType", "Layer", "JointType", "LayerType", "LayerType2Func", "Network"]


================================================
FILE: DomainSpecific/core/data.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
from enum import Enum

class DataType(Enum):
    # Memory Data
    Mem_Any          = 0
    Mem_Binary       = 1
    Mem_Int          = 2
    Mem_Float        = 3
    Mem_Str          = 4
    Mem_Warc         = 5
    Mem_Dict         = 6
    Mem_Index        = 7
    Mem_Vector       = 8
    Mem_Record       = 9
    Mem_List         = 10
    Mem_BinaryList   = 11
    Mem_IntList      = 12
    Mem_FloatList    = 13
    Mem_StrList      = 14
    Mem_WarcList     = 15
    Mem_DictList     = 16
    Mem_IndexList    = 17
    Mem_VectorList   = 18
    Mem_RecordList   = 19

    # Disk Data (Deprecated)
    File_Any         = 100
    File_Binary      = 101
    File_Text        = 102
    File_Warc        = 103
    File_Parquet     = 104
    File_Json        = 105
    File_Index       = 106
    File_Vector      = 107
    File_AnyLines    = 110
    File_TextLines   = 111
    File_JsonLines   = 112
    File_VectorLines = 113

    @staticmethod
    def belong(a, b):
        if not isinstance(a, DataType) or not isinstance(b, DataType):
            return False
        return a == b or \
               (b.value % 10 == 0 and 0 <= a.value - b.value < 10) or \
               (b == DataType.Mem_Any and a.value < 100) or \
               (b == DataType.File_Any and a.value >= 100)

class Data:
    """
    Data class (Deprecated).
    """
    def __init__(self, type=DataType.Mem_Any, value=None):
        self.type = type if isinstance(type, DataType) else DataType[type]
        self.value = value


if __name__ == "__main__":
    data = Data()
    print(data)


================================================
FILE: DomainSpecific/core/layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
from enum import Enum
from tqdm import tqdm
from core.layers import LayerType, LayerType2Func

class JointType(Enum):
    Default = 0 # Only process data as whole (frequently used in data IO and control layers).
    Map     = 1 # Firstly split data list into data unit, then process data unit to any type, finnaly return the list of processed data unit.
    FlatMap = 2 # Firstly split data list into data unit, then process data unit to list type, then concat the whole processed data lists, finnally return the concated data list.

class Layer:
    def __init__(self, type, joint=JointType.Default, repetition=1, param=dict(), input_names=list(), output_names=list()):
        self.type = type if isinstance(type, LayerType) else LayerType[type]
        self.func, self.input_types, self.output_types, self.enabled = LayerType2Func[self.type]
        self.joint = joint if isinstance(joint, JointType) else JointType[joint]
        self.repetition = repetition
        self.param = param
        self.input_names = input_names
        self.output_names = output_names

    def __call__(self, inputs, worker_id=0, worker_num=1, variables=dict()):
        outputs = list()
        try:
            variables["worker_id"] = worker_id
            variables["worker_num"] = worker_num

            if not isinstance(inputs, list):
                raise Exception(f"The inputs of layer should be list data type.")
            if len(inputs) != len(self.input_types):
                raise Exception(f"The number of inputs is not {len(self.input_types)}.")
            for i, (data, input_type) in enumerate(zip(inputs, self.input_types)):
                # TODO: add the check of input type.
                # check the data type of input.
                #if data.type != DataType[input_type]:
                #    raise Exception(f"The {i}th data, whose type is {data.type.name}, does not match the input type {input_type}")
                # Condition of empty input.
                if data is None:
                    outputs = [None for _ in self.output_types]
                    return outputs

            # TODO: to address the situation of repetition > 1.
            for i in range(self.repetition):
                if self.joint == JointType.Default:
                    values = list(self.func(*inputs, variables, **self.param))
                else:
                    n = min([len(data) for data in inputs])
                    if n != max([len(data) for data in inputs]):
                        raise Exception(f"Element amount of input datas are not equal.")

                    values = [[] for _ in self.output_types]
                    for i in tqdm(range(n), desc=f"Layer: {self.type.name}, worker_id: {worker_id}/{worker_num}"):
                        _values = self.func(*[data[i] for data in inputs], variables, **self.param)
                        for value, _value in zip(values, _values):
                            if _value is None:
                                continue
                            if self.joint == JointType.Map:
                                value.append(_value)
                            elif self.joint == JointType.FlatMap:
                                if not isinstance(_value, list):
                                    raise Exception(f"The output of layer should be list data type.")
                                value.extend(_value)
                            else:
                                raise Exception(f"Using unsupported joint type for {self.type.name} layer.")

                outputs = values
        except KeyboardInterrupt:
            sys.exit()
        except Exception as ex:
            traceback.print_exc()
        return outputs


if __name__ == "__main__":
    inputs = [["a", "b", "c", "d", "e"]]
    layer = Layer(LayerType.Data_Sample, param={"N": 2})
    outputs = layer(inputs)
    print(layer)


================================================
FILE: DomainSpecific/core/layers/__init__.py
================================================
from enum import Enum
from ..data import DataType

from .template_layer import template_layer

# Control layers
from .control import *

# Network (download/upload) layers
from .network import *

# IO (read/write) layers
from .io import *

# Extract layers
from .extract import *

# Transform layers
from .transform import *

class LayerType(Enum):
    Template                     = 0

    # Control
    Data_Sample                  = 1
    Data_Concat                  = 2
    Data_Order                   = 3
    Data_Partition               = 4
    Data_Filter                  = 5
    Data_Shuffle                 = 6

    # Network - download/upload
    Upload_File_To_Blob          = 101
    Upload_Bytes_To_Blob         = 102
    Download_File_From_Blob      = 103
    Download_Bytes_From_Blob     = 104
    Download_File_From_Internet  = 105
    Download_Bytes_From_Internet = 106
    Download_Url_List            = 107
    Download_Warc_Indice         = 108
    Download_Warc_File           = 109
    Download_Urls_From_Website   = 110
    Download_Image_From_Jsonl    = 111
    Download_StarCoder           = 112

    # IO - read/write
    To_Binary_File               = 201
    To_Line_File                 = 202
    To_Jsonl_File                = 203
    To_Parquet_File              = 204
    To_Index_File                = 205
    To_Warc_File                 = 206
    From_Binary_File             = 207
    From_Line_File               = 208
    From_Jsonl_File              = 209
    From_Parquet_File            = 210
    From_Index_File              = 211
    From_Wet_File                = 212
    From_Warc_File               = 213

    # Extract
    Extract_Article              = 301
    Build_Index                  = 302
    Search_Index                 = 303
    
    # Transform
    Tokenize_Article             = 401
    Ngrams                       = 402
    Minhash_Tokens               = 403
    LSH_Minhash                  = 404
    Warc_Filter                  = 405
    Warc_Encode                  = 406
    Warc_To_Wet                  = 407
    Wet_Decode                   = 408
    Text_Embedding               = 409
    Sentence_Embedding           = 410
    Sentence_Filter              = 411
    Code_Generation              = 412
    Url_To_Record                = 413
    Extract_Link_From_Warc       = 414
    Wet_To_Imageinfos            = 415
    Warc_To_Screenshot_MD        = 416
    MCQ_Filter                   = 417
    OpenQuestion_Filter          = 418
    Convert_PDF                  = 419
    Extract_HTML                 = 420
    MD_Filter                    = 421
    Cascaded_Filter              = 422
    Math_Filter                  = 423


LayerType2Func = \
{
    LayerType.Template                     : (template_layer, [DataType.Mem_Any], [DataType.Mem_Any], True),

    # Control
    LayerType.Data_Sample                  : (data_sample_layer, [DataType.Mem_List], [DataType.Mem_List], True),
    LayerType.Data_Concat                  : (data_concat_layer, [DataType.Mem_List], [DataType.Mem_List], True),
    LayerType.Data_Order                   : (data_order_layer, [DataType.Mem_List], [DataType.Mem_List], True),
    LayerType.Data_Filter                  : (data_filter_layer, [DataType.Mem_List], [DataType.Mem_List], True),
    LayerType.Data_Partition               : (data_partition_layer, [DataType.Mem_List], [DataType.Mem_List], True),
    LayerType.Data_Shuffle                 : (data_shuffle_layer, [DataType.Mem_List], [DataType.Mem_List], True),

    # Network - download/upload
    LayerType.Upload_File_To_Blob          : (upload_file_to_blob_layer, [DataType.Mem_Str, DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True),
    LayerType.Upload_Bytes_To_Blob         : (upload_bytes_to_blob_layer, [DataType.Mem_Binary, DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True),
    LayerType.Download_File_From_Blob      : (download_file_from_blob_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True),
    LayerType.Download_Bytes_From_Blob     : (download_bytes_from_blob_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Binary, DataType.Mem_Str], True),
    LayerType.Download_File_From_Internet  : (download_file_from_internet_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True),
    LayerType.Download_Bytes_From_Internet : (download_bytes_from_internet_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Binary, DataType.Mem_Str], True),
    LayerType.Download_Url_List            : (download_url_list_layer, [DataType.Mem_Str], [DataType.Mem_StrList, DataType.Mem_StrList], True),
    LayerType.Download_Warc_File           : (download_warc_file_layer, [DataType.Mem_Str], [DataType.Mem_Str, DataType.Mem_Str], True),
    LayerType.Download_Warc_Indice         : (download_warc_indice_layer, [DataType.Mem_Str], [DataType.Mem_StrList, DataType.Mem_StrList], True),
    LayerType.Download_Urls_From_Website   : (download_urls_from_website_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
    LayerType.Download_StarCoder           : (download_starcoder_layer, [DataType.Mem_Str], [DataType.Mem_Int], True),

    # IO - read/write
    LayerType.To_Binary_File               : (to_binary_file_layer, [DataType.Mem_Binary, DataType.Mem_Str], [DataType.Mem_Str], True),
    LayerType.To_Line_File                 : (to_line_file_layer, [DataType.Mem_StrList, DataType.Mem_Str], [DataType.Mem_Str], True),
    LayerType.To_Jsonl_File                : (to_jsonl_file_layer, [DataType.Mem_DictList, DataType.Mem_Str], [DataType.Mem_Str], True),
    LayerType.To_Parquet_File              : (to_parquet_file_layer, [DataType.Mem_DictList, DataType.Mem_Str], [DataType.Mem_Str], True),
    LayerType.To_Index_File                : (to_index_file_layer, [DataType.Mem_Index, DataType.Mem_Str], [DataType.Mem_Str], True),
    LayerType.From_Binary_File             : (from_binary_file_layer, [DataType.Mem_Str], [DataType.Mem_Binary], True),
    LayerType.From_Line_File               : (from_line_file_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
    LayerType.From_Jsonl_File              : (from_jsonl_file_layer, [DataType.Mem_Str], [DataType.Mem_DictList], True),
    LayerType.From_Parquet_File            : (from_parquet_file_layer, [DataType.Mem_Str], [DataType.Mem_DictList], True),
    LayerType.From_Index_File              : (from_index_file_layer, [DataType.Mem_Str], [DataType.Mem_Index], True),
    LayerType.From_Wet_File                : (from_wet_file_layer, [DataType.Mem_Str], [DataType.Mem_DictList], True),
    LayerType.From_Warc_File               : (from_warc_file_layer, [DataType.Mem_Str], [DataType.Mem_DictList], True),

    # Extract
    LayerType.Extract_Article              : (extract_article_layer, [DataType.Mem_Warc], [DataType.Mem_Dict], True),
    LayerType.Build_Index                  : (build_index_layer, [DataType.Mem_VectorList], [DataType.Mem_Index], True),
    LayerType.Search_Index                 : (search_index_layer, [DataType.Mem_Index, DataType.Mem_VectorList], [DataType.Mem_VectorList, DataType.Mem_VectorList], True),
    
    # Transform
    LayerType.Tokenize_Article             : (tokenize_article_layer, [DataType.Mem_Dict], [DataType.Mem_StrList], True),
    LayerType.Ngrams                       : (ngrams_layer, [DataType.Mem_StrList], [DataType.Mem_StrList], True),
    LayerType.Minhash_Tokens               : (minhash_tokens_layer, [DataType.Mem_StrList], [DataType.Mem_StrList], True),
    LayerType.LSH_Minhash                  : (lsh_minhash_layer, [DataType.Mem_StrList], [DataType.Mem_StrList], True),
    LayerType.Warc_Filter                  : (warc_filter_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
    LayerType.Warc_Encode                  : (warc_encode_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
    LayerType.Warc_To_Wet                  : (warc_to_wet_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
    LayerType.Wet_Decode                   : (wet_decode_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
    LayerType.Math_Filter                  : (math_filter_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
    LayerType.OpenQuestion_Filter          : (openquestion_filter_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
    LayerType.MCQ_Filter                   : (mcq_filter_layer, [DataType.Mem_Str], [DataType.Mem_StrList], True),
}


__all__ = [
    "LayerType", 
    "LayerType2Func", 
    "template_layer", 
    "data_sample_layer", 
    "data_concat_layer", 
    "data_order_layer", 
    "data_partition_layer", 
    "data_filter_layer", 
    "data_shuffle_layer", 
    "upload_file_to_blob_layer", 
    "upload_bytes_to_blob_layer", 
    "download_file_from_blob_layer", 
    "download_bytes_from_blob_layer", 
    "download_file_from_internet_layer", 
    "download_bytes_from_internet_layer", 
    "download_url_list_layer", 
    "download_warc_file_layer", 
    "download_warc_indice_layer", 
    "download_urls_from_website_layer", 
    "download_starcoder_layer", 
    "to_binary_file_layer", 
    "to_line_file_layer", 
    "to_jsonl_file_layer", 
    "to_parquet_file_layer", 
    "to_index_file_layer", 
    "from_binary_file_layer", 
    "from_line_file_layer", 
    "from_jsonl_file_layer", 
    "from_parquet_file_layer", 
    "from_index_file_layer", 
    "from_wet_file_layer", 
    "from_warc_file_layer", 
    "extract_article_layer", 
    "build_index_layer", 
    "search_index_layer", 
    "tokenize_article_layer", 
    "ngrams_layer", 
    "minhash_tokens_layer", 
    "lsh_minhash_layer", 
    "warc_filter_layer", 
    "warc_encode_layer", 
    "warc_to_wet_layer", 
    "wet_decode_layer", 
    "math_filter_layer", 
    "openquestion_filter_layer", 
    "mcq_filter_layer", 
]


================================================
FILE: DomainSpecific/core/layers/control/__init__.py
================================================
# Control
from .data_sample_layer import data_sample_layer
from .data_filter_layer import data_filter_layer
from .data_order_layer import data_order_layer
from .data_partition_layer import data_partition_layer
from .data_shuffle_layer import data_shuffle_layer
from .data_concat_layer import data_concat_layer

__all__ = [
    "data_sample_layer", 
    "data_filter_layer",
    "data_order_layer",
    "data_partition_layer",
    "data_shuffle_layer", 
    "data_concat_layer", 
]


================================================
FILE: DomainSpecific/core/layers/control/data_concat_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback

def data_concat_layer(lists, variables=dict()):
    ret = list()
    try:
        for a_list in lists[::-1]:
            if a_list is not None:
                ret[0:0] = a_list
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    lists = [["a"], ["b", "c"], None, ["d", "e", "f"]]
    lines = data_concat_layer(lists)
    print(lines)


================================================
FILE: DomainSpecific/core/layers/control/data_filter_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback

def data_filter_layer(lines, variables=dict(), IN=False, FILTERS=(None,)):
    ret = list()
    try:
        ret = list(filter(lambda line: line in FILTERS if IN else line not in FILTERS, lines))
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    lines = ["a", None, "b"]
    FILTERS = (None,)
    lines = data_filter_layer(lines, FILTERS=FILTERS)
    print(lines)


================================================
FILE: DomainSpecific/core/layers/control/data_order_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback

def data_order_layer(lines, variables=dict(), REVERSE=False):
    ret = list()
    try:
        ret = sorted(lines, reverse=REVERSE)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    lines = [1, 3, 2]
    lines = data_order_layer(lines)
    print(lines)


================================================
FILE: DomainSpecific/core/layers/control/data_partition_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback

def data_partition_layer(lines, variables=dict(), WORKER_ID=-1):
    ret = list()
    try:
        worker_id = variables.get("worker_id", 0)
        worker_num = variables.get("worker_num", 1)
        n = len(lines)
        if WORKER_ID == -1:
            ret = [lines[i] for i in range(worker_id, n, worker_num)]
        else:
            ret = lines if WORKER_ID == worker_id else list()
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    lines = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    variables = {"worker_id": 0, "worker_num": 2}
    lines = data_partition_layer(lines, variables=variables)
    print(lines)


================================================
FILE: DomainSpecific/core/layers/control/data_sample_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import random
import traceback

def data_sample_layer(lines, variables=dict(), N=-1, SEED=1):
    ret = list()
    try:
        random.seed(SEED)
        N = min(N, len(lines))
        if N >= 0:
            ret = random.sample(lines, N)
        else:
            ret = lines
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    lines = ["a", "b"]
    N = 1
    lines = data_sample_layer(lines, N=N)
    print(lines)


================================================
FILE: DomainSpecific/core/layers/control/data_shuffle_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import random
import traceback

def data_shuffle_layer(lines, variables=dict(), SEED=1):
    ret = list()
    try:
        random.seed(SEED)
        random.shuffle(lines)
        ret = lines
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    lines = ["a", "b"]
    lines = data_shuffle_layer(lines)
    print(lines)


================================================
FILE: DomainSpecific/core/layers/extract/__init__.py
================================================
# Extract
from .extract_article_layer import extract_article_layer
from .build_index_layer import build_index_layer
from .search_index_layer import search_index_layer

__all__ = [
    "extract_article_layer", 
    "build_index_layer", 
    "search_index_layer", 
]


================================================
FILE: DomainSpecific/core/layers/extract/build_index_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import sys
import faiss
import numpy as np
import traceback

def build_index_layer(base_vectors, variables=dict(), SEED=1, DIM=4096, CLUSTERS=100):
    ret = None
    try:
        np.random.seed(SEED)

        quantizer = faiss.IndexFlatL2(DIM)
        index = faiss.IndexIVFFlat(quantizer, DIM, CLUSTERS, faiss.METRIC_L2)

        assert not index.is_trained
        base_vectors = np.array(base_vectors)
        index.train(base_vectors)
        assert index.is_trained

        index.add(base_vectors)
        ret = index
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == '__main__':
    D = 64
    base_vectors = np.random.random((100000, D)).astype('float32')
    base_vectors[:, 0] += np.arange(100000) / 1000.
    index = build_index_layer(base_vectors, D=D)
    print(index)


================================================
FILE: DomainSpecific/core/layers/extract/extract_article_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import re
import fasttext
import traceback
from unittest.mock import patch
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter, chomp
from newspaper import Article
import global_var

def filter_tags_in_html(soup):
    def del_tags(soup):
        del_tags = ['style', 'script', 'img']
        for tag in del_tags:
            tags = soup.find_all(tag)
            for tag in tags:
                tag.decompose()

        tags = soup.find_all('table')
        for tag in tags:
            if len(tag.text.strip()) == 0:
                for tag in tags:
                    tag.decompose()

    def modify_text(soup):
        modify_tags = ['a']
        for i in range(len(modify_tags)):
            for tag in soup.find_all(modify_tags[i]):
                tag_text = tag.text
                new_tag_text = tag_text.replace('\n', '')
                if len(new_tag_text) != len(tag_text):
                    tag.string = new_tag_text
    del_tags(soup)
    modify_text(soup)

    return soup

def lid(soup, model):
    LID_WIN_SIZE=256
    text = ''.join(soup.text.split())
    span_start, span_end = 0, len(text)
    if len(text) > LID_WIN_SIZE:
        mid = len(text) // 2
        mid_win = LID_WIN_SIZE // 2
        span_start = max(0, int(mid - mid_win))
        span_end = min(len(text), int(mid + mid_win))

    det_text = text[span_start: span_end]
    res = model.predict(det_text)
    la = res[0][0].replace("__label__", "")
    prob = float(res[1][0])
    return la, prob

def get_main_text_html(soup):
    article = Article("padding_url", fetch_images=False, keep_article_html=True)
    article.download(input_html=str(soup))
    article.parse()
    # assert len(article.text.strip()) >= 128
    main_html = article.article_html
    main_text = article.text
    return main_html, main_text

def remove_dup_newline(text):
    fields = text.split('\n')
    for i in range(len(fields)):
        fields[i] = fields[i].strip()
    return re.sub('\n{2,}', '\n\n', '\n'.join(fields)).strip()

class User_MarkdownConverter(MarkdownConverter):
    def convert_tr(self, el, text, convert_as_inline):
        cells = el.find_all(['td', 'th'])
        is_headrow = all([cell.name == 'th' for cell in cells])
        overline = ''
        underline = ''
        if is_headrow and not el.previous_sibling:
            # first row and is headline: print headline underline
            underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
        elif (not el.previous_sibling
            and (el.parent.name == 'table'
                or (el.parent.name == 'tbody'
                    and not el.parent.previous_sibling))):
            # first row, not headline, and:
            # - the parent is table or
            # - the parent is tbody at the beginning of a table.
            # print empty headline above this row
            overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
            overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
        if len(text.replace('|', ' ').strip()) == 0:
            return overline + underline
        else:
            return overline + '|' + text.replace('\n', ' ') + '\n' + underline

    def convert_a(self, el, text, convert_as_inline):
        prefix, suffix, text = chomp(text)
        if not text:
            return ''
        href = el.get('href')
        title = el.get('title')
        # For the replacement see #29: text nodes underscores are escaped
        if (self.options['autolinks']
                and text.replace(r'\_', '_') == href
                and not title
                and not self.options['default_title']):
            # Shortcut syntax
            return '<%s>' % href
        if self.options['default_title'] and not title:
            title = href
        title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
        # return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
        return '%s %s %s' % (prefix, text.replace('\n', ' '), suffix) if href else text

    def convert_pre(self, el, text, convert_as_inline):
        if not text:
            return ''
        code_language = self.options['code_language']

        if self.options['code_language_callback']:
            code_language = self.options['code_language_callback'](el) or code_language

        return '\n```%s\n%s\n```\n' % (code_language, text)

def html2text(soup, **options):
    def clean_markdown(md):
        fields = md.split('\n')
        for i in range(len(fields)):
            fields[i] = fields[i].strip()

        new_fields = []
        for i in range(len(fields)):
            field_set = list(set(fields[i]))
            if len(field_set) == 1 and field_set[0] in ['#', '*', '+', '-']:
                continue
            new_fields.append(fields[i])

        fields = new_fields
        md = '\n'.join(fields)

        return re.sub('\n{2,}', '\n\n', md).strip()

    return clean_markdown(User_MarkdownConverter(**options).convert_soup(soup))

def trans2md(html):
    soup = BeautifulSoup(html, 'html5lib')
    markdown_text = html2text(soup)
    # assert len(markdown_text) > 50 and len(markdown_text.split('\n')) != 1
    if markdown_text.startswith('.') and markdown_text.endswith('.'):
        markdown_text = markdown_text[1:-1]
    main_text = remove_dup_newline(soup.text)
    return markdown_text, main_text

@classmethod
def _patch_newspaper_parser_clean(cls, node):
    return node

@patch('newspaper.parsers.Parser.clean_article_html', new=_patch_newspaper_parser_clean)
def extract(soup):
    main_html, main_text = get_main_text_html(soup)
    markdown_text, _new_main_text = trans2md(main_html)
    return markdown_text, main_text

def extract_article_layer(id_html, variables=dict()):
    ret = None
    try:
        LA_TIER1 = ["en", "es", "ja", "fr", "de", "pt", "it", "zh"]
        LA_TIER2 = ["nl", "sv", "da", "fi", "ru", "no", "ko", "zh", "pl", "tr", "ar", "he", "pt", "cs", "hu", "th", "hi"]
        LA_TIER = LA_TIER1 + LA_TIER2
        article_id, html = id_html
        
        soup = BeautifulSoup(html, 'html5lib')
        soup = filter_tags_in_html(soup)
        la, la_prob = lid(soup, global_var.lid_model)
        if la in LA_TIER:
            main_md, main_text = extract(soup)
            if len(main_text) >= 128:
                ret = {"id": article_id, "text": main_text, "lang": la, "lang_prob": la_prob}
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == '__main__':
    id_html = (None, None)
    id_text_la = extract_article_layer(id_html)
    print(id_text_la)


================================================
FILE: DomainSpecific/core/layers/extract/search_index_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
import faiss
import numpy as np
import traceback

def search_index_layer(index, query_vectors, variables=dict(), TOPK=1):
    ret = (None, None)
    try:
        query_vectors = np.array(query_vectors)
        D, I = index.search(query_vectors, TOPK)
        ret = (I, D)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == '__main__':
    DIM = 4096
    CLUSTERS = 2
    base_vectors = np.random.random((100000, DIM)).astype('float32')
    base_vectors[:, 0] += np.arange(100000) / 1000.
    
    quantizer = faiss.IndexFlatL2(DIM)
    index = faiss.IndexIVFFlat(quantizer, DIM, CLUSTERS, faiss.METRIC_L2)

    assert not index.is_trained
    index.train(base_vectors)
    assert index.is_trained
    index.add(base_vectors)

    query_vectors = np.random.random((10000, DIM)).astype('float32')
    query_vectors[:, 0] += np.arange(10000) / 1000.

    I, D = search_index_layer(index, query_vectors, D=D)
    print(D[:1])


================================================
FILE: DomainSpecific/core/layers/global_var.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
import traceback
#import torch
import fasttext
from transformers import AutoTokenizer, RobertaForSequenceClassification
from dependency.gpt_api import GPTAPI

try:
    # silences warnings as the package does not properly use the python 'warnings' package
    # see https://github.com/facebookresearch/fastText/issues/1056
    fasttext.FastText.eprint = lambda *args,**kwargs: None
except:
    pass

"""
class OpenQuestionModel:
    def __init__(self, pretrained_model_path, token_model_path="cardiffnlp/twitter-roberta-base-emotion", local_files_only=False):
        # load tokenizer model.
        self.tokenizer = AutoTokenizer.from_pretrained(token_model_path)

        # load trained model.
        self.model = RobertaForSequenceClassification.from_pretrained(pretrained_model_path, local_files_only=local_files_only)

    def run(self, text, thred=0.5, max_length=512):
        # tokenization.
        inputs = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)

        # inference.
        with torch.no_grad():
            logits = self.model(**inputs).logits
        logits = logits.softmax(dim=1)[0]
        predicted_idx = logits.argmax().item()
        predicted_label = self.model.config.id2label[predicted_idx]
        predicted_conf = logits[predicted_idx].item()
        if predicted_label == "LABEL_0" and predicted_conf < thred:
            predicted_idx = 1
            predicted_label = "LABEL_1"
        #return predicted_idx, predicted_label, predicted_conf
        return predicted_label
"""

# language detection by fasttext.
LID_MODEL_PATH = "./dependency/models/lid.176.bin"
if os.path.exists(LID_MODEL_PATH):
    lid_model = fasttext.load_model(LID_MODEL_PATH)
else:
    lid_model = None

# math detection by fasttext.
MATH_FT_MODEL_PATH = "./dependency/models/math.bin"
if os.path.exists(MATH_FT_MODEL_PATH):
    ft_math_model = fasttext.load_model(MATH_FT_MODEL_PATH)
else:
    ft_math_model = None

# openquestion detection by fasttext.
OPENQUESTION_MODEL_PATH = "./dependency/models/openquestion.bin"
if os.path.exists(OPENQUESTION_MODEL_PATH):
    ft_openquestion_model = fasttext.load_model(OPENQUESTION_MODEL_PATH)
else:
    ft_openquestion_model = None

# multiple-choice question detection by fasttext.
MCQ_MODEL_PATH = "./dependency/models/mcq.bin"
if os.path.exists(MCQ_MODEL_PATH):
    ft_mcq_model = fasttext.load_model(MCQ_MODEL_PATH)
else:
    ft_mcq_model = None

"""
# multiple-choice question detection by pytorch.
MCQ_PT_MODEL_PATH = "./dependency/models/mcq.pytorch"
if os.path.exists(MCQ_PT_MODEL_PATH):
    py_mcq_model = OpenQuestionModel(MCQ_PT_MODEL_PATH, local_files_only=True)
else:
    py_mcq_model = None
"""

# gpt agent.
gpt_api = GPTAPI()


================================================
FILE: DomainSpecific/core/layers/io/__init__.py
================================================
# IO - read/write
from .to_binary_file_layer import to_binary_file_layer
from .to_line_file_layer import to_line_file_layer
from .to_jsonl_file_layer import to_jsonl_file_layer
from .to_parquet_file_layer import to_parquet_file_layer
from .to_index_file_layer import to_index_file_layer
from .from_binary_file_layer import from_binary_file_layer
from .from_line_file_layer import from_line_file_layer
from .from_jsonl_file_layer import from_jsonl_file_layer
from .from_parquet_file_layer import from_parquet_file_layer
from .from_index_file_layer import from_index_file_layer
from .from_wet_file_layer import from_wet_file_layer
from .from_warc_file_layer import from_warc_file_layer

__all__ = [
    "to_binary_file_layer", 
    "to_line_file_layer", 
    "to_jsonl_file_layer", 
    "to_parquet_file_layer", 
    "to_index_file_layer",
    "from_binary_file_layer", 
    "from_line_file_layer", 
    "from_jsonl_file_layer", 
    "from_parquet_file_layer",
    "from_index_file_layer",
    "from_wet_file_layer", 
    "from_warc_file_layer",
]


================================================
FILE: DomainSpecific/core/layers/io/from_binary_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import util

def from_binary_file_layer(file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        if STORAGE_PATH is not None:
            util.download_file_from_blob(STORAGE_PATH, file_path, file_path)

        with open(file_path, "rb") as f:
            data = f.read()
        ret = data
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    file_path = "test.binary"
    data = from_binary_file_layer(file_path)
    print(data)


================================================
FILE: DomainSpecific/core/layers/io/from_index_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import faiss
import traceback
import util

def from_index_file_layer(file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        if STORAGE_PATH is not None:
            util.download_file_from_blob(STORAGE_PATH, file_path, file_path)

        index = faiss.read_index(file_path)
        ret = index
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == '__main__':
    file_path = "index.faiss"
    index = from_index_file_layer(file_path)
    print(index)


================================================
FILE: DomainSpecific/core/layers/io/from_jsonl_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import jsonlines
import util

def from_jsonl_file_layer(file_path, variables=dict(), STORAGE_PATH=None):
    ret = list()
    try:
        file_path = util.to_real_path(file_path, variables)
        if STORAGE_PATH is not None:
            util.download_file_from_blob(STORAGE_PATH, file_path, file_path)

        with jsonlines.open(file_path) as reader:
            for line in reader:
                ret.append(line)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    file_path = "test.jsonl"
    data = from_jsonl_file_layer(file_path)
    print(data)


================================================
FILE: DomainSpecific/core/layers/io/from_line_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import util

def from_line_file_layer(file_path, variables=dict(), STORAGE_PATH=None):
    ret = list()
    try:
        file_path = util.to_real_path(file_path, variables)
        if STORAGE_PATH is not None:
            util.download_file_from_blob(STORAGE_PATH, file_path, file_path)

        for line in open(file_path, "r"):
            line = line.strip()
            ret.append(line)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    file_path = "test.line"
    lines = from_line_file_layer(file_path)
    print(lines)


================================================
FILE: DomainSpecific/core/layers/io/from_parquet_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import pyarrow as pa
import pyarrow.parquet as pq
import util

def from_parquet_file_layer(file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        if STORAGE_PATH is not None:
            util.download_file_from_blob(STORAGE_PATH, file_path, file_path)

        table = pq.read_table(file_path)
        ret = table.to_pylist()
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    file_path = "test.parquet"
    data = from_parquet_file_layer(file_path)
    print(data)


================================================
FILE: DomainSpecific/core/layers/io/from_warc_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
from warcio.archiveiterator import ArchiveIterator
import util

def from_warc_file_layer(file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        if STORAGE_PATH is not None:
            util.download_file_from_blob(STORAGE_PATH, file_path, file_path)

        if os.path.exists(file_path):
            items = list()
            with open(file_path, "rb") as input:
                records = ArchiveIterator(input, arc2warc=True)
                for idx, record in enumerate(records):
                    if record.rec_type == "response" and record.http_headers.get_header("Content-Type", "").startswith("text/html"):
                        item = dict()
                        item["uri"] = record.rec_headers.get("WARC-Target-URI")
                        item["lang"] = record.rec_headers.get("Detected-Language")
                        item["content_length"] = record.rec_headers["Content-Length"]
                        item["html"] = record.content_stream().read()
                        items.append(item)
            ret = items
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    file_path = "test.warc.gz"
    data = from_warc_file_layer(file_path)
    print(data)


================================================
FILE: DomainSpecific/core/layers/io/from_wet_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
from warcio.archiveiterator import ArchiveIterator
import util

def from_wet_file_layer(file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        if STORAGE_PATH is not None:
            util.download_file_from_blob(STORAGE_PATH, file_path, file_path)

        if os.path.exists(file_path):
            items = list()
            with open(file_path, "rb") as input:
                records = ArchiveIterator(input, arc2warc=False)
                for idx, record in enumerate(records):
                    if record.rec_type == "conversion":
                        item = dict()
                        item["uri"] = record.rec_headers.get("WARC-Target-URI")
                        item["lang"] = record.rec_headers.get("Detected-Language")
                        item["content_length"] = record.rec_headers["Content-Length"]
                        item["text"] = record.content_stream().read()
                        items.append(item)
            ret = items
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    file_path = "test.warc.wet.gz"
    data = from_wet_file_layer(file_path)
    print(data)


================================================
FILE: DomainSpecific/core/layers/io/to_binary_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import util

def to_binary_file_layer(bytes, file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        util.create_folder_by_file_path(file_path)

        with open(file_path, "wb") as f:
            f.write(bytes)

        if STORAGE_PATH is not None:
            util.upload_file_to_blob(STORAGE_PATH, file_path, file_path)

        ret = file_path
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    bytes = b"hello"
    file_path = "test.binary"
    file_path = to_binary_file_layer(bytes, file_path)
    print(file_path)


================================================
FILE: DomainSpecific/core/layers/io/to_index_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import faiss
import traceback
import util

def to_index_file_layer(index, file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        util.create_folder_by_file_path(file_path)

        faiss.write_index(index, file_path)

        if STORAGE_PATH is not None:
            util.upload_file_to_blob(STORAGE_PATH, file_path, file_path)

        ret = file_path
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == '__main__':
    D = 64
    NLIST = 100
    base_vectors = np.random.random((100000, D)).astype('float32')
    base_vectors[:, 0] += np.arange(100000) / 1000.
    
    quantizer = faiss.IndexFlatL2(D)
    index = faiss.IndexIVFFlat(quantizer, D, NLIST, faiss.METRIC_L2)

    assert not index.is_trained
    index.train(base_vectors)
    assert index.is_trained
    index.add(base_vectors)

    file_path = "index.faiss"
    file_path = to_index_file_layer(index, file_path)
    print(file_path)


================================================
FILE: DomainSpecific/core/layers/io/to_jsonl_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import jsonlines
import util

def to_jsonl_file_layer(data, file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        util.create_folder_by_file_path(file_path)

        with jsonlines.open(file_path, "w") as writer:
            writer.write_all(data)

        if STORAGE_PATH is not None:
            util.upload_file_to_blob(STORAGE_PATH, file_path, file_path)

        ret = file_path
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    data = [{'id': "1", 'html': "hello"}, {'id': "2", 'html': "hi"}]
    file_path = "test.jsonl"
    file_path = to_jsonl_file_layer(data, file_path)
    print(file_path)


================================================
FILE: DomainSpecific/core/layers/io/to_line_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import util

def to_line_file_layer(lines, file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        util.create_folder_by_file_path(file_path)

        with open(file_path, "w") as f:
            for line in lines:
                f.write(line + "\n")

        if STORAGE_PATH is not None:
            util.upload_file_to_blob(STORAGE_PATH, file_path, file_path)

        ret = file_path
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    lines = ["line1", "line2"]
    file_path = "test.line"
    file_path = to_line_file_layer(lines, file_path)
    print(file_path)


================================================
FILE: DomainSpecific/core/layers/io/to_parquet_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import pyarrow as pa
import pyarrow.parquet as pq
import util

def to_parquet_file_layer(data, file_path, variables=dict(), STORAGE_PATH=None):
    ret = None
    try:
        file_path = util.to_real_path(file_path, variables)
        util.create_folder_by_file_path(file_path)

        table = pa.Table.from_pylist(data)
        pq.write_table(table, file_path)

        if STORAGE_PATH is not None:
            util.upload_file_to_blob(STORAGE_PATH, file_path, file_path)

        ret = file_path
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    data = [{'id': "1", 'html': "hello"}, {'id': "2", 'html': "hi"}]
    file_path = "test.parquet"
    file_path = to_parquet_file_layer(data, file_path)
    print(file_path)


================================================
FILE: DomainSpecific/core/layers/network/__init__.py
================================================
# Network - download/upload
from .upload_file_to_blob_layer import upload_file_to_blob_layer
from .upload_bytes_to_blob_layer import upload_bytes_to_blob_layer
from .download_file_from_blob_layer import download_file_from_blob_layer
from .download_bytes_from_blob_layer import download_bytes_from_blob_layer
from .download_file_from_internet_layer import download_file_from_internet_layer
from .download_bytes_from_internet_layer import download_bytes_from_internet_layer
from .download_url_list_layer import download_url_list_layer
from .download_warc_file_layer import download_warc_file_layer
from .download_warc_indice_layer import download_warc_indice_layer
from .download_urls_from_website_layer import download_urls_from_website_layer
from .download_starcoder_layer import download_starcoder_layer

__all__ = [
    "upload_file_to_blob_layer",
    "upload_bytes_to_blob_layer",
    "download_file_from_blob_layer", 
    "download_bytes_from_blob_layer", 
    "download_file_from_internet_layer", 
    "download_bytes_from_internet_layer", 
    "download_url_list_layer", 
    "download_warc_file_layer", 
    "download_warc_indice_layer", 
    "download_urls_from_website_layer", 
    "download_starcoder_layer", 
]


================================================
FILE: DomainSpecific/core/layers/network/download_bytes_from_blob_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import traceback
import util

def download_bytes_from_blob_layer(blob_path, variables=dict(), STORAGE_PATH=None, TRIES=1):
    ret = (None, None, blob_path)
    try:
        for _ in range(TRIES):
            try:
                assert STORAGE_PATH is not None and os.path.exists(STORAGE_PATH)
                storage_config = util.load_yaml(STORAGE_PATH)
                blob_path = util.to_real_path(blob_path, variables)
                file_name = util.md5(blob_path) + util.suffix(blob_path)
                bytes = util.download_bytes_from_blob(storage_config, blob_path)
                ret = (file_name, bytes, None)
                break
            except:
                time.sleep(1)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == '__main__':
    blob_path = "$(azure_blob_path)"
    STORAGE_PATH = "resources/environment/llmstore.yaml"
    bytes = download_bytes_from_blob_layer(blob_path, STORAGE_PATH=STORAGE_PATH)
    print(bytes)


================================================
FILE: DomainSpecific/core/layers/network/download_bytes_from_internet_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import traceback
import util

def download_bytes_from_internet_layer(url, variables=dict(), TRIES=1):
    ret = (None, None, url)
    try:
        for _ in range(TRIES):
            try:
                url = util.to_real_path(url, variables)
                file_name = util.md5(url) + util.suffix(url)
                bytes = util.download_bytes_from_internet(url)
                ret = (file_name, bytes, None)
                break
            except:
                time.sleep(1)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == '__main__':
    url = "https://upload.wikimedia.org/wikipedia/commons/4/4f/SVG_Logo.svg"
    bytes = download_bytes_from_internet_layer(url)
    print(bytes)


================================================
FILE: DomainSpecific/core/layers/network/download_file_from_blob_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import traceback
import util

def download_file_from_blob_layer(blob_path, variables=dict(), DOWNLOAD_PATH=".", STORAGE_PATH=None, TRIES=1):
    ret = (None, blob_path)
    try:
        for _ in range(TRIES):
            try:
                assert STORAGE_PATH is not None and os.path.exists(STORAGE_PATH)
                storage_config = util.load_yaml(STORAGE_PATH)
                blob_path = util.to_real_path(blob_path, variables)
                file_name = util.md5(blob_path) + util.suffix(blob_path)
                file_path = os.path.join(DOWNLOAD_PATH, file_name)
                file_path = util.to_real_path(file_path, variables)
                util.download_file_from_blob(storage_config, blob_path, file_path)
                ret = (file_path, None)
                break
            except:
                time.sleep(1)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == '__main__':
    blob_path = "$(azure_blob_path)"
    DOWNLOAD_PATH = "$(local_folder_path)"
    STORAGE_PATH = "resources/environment/llmstore.yaml"
    path = download_file_from_blob_layer(blob_path, DOWNLOAD_PATH=DOWNLOAD_PATH, STORAGE_PATH=STORAGE_PATH)
    print(path)


================================================
FILE: DomainSpecific/core/layers/network/download_file_from_internet_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import traceback
import util

def download_file_from_internet_layer(url, variables=dict(), DOWNLOAD_PATH=".", TRIES=1):
    ret = (None, url)
    try:
        for _ in range(TRIES):
            try:
                url = util.to_real_path(url, variables)
                file_name = util.md5(url) + util.suffix(url)
                file_path = os.path.join(DOWNLOAD_PATH, file_name)
                file_path = util.to_real_path(file_path, variables)
                util.download_file_from_internet(url, file_path)
                #bytes = util.download_bytes_from_internet(url)
                #util.upload_bytes_to_blob(variables["storage_config"], bytes, file_path)
                ret = (file_path, None)
                break
            except:
                time.sleep(1)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == '__main__':
    url = "https://upload.wikimedia.org/wikipedia/commons/4/4f/SVG_Logo.svg"
    DOWNLOAD_PATH = "$(local_folder_path)"
    path = download_file_from_internet_layer(url, DOWNLOAD_PATH=DOWNLOAD_PATH)
    print(path)


================================================
FILE: DomainSpecific/core/layers/network/download_starcoder_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import json
from datetime import datetime
import boto3
from botocore import UNSIGNED
from botocore.config import Config
import smart_open
from datasets import load_dataset
import util

s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

def download_contents(blob_id, src_encoding):
    s3_url = f"s3://softwareheritage/content/{blob_id}"
    with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": s3}) as fin:
        content = fin.read().decode(src_encoding)
    return content

def download_starcoder_layer(data_repo, variables=dict(), OUTPUT_FOLDER="./", STORAGE_PATH=None, HUGGINGFACE_TOKEN=None):
    ret = 0
    try:
        worker_id = variables["worker_id"]
        worker_num = variables["worker_num"]
        data_repo = util.to_real_path(data_repo, variables)
        output_folder = util.to_real_path(OUTPUT_FOLDER, variables)
        if STORAGE_PATH is not None:
            storage_config = util.load_yaml(STORAGE_PATH)

        ds = load_dataset(data_repo, split="train", streaming=True, token=HUGGINGFACE_TOKEN, cache_dir=f"./cache.{worker_id}/")
        ds = ds.filter(lambda row, idx: idx % worker_num == worker_id, with_indices=True)

        item_count = 0
        for i, row in enumerate(ds):
            for key in row.keys():
                if isinstance(row[key], datetime):
                    row[key] = datetime.timestamp(row[key])

            blob_id = row["blob_id"]
            src_encoding = row["src_encoding"]

            snapshot_prefix = row["snapshot_id"][:4]
            repo_name = row["repo_name"].replace("/", "@")
            branch_name = row["branch_name"].replace("/", "@")
            language = row["language"].replace(" ", "_")
            path = row["path"].lstrip("/")
            filename = row["filename"].strip()
            filename = path
            extension = row["extension"].strip()

            content = download_contents(blob_id, src_encoding)

            code_path = os.path.join(output_folder, snapshot_prefix, repo_name, branch_name, blob_id)
            metadata_path = os.path.join(output_folder, snapshot_prefix, repo_name, branch_name, blob_id + ".json")

            try:
                util.create_folder_by_file_path(code_path)
                with open(code_path, "w") as f:
                    f.write(content)
                if STORAGE_PATH is not None:
                    util.upload_file_to_blob(storage_config, code_path, code_path)

                util.create_folder_by_file_path(metadata_path)
                with open(metadata_path, "w") as f:
                    f.write(json.dumps(row, indent=4) + "\n")
                if STORAGE_PATH is not None:
                    util.upload_file_to_blob(storage_config, metadata_path, metadata_path)

                if STORAGE_PATH is not None:
                    try:
                        os.remove(code_path)
                        os.remove(metadata_path)
                    except OSError:
                        pass
            except:
                traceback.print_exc()
            
            item_count += 1

        ret = item_count
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == '__main__':
    data_repo = "$(local_the_stack_v2_dedup_metadata_path)"
    variables = {"workspace_dir": r"workspace", "worker_id": 0, "worker_num": 1}
    OUTPUT_FOLDER = "$(local_the_stack_v2_dedup_data_path)"
    STORAGE_PATH = "resources/storage/llmstore.yaml"
    HUGGINGFACE_TOKEN = None
    item_count = download_starcoder_layer(data_repo, variables=variables, OUTPUT_FOLDER=OUTPUT_FOLDER, STORAGE_PATH=STORAGE_PATH, HUGGINGFACE_TOKEN=HUGGINGFACE_TOKEN)
    print(item_count)


================================================
FILE: DomainSpecific/core/layers/network/download_url_list_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import gzip
import json
import requests
import traceback

def download_url_list_layer(index_url, variables=dict(), FILTER_SUFFIXES=(), TRIES=1):
    ret = list()
    try:
        for _ in range(TRIES):
            try:
                resp = requests.get(index_url, stream=True)
                urls = list()
                with gzip.open(resp.raw, 'rt') as f:
                    for line in f.readlines():
                        text = "{" + line.strip().split(" {")[1]
                        item = json.loads(text)
                        url = item["url"]
                        suffix = os.path.splitext(url)[1]
                        if suffix in FILTER_SUFFIXES:
                            urls.append(url)
                ret[0:0] = urls
                break
            except:
                time.sleep(1)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, [index_url] if len(ret) == 0 else [])


if __name__ == '__main__':
    index_url = "https://data.commoncrawl.org/cc-index/collections/CC-MAIN-2023-23/indexes/cdx-00000.gz"
    FILTER_SUFFIXES = (".svg",)
    urls = download_url_list_layer(index_url, FILTER_SUFFIXES=FILTER_SUFFIXES)
    print(urls)


================================================
FILE: DomainSpecific/core/layers/network/download_urls_from_website_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import requests
import logging
import traceback
import xml.etree.ElementTree as ET

def download_urls_from_website_layer(website_url, variables=dict(), FILTER=None):
    ret = list()
    try:
        robot_url = website_url + "/robots.txt"
        logging.disable(logging.WARNING)

        # get sitemap.
        xml_urls = list()
        whilte_url_prefixs = list()
        black_url_prefixs = list()
        resp = requests.get(robot_url)
        crawler = None
        for line in resp.text.split("\n"):
            line = line.strip()
            if len(line) == 0:
                continue
            if line.startswith("#"):
                continue

            if line.startswith("User-agent:"):
                crawler = line.split(":")[-1].strip()
                continue

            if crawler != "*":
                continue
            if crawler == "*" and line.startswith("Disallow:"):
                url_prefix = line.replace("Disallow:", "").strip()
                black_url_prefixs.append(url_prefix)
                continue
            if crawler == "*" and line.startswith("Allow:"):
                url_prefix = line.replace("Allow:", "").strip()
                whilte_url_prefixs.append(url_prefix)
                continue
            if crawler == "*" and line.startswith("Sitemap:"):
                xml_url = line.replace("Sitemap:", "").strip()
                if (FILTER is None or FILTER in xml_url) and xml_url.endswith(".xml"):
                    xml_urls.append(xml_url)
                continue

        # get urls.
        html_urls = set()
        for xml_url in xml_urls:
            try:
                resp = requests.get(xml_url)
                root = ET.fromstring(resp.content)
                for sitemap in root:
                    html_url = list(sitemap)[0].text
                    html_urls.add(html_url)
                #nodes = tree.xpath('//a/@href')
                #nodes = tree.xpath("//loc")
            except:
                pass

        ret = list(html_urls)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == '__main__':
    website_url = "https://byjus.com/"
    FILTER = "math"
    urls = download_urls_from_website_layer(website_url, FILTER=FILTER)
    print(urls[0][0])


================================================
FILE: DomainSpecific/core/layers/network/download_warc_file_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import traceback
import util

def download_warc_file_layer(warc_url, variables=dict(), DOWNLOAD_FOLDER="./", CONNECTS=16, TRIES=1, OVERWRITE=False):
    ret = (None, warc_url)
    try:
        if not warc_url.startswith("https://"):
            warc_url = "https://data.commoncrawl.org/" + warc_url
        #warc_url = warc_url.replace("https://data.commoncrawl.org/", "https://ds5q9oxwqwsfj.cloudfront.net/")# debug
        warc_name = warc_url.split("/")[-3] + "_" + os.path.basename(warc_url)
        warc_path = os.path.join(DOWNLOAD_FOLDER, warc_name)
        warc_path = util.to_real_path(warc_path, variables)

        for _ in range(TRIES):
            if OVERWRITE or not os.path.exists(warc_path):
                util.create_folder_by_file_path(warc_path)
                commandline = f"axel -q -n {CONNECTS} -o {warc_path} {warc_url}"
                exit_status = os.system(commandline)
            else:
                exit_status = 0

            if exit_status == 0:
                break
            time.sleep(1)

        if exit_status == 0:
            ret = (warc_name, None)
        else:
            ret = (None, warc_url)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == '__main__':
    warc_url = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-49/segments/1669446706285.92/warc/CC-MAIN-20221126080725-20221126110725-00000.warc.gz"
    DOWNLOAD_FOLDER = "$(local_folder_path)"
    (success_warc_url, failed_warc_url) = download_warc_file_layer(warc_url, DOWNLOAD_FOLDER=DOWNLOAD_FOLDER)
    print(success_warc_url, failed_warc_url)


================================================
FILE: DomainSpecific/core/layers/network/download_warc_indice_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import gzip
import requests
import traceback

def download_warc_indice_layer(index_url, variables=dict(), TRIES=1, URL_PREFIX="https://data.commoncrawl.org/"):
    ret = list()
    try:
        for _ in range(TRIES):
            try:
                resp = requests.get(index_url, stream=True)
                urls = list()
                with gzip.open(resp.raw, 'rt') as f:
                    for line in f.readlines():
                        warc_url = URL_PREFIX + line.strip()
                        urls.append(warc_url)
                ret = urls
                break
            except:
                time.sleep(1)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, [index_url] if len(ret) == 0 else [])


if __name__ == '__main__':
    index_url = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-49/warc.paths.gz"
    warc_urls = download_warc_indice_layer(index_url)
    print(warc_urls[0][0])


================================================
FILE: DomainSpecific/core/layers/network/upload_bytes_to_blob_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import traceback
import util

def upload_bytes_to_blob_layer(bytes, blob_path, variables=dict(), STORAGE_PATH=None, BLOB_PREFIX="", TRIES=1):
    ret = (None, blob_path)
    try:
        for _ in range(TRIES):
            try:
                assert STORAGE_PATH is not None and os.path.exists(STORAGE_PATH)
                storage_config = util.load_yaml(STORAGE_PATH)
                blob_path = util.to_real_path(os.path.join(BLOB_PREFIX, blob_path), variables)
                util.upload_bytes_to_blob(storage_config, bytes, blob_path)
                ret = (blob_path, None)
                break
            except:
                time.sleep(1)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == '__main__':
    bytes = b"hello"
    blob_path = "$(azure_blob_path)"
    STORAGE_PATH = "resources/environment/llmstore.yaml"
    path = upload_bytes_to_blob_layer(bytes, blob_path, STORAGE_PATH=STORAGE_PATH)
    print(path)


================================================
FILE: DomainSpecific/core/layers/network/upload_file_to_blob_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import traceback
import util

def upload_file_to_blob_layer(file_path, blob_path, variables=dict(), STORAGE_PATH=None, BLOB_PREFIX="", TRIES=1):
    ret = (None, blob_path)
    try:
        for _ in range(TRIES):
            try:
                assert STORAGE_PATH is not None and os.path.exists(STORAGE_PATH)
                storage_config = util.load_yaml(STORAGE_PATH)
                file_path = util.to_real_path(file_path, variables)
                blob_path = util.to_real_path(os.path.join(BLOB_PREFIX, blob_path), variables)
                util.upload_file_to_blob(storage_config, file_path, blob_path)
                ret = (blob_path, None)
                break
            except:
                time.sleep(1)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == '__main__':
    blob_path = "$(azure_blob_path)"
    file_path = "$(local_file_path)"
    STORAGE_PATH = "resources/environment/llmstore.yaml"
    path = upload_file_to_blob_layer(file_path, blob_path, STORAGE_PATH=STORAGE_PATH)
    print(path)


================================================
FILE: DomainSpecific/core/layers/template_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import sys
import traceback

# Spec of adding a new layer:
# 1. the layer function should be registered in __init__.py file of current folder.
# 2. the layer function should return tuple value, even though the return value is empty.
# 3. the layer function should contain a "variables" variable in dictionary type for the access of global shared variables.
# 4. It's better to implement the unit test and put it to the "__main__" function.
# 5. It's better to have exception handling for the function logic.
# 6. It's better to end with "_layer" for the name of function.
# 7. It's better to write comments for the function of purpose, input and output.
# 8. It's better to be lowercase for the name of input datas.
# 9. It's better to be uppercase for the name of input parameters.

def template_layer(input, variables=dict(), PARAM=None):
    ret = None
    try:
        ret = input
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret,)


if __name__ == "__main__":
    input = None
    output = template_layer(input)


================================================
FILE: DomainSpecific/core/layers/transform/__init__.py
================================================
# Transform
from .tokenize_article_layer import tokenize_article_layer
from .ngrams_layer import ngrams_layer
from .minhash_tokens_layer import minhash_tokens_layer
from .lsh_minhash_layer import lsh_minhash_layer
from .warc_filter_layer import warc_filter_layer
from .warc_encode_layer import warc_encode_layer
from .warc_to_wet_layer import warc_to_wet_layer
from .wet_decode_layer import wet_decode_layer
from .math_filter_layer import math_filter_layer
from .openquestion_filter_layer import openquestion_filter_layer
from .mcq_filter_layer import mcq_filter_layer

__all__ = [
    "tokenize_article_layer", 
    "ngrams_layer", 
    "minhash_tokens_layer", 
    "lsh_minhash_layer", 
    "warc_filter_layer", 
    "warc_encode_layer", 
    "warc_to_wet_layer", 
    "wet_decode_layer", 
    "math_filter_layer",
    "openquestion_filter_layer",
    "mcq_filter_layer",
]


================================================
FILE: DomainSpecific/core/layers/transform/lsh_minhash_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import time
import traceback
import numpy as np
from scipy.integrate import quad as integrate

# different from datasketch's implementation, will use 2^61-1 as the maximum hash value instead of 2^32-1
NUM_PERM = 256
LSH_THRESHOLD = 0.8

class LSH:
    def __init__(self):
        # gen lsh range
        b, r = self.optimal_param(LSH_THRESHOLD, NUM_PERM, 0.5, 0.5)
        self.hashranges = [(i*r, (i+1)*r) for i in range(b)]
        
    # gen lsh param
    # https://github.com/ekzhu/datasketch/blob/44077457d32887a91297f15c3efee2c1982f690e/datasketch/lsh.py
    def false_positive_probability(self, threshold, b, r):
        _probability = lambda s : 1 - (1 - s**float(r))**float(b)
        a, err = integrate(_probability, 0.0, threshold)
        return a

    def false_negative_probability(self, threshold, b, r):
        _probability = lambda s : 1 - (1 - (1 - s**float(r))**float(b))
        a, err = integrate(_probability, threshold, 1.0)
        return a

    def optimal_param(self, threshold, num_perm, false_positive_weight,
            false_negative_weight):
        '''
        Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
        of probabilities of false positive and false negative.
        '''
        min_error = float("inf")
        opt = (0, 0)
        for b in range(1, num_perm+1):
            max_r = int(num_perm / b)
            for r in range(1, max_r+1):
                fp = self.false_positive_probability(threshold, b, r)
                fn = self.false_negative_probability(threshold, b, r)
                error = fp*false_positive_weight + fn*false_negative_weight
                if error < min_error:
                    min_error = error
                    opt = (b, r)
        return opt

    def gen_lsh(self, minhash):
        return [bytearray(minhash[start:end]) for start, end in self.hashranges]

lsh = LSH()

def lsh_minhash_layer(minhash, variables=dict()):
    ret = list()
    try:
        minhash = np.array(minhash, dtype=np.uint64)
        #assert minhash.dtype == np.uint64 and minhash.shape == (NUM_PERM,)
        lshvalues = lsh.gen_lsh(minhash)
        for i, value in enumerate(lshvalues):
            key = f'{i}_'.encode() + value
            ret.append(key)
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == "__main__":
    minhash = [2170239837623632,1287605064391826,7877338491737559,1522708576701298,1959803855170230,136353893425081,3067530819312822,19822079906565762,14191953696745176,371933081470560,2359093478290026,24211742396711177,5207401883495830,3386445753675098,6482843287028185,14956790165792002,7760994632330526,3801562091963312,654119844389846,6118541550243605,1058268864309841,19648312785892006,5519054639081138,17769255728697304,1326859272534844,6541616202650748,11131462447891679,11540424367241221,6416091255362971,1178274890175074,9516296843449206,5019313649584786,556043434180166,3170749841321737,788403856226243,16256424180717928,11536645058081246,13331271075979702,5603975614240490,11332978618315755,49833277925775,28529817665769800,5399529123965416,5804862109442032,10516842515700528,1383775130067327,9593857895450592,344120332429946,3650720026287843,4927677784872807,3114522307389328,1054088699310940,11453703275676121,17145094372333782,11943406601641085,429519913626747,3559765888081715,6380853683568781,13142954055708448,1122751140539670,7679037943867431,23532369906879837,4460946791673399,6284691595180437,5534632051525650,4326069154983305,6645880540672905,1199004738171304,2741143312089611,3315947713975755,33325056362165,17905224452748795,11081894870845940,2429362824597352,8796539339687473,17606225237179401,2406479086961618,25285711888782525,1847958183256316,4198878926995358,5057832224878357,10146090240130753,2413082792037196,3530471135853536,7672611456084586,2230458118023706,9790058494528486,3351632677682193,6902744571969727,4063006572456150,2761280786272613,6242978327908865,26924233559187524,2214283527827093,951652422014210,1577851399523074,282734099627651,4284321096276342,1571021659718705,2064444079057042,25995837896147107,3642452037001290,615591136529782,2579917399379439,10350113780305730,141093940432428,9292013714641581,16926413460125,4351013271280123,4492914008491347,3885988895709230,3643655265951773,4028855757933683,10480484972551973,2399277677842610,391439629014342,4511050103292841,13930059233224697,10142483490268814,10209387364437517,10291028774837120,1963510243393060,6698235608219585,10249974506598137,2090329927024291,19452257405817527,5395347850501660,1466647506773938,18271233688875585,17909487123073655,22732716574954981,28208124344155426,16118266291737203,6436198404802809,935143955767639,4692764892567773,8853071216371112,1600664618209927,39702070969452097,7552579352900360,2729546584440357,12309935356310386,426760114692333,1297488733224877,153415463561661,18948566290952420,8432980683248649,21321844297374743,8265174613176795,905258690673816,705406607744747,9105597597214747,517772088040257,1591136193162784,27511729624229236,3634922285407283,1831578225426174,13255266977668852,15312685554649660,722931468693513,1049089865098577,3498618026981595,4820015824926872,21126162808808528,27814106051492575,4822875592156961,14999120736412943,10825146296544249,6314954554132894,937945964737656,5459760788750366,3819227047549912,6591064604768721,7907494363943122,3486632627636937,9384132089104933,22104346516322826,6658745931891482,34093012584282609,4995951742943174,3517485897161771,135044219482780,7630383357514628,5162177136386332,10728488430543051,5828055747100055,6893511170015442,11011121196423559,2528283999013590,5080079240873515,19593423843180365,6822359610856040,191087978655560,8846708703413576,33146998994366094,3940701969864300,3507581990705859,6201879648552385,27956522101531374,10178358282977630,2205391899838384,2614926987404300,1090899715885363,6945147978151211,5432157012678156,1250518799355535,3948407147690489,10306927288370802,4580562167416191,8475303907451120,2243101892749971,2451601302451002,2180238663422921,3834240093757495,12119880871693653,12134080723101916,1805202361835209,31781168568203930,42987808989068825,41914343122681270,7985132073155851,16763654385115268,1387995454655588,2351466328427087,3139781779642664,27792958762616566,11961004800461011,6612181571493100,22715857059525182,689087660337260,244785061275028,11511948953811059,8237401627755449,8214914423544509,5470929524034644,9110614658125771,17166417582628999,18571246019891132,3766276759071421,1226388404627669,9965671498507403,1214978610204088,7808074359603991,1313444080667563,9031456783378283,3783393382666945,34163041205217466,3314866608200743,3451870308271748,11716681494447625,1667361573332888,13859255454740261,7299000064706400,6085019581018810,4996856251238621,5666642298303467]
    lsh_values = lsh_minhash_layer(minhash)
    print(lsh_values)


================================================
FILE: DomainSpecific/core/layers/transform/math_filter_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import re
import requests
import fasttext
from gensim.utils import simple_preprocess
import pyarrow as pa
import pyarrow.parquet as pq
import util
import global_var

whilte_list = {r"\\displaystyle", r"\\alpha", r"\\beta", r"\\gamma", r"\\delta", r"\\zeta", r"\\eta", r"\\iota", r"\\kappa", r"\\mu", r"\\nu", r"\\xi", r"\\rho", r"\\tau", r"\\phi", r"\\chi", r"\\psi", r"\\omicron", r"\\epsilon", r"\\pi", r"\\lambda", r"\\omega", r"\\sigma", r"\\theta", r"\\vartheta", r"\\times", r"\\cdot", r"\\dot", r"\\div", r"\\frac", r"\\log", r"\\exp", r"\\poly", r"\\eq", r"\\neq", r"\\leq", r"\\geq", r"\\approx", r"\\infty", r"\\int", r"\\sum", r"\\lim", r"\\begin", r"\\subset", r"\\supset", r"\\top", r"\\star", r"\\sim", r"\\simeq", r"\\ne", r"\\ll", r"\\gg", r"\\pm", r"\\mp", r"\\triangleleft", r"\\triangleright", r"\\ast", r"\\circ", r"\\bullet", r"\\oplus", r"\\odot", r"\\otimes", r"\\ominus", r"\\oslash", r"\\bigcirc", r"\\wr", r"\\dagger", r"\\bigtriangleup", r"\\bigtriangledown", r"\\setminus", r"\\sqcup", r"\\wedge", r"\\dotplus", r"\\centerdot", r"\\ltimes", r"\\rtimes", r"\\prod", r"\\coprod", r"\\iint", r"\\iiint", r"\\iiiint", r"\\idotsint", r"\\bigoplus", r"\\big", r"\\oint", r"\\rightarrow", r"\\to", r"\\leftarrow", r"\\gets", r"\\uparrow", r"\\downarrow", r"\\forall", r"\\exists", r"\\pmod", r"\\cup", r"\\cap", r"\\hat", r"\\acute", r"\\check", r"\\grave", r"\\vec", r"\\ddot", r"\\tilde", r"\\breve", r"\\mathring", r"\\land", r"\\lor", r"\\lnot", r"\\in", r"\\smile", r"\\frown", r"\\infty", r"\\mid", r"\\sin", r"\\cos", r"\\tan", r"\\equiv", r"\\circ", r"\\dfrac", r"\\prec", r"\\preccurlyeq", r"\\sqrt",}
black_list = {r"\\text", r"\\if", r"\\local", r"\\usr", r"\\include", r"\\lib", r"\\bin", r"\\url", r"\\program", r"\\microsoft", r"\\temp", r"\\windows", r"\\documents", r"\\users", r"\\my", r"\\the",}
keywords1 = whilte_list - black_list
keywords1 = set(map(lambda x: x + "[^a-zA-Z]", keywords1))

keywords2 = {r"\+", r"\-", r"\*", r"\/", r"\%", r"\=", r"\!\=", r"\<", r"\>", r"\^", r"\_", r"\(", r"\)", r"\[", r"\]", r"\{", r"\}", r"\|\|", r"\&\&", r"sqrt", r"sum", r"int", r"\$", r"\<math\>", r"\[math\]", }

pattern0 = re.compile(r"\\[A-Z]{0,9}[a-z]{2,9}")
pattern1 = re.compile("|".join(keywords1))
pattern2 = re.compile("|".join(keywords2))

def ismath_by_model(text, model, thred=0.5):
    if model is None:
        return False
    if not isinstance(text, str) or len(text.strip()) == 0:
        return False
    try:
        x = " ".join(simple_preprocess(text))
        ret = model.predict(x)
        label, prob = ret[0][0], ret[1][0]
        return label != "__label__0"
    except:
        traceback.print_exc()
        return False

def math_filter_layer(pq_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", OVERWRITE=False):
    ret = list()
    try:
        in_pq_path = os.path.join(INPUT_FOLDER, pq_name)
        in_pq_path = util.to_real_path(in_pq_path, variables)
        out_pq_path = os.path.join(OUTPUT_FOLDER, pq_name)
        out_pq_path = util.to_real_path(out_pq_path, variables)

        if os.path.exists(in_pq_path) and (OVERWRITE or not os.path.exists(out_pq_path)):
            util.create_folder_by_file_path(out_pq_path)

            # read parquet file.
            try:
                table = pq.read_table(in_pq_path)
            except:
                traceback.print_exc()
            
            # filter records containing math.
            records = list()
            for record in table.to_pylist():
                try:
                    text = record["text"]

                    if record["la"] != "en":
                        continue

                    #if item["la_prob"] < 0.65:
                    #    continue
                    #if text is None or len(text) < 64:
                    #    continue
                    #if text.count("\\u") >= 10:
                    #    continue

                    #if not check_quality(record):
                    #    continue

                    symbols0 = set(pattern0.findall(text))
                    if len(symbols0) <= 0:
                        continue

                    symbols1 = set(pattern1.findall(text.lower()))
                    symbols1 = set(map(lambda sym: sym[:-1], symbols1))
                    if len(symbols1) <= 0:
                        continue

                    symbols2 = set(pattern2.findall(text.lower()))
                    if len(symbols1) == 1 and len(symbols2) <= 0:
                        continue

                    ismath = len(symbols1) >= 5 or ismath_by_model(text, global_var.ft_math_model)
                    if not ismath:
                        continue

                    records.append(record)
                except:
                    traceback.print_exc()

            # write parquet file.
            try:
                table = pa.Table.from_pylist(records)
                pq.write_table(table, out_pq_path)
            except:
                traceback.print_exc()
            
            ret = [out_pq_path]
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, )


if __name__ == '__main__':
    snapshot = "CC-MAIN-2022-49"
    variables = {"workspace_dir": r"workspace", "worker_id": 0, "worker_num": 1}
    INPUT_FOLDER = "$(input_data_folder)"
    OUTPUT_FOLDER = "$(output_data_folder)"
    STORAGE_PATH = "resources/storage/llmstore.yaml"
    ret = math_filter_layer(snapshot, variables=variables, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, STORAGE_PATH=STORAGE_PATH)
    print(ret)


================================================
FILE: DomainSpecific/core/layers/transform/mcq_filter_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import re
import json
import fasttext
import requests
from io import BytesIO
from gensim.utils import simple_preprocess
from warcio.limitreader import LimitReader
from warcio.warcwriter import WARCWriter
from warcio.archiveiterator import ArchiveIterator
import util
import global_var


def detect_lang(text):
    try:
        LID_WIN_SIZE = 256
        text = ''.join(text.split())
        span_start, span_end = 0, len(text)
        if len(text) > LID_WIN_SIZE:
            mid = len(text) // 2
            mid_win = LID_WIN_SIZE // 2
            span_start = max(0, int(mid - mid_win))
            span_end = min(len(text), int(mid + mid_win))
        det_text = text[span_start: span_end]
        res = global_var.lid_model.predict(det_text)
        lang = res[0][0].replace("__label__", "")
        prob = float(res[1][0])
        return lang
    except:
        return "unkown"


def detect_choice_exercise_by_rule(uri, html):
    uri = uri.lower()
    html = html.lower()
    contain_cnt = 0

    keywords_in_text = [b"choice question"]
    for keyword in keywords_in_text:
        if keyword in html:
            contain_cnt += 1
            break

    combo_keywords_in_text = [
        (b"a.",   b"b.",   b"c.",   b"d."),
        (b"a)",   b"b)",   b"c)",   b"d)"),
        (b"\na ", b"\nb ", b"\nc ", b"\nd "),
        (b">a<",  b">b<",  b">c<",  b">d<"),

        (b"1.",   b"2.",   b"3.",   b"4."),
        (b"1)",   b"2)",   b"3)",   b"4)"),
        (b"\n1 ", b"\n2 ", b"\n3 ", b"\n4 "),
        (b">1<",  b">2<",  b">3<",  b">4<"),

        (b"i.",   b"ii.",   b"iii.",   b"iv."),
        (b"i)",   b"ii)",   b"iii)",   b"iv)"),
        (b"\ni ", b"\nii ", b"\niii ", b"\niv "),
        (b">i<",  b">ii<",  b">iii<",  b">iv<"),
    ]

    for combo_keyword in combo_keywords_in_text:
        if combo_keyword[0] in html and combo_keyword[1] in html and combo_keyword[2] in html and combo_keyword[3] in html:
            contain_cnt += 1
            break

    return contain_cnt == 2


def detect_choice_exercise_by_ft_model(uri, text, thred=0.5):
    try:
        if not isinstance(text, str) or len(text.strip()) == 0:
            return False
        x = " ".join(simple_preprocess(text))
        ret = global_var.ft_mcq_model.predict(x)
        label, prob = ret[0][0], ret[1][0]
        if label == "__label__0" and prob < thred:
            return True
        return label == "__label__1"
    except:
        return False

"""
def detect_choice_exercise_by_pt_model(uri, text, thred=0.5):
    try:
        if not isinstance(text, str) or len(text.strip()) == 0:
            return False
        label = global_var.py_mcq_model.run(text, thred)
        return label == "LABEL_1"
    except:
        return False
"""


def detect_choice_exercise_by_LLM(text, engine=None):
    system = '''
You will be given a text converted from a webpage. Your task is to detect whether it contains choice question by responding with 'yes' or 'no'.
'''
    answer = global_var.gpt_api.run(system=system, question=text, engine=engine)
    answer = answer.lower().strip()
    if answer.startswith("yes"):
        return True
    elif answer.startswith("no"):
        return False
    else:
        return False


def LCS(str1, str2):
    m = len(str1)
    n = len(str2)

    dp = [[0 for _ in range(n+1)] for _ in range(m+1)]

    for i in range(1, m+1):
        for j in range(1, n+1):
            if str1[i-1] == str2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])

    return round(1.0 * dp[m][n] / n, 6)


def localize_choice_exercise_by_LLM(text, engine=None):
    system = '''
Purpose:
Create a multiple-choice question dataset.

Task:
Extract all multiple-choice questions from the provided text.

Requirements:
1. If the given text does not contain multiple-choice questions, respond only with "No multiple-choice questions found".
2. Do not modify the original multiple-choice questions.
3. Ensure all multiple-choice questions are copied without omissions.
4. Ensure all multiple-choice questions are copied in order.
5. Ensure all multiple-choice questions are copied under the original layout.
6. Copy the questions along with their options.
7. If answers and explanations are provided, copy them as well.
8. If source materials or reading passage is provided, copy it as well.
9. Don't add content not from original given text.

Please strictly adhere to these requirements while performing the task.
'''
    exercises = global_var.gpt_api.run(system=system, question=text, engine=engine)
    exercises = exercises.strip()
    if len(exercises) == 0 or "no multiple-choice question" in exercises.lower():
        return None
    else:
        exercises = exercises.replace("Multiple Choice Questions\n", "")
        exercises = exercises.replace("Multiple-choice questions:\n", "")
        exercises = exercises.replace("No other multiple-choice questions found.", "")
        exercises = exercises.replace("No other multiple-choice questions found in the text.", "")
        exercises = exercises.replace("No multiple-choice questions found.", "")
        exercises = exercises.replace("No more multiple-choice questions found.", "")

        sim = LCS(text, exercises)
        if sim < 0.9:
            return None
        else:
            return exercises


# rule + model + GPT3.5 turbor.
def mcq_filter_layer(wet_file_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", OVERWRITE=False):
    ret = list()
    try:
        src_wet_file_path = os.path.join(INPUT_FOLDER, wet_file_name)
        src_wet_file_path = util.to_real_path(src_wet_file_path, variables)
        jsonl_file_name = wet_file_name.replace(".warc.wet.gz", ".jsonl")
        dst_jsonl_file_path = os.path.join(OUTPUT_FOLDER, jsonl_file_name)
        dst_jsonl_file_path = util.to_real_path(dst_jsonl_file_path, variables)

        if os.path.exists(src_wet_file_path) and (OVERWRITE or not os.path.exists(dst_jsonl_file_path)):
            items = list()
            with open(src_wet_file_path, "rb") as input:
                records = ArchiveIterator(input, arc2warc=False)
                for id, record in enumerate(records):
                    if record.rec_type == "conversion":
                        try:
                            # read raw html.
                            uri = record.rec_headers["WARC-Target-URI"]
                            bs = record.content_stream().read()
                            if bs is None:
                                continue

                            text = str(bs, "utf-8")
                            if text is None:
                                continue

                            # 1st round filter.
                            round1_contain_exercise = detect_choice_exercise_by_rule(uri, bs)
                            if not round1_contain_exercise:
                                continue

                            # 2nd round filter.
                            round2_contain_exercise = detect_choice_exercise_by_ft_model(uri, text, thred=0.825)
                            if not round2_contain_exercise:
                                continue
                            #round2_contain_exercise = detect_choice_exercise_by_pt_model(uri, text, thred=0.99)
                            #if not round2_contain_exercise:
                            #    continue

                            """
                            # 3rd round filter.
                            round3_contain_exercise = detect_choice_exercise_by_LLM(text, "gpt-35-turbo")
                            if not round3_contain_exercise:
                                continue
                            """

                            item = dict()
                            item["uri"] = uri
                            item["text"] = text
                            lang = detect_lang(text)
                            item["lang"] = lang
                            #exercises = localize_choice_exercise_by_LLM(text, "gpt-35-turbo")
                            #item["exercises"] = exercises
                            items.append(item)
                        except:
                            traceback.print_exc()
                            pass
            with open(dst_jsonl_file_path, "w") as output:
                for item in items:
                    output.write(json.dumps(item) + "\n")
            ret = [dst_jsonl_file_path]
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, )


if __name__ == '__main__':
    wet_file_name = "CC-MAIN-20210115134101-20210115164101-00005_5.warc.wet.gz"
    variables = {"workspace_dir": r"workspace", "worker_id": 0, "worker_num": 1}
    INPUT_FOLDER = "$(input_data_folder)"
    OUTPUT_FOLDER = "$(output_data_folder)"
    ret = mcq_filter_layer(wet_file_name, variables=variables, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, OVERWRITE=True)
    print(ret)


================================================
FILE: DomainSpecific/core/layers/transform/minhash_tokens_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import hashlib
import traceback
import numpy as np
from itertools import tee

MER = 2**61 - 1
NUM_PERM = 256
SEED = 1

class MinHasher:
    def __init__(self):
        np.random.seed(1)
        self.gen = np.random.RandomState(SEED)
        self.a = self.gen.randint(1, MER, (NUM_PERM,), dtype='u8')
        self.b = self.gen.randint(0, MER, (NUM_PERM,), dtype='u8')

    def _sha1_hash(self, val):
        val = int.from_bytes(hashlib.sha1(val).digest()[:8], 'little')
        val &= MER
        return np.uint64(val)
    
    def hash(self, sequence):
        res = np.ones(NUM_PERM, dtype='u8') * MER
        for token in sequence:
            hash0 = self._sha1_hash(token.encode('utf8'))
            hash_vec = hash0 * self.a + self.b
            hash_vec %= MER
            res = np.minimum(res, hash_vec)
        return res

minhasher = MinHasher()

def minhash_tokens_layer(tokens, variables=dict()):
    ret = None
    try:
        minhash = minhasher.hash(tokens)
        ret = minhash
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == "__main__":
    tokens = {'产权 份额 为 土地 出让', '商品 住房 市场 价格 合理', '确定 , 在 售 房', ', 可 向 代 持', '住房 , 划 拨 土地', '增 购 政府 份额 的', '向社会公布 。 划 拨 土地', '为 商品 住房 , 划', '▁来源 : 中国 网 地产', '出让 土地 共有 产权 保障', '的 , 可 向 代', '售 房 阶段 向社会公布 。', '商品 住房 , 划 拨', '以及 累计 缴纳 社保 或', '性质 转 为 商品 住房', '的 非 市区 户籍 家庭', '购房 款 。 ▁在 使用', '地产 ▁ 杭州市 1 日', '《 杭州市 共有 产权 保障', '住房 享有 与 购买 商品', '类型 商品 住房 市场 价格', '的 申请 , 增 购', '价 按 同 地段 、', '款 。 ▁在 使用 管理', '可根据 支付 能力 在 50%', '按照 单 套 销售 价格', '方可 通过 买卖 等方式 上市', '年限 等相关 条件 。 ▁', '10 年后 , 方可 通过', '市场 价格 合理 优惠 后', '拨 土地 共有 产权 保障', '杭州市 共有 产权 保障 住房', '销售 基准 价 按 同', '能力 在 50% 至 80%', '等相关 条件 。 ▁ 办法', '年 的 , 可 向', '至 80% 范围内 选择 产权', '共有 产权 保障 住房 销售', '符合 限购 政策 前提 下', '购房 家庭 可根据 支付 能力', '提出 共有 产权 保障 住房', '住房 , 购房 家庭 可根据', '。 ▁在 使用 管理 方面', '-12- 03 ▁记者 : ▁来源', '保障 住房 面向 符合条件的 市区', '住房 以及 累计 缴纳 社保', '。 ▁ 办法 明确 ,', '购房 家庭 产权 份额 为', '社保 或 个 税 年限', '价 及其 浮动 幅度 确定', '非 市区 户籍 家庭 供应', '购房 款 。 出让 土地', ', 购房 家庭 可根据 支付', '单 套 销售 价格 对应的', '权利 性质 调整为 出让 。', '03 ▁记者 : ▁来源 :', '▁2021 -12- 03 ▁记者 :', '产权 保障 住房 面向 符合条件的', '日 对外 发布 《 杭州市', '就业 的 非 市区 户籍', '增 购 后 住房 性质', ', 购买 共有 产权 保障', '、 同 类型 商品 住房', '同等 的 公共服务 权益 。', '对应的 不同 比例 支付 购房', '的 公共服务 权益 。 ▁根据', '网 地产 ▁ 杭州市 1', '款 。 出让 土地 共有', '套 销售 价格 对应的 产权', '管理 方面 , 杭州 提出', '住房 , 购房 家庭 产权', '和 稳定 就业 的 非', '土地 权利 性质 调整为 出让', '浮动 幅度 确定 , 在', '不动产 权 证 满 10', '▁ 办法 明确 , 共有', '机构 提出 一次性 增 购', '》 , 其中 明确 ,', '权 证 满 10 年后', '在 50% 至 80% 范围内', '方面 , 杭州 提出 共有', '满 10 年后 , 方可', '基准 价 按 同 地段', '产权 份额 比例 , 按照', '保障 住房 管理办法 》 ,', '居住证 、 住房 以及 累计', '销售 价格 对应的 产权 比例', '住房 面向 符合条件的 市区 户籍', '。 ▁根据 办法 , 市区', '单 套 销售 价格 按照', '销售 基准 价 及其 浮动', ': 中国 网 地产 ▁', '持 机构 提出 一次性 增', '价格 按照 销售 基准 价', '家庭 供应 , 购买 共有', '购买 共有 产权 保障 住房', '稳定 就业 的 非 市区', '购买 商品 住房 同等 的', '其中 明确 , 共有 产权', '▁记者 : ▁来源 : 中国', '价格 对应的 不同 比例 支付', '与 购买 商品 住房 同等', '、 住房 等相关 条件 ,', '条件 。 ▁ 办法 明确', '证 满 5 年 的', '满 5 年 的 ,', '管理办法 》 , 其中 明确', '市区 户籍 家庭 需 满足', '份额 的 申请 , 增', '商品 住房 同等 的 公共服务', '支付 能力 在 50% 至', '权 证 满 5 年', '户籍 家庭 需 满足 居住证', ', 方可 通过 买卖 等方式', ', 在 售 房 阶段', '对应的 产权 比例 支付 购房', '产权 保障 住房 购房 家庭', '家庭 需 满足 居住证 、', '杭州 提出 共有 产权 保障', '1 日 对外 发布 《', ', 其中 明确 , 共有', '满足 居住证 、 住房 以及', '选择 产权 份额 比例 ,', '同时 满足 户籍 、 住房', ', 市区 户籍 家庭 要在', '销售 价格 对应的 不同 比例', '个 税 年限 等相关 条件', '住房 市场 价格 合理 优惠', '产权 保障 住房 , 购房', '、 住房 以及 累计 缴纳', '产权 保障 住房 销售 基准', '后 住房 性质 转 为', '土地 出让 时 已 确定的', '比例 , 按照 单 套', '发布 《 杭州市 共有 产权', '住房 性质 转 为 商品', '累计 缴纳 社保 或 个', '份额 比例 , 按照 单', '时 已 确定的 份额 比例', '划 拨 土地 权利 性质', '基准 价 及其 浮动 幅度', '。 出让 土地 共有 产权', '为 土地 出让 时 已', ', 购房 家庭 产权 份额', '等相关 条件 , 非 市区', '按 同 地段 、 同', '按照 销售 基准 价 及其', '不同 比例 支付 购房 款', '住房 销售 基准 价 按', '家庭 产权 份额 为 土地', '可 向 代 持 机构', '▁在 使用 管理 方面 ,', '家庭 取得 不动产 权 证', '性质 调整为 出让 。 取得', '取得 不动产 权 证 满', '市区 户籍 家庭 要在 符合', ', 杭州 提出 共有 产权', '政策 前提 下 同时 满足', '▁根据 办法 , 市区 户籍', '办法 , 市区 户籍 家庭', '缴纳 社保 或 个 税', '。 划 拨 土地 共有', '家庭 可根据 支付 能力 在', '满足 户籍 、 住房 等相关', '一次性 增 购 政府 份额', '购 政府 份额 的 申请', '需 满足 居住证 、 住房', '同 地段 、 同 类型', '供应 , 购买 共有 产权', '使用 管理 方面 , 杭州', '保障 住房 享有 与 购买', '共有 产权 保障 住房 享有', '限购 政策 前提 下 同时', '套 销售 价格 按照 销售', '户籍 和 稳定 就业 的', '优惠 后 确定 。 单', '住房 管理办法 》 , 其中', '市区 户籍 和 稳定 就业', '支付 购房 款 。 ▁在', '户籍 家庭 供应 , 购买', '同 类型 商品 住房 市场', '保障 住房 购房 家庭 取得', '及其 浮动 幅度 确定 ,', '共有 产权 保障 住房 管理办法', '共有 产权 保障 住房 面向', '在 售 房 阶段 向社会公布', '共有 产权 保障 住房 ,', '政府 份额 的 申请 ,', '买卖 等方式 上市 交易 。', '市区 户籍 家庭 供应 ,', '出让 时 已 确定的 份额', '家庭 要在 符合 限购 政策', '申请 , 增 购 后', ', 非 市区 户籍 家庭', '前提 下 同时 满足 户籍', '划 拨 土地 共有 产权', ', 划 拨 土地 权利', '产权 保障 住房 管理办法 》', '阶段 向社会公布 。 划 拨', '明确 , 共有 产权 保障', '确定的 份额 比例 , 按照', '证 满 10 年后 ,', '通过 买卖 等方式 上市 交易', '已 确定的 份额 比例 ,', '不动产 权 证 满 5', '提出 一次性 增 购 政府', '对外 发布 《 杭州市 共有', '价格 合理 优惠 后 确定', '。 取得 不动产 权 证', '范围内 选择 产权 份额 比例', '房 阶段 向社会公布 。 划', '▁ 杭州市 1 日 对外', '份额 为 土地 出让 时', ', 增 购 后 住房', '地段 、 同 类型 商品', '杭州市 1 日 对外 发布', '户籍 家庭 要在 符合 限购', '保障 住房 销售 基准 价', '调整为 出让 。 取得 不动产', ', 共有 产权 保障 住房', '权益 。 ▁根据 办法 ,', '比例 支付 购房 款 。', '保障 住房 , 购房 家庭', '或 个 税 年限 等相关', '年后 , 方可 通过 买卖', '出让 。 取得 不动产 权', '价格 对应的 产权 比例 支付', '购 后 住房 性质 转', '确定 。 单 套 销售', '支付 购房 款 。 出让', '要在 符合 限购 政策 前提', '拨 土地 权利 性质 调整为', '转 为 商品 住房 ,', '享有 与 购买 商品 住房', '公共服务 权益 。 ▁根据 办法', '中国 网 地产 ▁ 杭州市', '5 年 的 , 可', '合理 优惠 后 确定 。', '办法 明确 , 共有 产权', '共有 产权 保障 住房 购房', '套 销售 价格 对应的 不同', '户籍 、 住房 等相关 条件', '下 同时 满足 户籍 、', '产权 保障 住房 享有 与', '面向 符合条件的 市区 户籍 和', '购房 家庭 取得 不动产 权', '条件 , 非 市区 户籍', '幅度 确定 , 在 售', ': ▁来源 : 中国 网', '代 持 机构 提出 一次性', '产权 比例 支付 购房 款', '80% 范围内 选择 产权 份额', '向 代 持 机构 提出', '住房 同等 的 公共服务 权益', '税 年限 等相关 条件 。', '土地 共有 产权 保障 住房', ', 按照 单 套 销售', '非 市区 户籍 家庭 需', '。 单 套 销售 价格', '符合条件的 市区 户籍 和 稳定', '住房 等相关 条件 , 非', '50% 至 80% 范围内 选择', '后 确定 。 单 套', '住房 购房 家庭 取得 不动产', '销售 价格 按照 销售 基准'}
    minhash = minhash_tokens_layer(tokens)
    print(minhash)


================================================
FILE: DomainSpecific/core/layers/transform/ngrams_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
from itertools import tee

NGRAM_SIZE = 5

def ngrams_layer(sequence, variables=dict()):
    ret = None
    try:
        # https://github.com/ChenghaoMou/text-dedup/blob/main/text_dedup/utils/tokenization.py
        if len(sequence) < NGRAM_SIZE:
            return iter([sequence])
        iterables = tee(iter(sequence), NGRAM_SIZE)
        for i, sub_iterable in enumerate(iterables):
            for _ in range(i):
                next(sub_iterable, None)
        tokens = zip(*iterables)
        tokens = {" ".join(t).strip() for t in tokens}
        #tokens = list(tokens)
        ret = tokens
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == "__main__":
    tokens = ['▁2021', '-12-', '03', '▁记者', ':', '▁来源', ':', '中国', '网', '地产', '▁', '杭州市', '1', '日', '对外', '发布', '《', '杭州市', '共有', '产权', '保障', '住房', '管理办法', '》', ',', '其中', '明确', ',', '共有', '产权', '保障', '住房', '面向', '符合条件的', '市区', '户籍', '和', '稳定', '就业', '的', '非', '市区', '户籍', '家庭', '供应', ',', '购买', '共有', '产权', '保障', '住房', '享有', '与', '购买', '商品', '住房', '同等', '的', '公共服务', '权益', '。', '▁根据', '办法', ',', '市区', '户籍', '家庭', '要在', '符合', '限购', '政策', '前提', '下', '同时', '满足', '户籍', '、', '住房', '等相关', '条件', ',', '非', '市区', '户籍', '家庭', '需', '满足', '居住证', '、', '住房', '以及', '累计', '缴纳', '社保', '或', '个', '税', '年限', '等相关', '条件', '。', '▁', '办法', '明确', ',', '共有', '产权', '保障', '住房', '销售', '基准', '价', '按', '同', '地段', '、', '同', '类型', '商品', '住房', '市场', '价格', '合理', '优惠', '后', '确定', '。', '单', '套', '销售', '价格', '按照', '销售', '基准', '价', '及其', '浮动', '幅度', '确定', ',', '在', '售', '房', '阶段', '向社会公布', '。', '划', '拨', '土地', '共有', '产权', '保障', '住房', ',', '购房', '家庭', '可根据', '支付', '能力', '在', '50%', '至', '80%', '范围内', '选择', '产权', '份额', '比例', ',', '按照', '单', '套', '销售', '价格', '对应的', '不同', '比例', '支付', '购房', '款', '。', '出让', '土地', '共有', '产权', '保障', '住房', ',', '购房', '家庭', '产权', '份额', '为', '土地', '出让', '时', '已', '确定的', '份额', '比例', ',', '按照', '单', '套', '销售', '价格', '对应的', '产权', '比例', '支付', '购房', '款', '。', '▁在', '使用', '管理', '方面', ',', '杭州', '提出', '共有', '产权', '保障', '住房', '购房', '家庭', '取得', '不动产', '权', '证', '满', '5', '年', '的', ',', '可', '向', '代', '持', '机构', '提出', '一次性', '增', '购', '政府', '份额', '的', '申请', ',', '增', '购', '后', '住房', '性质', '转', '为', '商品', '住房', ',', '划', '拨', '土地', '权利', '性质', '调整为', '出让', '。', '取得', '不动产', '权', '证', '满', '10', '年后', ',', '方可', '通过', '买卖', '等方式', '上市', '交易', '。']
    tokens = ngrams_layer(tokens)
    print(tokens)


================================================
FILE: DomainSpecific/core/layers/transform/openquestion_filter_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import re
import gc
import requests
import fasttext
from gensim.utils import simple_preprocess
import pyarrow as pa
import pyarrow.parquet as pq
sys.path.append(".")
import util
import global_var

question_keywords = ("q&a", "q & a", "q:", "que:", "question:", "quiz:", "exam:", "examination:", "probe:", "request:", "challenge:", "test:", "query:", "survey:")
#question_keywords2 = ("what ", "where ", "why ", "when ", "who ", "whoes ", "how ", "\?")
question_keywords2 = ("what", "where", "why", "when", "who", "whoes", "how")
question_keywords += question_keywords2
question_keywords = set(map(lambda x: "[^a-zA-Z]" + x + "[^a-zA-Z]", question_keywords))
question_pattern = re.compile("|".join(question_keywords))

answer_keywords = ("q&a", "q & a", "a:", "ans:", "answer:", "solution:", "reply:", "response:", "result:", "outcome:", "explanation:", "conclusion:", "finding:", "assertion:", "statement:", "clarification:")
answer_keywords = set(map(lambda x: "[^a-zA-Z]" + x + "[^a-zA-Z]", answer_keywords))
answer_pattern = re.compile("|".join(answer_keywords))


def is_openquestion_by_model(text, model, thred=0.5):
    if model is None:
        return False
    if not isinstance(text, str) or len(text.strip()) == 0:
        return False
    try:
        x = " ".join(simple_preprocess(text))
        ret = model.predict(x)
        label, prob = ret[0][0], ret[1][0]
        return label != "__label__0"
    except:
        traceback.print_exc()
        return False

def check_yes_no_question(text_before, text_after):
    text_after = text_after.lower().strip()
    keywords = ("yes", "y", "no", "n")
    for keyword in keywords:
        if text_after.startswith(keyword) and not text_after[len(keyword)].isalnum():
            return True
    return False

def check_multiple_choise_question(text_before, text_after):
    combo_keywords_list = [
        ("a.",   "b.",   "c.",   "d."),
        ("a)",   "b)",   "c)",   "d)"),
        ("\na ", "\nb ", "\nc ", "\nd "),
        (">a<",  ">b<",  ">c<",  ">d<"),

        ("1.",   "2.",   "3.",   "4."),
        ("1)",   "2)",   "3)",   "4)"),
        ("\n1 ", "\n2 ", "\n3 ", "\n4 "),
        (">1<",  ">2<",  ">3<",  ">4<"),

        ("i.",   "ii.",   "iii.",   "iv."),
        ("i)",   "ii)",   "iii)",   "iv)"),
        ("\ni ", "\nii ", "\niii ", "\niv "),
        (">i<",  ">ii<",  ">iii<",  ">iv<"),
    ]
    text_before = text_before.lower().strip()
    for combo_keywords in combo_keywords_list:
        t = 0
        for combo_keyword in combo_keywords:
            t = text_before.find(combo_keyword, t)
            if t == -1:
                break
        if t != -1:
            return True
        #if combo_keywords[0] in text_before and combo_keywords[1] in text_before and combo_keywords[2] in text_before:
        #    return True
    return False

def check_fill_in_question(text_before, text_after):
    text_before = text_before.lower().strip()
    if "___" in text_before or "()" in text_before or "..." in text_before:
        return True
    return False

def check_quality(item):
    text = item["text"]
    lines = text.split("\n")
    lens = list(map(lambda l: len(l.strip()), lines))
    max_len = max(lens)

    #if max_len > 1024:
    if max_len > 2048:
        return False
    if max_len <= 128:
        return False

    if len(lens) <= 3:
        return False
    if len(lens) > 256:
        return False

    if len(text) < 256:
        return False
    if len(text) > 1024 * 16:
        return False

    if 1.0 * text.count(" ") / len(text) > 0.33:
        return False

    if 1.0 * text.count("  ") / len(text) > 0.1:
        return False

    if 1.0 * text.count("\t") / len(text) > 0.1:
        return False

    if 1.0 * text.count(".") / len(text) > 0.1:
        return False

    if 1.0 * text.count("-") / len(text) > 0.1:
        return False

    if 1.0 * text.count("#") / len(text) > 0.1:
        return False

    if 1.0 * text.count("|") / len(text) > 0.1:
        return False

    if 1.0 * text.count(",") / len(text) > 0.1:
        return False

    sl_cnt = 1.0 * len(list(filter(lambda x: len(x.strip()) <= 32, lines))) / len(lines)
    if sl_cnt > 0.67:
        return False

    return True

def openquestion_filter_layer(pq_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", OVERWRITE=False):
    ret = list()
    try:
        in_pq_path = os.path.join(INPUT_FOLDER, pq_name)
        in_pq_path = util.to_real_path(in_pq_path, variables)
        out_pq_path = os.path.join(OUTPUT_FOLDER, pq_name)
        out_pq_path = util.to_real_path(out_pq_path, variables)

        if os.path.exists(in_pq_path) and (OVERWRITE or not os.path.exists(out_pq_path)):
            util.create_folder_by_file_path(out_pq_path)

            # read parquet file.
            try:
                table = pq.read_table(in_pq_path)
                records = table.to_pylist()
            except:
                traceback.print_exc()
            
            # filter records containing open question.
            openquestion_records = list()
            for record_idx, record in enumerate(records):
                try:
                    text = record["text"]
                    text_low = text.lower()

                    if record["la"] != "en":
                        continue

                    #if item["la_prob"] < 0.65:
                    #    continue
                    #if text is None or len(text) < 64:
                    #    continue
                    #if text.count("\\u") >= 10:
                    #    continue

                    #if not check_quality(record):
                    #    continue

                    contain_question = len(question_pattern.findall(text_low)) >= 2
                    if not contain_question:
                        continue
                    
                    contain_answer = len(answer_pattern.findall(text_low)) >= 2
                    if not contain_answer:
                        continue

                    contain_openquestion = is_openquestion_by_model(text, global_var.ft_openquestion_model)
                    if not contain_openquestion:
                        continue

                    openquestion_records.append(record)
                except:
                    traceback.print_exc()

            # write parquet file.
            try:
                openquestion_table = pa.Table.from_pylist(openquestion_records)
                pq.write_table(openquestion_table, out_pq_path)
            except:
                traceback.print_exc()
            
            ret = [out_pq_path]
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, )


if __name__ == '__main__':
    snapshot = "CC-MAIN-2022-49"
    variables = {"workspace_dir": r"workspace", "worker_id": 0, "worker_num": 1}
    INPUT_FOLDER = "$(input_data_folder)"
    OUTPUT_FOLDER = "$(output_data_folder)"
    STORAGE_PATH = "resources/storage/llmstore.yaml"
    ret = openquestion_filter_layer(snapshot, variables=variables, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, STORAGE_PATH=STORAGE_PATH)
    print(ret)


================================================
FILE: DomainSpecific/core/layers/transform/tokenize_article_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import sentencepiece as spm


tokenizer = None

def tokenize_article_layer(article, variables=dict(), SPM_MODEL_PATH="./dependency/models/sentencepiece.bpe.model"):
    ret = None
    try:
        global tokenizer
        if tokenizer is None:
            tokenizer = spm.SentencePieceProcessor(SPM_MODEL_PATH)
        tokens = tokenizer.encode(article, out_type=str)
        ret = tokens
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return ret


if __name__ == "__main__":
    article = "2021-12-03 记者： 来源：中国网地产\n\n杭州市1日对外发布《杭州市共有产权保障住房管理办法》，其中明确，共有产权保障住房面向符合条件的市区户籍和稳定就业的非市区户籍家庭供应，购买共有产权保障住房享有与购买商品住房同等的公共服务权益。\n\n根据办法，市区户籍家庭要在符合限购政策前提下同时满足户籍、住房等相关条件，非市区户籍家庭需满足居住证、住房以及累计缴纳社保或个税年限等相关条件。\n\n办法明确，共有产权保障住房销售基准价按同地段、同类型商品住房市场价格合理优惠后确定。单套销售价格按照销售基准价及其浮动幅度确定，在售房阶段向社会公布。划拨土地共有产权保障住房，购房家庭可根据支付能力在50%至80%范围内选择产权份额比例，按照单套销售价格对应的不同比例支付购房款。出让土地共有产权保障住房，购房家庭产权份额为土地出让时已确定的份额比例，按照单套销售价格对应的产权比例支付购房款。\n\n在使用管理方面，杭州提出共有产权保障住房购房家庭取得不动产权证满5年的，可向代持机构提出一次性增购政府份额的申请，增购后住房性质转为商品住房，划拨土地权利性质调整为出让。取得不动产权证满10年后，方可通过买卖等方式上市交易。"
    tokens = tokenize_article_layer(article)
    print(tokens)


================================================
FILE: DomainSpecific/core/layers/transform/warc_encode_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# coding=utf-8
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import re
import codecs
import logging
import traceback
import requests
from pathlib import Path
from urllib.parse import urlparse
from io import BytesIO
from warcio.limitreader import LimitReader
from warcio.warcwriter import WARCWriter
from warcio.archiveiterator import ArchiveIterator
import lxml.etree as ET
import lxml.html as HT
from py_asciimath.translator.translator import MathML2Tex
from pylatexenc.latexwalker import LatexWalker
from charset_normalizer import detect
import util

def tex_in_script_tag(text):
    return text.startswith('<script type="math/tex"') or \
           text.startswith("<script type='math/tex'") or \
           text.startswith('<script type="math/latex"') or \
           text.startswith("<script type='math/latex'") or \
           text.startswith('<script type="math/asciimath"') or \
           text.startswith("<script type='math/asciimath'") or \
           text.startswith('<span class="math-formula">') or \
           text.startswith("<span class='math-formula'>")

def tex_in_math_tag(text):
    return text.startswith("<annotation encoding='application/x-tex'>") or \
           text.startswith('<annotation encoding="application/x-tex">')

def tex_in_math_tag2(text):
    return text.startswith("<math") and "</annotation>" in text

def mathml_in_script_tag(text):
    return text.startswith('<script type="math/mml"') or \
           text.startswith("<script type='math/mml'")

def mathml_in_math_tag(text):
    return text.startswith("<math ") and 'xmlns="http://www.w3.org/1998/Math/MathML"' in text
    #return text.startswith('<math xmlns="http://www.w3.org/1998/Math/MathML"') or \
    #       text.startswith("<math xmlns='http://www.w3.org/1998/Math/MathML'")
    #return text.startswith("<math ")

def is_tex(text):
    return re.match(r"(\$\$.*?\$\$)", text) is not None

def contain_tex(text):
    return re.search(r"(\$\$.*?\$\$)", text) is not None

def check_latex(latex):
    try:
        w = LatexWalker(latex, tolerant_parsing=False)
        (nodelist, pos, len_) = w.get_latex_nodes(pos=0)
        return True
    except:
        return False

def remove_hidden_content(html):
    text = html
    root = HT.document_fromstring(text)

    hidden_nodes = root.xpath('//*[@aria-hidden="true"]')
    for hidden_node in hidden_nodes:
        hidden_node.drop_tree()

    doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">'
    if html.strip().startswith(b'<!DOCTYPE'):
        index = html.find(b"<html")
        if index != -1:
            doctype = html[:index].strip()
    new_text = HT.tostring(root, method="html", doctype=doctype)
    new_html = new_text
    return new_html

def remove_attr(text, attr):
    index = text.find(attr)
    if index == -1:
        return text, False
    before = text[:index-1]
    text = text[index:]
    index = len(attr) + 1
    index = text.find(text[index:index+1], index+1) + 1
    after = text[index:]
    text = text[:index]
    text = before + after
    return text, True

def mathml_to_latex1(text):
    mml_dom = ET.fromstring(text)
    xslt = ET.parse("./dependency/xsltml_2.0/mmltex.xsl")
    transform = ET.XSLT(xslt)
    mmldom = transform(mml_dom)
    text = str(mmldom)
    return text

def mathml_to_latex2(text):
    symbol_mappings = {
        "&alpha;": "α",
        "&Alpha;": "A",
        "&beta;": "β",
        "&Beta;": "B",
        "&epsilon;": "ε",
        "&Epsilon;": "Ε",
        "&Mu;": "M",
        "&Nu;": "N",
        "&omicron;": "o",
        "&Omicron;": "O",
        "&iot;": "ι",
        "&conjugate0;": "&#x2015;",
    }
    for key1, key2 in symbol_mappings.items():
        text = text.replace(key1, key2)

    # add xml head.
    head = "<?xml version='1.0' encoding='UTF-8'?>\n" + \
           '<!DOCTYPE math PUBLIC "-//W3C//DTD MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/mathml2.dtd">'
    text = head + text

    # remove unrecognized attributes.
    attrs = ("fontstyle", "ignorefont", "mathcolor", "rtableid", "altimg-valign", "dspmath", "xmlns:md", "specific-use")
    for attr in attrs:
        find = True
        while find:
            text, find = remove_attr(text, attr)
    text = text.replace(' xmlns=""', '')

    logging.disable(logging.WARNING)
    mathml2tex = MathML2Tex()
    text = mathml2tex.translate(text, network=False, from_file=False,)
    #logging.enable(logging.WARNING)
    return text

def separate_content_and_tag(html, start_str, end_str, s=0):
    index = html.find(start_str, s)
    before = html[:index]
    html = html[index:]
    index = html.find(end_str) + len(end_str)
    content = html[:index]
    after = html[index:]
    return content, before, after

def detect_code(text):
    keywords = (
        'if', 'else', 'for', 'while', 'def', 'class', 'include', 'switch', 'case', 
        'default', 'const', 'static', 'try', 'catch', 'exception', 'continue', 'open', 
        'close', 'import', 'var', 'None', 'null', 'true', 'True', 'false', 'False', 'print', 'return',
        'sudo', 'apt-get', 'wget',
        '\+', '-', '\*', '/', '=',
        #'//', '#', '/*', '*/',
    )
    patterns = [
        rf'\b(?:{"|".join(keywords)})\b', # keywords
        r'[{};]', # code indicators (curly braces, semicolon)
        r'\w+\s*\(.*\)', # function calls or declarations
        r'\w+\s*=\s*\w+', # variable assignments
    ]

    for pattern in patterns:
        if re.search(pattern, text):
            return True

    return False

def encode_code(node, code_tag, not_code_tag):
    # situation 1. <pre><code>
    # situation 2. <pre><span>
    # situation 3. <pre><code><span>
    # situation 4. <table><tbody>
    # situation 5. <table><tbody><pre>...

    if node.tag == "code":
        parent_node = node.getparent()
        parent_tag = parent_node.tag

        if parent_tag == "tbody":
            code_node = parent_node
        elif parent_tag == "pre":
            code_node = parent_node
            # below could be commentted.
            while parent_node is not None:
                parent_node = parent_node.getparent()
                if parent_node is not None and parent_node.tag == "tbody":
                    code_node = parent_node
                    break
        else:
            #code_node = node
            code_node = None

        if code_node is not None:
            text = code_node.text_content()

            # delete the whole attributes.
            for key, value in code_node.attrib.items():
                code_node.attrib.pop(key)
            if detect_code(text):
                code_node.tag = code_tag# + "-" + lang
                return True
            else:
                #code_node.tag = not_code_tag# debug
                return False

    child_nodes = node.getchildren()
    contain = False
    for child_node in child_nodes:
        if encode_code(child_node, code_tag, not_code_tag):
            contain = True
    return contain

def filter_code(html, code_tag, not_code_tag):
    root = HT.document_fromstring(html)

    contain = encode_code(root, code_tag, not_code_tag)

    doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">'
    if html.strip().startswith(b'<!DOCTYPE'):
        index = html.find(b"<html")
        if index != -1:
            doctype = html[:index].strip()
    new_text = HT.tostring(root, method="html", doctype=doctype)
    new_html = new_text

    return new_html, contain

def encode_image(uri, node, image_tag):
    if node.tag == "img":
        node.tag = image_tag

        link = node.attrib.get("src")
        if link is not None:
            link = util.relative2absolute_path(uri, link)
        alt = node.attrib.get("alt")
        width = node.attrib.get("width")
        height = node.attrib.get("height")
        name = util.md5(link) + Path(urlparse(link).path).suffix if link is not None else None
        attrs = {"link": link, "alt": alt, "width": width, "height": height, "name": name}
        node.text = str(attrs)

        # delete the whole attributes.
        for key, value in node.attrib.items():
            node.attrib.pop(key)
        return True

    child_nodes = node.getchildren()
    contain = False
    for child_node in child_nodes:
        if encode_image(uri, child_node, image_tag):
            contain = True
    return contain

def filter_image(uri, html, image_tag):
    root = HT.document_fromstring(html)

    contain = encode_image(uri, root, image_tag)

    doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">'
    if html.strip().startswith(b'<!DOCTYPE'):
        index = html.find(b"<html")
        if index != -1:
            doctype = html[:index].strip()
    new_text = HT.tostring(root, method="html", doctype=doctype)
    new_html = new_text

    return new_html, contain

def encode_video(uri, node, video_tag):
    if node.tag == "video":
        node.tag = video_tag

        link = node.attrib.get("src")
        if link is not None:
            link = util.relative2absolute_path(uri, link)
        alt = node.attrib.get("alt")
        width = node.attrib.get("width")
        height = node.attrib.get("height")
        name = util.md5(link) + Path(urlparse(link).path).suffix if link is not None else None
        attrs = {"link": link, "alt": alt, "width": width, "height": height, "name": name}
        node.text = str(attrs)

        # delete the whole attributes.
        for key, value in node.attrib.items():
            node.attrib.pop(key)
        return True

    child_nodes = node.getchildren()
    contain = False
    for child_node in child_nodes:
        if encode_video(uri, child_node, video_tag):
            contain = True
    return contain

def filter_video(uri, html, video_tag):
    root = HT.document_fromstring(html)

    contain = encode_video(uri, root, video_tag)

    doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">'
    if html.strip().startswith(b'<!DOCTYPE'):
        index = html.find(b"<html")
        if index != -1:
            doctype = html[:index].strip()
    new_text = HT.tostring(root, method="html", doctype=doctype)
    new_html = new_text

    return new_html, contain

def encode_math_html(uri, html, encoding):
    encode_table = {
        b"<": b"[[[less]]]",
        b">": b"[[[large]]]",
    }

    tag_head_mathml  = b"[[[math-ml]]]"
    tag_tail_mathml  = b"[[[/math-ml]]]"
    tag_head_mathtex = b"[[[math-tex]]]"
    tag_tail_mathtex = b"[[[/math-tex]]]"
    
    start_end_strs = (
        (b"<maths", b"</maths>"),#1
        (b"<math>", b"</math>"),#2
        (b"<math ", b"</math>"),#2
        (b"<annotation encoding='application/x-tex'>", b"</annotation>"),
        (b'<annotation encoding="application/x-tex">', b'</annotation>'),
        (b"<span class='math-formula'>", b"</span>"),
        (b'<span class="math-formula">', b'</span>'),
        (b'<script type="math/mml"', b'</script>'),
        (b"<script type='math/mml'", b"</script>"),
        (b'<script type="math/tex"', b'</script>'),
        (b"<script type='math/tex'", b"</script>"),
        (b'<script type="math/latex"', b'</script>'),
        (b"<script type='math/latex'", b"</script>"),
        (b'<script type="math/asciimath"', b'</script>'),
        (b"<script type='math/asciimath'", b"</script>"),
    )

    sub_start_end_strs = (
        (b"<math", b"</math>"),#1
        (b"<annotation encoding='application/x-tex'>", b"</annotation>"),#2
        (b'<annotation encoding="application/x-tex">', b'</annotation>'),#2
    )

    assert tag_head_mathml not in html and tag_tail_mathml not in html
    assert tag_head_mathtex not in html and tag_tail_mathtex not in html

    contain_tag = False
    for (start_str, end_str) in start_end_strs:
        while start_str in html:
            content, before, after = separate_content_and_tag(html, start_str, end_str)

            if start_str[:5] == b"<math":
                for sub_start_str, sub_end_str in sub_start_end_strs:
                    if sub_start_str in content[len(start_str):-len(end_str)]:
                        content = content[len(start_str):-len(end_str)]
                        content, sub_before, sub_after = separate_content_and_tag(content, sub_start_str, sub_end_str)

            contain = True
            try:
                content_str = str(content, encoding)
            except:
                return html, False

            if contain and (tex_in_script_tag(content_str) or tex_in_math_tag(content_str)):
                try:
                    index1 = content.find(b">") + 1
                    index2 = content.rfind(b"<")
                    formula = content[index1:index2]
                    formula = formula.strip()
                    formula_str = str(formula, encoding)

                    if not check_latex(formula_str):
                        return html, False
                    for key1, key2 in encode_table.items():
                        formula = formula.replace(key1, key2)
                    content = b"<span>" + tag_head_mathtex + formula + tag_tail_mathtex + b"</span>"
                except:
                    contain = False
            elif contain and (tex_in_math_tag2(content_str)):
                try:
                    index2 = content_str.find("</annotation>")
                    index1 = content_str[:index2].rfind("</mrow>") + len("</mrow>")
                    formula = content_str[index1:index2]
                    formula = formula.strip()
                    formula_str = str(formula, encoding)

                    if not check_latex(formula_str):
                        return html, False
                    for key1, key2 in encode_table.items():
                        formula = formula.replace(key1, key2)
                    content = b"<span>" + tag_head_mathtex + formula + tag_tail_mathtex + b"</span>"
                except:
                    contain = False
            elif contain and (mathml_in_script_tag(content_str) or mathml_in_math_tag(content_str)):
                try:
                    # convert mathml to latex.
                    if "<semantics>" in content_str and "</semantics>" not in content_str:
                        content_str = content_str.replace("<semantics>", "")
                    try:
                        formula_str = mathml_to_latex1(content_str)
                    except:
                        formula_str = mathml_to_latex2(content_str)
                    formula = bytes(formula_str, encoding)
                    formula = formula.replace(codecs.BOM_UTF8, b"")
                    formula = formula.strip(b"$")
                    formula = formula.strip()
                    formula_str = str(formula, encoding)

                    if not check_latex(formula_str):
                        return html, False
                    for key1, key2 in encode_table.items():
                        formula = formula.replace(key1, key2)
                    content = b"<span>" + tag_head_mathml + formula + tag_tail_mathml + b"</span>"
                except:
                    contain = False
            else:
                contain = False

            if contain:
                html = before + content + after
                contain_tag = True
            else:
                html = before + after

    return html, contain_tag

def get_tag_info(tag):
    start_tag = f"<{tag}>".encode()
    end_tag = f"</{tag}>".encode()
    encode_start_tag = f"[[[{tag}]]]".encode()
    encode_end_tag = f"[[[/{tag}]]]".encode()
    tag = tag.encode()
    return tag, start_tag, end_tag, encode_start_tag, encode_end_tag

def encode_code_html(uri, html, encoding):
    code_tag_str = "code-encode"
    not_code_tag_str = "not-code-encode"
    code_tag, code_start_tag, code_end_tag, code_encode_start_tag, code_encode_end_tag = get_tag_info(code_tag_str)
    not_code_tag, not_code_start_tag, not_code_end_tag, not_code_encode_start_tag, not_code_encode_end_tag = get_tag_info(not_code_tag_str)
    assert code_start_tag not in html and code_end_tag not in html
    assert not_code_start_tag not in html and not_code_end_tag not in html

    try:
        html, contain = filter_code(html, code_tag_str, not_code_tag_str)

        if contain:
            html = html.replace(code_start_tag, b"<pre>" + b"\n" + code_encode_start_tag + b"\n")
            html = html.replace(code_end_tag, b"\n" + code_encode_end_tag + b"\n" + b"</pre>")

            #html = html.replace(not_code_start_tag, b"<pre>" + b"\n" + not_code_encode_start_tag + b"\n")# debug
            #html = html.replace(not_code_end_tag, b"\n" + not_code_encode_end_tag + b"\n" + b"</pre>")# debug
    except:
        contain = False

    return html, contain

def encode_image_html(uri, html, encoding):
    image_tag_str = "image-encode"
    image_tag, image_start_tag, image_end_tag, image_encode_start_tag, image_encode_end_tag = get_tag_info(image_tag_str)
    assert image_start_tag not in html and image_end_tag not in html

    try:
        html, contain = filter_image(uri, html, image_tag_str)

        if contain:
            #html = html.replace(image_start_tag, b"<pre>" + b"\n" + image_encode_start_tag + b"\n")
            #html = html.replace(image_end_tag, b"\n" + image_encode_end_tag + b"\n" + b"</pre>")
            html = html.replace(image_start_tag, b"<span>" + b"\n" + image_encode_start_tag + b"\n")
            html = html.replace(image_end_tag, b"\n" + image_encode_end_tag + b"\n" + b"</span>")
    except:
        contain = False

    return html, contain

def encode_video_html(uri, html, encoding):
    video_tag_str = "video-encode"
    video_tag, video_start_tag, video_end_tag, video_encode_start_tag, video_encode_end_tag = get_tag_info(video_tag_str)
    assert video_start_tag not in html and video_end_tag not in html

    try:
        html, contain = filter_video(uri, html, video_tag_str)

        if contain:
            #html = html.replace(video_start_tag, b"<pre>" + b"\n" + video_encode_start_tag + b"\n")
            #html = html.replace(video_end_tag, b"\n" + video_encode_end_tag + b"\n" + b"</pre>")
            html = html.replace(video_start_tag, b"<span>" + b"\n" + video_encode_start_tag + b"\n")
            html = html.replace(video_end_tag, b"\n" + video_encode_end_tag + b"\n" + b"</span>")
    except:
        contain = False

    return html, contain

def encode_html(uri, html, encoding, TAG):
    if html is None:
        return None, False

    if TAG == "math":
        html, contain_tag = encode_math_html(uri, html, encoding)
    elif TAG == "code":
        html, contain_tag = encode_code_html(uri, html, encoding)
    elif TAG == "image":
        html, contain_tag = encode_image_html(uri, html, encoding)
    elif TAG == "video":
        html, contain_tag = encode_video_html(uri, html, encoding)
    return html, contain_tag


def warc_encode_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", TAG=None, DEFAULT_ENCODING=None, OVERWRITE=False):
    ret = list()
    try:
        src_warc_file_path = os.path.join(INPUT_FOLDER, warc_file_name)
        src_warc_file_path = util.to_real_path(src_warc_file_path, variables)
        dst_warc_file_path = os.path.join(OUTPUT_FOLDER, warc_file_name)
        dst_warc_file_path = util.to_real_path(dst_warc_file_path, variables)

        if os.path.exists(src_warc_file_path) and (OVERWRITE or not os.path.exists(dst_warc_file_path)):
            util.create_folder_by_file_path(dst_warc_file_path)
            with open(dst_warc_file_path, "wb") as output:
                writer = WARCWriter(output, gzip=True)
                with open(src_warc_file_path, "rb") as input:
                    records = ArchiveIterator(input, arc2warc=True)
                    for id, record in enumerate(records):
                        if record.rec_type == "response" and record.http_headers.get_header("Content-Type", "").startswith("text/html"):
                            try:
                                uri = record.rec_headers["WARC-Target-URI"]

                                # read raw html.
                                html = record.content_stream().read()

                                # check html codec.
                                charset = record.http_headers["Content-Type"].split(";")[-1].split("=")
                                if charset[0].strip().lower() == "charset":
                                    encoding = charset[1]
                                else:
                                    index1 = html.find(b'<meta charset="')
                                    if index1 >= 0:
                                        index1 += len(b'<meta charset="')
                                        index2 = html.find(b'"', index1)
                                        encoding = str(html[index1:index2], encoding="ascii")
                                    else:
                                        try:
                                            logging.disable(logging.WARNING)
                                            encoding = detect(html)["encoding"]
                                            #logging.enable(logging.WARNING)
                                        except:
                                            encoding = ""
                                if encoding is not None:
                                    encoding = encoding.strip().strip('"').lower()

                                if encoding in ("",):
                                    encoding = DEFAULT_ENCODING
                                
                                # remove hidden tag.
                                if encoding is not None and b'aria-hidden="true"' in html:
                                #if encoding is not None and (b'aria-hidden="true"' in html or b'aria-readonly="true"' in html):
                                    try:
                                        html = remove_hidden_content(html)
                                    except:
                                        encoding = DEFAULT_ENCODING

                                # encode html.
                                if encoding is not None:
                                    if TAG is not None:
                                        html, contain_tag = encode_html(uri, html, encoding, TAG)
                                    else:
                                        contain_tag_cnt = 0
                                        TAGS = ("math", "code", "image")# "video"
                                        for tag in TAGS:
                                            html, contain_tag = encode_html(uri, html, encoding, tag)
                                            if contain_tag:
                                                contain_tag_cnt += 1
                                        contain_tag = contain_tag_cnt > 0
                                else:
                                    html = None
                                    contain_tag = False

                                # write encoded html.
                                if contain_tag and html is not None:
                                    content = BytesIO(html)
                                    assert content.getbuffer().nbytes == len(html)
                                    raw_length = len(html)
                                    record.raw_stream = LimitReader(content, raw_length)

                                    record.rec_headers["Content-Length"] = None
                                    record.length = None

                                    writer.write_record(record)
                            except:
                                traceback.print_exc()

            ret = [warc_file_name]
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, )


if __name__ == "__main__":
    warc_file_name = "CC-MAIN-20221127073607-20221127103607-00007.warc.gz"
    INPUT_FOLDER = "$(input_data_folder)"
    OUTPUT_FOLDER = "$(output_data_folder)"
    TAG = "math"
    output = warc_encode_layer(warc_file_name, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, TAG=TAG)
    print(output)


================================================
FILE: DomainSpecific/core/layers/transform/warc_filter_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import re
from io import BytesIO
from warcio.warcwriter import WARCWriter
from warcio.limitreader import LimitReader
from warcio.archiveiterator import ArchiveIterator
import util

def warc_filter_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", TAGS=(), OVERWRITE=False):
    ret = list()
    try:
        src_warc_file_path = os.path.join(INPUT_FOLDER, warc_file_name)
        src_warc_file_path = util.to_real_path(src_warc_file_path, variables)
        dst_warc_file_path = os.path.join(OUTPUT_FOLDER, warc_file_name)
        dst_warc_file_path = util.to_real_path(dst_warc_file_path, variables)
        TAGS = list(map(lambda tag: bytes(tag, "ascii"), TAGS))
        regex = re.compile(b'|'.join(TAGS))

        if os.path.exists(src_warc_file_path) and (OVERWRITE or not os.path.exists(dst_warc_file_path)):
            util.create_folder_by_file_path(dst_warc_file_path)
            with open(dst_warc_file_path, "wb") as output:
                writer = WARCWriter(output, gzip=True)
                with open(src_warc_file_path, "rb") as input:
                    reader = ArchiveIterator(input, arc2warc=True)
                    for i, record in enumerate(reader):
                        if record.rec_type == "response" and record.http_headers.get_header("Content-Type", "").startswith("text/html"):
                            try:
                                # read raw html.
                                html = record.content_stream().read()

                                # filter.
                                if regex.search(html):
                                    content = BytesIO(html)
                                    assert len(html) == record.payload_length
                                    record.raw_stream = LimitReader(content, record.payload_length)
                                    writer.write_record(record)
                            except:
                                traceback.print_exc()
            
            ret = [warc_file_name]
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, )


if __name__ == "__main__":
    warc_file_name = "CC-MAIN-20221127073607-20221127103607-00007.warc.gz"
    INPUT_FOLDER = "$(input_data_folder)"
    OUTPUT_FOLDER = "$(output_data_folder)"
    TAGS = (
        "<math",
        "MathJax",
    )
    output = warc_filter_layer(warc_file_name, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, TAGS=TAGS)
    print(output)


================================================
FILE: DomainSpecific/core/layers/transform/warc_to_wet_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import util

def warc_to_wet_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", OVERWRITE=False):
    ret = list()
    try:
        wet_file_name = warc_file_name.replace(".warc.gz", ".warc.wet.gz")
        wat_file_name = warc_file_name.replace(".warc.gz", ".warc.wat.gz")

        src_warc_file_path = os.path.join(INPUT_FOLDER, warc_file_name)
        src_warc_file_path = util.to_real_path(src_warc_file_path, variables)

        dst_wet_file_path = os.path.join(OUTPUT_FOLDER, wet_file_name)
        dst_wet_file_path = util.to_real_path(dst_wet_file_path, variables)

        if os.path.exists(src_warc_file_path) and (OVERWRITE or not os.path.exists(dst_wet_file_path)):
            util.create_folder_by_file_path(dst_wet_file_path)

            # export SPARK_USER=$USER
            java_package = "./dependency/ia-hadoop-tools-jar-with-dependencies.jar"
            commandline = f"sudo java -jar {java_package} WEATGenerator -strictMode -skipExisting batch-id-xyz {src_warc_file_path}"
            exit_status1 = os.system(commandline)
            assert exit_status1 == 0

            tmp_base_path = os.path.dirname(src_warc_file_path)
            tmp_wet_file_path = os.path.join(tmp_base_path, "..", "wet/", wet_file_name)
            tmp_wat_file_path = os.path.join(tmp_base_path, "..", "wat/", wat_file_name)
            exit_status2 = os.system(f"sudo cp -f {tmp_wet_file_path} {dst_wet_file_path}")
            assert exit_status2 == 0

            os.system(f"sudo rm {tmp_wet_file_path}")
            os.system(f"sudo rm {tmp_wat_file_path}")

            ret = [wet_file_name]
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, )


if __name__ == "__main__":
    warc_file_name = "CC-MAIN-20221127073607-20221127103607-00007.warc.gz"
    INPUT_FOLDER = "$(input_data_folder)"
    OUTPUT_FOLDER = "$(output_data_folder)"
    output = warc_to_wet_layer(warc_file_name, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER)
    print(output)


================================================
FILE: DomainSpecific/core/layers/transform/wet_decode_layer.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
import re
from io import BytesIO
from warcio.limitreader import LimitReader
from warcio.warcwriter import WARCWriter
from warcio.archiveiterator import ArchiveIterator
from pylatexenc.latex2text import LatexNodes2Text
from guesslang import Guess
import util

def decode_tag(tag):
    return tag.replace(b"[[[", b"<").replace(b"]]]", b">")

def latex2text(latex, encoding="utf-8"):
    latexNodes2Text = LatexNodes2Text()
    latex = str(latex, encoding)
    text = latexNodes2Text.latex_to_text(latex)
    text = bytes(text, encoding)
    return text

def separate_content_and_tag(html, start_str, end_str):
    index = html.find(start_str)
    before = html[:index]
    html = html[index:]
    index = html.find(end_str) + len(end_str)
    content = html[:index]
    after = html[index:]
    return content, before, after

def remove_number_and_merge_snippet(html, NumberThred = 7):
    lines = html.split(b'\n')

    for interval in (1, 2, 3, 4):
        line_no_list = list()
        last_code_no = -1
        for line_no in range(0, len(lines), interval):
            try:
                code_no = int(lines[line_no].strip())
            except:
                code_no = -1
            if (last_code_no == -1 and code_no == 1) or last_code_no + 1 == code_no:
                last_code_no = code_no
                line_no_list.append(line_no)
            else:
                if last_code_no > NumberThred:
                    for hist_line_no in line_no_list:
                        lines[hist_line_no] = b''
                line_no_list = list()
                last_code_no = -1
        lines = list(filter(lambda line: len(line) > 0, lines))

    for i in range(2):
        line_no_list = list()
        last_code_no = -1
        for line_no in range(len(lines)):
            try:
                code_no = int(lines[line_no].strip())
            except:
                code_no = -1
            if (last_code_no == -1 and code_no == 1) or last_code_no + 1 == code_no:
                last_code_no = code_no
                line_no_list.append(line_no)
            elif code_no == 0 or code_no == 1:
                if last_code_no > NumberThred:
                    for hist_line_no in line_no_list:
                        lines[hist_line_no] = b''
                line_no_list = [line_no]
                last_code_no = code_no
        lines = list(filter(lambda line: len(line) > 0, lines))
    
    for line_no in range(len(lines)):
        if len(lines[line_no].strip()) == 0:
            lines[line_no] = b''
    lines = list(filter(lambda line: len(line) > 0, lines))

    # merge code snippets which are locate continously with single line.
    #html = re.sub(b"</code-encode>\n<code-encode>\n", b"\n", html)
    code_head = b"<code-encode>"
    code_tail = b"</code-encode>"
    for line_no in range(max(0, len(lines)-3)):
        if code_tail in lines[line_no] and code_head in lines[line_no+1] and code_tail in lines[line_no+3]:
            lines[line_no] = b''
            lines[line_no+1] = b''
    lines = list(filter(lambda line: len(line) > 0, lines))

    # filter issue html.
    cnt = 0
    for line in lines:
        if code_head in line:
            cnt += 1
        elif code_tail in line:
            cnt -= 1
        # error happens.
        if cnt != 0 and cnt != 1:
            return b''
    
    html = b'\n'.join(lines)
    return html

guess = None
def identify_code(text):
    global guess
    if guess is None:
        guess = Guess()
    try:
        #name = guess.language_name(text)
        name, prob = guess.probabilities(text)[0]
    except:
        name, prob = "unknown", 1.0
    return name, prob

def decode_html(uri, html, encoding, TAG):
    if html is None:
        return None, False

    if TAG == "math":
        decode_table = {
            b"[[[less]]]": b"<",
            b"[[[large]]]": b">",
        }

        tag_head_mathml = b"[[[math-ml]]]"
        tag_tail_mathml = b"[[[/math-ml]]]"
        tag_head_mathtex = b"[[[math-tex]]]"
        tag_tail_mathtex = b"[[[/math-tex]]]"

        start_end = (
            (tag_head_mathml, tag_tail_mathml),
            (tag_head_mathtex, tag_tail_mathtex),
        )

        for (start, end) in start_end:
            while start in html:
                content, before, after = separate_content_and_tag(html, start, end)
                formula = content[len(start): -len(end)]

                if len(formula.strip()) != 0:
                    # decode < and >.
                    for key1, key2 in decode_table.items():
                        formula = formula.replace(key1, key2)
                    
                    # decode math tag.
                    content = decode_tag(start) + formula + decode_tag(end)

                    # dedup math formula around context.
                    formula_ascii = latex2text(formula).strip()
                    n = len(formula_ascii)
                    if n > 0 and before.rstrip()[-n:] == formula_ascii:
                        before = before.rstrip()[:-n]
                    elif n > 0 and after.lstrip()[:n] == formula_ascii:
                        after = after.lstrip()[n:]
                    html = before + content + after
                else:
                    # remove empty formula.
                    html = before + after

    elif TAG == "code":
        tag_head_code = b"[[[code-encode]]]"
        tag_tail_code = b"[[[/code-encode]]]"
        #tag_head_notcode = b"[[[not-code-encode]]]"# debug
        #tag_tail_notcode = b"[[[/not-code-encode]]]"# debug

        start_end = (
            (tag_head_code, tag_tail_code),
            #(tag_head_notcode, tag_tail_notcode),# debug
        )

        for (start, end) in start_end:
            while start in html:
                content, before, after = separate_content_and_tag(html, start, end)
                code = content[len(start): -len(end)].strip()

                if len(code) != 0:
                    lang, prob = identify_code(code)
                    #lcnt = code.count(b"\n")
                    #meta_lang = bytes(f"<metadata lang={lang} prob={prob:.2f} lines={lcnt} />", encoding=encoding)
                    meta_lang = bytes(f"<metadata lang={lang} prob={prob:.2f} />", encoding=encoding)
                    decode_start = decode_tag(start)
                    decode_end = decode_tag(end)
                    #content = decode_start + b"\n" + code + b"\n" + decode_end
                    content = decode_start + meta_lang + b"\n" + code + b"\n" + decode_end
                    html = before + content + after
                else:
                    # remove empty code.
                    html = before + after

        # remove number of code block.
        html = remove_number_and_merge_snippet(html)

    elif TAG == "image":
        tag_head_image = b"[[[image-encode]]]"
        tag_tail_image = b"[[[/image-encode]]]"

        start_end = (
            (tag_head_image, tag_tail_image),
        )

        for (start, end) in start_end:
            while start in html:
                content, before, after = separate_content_and_tag(html, start, end)
                image_meta = content[len(start): -len(end)].strip()

                if len(image_meta) != 0:
                    decode_start = decode_tag(start)
                    decode_end = decode_tag(end)
                    content = decode_start + image_meta + decode_end
                    html = before + content + after
                else:
                    # remove empty image.
                    html = before + after
                    return None, False

    elif TAG == "video":
        tag_head_video = b"[[[video-encode]]]"
        tag_tail_video = b"[[[/video-encode]]]"

        start_end = (
            (tag_head_video, tag_tail_video),
        )

        for (start, end) in start_end:
            while start in html:
                content, before, after = separate_content_and_tag(html, start, end)
                video_meta = content[len(start): -len(end)].strip()

                if len(video_meta) != 0:
                    decode_start = decode_tag(start)
                    decode_end = decode_tag(end)
                    content = decode_start + video_meta + decode_end
                    html = before + content + after
                else:
                    # remove empty video.
                    html = before + after
                    return None, False

    # remove continous empty lines.
    if html is not None and len(html) > 0:
        html = re.sub(b"(\n\r)+", b"\n", html)
        html = re.sub(b"(\r\n)+", b"\n", html)
        html = re.sub(b"\n+", b"\n", html)

    contain = False
    for (start, end) in start_end:
        decode_start = decode_tag(start)
        if decode_start in html:
            contain = True

    return html, contain

def wet_decode_layer(wet_file_name, variables=dict(), INPUT_FOLDER="./", OUTPUT_FOLDER="./", TAG=None, OVERWRITE=False):
    ret = list()
    try:
        BLACK_URLS = ("blame.php", "diff.php")
        regex = re.compile('|'.join(BLACK_URLS))
        src_wet_file_path = os.path.join(INPUT_FOLDER, wet_file_name)
        src_wet_file_path = util.to_real_path(src_wet_file_path, variables)
        dst_wet_file_path = os.path.join(OUTPUT_FOLDER, wet_file_name)
        dst_wet_file_path = util.to_real_path(dst_wet_file_path, variables)

        if os.path.exists(src_wet_file_path) and (OVERWRITE or not os.path.exists(dst_wet_file_path)):
            util.create_folder_by_file_path(dst_wet_file_path)
            with open(dst_wet_file_path, "wb") as output:
                writer = WARCWriter(output, gzip=True)
                with open(src_wet_file_path, "rb") as input:
                    records = ArchiveIterator(input, arc2warc=False)
                    for id, record in enumerate(records):
                        #lang = record.rec_headers["WARC-Identified-Content-Language"]
                        #if lang != "en":
                        #    continue

                        if record.rec_type == "conversion":
                            try:
                                uri = record.rec_headers["WARC-Target-URI"]
                                if regex.search(uri):
                                    continue

                                # read raw html.
                                html = record.content_stream().read()
                                encoding = "utf-8"

                                # decode html.
                                if encoding is not None:
                                    if TAG is not None:
                                        html, contain_tag = decode_html(uri, html, encoding, TAG)
                                    else:
                                        contain_tag_cnt = 0
                                        TAGS = ("math", "code", "image")# "video"
                                        for tag in TAGS:
                                            html, contain_tag = decode_html(uri, html, encoding, tag)
                                            if contain_tag:
                                                contain_tag_cnt += 1
                                        contain_tag = contain_tag_cnt > 0
                                else:
                                    html = None
                                    contain_tag = False

                                # write decoded html.
                                if contain_tag and html is not None:
                                    content = BytesIO(html)
                                    assert content.getbuffer().nbytes == len(html)
                                    raw_length = len(html)
                                    record.raw_stream = LimitReader(content, raw_length)

                                    record.rec_headers["Content-Length"] = None
                                    record.length = None

                                    writer.write_record(record)
                            except:
                                traceback.print_exc()
            #ret = [wet_file_name]
            ret = [dst_wet_file_path]
    except KeyboardInterrupt:
        sys.exit()
    except Exception as ex:
        traceback.print_exc()
    return (ret, )


if __name__ == "__main__":
    warc_file_name = "CC-MAIN-20221127073607-20221127103607-00007.warc.gz"
    INPUT_FOLDER = "$(input_data_folder)"
    OUTPUT_FOLDER = "$(output_data_folder)"
    TAG = "math"
    output = wet_decode_layer(warc_file_name, INPUT_FOLDER=INPUT_FOLDER, OUTPUT_FOLDER=OUTPUT_FOLDER, TAG=TAG)
    print(output)


================================================
FILE: DomainSpecific/core/layers/util.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import copy
import yaml
import hashlib
import logging
import datetime
import requests
from urllib.parse import urljoin
from azure.storage.blob import ContainerClient, BlobSasPermissions, generate_blob_sas
from azure.identity import DefaultAzureCredential
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

def load_yaml(config_path):
    config = None
    if os.path.exists(config_path):
        with open(config_path, "r") as file:
            config = yaml.safe_load(file)
    return config

def save_yaml(config, config_path):
    if os.path.exists(os.path.dirname(config_path)):
        with open(config_path, "w") as file:
            yaml.safe_dump(config, file)

def str2bytes(data):
    data = bytes(data, "utf-8")
    return data

def md5(data):
    if isinstance(data, str):
        data = str2bytes(data)
    md5 = hashlib.md5(data).hexdigest()
    return md5

def sha256(data):
    if isinstance(data, str):
        data = str2bytes(data)
    sha256 = hashlib.sha256(data).hexdigest()
    return sha256

def suffix(path):
    suffix = os.path.splitext(path)[1]
    return suffix

def relative2absolute_path(prefix, link):
    # Root-relative path.
    if link.startswith("/"):
        link = urljoin(prefix, link)
    else:
        colon_count = link[:10].count(":")
        # Document-relative path.
        if link.startswith(".") or colon_count == 0:
            link = urljoin(prefix, link)
        # Absolute paths, such as `http://`, `https://`, `ftp://`, or 'file://'.
        else:
            link = link
    return link

def create_folder_by_file_path(local_file_path):
    local_folder_path = os.path.dirname(local_file_path)
    if not os.path.exists(local_folder_path) and len(local_folder_path.strip()) != 0:
        try:
            os.makedirs(local_folder_path, exist_ok=True)
        except:
            pass

def to_real_path(path, variables):
    keys = ("workspace_dir", "worker_id", "worker_num")
    path = copy.copy(path)
    for name, value in variables.items():
        if name in keys:
            path = path.replace("{%s}" % name, str(value))
    return path

def get_container_client(storage_config):
    if isinstance(storage_config, ContainerClient):
        return storage_config

    if isinstance(storage_config, str) and os.path.exists(storage_config):
        storage_config = load_yaml(storage_config)

    account_domain = "blob.core.windows.net"
    account_name = storage_config["azstorage"]["account-name"]
    #account_key = storage_config["azstorage"]["account-key"]
    container_name = storage_config["azstorage"]["container"]
    identity_id = storage_config["azstorage"]["appid"]
    credential = DefaultAzureCredential(managed_identity_client_id=identity_id)

    container_client = ContainerClient(
        account_url=f"https://{account_name}.{account_domain}/",
        container_name=container_name,
        credential=credential#account_key
    )

    return container_client

def get_blob_client(storage_config, blob_path):
    container_client = get_container_client(storage_config)
    blob_client = container_client.get_blob_client(blob_path)
    return blob_client

def exist_blob(container_client, blob_path):
    with container_client.get_blob_client(blob_path) as blob_client:
        blob_path_exists = blob_client.exists()
        return blob_path_exists

def get_blob_size(container_client, blob_path):
    with container_client.get_blob_client(blob_path) as blob_client:
        properties = blob_client.get_blob_properties()
        size = properties.size
        return size

def list_blob_dir(container_client, blob_path):
    names = list()
    for blob in container_client.walk_blobs(name_starts_with=blob_path):
        names.append(blob.name)
    return names

def create_blob_dir(container_client, blob_path):
    container_client.upload_blob(name=os.path.join(blob_path, "_"), data=b"", overwrite=True)

def upload_bytes_to_blob(storage_config, content, blob_path):
    with get_blob_client(storage_config, blob_path) as blob_client:
        blob_client.upload_blob(content, overwrite=True)
    return blob_path

def upload_file_to_blob(storage_config, local_path, blob_path):
    with open(local_path, "rb") as content:
        upload_bytes_to_blob(storage_config, content, blob_path)
    return blob_path

def upload_bytes_to_internet(content, blob_path):
    # TODO: to be implemented.
    return blob_path

def upload_file_to_internet(local_path, blob_path):
    # TODO: to be implemented.
    return blob_path

def download_bytes_from_blob(storage_config, blob_path):
    with get_blob_client(storage_config, blob_path) as blob_client:
        content = blob_client.download_blob().readall()
    return content

def download_file_from_blob(storage_config, blob_path, local_path):
    content = download_bytes_from_blob(storage_config, blob_path)
    create_folder_by_file_path(local_path)
    with open(local_path, "wb") as data:
        data.write(content)
    return local_path

def download_bytes_from_internet(url, timeout=3):
    try:
        resp = requests.get(url, allow_redirects=True, timeout=timeout)
        if resp.status_code == 200:
            content = resp.content
            return content
        else:
            return None
    except:
        return None

def download_file_from_internet(url, local_path):
    try:
        content = download_bytes_from_internet(url)
        if content is not None:
            create_folder_by_file_path(local_path)
            with open(local_path, "wb") as data:
                data.write(content)
            return local_path, len(content)
        else:
            return None, 0
    except:
        return None, 0


================================================
FILE: DomainSpecific/core/network.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
os.sys.path.append(f"{os.path.dirname(os.path.realpath(__file__))}/..")
import traceback
from core.layers import LayerType, util

class Network:
    def __init__(self):
        self.type = None
        self.input_names = list()
        self.output_names = list()
        self.datas = dict()
        self.layers = dict()

    def set_input_names(self, input_names):
        self.input_names = input_names

    def set_output_names(self, output_names):
        self.output_names = output_names

    def add_data(self, name, value):
        self.datas[name] = value

    def add_layer(self, name, value):
        self.layers[name] = value

    def next_layer(self, invisited_layer_names):
        for name in invisited_layer_names:
            layer = self.layers[name]
            input_names = layer.input_names
            if set(input_names) <= set(self.datas.keys()):
                input_values = [self.datas[input_name] for input_name in input_names]
                invisited_layer_names.remove(name)
                return layer, name, input_values
        return None
    
    def __call__(self, inputs=list(), worker_id=0, worker_num=1, variables=dict()):
        outputs = list()
        try:
            if len(inputs) == len(self.input_names):
                for name, value in zip(self.input_names, inputs):
                    self.add_data(name, value)
            
            invisited_layer_names = sorted(list(self.layers.keys()))
            while len(invisited_layer_names) > 0:
                item = self.next_layer(invisited_layer_names)
                if item is None:
                    raise Exception("There are some layers which misses input data.")
                layer, layer_name, input_values = item
                print(f"{layer_name} - input: {layer.input_names}, output: {layer.output_names}", flush=True)

                output_values = layer(input_values, worker_id=worker_id, worker_num=worker_num, variables=variables)
                for name, value in zip(layer.output_names, output_values):
                    self.add_data(name, value)
            outputs = [self.datas[output_name] for output_name in self.output_names]
        except KeyboardInterrupt:
            sys.exit()
        except Exception as ex:
            traceback.print_exc()
        return outputs

    """
    def spark(self, inputs, spark_session, spark_context, worker_num=1, variables=dict()):
        from pyspark import TaskContext, StorageLevel

        def merge(x, n):
            if n == 0:
                return []
            elif n == 1:
                return [x]
            elif n == 2:
                return list(x)
            else:
                for _ in range(n - 2):
                    x = x[0] + x[1:]
                return list(x)
        
        def func(layer, input, worker_id, worker_num, variables):
            input = list(input)
            assert len(input) == 1
            input = input[0]
            output = layer(input, worker_id=worker_id, worker_num=worker_num, variables=variables)
            return [output]
        
        outputs = list()
        try:
            if len(inputs) == len(self.input_names):
                for name, value in zip(self.input_names, inputs):
                    self.add_data(name, value)
            
            for name, data in self.datas.items():
                input_rdd = spark_context.parallelize(worker_num * [data], worker_num)
                # Avoid recomputation, because each rdd may be used multiple times.
                input_rdd.persist(StorageLevel.MEMORY_AND_DISK)
                self.add_data(name, input_rdd)
            
            invisited_layer_names = sorted(list(self.layers.keys()))
            while len(invisited_layer_names) > 0:
                item = self.next_layer(invisited_layer_names)
                if item is None:
                    raise Exception("There are some layers which misses input data.")
                layer, layer_name, input_values = item

                input_rdds = None
                for i, input_rdd in enumerate(input_values):
                    input_rdds = input_rdd if i == 0 else input_rdds.zip(input_rdd)
                input_rdds = input_rdds.map(lambda x: merge(x, len(layer.input_names)))

                native_io = True
                if native_io:
                    output_rdds = input_rdds.mapPartitionsWithIndex(
                        lambda worker_id, input: 
                        func(layer, input, worker_id, worker_num, variables), preservesPartitioning=True
                    )
                else:# (Deprecated)
                    #if layer.type in (LayerType.To_Line_File, LayerType.To_Jsonl_File, LayerType.To_Parquet_File):
                    if layer.type == LayerType.To_Line_File:
                        inputs = input_rdds.collect()
                        outputs = list()
                        for worker_id, input in enumerate(inputs):
                            variables["worker_id"] = worker_id
                            variables["worker_num"] = worker_num
                            assert len(input) == 2
                            file_path = util.to_real_path(input[1], variables)
                            
                            spark_context.parallelize(input[0], 1).saveAsTextFile(file_path)
                            #rdd = spark_context.parallelize(input[0], 1)
                            #rdd.toDF().write.mode("overwrite").text(file_path)
                            #rdd.toDF().write.mode("overwrite").json(file_path)
                            #rdd.toDF().write.mode("overwrite").parquet(file_path)
                            
                            output = [file_path]
                            outputs.append(output)
                        output_rdds = spark_context.parallelize(outputs, worker_num)
                    #elif layer.type in (LayerType.From_Line_File, LayerType.From_Jsonl_File, LayerType.From_Parquet_File):
                    elif layer.type == LayerType.From_Line_File:
                        inputs = input_rdds.collect()
                        outputs = list()
                        for worker_id, input in enumerate(inputs):
                            variables["worker_id"] = worker_id
                            variables["worker_num"] = worker_num
                            assert len(input) == 1
                            file_path = util.to_real_path(input[0], variables)
                            
                            lines = spark_context.textFile(file_path).collect()
                            #rdd = spark_session.read.option("mode", "DROPMALFORMED").text(file_path).rdd
                            #rdd = spark_session.read.option("mode", "DROPMALFORMED").json(file_path).rdd
                            #rdd = spark_session.read.option("mode", "DROPMALFORMED").parquet(file_path).rdd
                            #lines = rdd.collect()
                            
                            output = [lines]
                            outputs.append(output)
                        output_rdds = spark_context.parallelize(outputs, worker_num)
                    else:
                        output_rdds = input_rdds.mapPartitionsWithIndex(
                            lambda worker_id, input: 
                            func(layer, input, worker_id, worker_num, variables), preservesPartitioning=True
                        )

                # Avoid recomputation, because each rdd may be used multiple times.
                output_rdds.persist(StorageLevel.MEMORY_AND_DISK)
                for i, name in enumerate(layer.output_names):
                    output_rdd = output_rdds.map(lambda _:_[i])
                    # Avoid recomputation, because each rdd may be used multiple times.
                    output_rdd.persist(StorageLevel.MEMORY_AND_DISK)
                    self.add_data(name, output_rdd)

                print(f"{layer_name} - {layer.input_names}, {layer.output_names}", flush=True)
            outputs = [self.datas[output_name].collect() for output_name in self.output_names]
        except KeyboardInterrupt:
            sys.exit()
        except Exception as ex:
            traceback.print_exc()
        return outputs
    """


if __name__ == "__main__":
    network = Network()
    print(network)


================================================
FILE: DomainSpecific/dependency/gpt_api.py
================================================
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
import os
import sys
import time
import traceback
import tiktoken
import collections
from datetime import datetime
import openai
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider


class GPTAPI:
    def __init__(self, engine, endpoint, identity_id):
        """
        Detail setting method could refer to: https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/managed-identity
        The authentication methods include key-based method, cli-based method, identity-based method, etc.
        We use identity-based method, you could switch to other method.
        """
        self.keep_history = False
        self.user_QAs = collections.defaultdict(list)
        self.max_tokens_per_requests = 8192 - 800 - 192
        self.quato_tokens_per_minute = 120000#140000
        self.quato_requests_per_minute = 720#840
        self.last_minute = -1
        self.acc_tokens = 0
        self.acc_requests = 0

        try:
            self.enc = tiktoken.encoding_for_model("gpt-4")
        except:
            self.enc = None
        self.engine = engine
        self.endpoint = endpoint

        token_provider = get_bearer_token_provider(DefaultAzureCredential(managed_identity_client_id=identity_id), "https://cognitiveservices.azure.com/.default")
        self.client = AzureOpenAI(
            azure_endpoint=endpoint,
            azure_ad_token_provider=token_provider,
            #api_version="2024-02-15-preview",
            api_version="2024-08-01-preview",
            max_retries=0,
        )

    def switch_api(self, api_idx=-1):
        # TBD: not implemented yet. 
        pass

    def get_tokens(self, text):
        tokens = max(len(text.split()), len(text) // 4)
        return tokens

    def run(self, system, question, engine=None, uid=None, temperature=0.0, max_tokens=800):
        if engine is None:
            engine = self.engine
        
        if self.enc is None:
            return ""

        # question check.
        #if self.get_tokens(question) > self.max_tokens_per_requests:
        #    question = question[:self.max_tokens_per_requests * 4]
        tokens = self.enc.encode(question)
        tokens_len = len(tokens)
        if tokens_len > self.max_tokens_per_requests:
            offset = (tokens_len - self.max_tokens_per_requests) // 2
            cut_tokens = tokens[offset:offset+self.max_tokens_per_requests]
            question = self.enc.decode(cut_tokens)

        # system setting.
        messages = [{"role": "system", "content": system}]
        
        # chat setting.
        if self.keep_history:
            for Q, A in self.user_QAs[uid]:
                messages.append({"role": "user", "content": Q})
                messages.append({"role": "assistant", "content": A})
        messages.append({"role": "user", "content": question})

        # quato check.
        """
        while True:
            cur_minute = datetime.now().minute
            cur_tokens = self.get_tokens(str(messages))
            if self.last_minute != cur_minute:
                self.last_minute = cur_minute
                self.acc_tokens = 0
                self.acc_requests = 0
            if self.acc_requests + 1  < self.quato_requests_per_minute and self.acc_tokens + cur_tokens < self.quato_tokens_per_minute:
                self.acc_requests += 1
                self.acc_tokens += cur_tokens
                break
            time.sleep(1)
        """

        # robot running.
        try:
            response = self.client.chat.completions.create(
                model=engine,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                #top_p=0.95,
                #frequency_penalty=0,
                #presence_penalty=0,
                #stop=None
            )
            answer = response.choices[0].message.content
        # https://github.com/openai/openai-python/blob/main/openai/error.py
        except (openai.RateLimitError, op

Download .txt

gitextract_ayw6h_qv/

├── .github/
│   └── workflows/
│       └── codeql.yml
├── CODE_OF_CONDUCT.md
├── DomainSpecific/
│   ├── .gitignore
│   ├── configs/
│   │   ├── cc_math_filter.CC-MAIN-2023-23.json
│   │   ├── cc_openquestion_filter.CC-MAIN-2023-23.json
│   │   ├── cc_warc_download.CC-MAIN-2023-23.json
│   │   ├── cc_warc_filter.CC-MAIN-2023-23.json
│   │   ├── cc_warc_to_wet.code.CC-MAIN-2023-23.json
│   │   ├── cc_warc_to_wet.math.CC-MAIN-2023-23.json
│   │   └── network_template.json
│   ├── core/
│   │   ├── __init__.py
│   │   ├── data.py
│   │   ├── layer.py
│   │   ├── layers/
│   │   │   ├── __init__.py
│   │   │   ├── control/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── data_concat_layer.py
│   │   │   │   ├── data_filter_layer.py
│   │   │   │   ├── data_order_layer.py
│   │   │   │   ├── data_partition_layer.py
│   │   │   │   ├── data_sample_layer.py
│   │   │   │   └── data_shuffle_layer.py
│   │   │   ├── extract/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build_index_layer.py
│   │   │   │   ├── extract_article_layer.py
│   │   │   │   └── search_index_layer.py
│   │   │   ├── global_var.py
│   │   │   ├── io/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── from_binary_file_layer.py
│   │   │   │   ├── from_index_file_layer.py
│   │   │   │   ├── from_jsonl_file_layer.py
│   │   │   │   ├── from_line_file_layer.py
│   │   │   │   ├── from_parquet_file_layer.py
│   │   │   │   ├── from_warc_file_layer.py
│   │   │   │   ├── from_wet_file_layer.py
│   │   │   │   ├── to_binary_file_layer.py
│   │   │   │   ├── to_index_file_layer.py
│   │   │   │   ├── to_jsonl_file_layer.py
│   │   │   │   ├── to_line_file_layer.py
│   │   │   │   └── to_parquet_file_layer.py
│   │   │   ├── network/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── download_bytes_from_blob_layer.py
│   │   │   │   ├── download_bytes_from_internet_layer.py
│   │   │   │   ├── download_file_from_blob_layer.py
│   │   │   │   ├── download_file_from_internet_layer.py
│   │   │   │   ├── download_starcoder_layer.py
│   │   │   │   ├── download_url_list_layer.py
│   │   │   │   ├── download_urls_from_website_layer.py
│   │   │   │   ├── download_warc_file_layer.py
│   │   │   │   ├── download_warc_indice_layer.py
│   │   │   │   ├── upload_bytes_to_blob_layer.py
│   │   │   │   └── upload_file_to_blob_layer.py
│   │   │   ├── template_layer.py
│   │   │   ├── transform/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── lsh_minhash_layer.py
│   │   │   │   ├── math_filter_layer.py
│   │   │   │   ├── mcq_filter_layer.py
│   │   │   │   ├── minhash_tokens_layer.py
│   │   │   │   ├── ngrams_layer.py
│   │   │   │   ├── openquestion_filter_layer.py
│   │   │   │   ├── tokenize_article_layer.py
│   │   │   │   ├── warc_encode_layer.py
│   │   │   │   ├── warc_filter_layer.py
│   │   │   │   ├── warc_to_wet_layer.py
│   │   │   │   └── wet_decode_layer.py
│   │   │   └── util.py
│   │   └── network.py
│   ├── dependency/
│   │   ├── gpt_api.py
│   │   ├── ia-hadoop-tools-jar-with-dependencies.jar
│   │   ├── install.py
│   │   ├── requirements.txt
│   │   └── xsltml_2.0/
│   │       ├── cmarkup.xsl
│   │       ├── entities.xsl
│   │       ├── glayout.xsl
│   │       ├── mmltex.xsl
│   │       ├── scripts.xsl
│   │       ├── tables.xsl
│   │       └── tokens.xsl
│   ├── readme.md
│   ├── requirements.txt
│   ├── resources/
│   │   ├── computation/
│   │   │   ├── batch_dca_eastus.yaml
│   │   │   └── local.yaml
│   │   ├── environment/
│   │   │   ├── amlt_sing.yaml
│   │   │   └── local.yaml
│   │   └── storage/
│   │       ├── llmstore.yaml
│   │       └── local.yaml
│   ├── sample_run.sh
│   ├── submit.py
│   ├── tools/
│   │   ├── __init__.py
│   │   ├── submit_batch_job.py
│   │   └── submit_local_job.py
│   └── wrapper/
│       ├── __init__.py
│       ├── interpreter.py
│       ├── parser.py
│       ├── runner.py
│       └── utility/
│           ├── __init__.py
│           ├── azure_env.py
│           ├── cpu_count.py
│           ├── load_yaml.py
│           ├── logger.py
│           └── save_yaml.py
├── GeneralDomain/
│   ├── .gitignore
│   ├── README.md
│   ├── pyproject.toml
│   └── redstone_cc/
│       ├── __init__.py
│       ├── __main__.py
│       ├── algos/
│       │   ├── __init__.py
│       │   ├── deduplication/
│       │   │   ├── __init__.py
│       │   │   ├── minhash.py
│       │   │   ├── sha1.py
│       │   │   └── utils.py
│       │   ├── fasttext_classifier.py
│       │   ├── rule_based_filters/
│       │   │   ├── __init__.py
│       │   │   ├── func/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── document.py
│       │   │   │   ├── line.py
│       │   │   │   └── repetition.py
│       │   │   ├── model/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── document.py
│       │   │   │   └── violations.py
│       │   │   ├── ruleset/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── gopher.py
│       │   │   │   └── refinedweb.py
│       │   │   └── utils.py
│       │   └── trafilatura_process.py
│       ├── download_utils.py
│       └── process.py
├── LICENSE
├── README.md
├── SECURITY.md
└── SUPPORT.md

Download .txt

SYMBOL INDEX (272 symbols across 78 files)

FILE: DomainSpecific/core/data.py
  class DataType (line 6) | class DataType(Enum):
    method belong (line 44) | def belong(a, b):
  class Data (line 52) | class Data:
    method __init__ (line 56) | def __init__(self, type=DataType.Mem_Any, value=None):

FILE: DomainSpecific/core/layer.py
  class JointType (line 12) | class JointType(Enum):
  class Layer (line 17) | class Layer:
    method __init__ (line 18) | def __init__(self, type, joint=JointType.Default, repetition=1, param=...
    method __call__ (line 27) | def __call__(self, inputs, worker_id=0, worker_num=1, variables=dict()):

FILE: DomainSpecific/core/layers/__init__.py
  class LayerType (line 21) | class LayerType(Enum):

FILE: DomainSpecific/core/layers/control/data_concat_layer.py
  function data_concat_layer (line 9) | def data_concat_layer(lists, variables=dict()):

FILE: DomainSpecific/core/layers/control/data_filter_layer.py
  function data_filter_layer (line 9) | def data_filter_layer(lines, variables=dict(), IN=False, FILTERS=(None,)):

FILE: DomainSpecific/core/layers/control/data_order_layer.py
  function data_order_layer (line 9) | def data_order_layer(lines, variables=dict(), REVERSE=False):

FILE: DomainSpecific/core/layers/control/data_partition_layer.py
  function data_partition_layer (line 9) | def data_partition_layer(lines, variables=dict(), WORKER_ID=-1):

FILE: DomainSpecific/core/layers/control/data_sample_layer.py
  function data_sample_layer (line 10) | def data_sample_layer(lines, variables=dict(), N=-1, SEED=1):

FILE: DomainSpecific/core/layers/control/data_shuffle_layer.py
  function data_shuffle_layer (line 10) | def data_shuffle_layer(lines, variables=dict(), SEED=1):

FILE: DomainSpecific/core/layers/extract/build_index_layer.py
  function build_index_layer (line 9) | def build_index_layer(base_vectors, variables=dict(), SEED=1, DIM=4096, ...

FILE: DomainSpecific/core/layers/extract/extract_article_layer.py
  function filter_tags_in_html (line 16) | def filter_tags_in_html(soup):
  function lid (line 43) | def lid(soup, model):
  function get_main_text_html (line 59) | def get_main_text_html(soup):
  function remove_dup_newline (line 68) | def remove_dup_newline(text):
  class User_MarkdownConverter (line 74) | class User_MarkdownConverter(MarkdownConverter):
    method convert_tr (line 75) | def convert_tr(self, el, text, convert_as_inline):
    method convert_a (line 98) | def convert_a(self, el, text, convert_as_inline):
    method convert_pre (line 117) | def convert_pre(self, el, text, convert_as_inline):
  function html2text (line 127) | def html2text(soup, **options):
  function trans2md (line 147) | def trans2md(html):
  function _patch_newspaper_parser_clean (line 157) | def _patch_newspaper_parser_clean(cls, node):
  function extract (line 161) | def extract(soup):
  function extract_article_layer (line 166) | def extract_article_layer(id_html, variables=dict()):

FILE: DomainSpecific/core/layers/extract/search_index_layer.py
  function search_index_layer (line 10) | def search_index_layer(index, query_vectors, variables=dict(), TOPK=1):

FILE: DomainSpecific/core/layers/io/from_binary_file_layer.py
  function from_binary_file_layer (line 10) | def from_binary_file_layer(file_path, variables=dict(), STORAGE_PATH=None):

FILE: DomainSpecific/core/layers/io/from_index_file_layer.py
  function from_index_file_layer (line 11) | def from_index_file_layer(file_path, variables=dict(), STORAGE_PATH=None):

FILE: DomainSpecific/core/layers/io/from_jsonl_file_layer.py
  function from_jsonl_file_layer (line 11) | def from_jsonl_file_layer(file_path, variables=dict(), STORAGE_PATH=None):

FILE: DomainSpecific/core/layers/io/from_line_file_layer.py
  function from_line_file_layer (line 10) | def from_line_file_layer(file_path, variables=dict(), STORAGE_PATH=None):

FILE: DomainSpecific/core/layers/io/from_parquet_file_layer.py
  function from_parquet_file_layer (line 12) | def from_parquet_file_layer(file_path, variables=dict(), STORAGE_PATH=No...

FILE: DomainSpecific/core/layers/io/from_warc_file_layer.py
  function from_warc_file_layer (line 11) | def from_warc_file_layer(file_path, variables=dict(), STORAGE_PATH=None):

FILE: DomainSpecific/core/layers/io/from_wet_file_layer.py
  function from_wet_file_layer (line 11) | def from_wet_file_layer(file_path, variables=dict(), STORAGE_PATH=None):

FILE: DomainSpecific/core/layers/io/to_binary_file_layer.py
  function to_binary_file_layer (line 10) | def to_binary_file_layer(bytes, file_path, variables=dict(), STORAGE_PAT...

FILE: DomainSpecific/core/layers/io/to_index_file_layer.py
  function to_index_file_layer (line 11) | def to_index_file_layer(index, file_path, variables=dict(), STORAGE_PATH...

FILE: DomainSpecific/core/layers/io/to_jsonl_file_layer.py
  function to_jsonl_file_layer (line 11) | def to_jsonl_file_layer(data, file_path, variables=dict(), STORAGE_PATH=...

FILE: DomainSpecific/core/layers/io/to_line_file_layer.py
  function to_line_file_layer (line 10) | def to_line_file_layer(lines, file_path, variables=dict(), STORAGE_PATH=...

FILE: DomainSpecific/core/layers/io/to_parquet_file_layer.py
  function to_parquet_file_layer (line 12) | def to_parquet_file_layer(data, file_path, variables=dict(), STORAGE_PAT...

FILE: DomainSpecific/core/layers/network/download_bytes_from_blob_layer.py
  function download_bytes_from_blob_layer (line 11) | def download_bytes_from_blob_layer(blob_path, variables=dict(), STORAGE_...

FILE: DomainSpecific/core/layers/network/download_bytes_from_internet_layer.py
  function download_bytes_from_internet_layer (line 11) | def download_bytes_from_internet_layer(url, variables=dict(), TRIES=1):

FILE: DomainSpecific/core/layers/network/download_file_from_blob_layer.py
  function download_file_from_blob_layer (line 11) | def download_file_from_blob_layer(blob_path, variables=dict(), DOWNLOAD_...

FILE: DomainSpecific/core/layers/network/download_file_from_internet_layer.py
  function download_file_from_internet_layer (line 11) | def download_file_from_internet_layer(url, variables=dict(), DOWNLOAD_PA...

FILE: DomainSpecific/core/layers/network/download_starcoder_layer.py
  function download_contents (line 19) | def download_contents(blob_id, src_encoding):
  function download_starcoder_layer (line 25) | def download_starcoder_layer(data_repo, variables=dict(), OUTPUT_FOLDER=...

FILE: DomainSpecific/core/layers/network/download_url_list_layer.py
  function download_url_list_layer (line 13) | def download_url_list_layer(index_url, variables=dict(), FILTER_SUFFIXES...

FILE: DomainSpecific/core/layers/network/download_urls_from_website_layer.py
  function download_urls_from_website_layer (line 12) | def download_urls_from_website_layer(website_url, variables=dict(), FILT...

FILE: DomainSpecific/core/layers/network/download_warc_file_layer.py
  function download_warc_file_layer (line 11) | def download_warc_file_layer(warc_url, variables=dict(), DOWNLOAD_FOLDER...

FILE: DomainSpecific/core/layers/network/download_warc_indice_layer.py
  function download_warc_indice_layer (line 12) | def download_warc_indice_layer(index_url, variables=dict(), TRIES=1, URL...

FILE: DomainSpecific/core/layers/network/upload_bytes_to_blob_layer.py
  function upload_bytes_to_blob_layer (line 11) | def upload_bytes_to_blob_layer(bytes, blob_path, variables=dict(), STORA...

FILE: DomainSpecific/core/layers/network/upload_file_to_blob_layer.py
  function upload_file_to_blob_layer (line 11) | def upload_file_to_blob_layer(file_path, blob_path, variables=dict(), ST...

FILE: DomainSpecific/core/layers/template_layer.py
  function template_layer (line 18) | def template_layer(input, variables=dict(), PARAM=None):

FILE: DomainSpecific/core/layers/transform/lsh_minhash_layer.py
  class LSH (line 16) | class LSH:
    method __init__ (line 17) | def __init__(self):
    method false_positive_probability (line 24) | def false_positive_probability(self, threshold, b, r):
    method false_negative_probability (line 29) | def false_negative_probability(self, threshold, b, r):
    method optimal_param (line 34) | def optimal_param(self, threshold, num_perm, false_positive_weight,
    method gen_lsh (line 53) | def gen_lsh(self, minhash):
  function lsh_minhash_layer (line 58) | def lsh_minhash_layer(minhash, variables=dict()):

FILE: DomainSpecific/core/layers/transform/math_filter_layer.py
  function ismath_by_model (line 28) | def ismath_by_model(text, model, thred=0.5):
  function math_filter_layer (line 42) | def math_filter_layer(pq_name, variables=dict(), INPUT_FOLDER="./", OUTP...

FILE: DomainSpecific/core/layers/transform/mcq_filter_layer.py
  function detect_lang (line 21) | def detect_lang(text):
  function detect_choice_exercise_by_rule (line 40) | def detect_choice_exercise_by_rule(uri, html):
  function detect_choice_exercise_by_ft_model (line 76) | def detect_choice_exercise_by_ft_model(uri, text, thred=0.5):
  function detect_choice_exercise_by_LLM (line 101) | def detect_choice_exercise_by_LLM(text, engine=None):
  function LCS (line 115) | def LCS(str1, str2):
  function localize_choice_exercise_by_LLM (line 131) | def localize_choice_exercise_by_LLM(text, engine=None):
  function mcq_filter_layer (line 172) | def mcq_filter_layer(wet_file_name, variables=dict(), INPUT_FOLDER="./",...

FILE: DomainSpecific/core/layers/transform/minhash_tokens_layer.py
  class MinHasher (line 16) | class MinHasher:
    method __init__ (line 17) | def __init__(self):
    method _sha1_hash (line 23) | def _sha1_hash(self, val):
    method hash (line 28) | def hash(self, sequence):
  function minhash_tokens_layer (line 39) | def minhash_tokens_layer(tokens, variables=dict()):

FILE: DomainSpecific/core/layers/transform/ngrams_layer.py
  function ngrams_layer (line 12) | def ngrams_layer(sequence, variables=dict()):

FILE: DomainSpecific/core/layers/transform/openquestion_filter_layer.py
  function is_openquestion_by_model (line 31) | def is_openquestion_by_model(text, model, thred=0.5):
  function check_yes_no_question (line 45) | def check_yes_no_question(text_before, text_after):
  function check_multiple_choise_question (line 53) | def check_multiple_choise_question(text_before, text_after):
  function check_fill_in_question (line 83) | def check_fill_in_question(text_before, text_after):
  function check_quality (line 89) | def check_quality(item):
  function openquestion_filter_layer (line 141) | def openquestion_filter_layer(pq_name, variables=dict(), INPUT_FOLDER="....

FILE: DomainSpecific/core/layers/transform/tokenize_article_layer.py
  function tokenize_article_layer (line 13) | def tokenize_article_layer(article, variables=dict(), SPM_MODEL_PATH="./...

FILE: DomainSpecific/core/layers/transform/warc_encode_layer.py
  function tex_in_script_tag (line 26) | def tex_in_script_tag(text):
  function tex_in_math_tag (line 36) | def tex_in_math_tag(text):
  function tex_in_math_tag2 (line 40) | def tex_in_math_tag2(text):
  function mathml_in_script_tag (line 43) | def mathml_in_script_tag(text):
  function mathml_in_math_tag (line 47) | def mathml_in_math_tag(text):
  function is_tex (line 53) | def is_tex(text):
  function contain_tex (line 56) | def contain_tex(text):
  function check_latex (line 59) | def check_latex(latex):
  function remove_hidden_content (line 67) | def remove_hidden_content(html):
  function remove_attr (line 84) | def remove_attr(text, attr):
  function mathml_to_latex1 (line 97) | def mathml_to_latex1(text):
  function mathml_to_latex2 (line 105) | def mathml_to_latex2(text):
  function separate_content_and_tag (line 142) | def separate_content_and_tag(html, start_str, end_str, s=0):
  function detect_code (line 151) | def detect_code(text):
  function encode_code (line 173) | def encode_code(node, code_tag, not_code_tag):
  function filter_code (line 218) | def filter_code(html, code_tag, not_code_tag):
  function encode_image (line 233) | def encode_image(uri, node, image_tag):
  function filter_image (line 259) | def filter_image(uri, html, image_tag):
  function encode_video (line 274) | def encode_video(uri, node, video_tag):
  function filter_video (line 300) | def filter_video(uri, html, video_tag):
  function encode_math_html (line 315) | def encode_math_html(uri, html, encoding):
  function get_tag_info (line 433) | def get_tag_info(tag):
  function encode_code_html (line 441) | def encode_code_html(uri, html, encoding):
  function encode_image_html (line 463) | def encode_image_html(uri, html, encoding):
  function encode_video_html (line 481) | def encode_video_html(uri, html, encoding):
  function encode_html (line 499) | def encode_html(uri, html, encoding, TAG):
  function warc_encode_layer (line 514) | def warc_encode_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./...

FILE: DomainSpecific/core/layers/transform/warc_filter_layer.py
  function warc_filter_layer (line 15) | def warc_filter_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./...

FILE: DomainSpecific/core/layers/transform/warc_to_wet_layer.py
  function warc_to_wet_layer (line 10) | def warc_to_wet_layer(warc_file_name, variables=dict(), INPUT_FOLDER="./...

FILE: DomainSpecific/core/layers/transform/wet_decode_layer.py
  function decode_tag (line 17) | def decode_tag(tag):
  function latex2text (line 20) | def latex2text(latex, encoding="utf-8"):
  function separate_content_and_tag (line 27) | def separate_content_and_tag(html, start_str, end_str):
  function remove_number_and_merge_snippet (line 36) | def remove_number_and_merge_snippet(html, NumberThred = 7):
  function identify_code (line 107) | def identify_code(text):
  function decode_html (line 118) | def decode_html(uri, html, encoding, TAG):
  function wet_decode_layer (line 256) | def wet_decode_layer(wet_file_name, variables=dict(), INPUT_FOLDER="./",...

FILE: DomainSpecific/core/layers/util.py
  function load_yaml (line 17) | def load_yaml(config_path):
  function save_yaml (line 24) | def save_yaml(config, config_path):
  function str2bytes (line 29) | def str2bytes(data):
  function md5 (line 33) | def md5(data):
  function sha256 (line 39) | def sha256(data):
  function suffix (line 45) | def suffix(path):
  function relative2absolute_path (line 49) | def relative2absolute_path(prefix, link):
  function create_folder_by_file_path (line 63) | def create_folder_by_file_path(local_file_path):
  function to_real_path (line 71) | def to_real_path(path, variables):
  function get_container_client (line 79) | def get_container_client(storage_config):
  function get_blob_client (line 101) | def get_blob_client(storage_config, blob_path):
  function exist_blob (line 106) | def exist_blob(container_client, blob_path):
  function get_blob_size (line 111) | def get_blob_size(container_client, blob_path):
  function list_blob_dir (line 117) | def list_blob_dir(container_client, blob_path):
  function create_blob_dir (line 123) | def create_blob_dir(container_client, blob_path):
  function upload_bytes_to_blob (line 126) | def upload_bytes_to_blob(storage_config, content, blob_path):
  function upload_file_to_blob (line 131) | def upload_file_to_blob(storage_config, local_path, blob_path):
  function upload_bytes_to_internet (line 136) | def upload_bytes_to_internet(content, blob_path):
  function upload_file_to_internet (line 140) | def upload_file_to_internet(local_path, blob_path):
  function download_bytes_from_blob (line 144) | def download_bytes_from_blob(storage_config, blob_path):
  function download_file_from_blob (line 149) | def download_file_from_blob(storage_config, blob_path, local_path):
  function download_bytes_from_internet (line 156) | def download_bytes_from_internet(url, timeout=3):
  function download_file_from_internet (line 167) | def download_file_from_internet(url, local_path):

FILE: DomainSpecific/core/network.py
  class Network (line 10) | class Network:
    method __init__ (line 11) | def __init__(self):
    method set_input_names (line 18) | def set_input_names(self, input_names):
    method set_output_names (line 21) | def set_output_names(self, output_names):
    method add_data (line 24) | def add_data(self, name, value):
    method add_layer (line 27) | def add_layer(self, name, value):
    method next_layer (line 30) | def next_layer(self, invisited_layer_names):
    method __call__ (line 40) | def __call__(self, inputs=list(), worker_id=0, worker_num=1, variables...

FILE: DomainSpecific/dependency/gpt_api.py
  class GPTAPI (line 16) | class GPTAPI:
    method __init__ (line 17) | def __init__(self, engine, endpoint, identity_id):
    method switch_api (line 48) | def switch_api(self, api_idx=-1):
    method get_tokens (line 52) | def get_tokens(self, text):
    method run (line 56) | def run(self, system, question, engine=None, uid=None, temperature=0.0...

FILE: DomainSpecific/dependency/install.py
  function install (line 16) | def install(local_id, storage_path):

FILE: DomainSpecific/submit.py
  function submit_job (line 7) | def submit_job(network_path, run_mode, docker_path, computation_path, st...

FILE: DomainSpecific/tools/submit_batch_job.py
  function submit_batch_job (line 18) | def submit_batch_job(network_path, run_mode, docker_path, computation_pa...

FILE: DomainSpecific/tools/submit_local_job.py
  function submit_local_job (line 10) | def submit_local_job(network_path, run_mode, docker_path, computation_pa...

FILE: DomainSpecific/wrapper/interpreter.py
  class Interpreter (line 14) | class Interpreter:
    method __init__ (line 15) | def __init__(self):
    method check_config (line 19) | def check_config(self, config):
    method __call__ (line 88) | def __call__(self, config_path):

FILE: DomainSpecific/wrapper/parser.py
  class Parser (line 9) | class Parser:
    method __init__ (line 10) | def __init__(self):
    method __call__ (line 13) | def __call__(self, config_path):

FILE: DomainSpecific/wrapper/runner.py
  class RunMode (line 15) | class RunMode(Enum):
  class Runner (line 20) | class Runner:
    method __init__ (line 21) | def __init__(self, network_path):
    method __call__ (line 25) | def __call__(self, run_mode, worker_id, worker_num, workspace_dir):

FILE: DomainSpecific/wrapper/utility/azure_env.py
  function get_local_rank (line 6) | def get_local_rank():
  function get_world_rank (line 12) | def get_world_rank():
  function get_world_size (line 21) | def get_world_size():
  function get_process_per_node (line 33) | def get_process_per_node():

FILE: DomainSpecific/wrapper/utility/cpu_count.py
  function cpu_count (line 8) | def cpu_count():

FILE: DomainSpecific/wrapper/utility/load_yaml.py
  function load_yaml (line 7) | def load_yaml(config_path):

FILE: DomainSpecific/wrapper/utility/logger.py
  class Logger (line 8) | class Logger:
    method __init__ (line 9) | def __init__():
    method init (line 13) | def init(log_path=None):
    method debug (line 28) | def debug(msg):
    method info (line 32) | def info(msg):
    method warning (line 36) | def warning(msg):
    method error (line 40) | def error(msg):
    method critical (line 44) | def critical(msg):

FILE: DomainSpecific/wrapper/utility/save_yaml.py
  function save_yaml (line 7) | def save_yaml(config, config_path):

FILE: GeneralDomain/redstone_cc/__main__.py
  function main (line 10) | def main():

FILE: GeneralDomain/redstone_cc/algos/deduplication/minhash.py
  function gen_lsh_param (line 10) | def gen_lsh_param(num_perm, lsh_threshold):
  class CalcMinhash (line 14) | class CalcMinhash:
    method __init__ (line 15) | def __init__(self, num_perm, seed=DEFAULT_SEED, mer=DEFAULT_MER):
    method _sha1_hash (line 23) | def _sha1_hash(self, val):
    method hash (line 28) | def hash(self, sequence: list[str]) -> np.ndarray:
  class CalcLsh (line 38) | class CalcLsh:
    method __init__ (line 39) | def __init__(self, b, r):
    method gen_lsh (line 44) | def gen_lsh(self, minhash) -> list[bytearray]:
  class CalcMinhashLsh (line 48) | class CalcMinhashLsh:
    method __init__ (line 49) | def __init__(self, b, r, seed=DEFAULT_SEED, mer=DEFAULT_MER):
    method hash (line 54) | def hash(self, tokens) -> list[bytearray]:
  class LocalMinhashLshDedup (line 60) | class LocalMinhashLshDedup:
    method __init__ (line 61) | def __init__(self, b, r, seed=DEFAULT_SEED, mer=DEFAULT_MER):
    method add (line 66) | def add(self, id, tokens):
    method dedup (line 70) | def dedup(self):

FILE: GeneralDomain/redstone_cc/algos/deduplication/sha1.py
  function sha1_hash (line 8) | def sha1_hash(line, hash_size=DEFAULT_HASH_SIZE) -> bytes:
  class LocalSha1Dedup (line 14) | class LocalSha1Dedup:
    method __init__ (line 15) | def __init__(self, hash_size):
    method add_line (line 20) | def add_line(self, line_id, line):
    method add_hashes (line 24) | def add_hashes(self, line_id, hval):
    method dedup (line 28) | def dedup(self):

FILE: GeneralDomain/redstone_cc/algos/deduplication/utils.py
  function ccnet_normalize (line 13) | def ccnet_normalize(line) -> str:
  function slimpajama_tokenize (line 31) | def slimpajama_tokenize(text, num_ngrams=13):
  function spm_tokenize (line 42) | def spm_tokenize(text, spm_model, num_ngrams=5):

FILE: GeneralDomain/redstone_cc/algos/fasttext_classifier.py
  class FastTextClassifier (line 10) | class FastTextClassifier:
    method __init__ (line 11) | def __init__(self, model_path):
    method predict (line 14) | def predict(self, text):

FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/func/document.py
  function document_word_count (line 4) | def document_word_count(words):
  function document_mean_word_length (line 8) | def document_mean_word_length(words):
  function document_alpha_words (line 15) | def document_alpha_words(words):
  function document_start_with_bullet (line 33) | def document_start_with_bullet(lines):
  function document_end_with_ellipsis (line 47) | def document_end_with_ellipsis(lines):
  function document_gopher_symbols (line 54) | def document_gopher_symbols(text):
  function document_gopher_stopwords (line 61) | def document_gopher_stopwords(words):

FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/func/line.py
  function line_uppercase_ratio (line 7) | def line_uppercase_ratio(line):
  function line_all_numeric (line 18) | def line_all_numeric(line):
  function line_refinedweb_counter (line 25) | def line_refinedweb_counter(line):
  function line_regex_match (line 29) | def line_regex_match(line, patterns):
  function test_line_uppercase_ratio (line 36) | def test_line_uppercase_ratio():
  function test_line_all_numeric (line 46) | def test_line_all_numeric():
  function test_line_refinedweb_counter (line 53) | def test_line_refinedweb_counter():
  function test_line_regex_match (line 60) | def test_line_regex_match():

FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/func/repetition.py
  function repetition_ngram_top_char_frac (line 7) | def repetition_ngram_top_char_frac(words, n: int):
  function repetition_ngram_dup_char_frac (line 22) | def repetition_ngram_dup_char_frac(words, n: int):
  function repetition_line_dup_frac (line 35) | def repetition_line_dup_frac(lines):
  function test_ngram_top (line 53) | def test_ngram_top():
  function test_ngram_dup (line 67) | def test_ngram_dup():
  function test_dup_line (line 77) | def test_dup_line():

FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/model/document.py
  class Document (line 17) | class Document:
    method __init__ (line 18) | def __init__(self, text, lang):
    method sents (line 23) | def sents(self):
    method paragraphs (line 33) | def paragraphs(self):
    method normalized_text (line 37) | def normalized_text(self):
    method normalized_sents (line 41) | def normalized_sents(self):
    method normalized_words (line 45) | def normalized_words(self):

FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/model/violations.py
  class Violations (line 6) | class Violations:
    method __init__ (line 7) | def __init__(self):
    method doc (line 12) | def doc(self, key):
    method line (line 17) | def line(self, key, lines: List[int]):
    method apply_to_doc (line 25) | def apply_to_doc(self, doc: Document) -> str | None:

FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/ruleset/gopher.py
  function gopher_filter (line 24) | def gopher_filter(doc: Document):
  function apply_gopher_rules (line 79) | def apply_gopher_rules(text, lang):

FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/ruleset/refinedweb.py
  function refinedweb_filter (line 22) | def refinedweb_filter(doc: Document):
  function apply_refinedweb_rules (line 68) | def apply_refinedweb_rules(text, lang):

FILE: GeneralDomain/redstone_cc/algos/rule_based_filters/utils.py
  function remove_url (line 14) | def remove_url(text):
  function remove_consecutive_new_lines (line 18) | def remove_consecutive_new_lines(text):
  function remove_punct (line 22) | def remove_punct(text):
  function normalize (line 26) | def normalize(text):

FILE: GeneralDomain/redstone_cc/algos/trafilatura_process.py
  class EmptyResultException (line 15) | class EmptyResultException(Exception):
  function _remove_dup_newline (line 19) | def _remove_dup_newline(text):
  function _normalize_whitespace (line 29) | def _normalize_whitespace(tree):
  function _traf_xml_to_html (line 46) | def _traf_xml_to_html(tree):
  function _build_traf_doc_full (line 111) | def _build_traf_doc_full(traf_bare_res):
  function _build_traf_doc (line 131) | def _build_traf_doc(traf_bare_res):
  function _reset_caches (line 146) | def _reset_caches():
  function _detect_zip_bomb (line 154) | def _detect_zip_bomb(data):
  function trafilatura_process (line 189) | def trafilatura_process(html):

FILE: GeneralDomain/redstone_cc/download_utils.py
  function _url_basename (line 12) | def _url_basename(url):
  function _normalize_dst (line 17) | def _normalize_dst(src, dst):
  function detect_aria2 (line 25) | def detect_aria2():
  function download_with_aria2 (line 30) | def download_with_aria2(src, dst, num_connections=16, quiet=False, extra...
  function download_with_requests (line 71) | def download_with_requests(src, dst):
  function download (line 81) | def download(src, dst):

FILE: GeneralDomain/redstone_cc/process.py
  function process_items (line 17) | def process_items(remote_cc_path, items, disable_tqdm=False):
  function process_file (line 70) | def process_file(index_path):

Download .json

Condensed preview — 130 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (455K chars).

[
  {
    "path": ".github/workflows/codeql.yml",
    "chars": 4294,
    "preview": "# For most projects, this workflow file will not need changing; you simply need\n# to commit it to your repository.\n#\n# Y"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "chars": 444,
    "preview": "# Microsoft Open Source Code of Conduct\n\nThis project has adopted the [Microsoft Open Source Code of Conduct](https://op"
  },
  {
    "path": "DomainSpecific/.gitignore",
    "chars": 52,
    "preview": "__pycache__/\ndependency/models/\nenv_ready\nworkspace\n"
  },
  {
    "path": "DomainSpecific/configs/cc_math_filter.CC-MAIN-2023-23.json",
    "chars": 2023,
    "preview": "{\n    \"name\": \"cc_math_extraction\",\n    \"description\": \"math extraction from cc parquet file - 202323.\",\n    \"date\": \"20"
  },
  {
    "path": "DomainSpecific/configs/cc_openquestion_filter.CC-MAIN-2023-23.json",
    "chars": 2064,
    "preview": "{\n    \"name\": \"cc_openquestion_extraction\",\n    \"description\": \"open question extraction from cc parquet file - 202323.\""
  },
  {
    "path": "DomainSpecific/configs/cc_warc_download.CC-MAIN-2023-23.json",
    "chars": 3023,
    "preview": "{\n    \"name\": \"cc_warc_download\",\n    \"description\": \"download warc files for a specific cc snapshot - CC-MAIN-2023-23.\""
  },
  {
    "path": "DomainSpecific/configs/cc_warc_filter.CC-MAIN-2023-23.json",
    "chars": 2247,
    "preview": "{\n    \"name\": \"cc_warc_filter\",\n    \"description\": \"filter html containing specific tags on warc files - CC-MAIN-2023-23"
  },
  {
    "path": "DomainSpecific/configs/cc_warc_to_wet.code.CC-MAIN-2023-23.json",
    "chars": 3928,
    "preview": "{\n    \"name\": \"cc_warc_to_wet\",\n    \"description\": \"convert cc warc to wet and keep math formula - CC-MAIN-2023-23.\",\n  "
  },
  {
    "path": "DomainSpecific/configs/cc_warc_to_wet.math.CC-MAIN-2023-23.json",
    "chars": 3928,
    "preview": "{\n    \"name\": \"cc_warc_to_wet\",\n    \"description\": \"convert cc warc to wet and keep math formula - CC-MAIN-2023-23.\",\n  "
  },
  {
    "path": "DomainSpecific/configs/network_template.json",
    "chars": 706,
    "preview": "{\n    \"name\": \"template_network\",\n    \"description\": \"Toy example of network.\",\n    \"date\": \"20230713\",\n    \"version\": \""
  },
  {
    "path": "DomainSpecific/core/__init__.py",
    "chars": 290,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nfrom .data import DataType\nfrom .layer import Layer, Joi"
  },
  {
    "path": "DomainSpecific/core/data.py",
    "chars": 1630,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nfrom enum import Enum\n\nclass DataType(Enum):\n    # Memor"
  },
  {
    "path": "DomainSpecific/core/layer.py",
    "chars": 4076,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/__init__.py",
    "chars": 9848,
    "preview": "from enum import Enum\nfrom ..data import DataType\n\nfrom .template_layer import template_layer\n\n# Control layers\nfrom .co"
  },
  {
    "path": "DomainSpecific/core/layers/control/__init__.py",
    "chars": 481,
    "preview": "# Control\nfrom .data_sample_layer import data_sample_layer\nfrom .data_filter_layer import data_filter_layer\nfrom .data_o"
  },
  {
    "path": "DomainSpecific/core/layers/control/data_concat_layer.py",
    "chars": 616,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/control/data_filter_layer.py",
    "chars": 647,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/control/data_order_layer.py",
    "chars": 537,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/control/data_partition_layer.py",
    "chars": 887,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/control/data_sample_layer.py",
    "chars": 679,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/control/data_shuffle_layer.py",
    "chars": 580,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/extract/__init__.py",
    "chars": 265,
    "preview": "# Extract\nfrom .extract_article_layer import extract_article_layer\nfrom .build_index_layer import build_index_layer\nfrom"
  },
  {
    "path": "DomainSpecific/core/layers/extract/build_index_layer.py",
    "chars": 939,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport sys\nimport faiss\nimport numpy as np\nimport traceb"
  },
  {
    "path": "DomainSpecific/core/layers/extract/extract_article_layer.py",
    "chars": 6827,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/extract/search_index_layer.py",
    "chars": 1095,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nimport faiss\nimport numpy as np\nimp"
  },
  {
    "path": "DomainSpecific/core/layers/global_var.py",
    "chars": 2837,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nimport traceback\n#import torch\nimpo"
  },
  {
    "path": "DomainSpecific/core/layers/io/__init__.py",
    "chars": 1046,
    "preview": "# IO - read/write\nfrom .to_binary_file_layer import to_binary_file_layer\nfrom .to_line_file_layer import to_line_file_la"
  },
  {
    "path": "DomainSpecific/core/layers/io/from_binary_file_layer.py",
    "chars": 795,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/from_index_file_layer.py",
    "chars": 784,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/from_jsonl_file_layer.py",
    "chars": 838,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/from_line_file_layer.py",
    "chars": 808,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/from_parquet_file_layer.py",
    "chars": 833,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/from_warc_file_layer.py",
    "chars": 1545,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/from_wet_file_layer.py",
    "chars": 1471,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/to_binary_file_layer.py",
    "chars": 889,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/to_index_file_layer.py",
    "chars": 1232,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/to_jsonl_file_layer.py",
    "chars": 971,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/to_line_file_layer.py",
    "chars": 933,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/io/to_parquet_file_layer.py",
    "chars": 1004,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/__init__.py",
    "chars": 1223,
    "preview": "# Network - download/upload\nfrom .upload_file_to_blob_layer import upload_file_to_blob_layer\nfrom .upload_bytes_to_blob_"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_bytes_from_blob_layer.py",
    "chars": 1210,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_bytes_from_internet_layer.py",
    "chars": 952,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_file_from_blob_layer.py",
    "chars": 1421,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_file_from_internet_layer.py",
    "chars": 1317,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_starcoder_layer.py",
    "chars": 3931,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_url_list_layer.py",
    "chars": 1417,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_urls_from_website_layer.py",
    "chars": 2494,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_warc_file_layer.py",
    "chars": 1824,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/download_warc_indice_layer.py",
    "chars": 1159,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/upload_bytes_to_blob_layer.py",
    "chars": 1187,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/network/upload_file_to_blob_layer.py",
    "chars": 1280,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/template_layer.py",
    "chars": 1155,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport sys\nimport traceback\n\n# Spec of adding a new laye"
  },
  {
    "path": "DomainSpecific/core/layers/transform/__init__.py",
    "chars": 876,
    "preview": "# Transform\nfrom .tokenize_article_layer import tokenize_article_layer\nfrom .ngrams_layer import ngrams_layer\nfrom .minh"
  },
  {
    "path": "DomainSpecific/core/layers/transform/lsh_minhash_layer.py",
    "chars": 7037,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/math_filter_layer.py",
    "chars": 5774,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/mcq_filter_layer.py",
    "chars": 9211,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/minhash_tokens_layer.py",
    "chars": 5859,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/ngrams_layer.py",
    "chars": 2624,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/openquestion_filter_layer.py",
    "chars": 7338,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/tokenize_article_layer.py",
    "chars": 1310,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/warc_encode_layer.py",
    "chars": 24498,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\n# coding=utf-8\nimport os\nimport sys\nos.sys.path.append(f"
  },
  {
    "path": "DomainSpecific/core/layers/transform/warc_filter_layer.py",
    "chars": 2711,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/warc_to_wet_layer.py",
    "chars": 2239,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/transform/wet_decode_layer.py",
    "chars": 12822,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/core/layers/util.py",
    "chars": 5807,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport copy\nimport yaml\nimport hashlib\nimport "
  },
  {
    "path": "DomainSpecific/core/network.py",
    "chars": 8444,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/dependency/gpt_api.py",
    "chars": 5804,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nimport time\nimport traceback\nimport"
  },
  {
    "path": "DomainSpecific/dependency/install.py",
    "chars": 3684,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/dependency/requirements.txt",
    "chars": 771,
    "preview": "lxml==5.1.0\n#fasttext==0.9.2\nfasttext-wheel==0.9.2\nsentencepiece==0.1.99\ntrafilatura==1.6.1\nhtml5lib==1.1\nnewspaper3k==0"
  },
  {
    "path": "DomainSpecific/dependency/xsltml_2.0/cmarkup.xsl",
    "chars": 35631,
    "preview": "<?xml version='1.0' encoding=\"UTF-8\"?>\n<xsl:stylesheet\n\t\txmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n\t\txmlns:m=\"htt"
  },
  {
    "path": "DomainSpecific/dependency/xsltml_2.0/entities.xsl",
    "chars": 71213,
    "preview": "<?xml version='1.0' encoding=\"UTF-8\"?>\n<xsl:stylesheet xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n\t\txmlns:m=\"http:"
  },
  {
    "path": "DomainSpecific/dependency/xsltml_2.0/glayout.xsl",
    "chars": 6114,
    "preview": "<?xml version='1.0' encoding=\"UTF-8\"?>\n<xsl:stylesheet xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n\t\txmlns:m=\"http:"
  },
  {
    "path": "DomainSpecific/dependency/xsltml_2.0/mmltex.xsl",
    "chars": 1599,
    "preview": "<?xml version='1.0' encoding=\"UTF-8\"?>\n<xsl:stylesheet xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n\t\txmlns:m=\"http:"
  },
  {
    "path": "DomainSpecific/dependency/xsltml_2.0/scripts.xsl",
    "chars": 9610,
    "preview": "<?xml version='1.0' encoding=\"UTF-8\"?>\n<xsl:stylesheet xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n\t\txmlns:m=\"http:"
  },
  {
    "path": "DomainSpecific/dependency/xsltml_2.0/tables.xsl",
    "chars": 4198,
    "preview": "<?xml version='1.0' encoding=\"UTF-8\"?>\n<xsl:stylesheet xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n\t\txmlns:m=\"http:"
  },
  {
    "path": "DomainSpecific/dependency/xsltml_2.0/tokens.xsl",
    "chars": 10359,
    "preview": "<?xml version='1.0' encoding=\"UTF-8\"?>\n<xsl:stylesheet xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"\n\t\txmlns:m=\"http:"
  },
  {
    "path": "DomainSpecific/readme.md",
    "chars": 3188,
    "preview": "# Domain-specific Knowledge Extraction from CommonCrawl\n\n## Introduction \nDeveloping data workflows for specific require"
  },
  {
    "path": "DomainSpecific/requirements.txt",
    "chars": 136,
    "preview": "pyyaml==6.0\nwheel==0.43.0\nsetuptools==70.0.0\nazure-ai-ml==1.16.0\nazure-batch==14.2.0\nazure-identity==1.16.1\nazure-storag"
  },
  {
    "path": "DomainSpecific/resources/computation/batch_dca_eastus.yaml",
    "chars": 138,
    "preview": "# To be filled.\nbatch_url: ${batch_url}\nbatch_pool_id: ${pool_id}\nbatch_node_num: ${node_num}\nbatch_process_per_node: ${"
  },
  {
    "path": "DomainSpecific/resources/computation/local.yaml",
    "chars": 29,
    "preview": "#worker_num: 1\nworker_num: 2\n"
  },
  {
    "path": "DomainSpecific/resources/environment/amlt_sing.yaml",
    "chars": 96,
    "preview": "name: datanetwork\ndescription: Environment for DataNetwork\n# To be filled.\nimage: ${image_repo}\n"
  },
  {
    "path": "DomainSpecific/resources/environment/local.yaml",
    "chars": 72,
    "preview": "name: datanetwork\ndescription: Environment for DataNetwork\nimage: local\n"
  },
  {
    "path": "DomainSpecific/resources/storage/llmstore.yaml",
    "chars": 673,
    "preview": "allow-other: true\n\nlogging:\n  type: syslog\n  level: log_debug\n\ncomponents:\n  - libfuse\n  - file_cache\n  - attr_cache\n  -"
  },
  {
    "path": "DomainSpecific/resources/storage/local.yaml",
    "chars": 41,
    "preview": "workspace_dir: ./workspace/\nmount: false\n"
  },
  {
    "path": "DomainSpecific/sample_run.sh",
    "chars": 1987,
    "preview": "#!/usr/bin/env bash\n\n# -------------------------------------------------------------------------------------------------"
  },
  {
    "path": "DomainSpecific/submit.py",
    "chars": 1443,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport argparse\n\ndef submit_job(network_path, "
  },
  {
    "path": "DomainSpecific/tools/__init__.py",
    "chars": 210,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nfrom .submit_local_job import submit_local_job\nfrom .sub"
  },
  {
    "path": "DomainSpecific/tools/submit_batch_job.py",
    "chars": 6569,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nimport argparse\nos.sys.path.append("
  },
  {
    "path": "DomainSpecific/tools/submit_local_job.py",
    "chars": 1720,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nimport argparse\nos.sys.path.append("
  },
  {
    "path": "DomainSpecific/wrapper/__init__.py",
    "chars": 245,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nfrom .parser import Parser\nfrom .interpreter import Inte"
  },
  {
    "path": "DomainSpecific/wrapper/interpreter.py",
    "chars": 6651,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/wrapper/parser.py",
    "chars": 825,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nimport json\nimport traceback\n\nclass"
  },
  {
    "path": "DomainSpecific/wrapper/runner.py",
    "chars": 3050,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport sys\nos.sys.path.append(f\"{os.path.dirna"
  },
  {
    "path": "DomainSpecific/wrapper/utility/__init__.py",
    "chars": 421,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nfrom .logger import Logger\nfrom .cpu_count import cpu_co"
  },
  {
    "path": "DomainSpecific/wrapper/utility/azure_env.py",
    "chars": 1142,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\n\ndef get_local_rank():\n    # Azure Singularity"
  },
  {
    "path": "DomainSpecific/wrapper/utility/cpu_count.py",
    "chars": 2848,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport re\nimport subprocess\n\ndef cpu_count():\n"
  },
  {
    "path": "DomainSpecific/wrapper/utility/load_yaml.py",
    "chars": 274,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport yaml\n\ndef load_yaml(config_path):\n    c"
  },
  {
    "path": "DomainSpecific/wrapper/utility/logger.py",
    "chars": 1179,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport logging\n\nlogger = None\n\nclass Logger:\n    def __i"
  },
  {
    "path": "DomainSpecific/wrapper/utility/save_yaml.py",
    "chars": 262,
    "preview": "#\n# Copyright (c) Microsoft Corporation. All rights reserved.\n#\nimport os\nimport yaml\n\ndef save_yaml(config, config_path"
  },
  {
    "path": "GeneralDomain/.gitignore",
    "chars": 12,
    "preview": "__pycache__/"
  },
  {
    "path": "GeneralDomain/README.md",
    "chars": 848,
    "preview": "# Redstone General CC\n\nLibrary for reproducing the general CC part of RedStone dataset from the released index Parquet f"
  },
  {
    "path": "GeneralDomain/pyproject.toml",
    "chars": 833,
    "preview": "[build-system]\nrequires = [\"flit_core >=3.2, <4\"]\nbuild-backend = \"flit_core.buildapi\"\n\n[project]\nname = \"redstone-cc\"\nd"
  },
  {
    "path": "GeneralDomain/redstone_cc/__init__.py",
    "chars": 49,
    "preview": "from .process import process_file, process_items\n"
  },
  {
    "path": "GeneralDomain/redstone_cc/__main__.py",
    "chars": 655,
    "preview": "import argparse\n\nimport pyarrow as pa\nimport pyarrow.parquet as pq\nfrom loguru import logger\n\nfrom .process import proce"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/deduplication/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/deduplication/minhash.py",
    "chars": 2522,
    "preview": "import hashlib\n\nimport numpy as np\nfrom datasketch.lsh import _optimal_param\n\nDEFAULT_MER = 2**61 - 1\nDEFAULT_SEED = 1\n\n"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/deduplication/sha1.py",
    "chars": 956,
    "preview": "import hashlib\n\nfrom .utils import ccnet_normalize\n\nDEFAULT_HASH_SIZE = 8\n\n\ndef sha1_hash(line, hash_size=DEFAULT_HASH_S"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/deduplication/utils.py",
    "chars": 1323,
    "preview": "import unicodedata\nimport re\nimport string\n\nimport regex\nimport ftfy\nfrom nltk import ngrams\n\nDIGIT_RE = regex.compile(r"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/fasttext_classifier.py",
    "chars": 594,
    "preview": "import fasttext\n\nfasttext.FastText.eprint = lambda x: None\n\nFASTTEXT_LID_176_URL = (\n    \"https://dl.fbaipublicfiles.com"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/func/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/func/document.py",
    "chars": 1372,
    "preview": "import regex\n\n\ndef document_word_count(words):\n    return len(words)\n\n\ndef document_mean_word_length(words):\n    return "
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/func/line.py",
    "chars": 1823,
    "preview": "import regex\n\nRE_UPPER = regex.compile(r\"\\p{Lu}\")\nRE_LETTER = regex.compile(r\"\\p{L}\")\n\n\ndef line_uppercase_ratio(line):\n"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/func/repetition.py",
    "chars": 2504,
    "preview": "from collections import Counter\n\nimport numpy as np\nfrom nltk.util import ngrams\n\n\ndef repetition_ngram_top_char_frac(wo"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/model/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/model/document.py",
    "chars": 1082,
    "preview": "import sys\nfrom functools import cached_property\n\nimport stopit\nfrom loguru import logger\nfrom sentence_splitter import "
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/model/violations.py",
    "chars": 973,
    "preview": "from typing import List\n\nfrom .document import Document\n\n\nclass Violations:\n    def __init__(self):\n        self.doc_vio"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/ruleset/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/ruleset/gopher.py",
    "chars": 3005,
    "preview": "from ..model.document import Document\nfrom ..model.violations import Violations\nfrom ..func.document import (\n    docume"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/ruleset/refinedweb.py",
    "chars": 1953,
    "preview": "import regex\nfrom .gopher import gopher_filter\nfrom ..model.document import Document\nfrom ..func.line import (\n    line_"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/rule_based_filters/utils.py",
    "chars": 739,
    "preview": "import unicodedata\n\nimport regex\n\nRE_PUNCT = regex.compile(r\"\\p{P}\")\nRE_URL = regex.compile(\n    r\"https?:\\/\\/(www\\.)?[-"
  },
  {
    "path": "GeneralDomain/redstone_cc/algos/trafilatura_process.py",
    "chars": 6389,
    "preview": "import zlib\nimport re\n\nimport brotlicffi\nimport lxml.etree as ET\nfrom lxml.html import tostring\nfrom trafilatura import "
  },
  {
    "path": "GeneralDomain/redstone_cc/download_utils.py",
    "chars": 2079,
    "preview": "import os\nimport subprocess\nimport shlex\nimport shutil\nfrom functools import lru_cache\nfrom urllib.parse import urlparse"
  },
  {
    "path": "GeneralDomain/redstone_cc/process.py",
    "chars": 2641,
    "preview": "import tempfile\nimport os\n\nimport pyarrow.parquet as pq\nfrom tqdm import tqdm\nfrom warcio.archiveiterator import Archive"
  },
  {
    "path": "LICENSE",
    "chars": 1141,
    "preview": "    MIT License\n\n    Copyright (c) Microsoft Corporation.\n\n    Permission is hereby granted, free of charge, to any pers"
  },
  {
    "path": "README.md",
    "chars": 9805,
    "preview": "<p align=\"center\">\n  <img src=\"assets/icon.png\" width=\"150\">\n  <br />\n  <br />\n  <a href=\"https://huggingface.co/dataset"
  },
  {
    "path": "SECURITY.md",
    "chars": 2656,
    "preview": "<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->\n\n## Security\n\nMicrosoft takes the security of our software products an"
  },
  {
    "path": "SUPPORT.md",
    "chars": 1244,
    "preview": "# TODO: The maintainer of this repo has not yet edited this file\r\n\r\n**REPO OWNER**: Do you want Customer Service & Suppo"
  }
]

// ... and 1 more files (download for full content)

About this extraction

This page contains the full source code of the microsoft/RedStone GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 130 files (57.7 MB), approximately 120.2k tokens, and a symbol index with 272 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo