[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n"
  },
  {
    "path": "README.md",
    "content": "# HTMLSimilarity\n根据网页结构判断页面相似性(Determine page similarity based on HTML page structure)\n\n[![PyV](https://img.shields.io/badge/python-3.7-brightgreen.svg)]()\n\n使用方法\n-----------\n\n```\nfrom htmlsimilarity import get_html_similarity\n\nis_similarity, value = get_html_similarity(html_doc1, html_doc2)\n```\n\n说明\n-----------\n\n##### 输入参数：\n* HTML文档1\n* HTML文档2\n* 降维后的维数，默认是5000\n\n##### 返回值：\n* 是否相似\n* 相似值（value<0.2时相似，value>0.2时不相似）\n\n\n判断方法\n-----------\n\n根据网页的DOM树确定网页的模板特征向量，对模板特征向量计算网页结构相似性。\n\n详细参考：[李景阳, 张波. 网页结构相似性确定方法及装置:.](http://cprs.patentstar.com.cn/Search/Detail?ANE=9HCC7BGA7AHACGEA7GAA8BHA5ADA9FGF8CBA9EDA9BDC9FCG)\n\n原理参考上述专利文章，对其判断相似性部分进行简单实现。\n\n用途\n-----------\n\n判断越权时，需要对response进行对比，当后端返回渲染后HTML的情况下，无法直接判断是否出现了越权，利用常规的文本相似度对比如difflib，通过分词或最长公共子串等方法进行判断并不适用于用来判断越权，所以使用根据页面结构判断相似度，确定是否出现了越权。\n"
  },
  {
    "path": "calc.py",
    "content": "# -*- coding: utf-8 -*-\n\n\"\"\"\n@author: taoqi\n@file: calc.py\n@time: 2019-07-03 18:22\n\"\"\"\n\n\ndef calculated_similarity(dom1_eigenvector, dom2_eigenvector, dimension):\n    a, b = 0, 0\n    for i in range(dimension):\n        a += dom1_eigenvector[i]-dom2_eigenvector[i]\n        if dom1_eigenvector[i] and dom2_eigenvector[i]:\n            b += dom1_eigenvector[i] + dom2_eigenvector[i]\n    similarity = abs(a)/b\n    return similarity\n"
  },
  {
    "path": "domtree2data.py",
    "content": "# -*- coding: utf-8 -*-\n\n\"\"\"\n@author: taoqi\n@file: domtree2data.py\n@time: 2019-07-03 14:47\n\"\"\"\n\n\nclass Converter:\n    def __init__(self, dom_tree, dimension):\n        self.dom_tree = dom_tree\n        self.node_info_list = []\n        self.dimension = dimension\n        self.initial_weight = 1\n        self.attenuation_ratio = 0.6\n        self.dom_eigenvector = {}.fromkeys(range(0, dimension), 0)\n\n    def get_eigenvector(self):\n        for node_id in range(1, self.dom_tree.size() + 1):\n            node = self.dom_tree.get_node(node_id)\n            node_feature = self.create_feature(node)\n            feature_hash = self.feature_hash(node_feature)\n            node_weight = self.calculate_weight(node, node_id, feature_hash)\n            self.construct_eigenvector(feature_hash, node_weight)\n        return self.dom_eigenvector\n\n    @staticmethod\n    def create_feature(node):\n        node_attr_list = []\n        node_feature = node.data.label + '|'\n        for attr in node.data.attrs.keys():\n            node_attr_list.append(attr + ':' + str(node.data.attrs[attr]))\n        node_feature += '|'.join(node_attr_list)\n        return node_feature\n\n    @staticmethod\n    def feature_hash(node_feature):\n        return abs(hash(node_feature)) % (10 ** 8)\n\n    def calculate_weight(self, node, node_id, feature_hash):\n        brother_node_count = 0\n        depth = self.dom_tree.depth(node)\n        for brother_node in self.dom_tree.siblings(node_id):\n            brother_node_feature_hash = self.feature_hash(self.create_feature(brother_node))\n            if brother_node_feature_hash == feature_hash:\n                brother_node_count = brother_node_count + 1\n        if brother_node_count:\n            node_weight = self.initial_weight * self.attenuation_ratio ** depth * self.attenuation_ratio ** brother_node_count\n        else:\n            node_weight = self.initial_weight * self.attenuation_ratio ** depth\n        return node_weight\n\n    def construct_eigenvector(self, feature_hash, node_weight):\n        feature_hash = feature_hash % self.dimension\n        self.dom_eigenvector[feature_hash] += node_weight\n"
  },
  {
    "path": "htmlparser.py",
    "content": "# -*- coding: utf-8 -*-\n\n\"\"\"\n@author: taoqi\n@file: htmlparser.py\n@time: 2019-07-02 17:17\n\"\"\"\n\nfrom treelib import Tree\nfrom bs4 import BeautifulSoup\nimport bs4\n\n\nclass DOMTree:\n    def __init__(self, label, attrs):\n        self.label = label\n        self.attrs = attrs\n\n\nclass HTMLParser:\n\n    def __init__(self, html):\n        self.dom_id = 1\n        self.dom_tree = Tree()\n        self.bs_html = BeautifulSoup(html, 'lxml')\n\n    def get_dom_structure_tree(self):\n        for content in self.bs_html.contents:\n            if isinstance(content, bs4.element.Tag):\n                self.bs_html = content\n        self.recursive_descendants(self.bs_html, 1)\n        return self.dom_tree\n\n    def recursive_descendants(self, descendants, parent_id):\n        if self.dom_id == 1:\n            self.dom_tree.create_node(descendants.name, self.dom_id, data=DOMTree(descendants.name, descendants.attrs))\n            self.dom_id = self.dom_id + 1\n        for child in descendants.contents:\n            if isinstance(child, bs4.element.Tag):\n                self.dom_tree.create_node(child.name, self.dom_id, parent_id, data=DOMTree(child.name, child.attrs))\n                self.dom_id = self.dom_id + 1\n                self.recursive_descendants(child, self.dom_id - 1)\n"
  },
  {
    "path": "htmlsimilarity.py",
    "content": "# -*- coding: utf-8 -*-\n\n\"\"\"\n@author: taoqi\n@file: HTMLSimilarity.py\n@time: 2019-07-02 16:57\n\"\"\"\n\nfrom htmlparser import HTMLParser\nfrom domtree2data import Converter\nfrom calc import calculated_similarity\n\n\ndef get_html_similarity(html_doc1, html_doc2, dimension=5000):\n    hp1 = HTMLParser(html_doc1)\n    html_doc1_dom_tree = hp1.get_dom_structure_tree()\n    hp2 = HTMLParser(html_doc2)\n    html_doc2_dom_tree = hp2.get_dom_structure_tree()\n    converter = Converter(html_doc1_dom_tree, dimension)\n    dom1_eigenvector = converter.get_eigenvector()\n    converter = Converter(html_doc2_dom_tree, dimension)\n    dom2_eigenvector = converter.get_eigenvector()\n    value = calculated_similarity(dom1_eigenvector, dom2_eigenvector, dimension)\n    if value > 0.2:\n        return False, value\n    else:\n        return True, value\n"
  }
]