Repository: SPuerBRead/HTMLSimilarity Branch: master Commit: 4f0a3f115b7c Files: 6 Total size: 6.5 KB Directory structure: gitextract_zhbu9mib/ ├── .gitignore ├── README.md ├── calc.py ├── domtree2data.py ├── htmlparser.py └── htmlsimilarity.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: README.md ================================================ # HTMLSimilarity 根据网页结构判断页面相似性(Determine page similarity based on HTML page structure) [![PyV](https://img.shields.io/badge/python-3.7-brightgreen.svg)]() 使用方法 ----------- ``` from htmlsimilarity import get_html_similarity is_similarity, value = get_html_similarity(html_doc1, html_doc2) ``` 说明 ----------- ##### 输入参数: * HTML文档1 * HTML文档2 * 降维后的维数,默认是5000 ##### 返回值: * 是否相似 * 相似值(value<0.2时相似,value>0.2时不相似) 判断方法 ----------- 根据网页的DOM树确定网页的模板特征向量,对模板特征向量计算网页结构相似性。 详细参考:[李景阳, 张波. 网页结构相似性确定方法及装置:.](http://cprs.patentstar.com.cn/Search/Detail?ANE=9HCC7BGA7AHACGEA7GAA8BHA5ADA9FGF8CBA9EDA9BDC9FCG) 原理参考上述专利文章,对其判断相似性部分进行简单实现。 用途 ----------- 判断越权时,需要对response进行对比,当后端返回渲染后HTML的情况下,无法直接判断是否出现了越权,利用常规的文本相似度对比如difflib,通过分词或最长公共子串等方法进行判断并不适用于用来判断越权,所以使用根据页面结构判断相似度,确定是否出现了越权。 ================================================ FILE: calc.py ================================================ # -*- coding: utf-8 -*- """ @author: taoqi @file: calc.py @time: 2019-07-03 18:22 """ def calculated_similarity(dom1_eigenvector, dom2_eigenvector, dimension): a, b = 0, 0 for i in range(dimension): a += dom1_eigenvector[i]-dom2_eigenvector[i] if dom1_eigenvector[i] and dom2_eigenvector[i]: b += dom1_eigenvector[i] + dom2_eigenvector[i] similarity = abs(a)/b return similarity ================================================ FILE: domtree2data.py ================================================ # -*- coding: utf-8 -*- """ @author: taoqi @file: domtree2data.py @time: 2019-07-03 14:47 """ class Converter: def __init__(self, dom_tree, dimension): self.dom_tree = dom_tree self.node_info_list = [] self.dimension = dimension self.initial_weight = 1 self.attenuation_ratio = 0.6 self.dom_eigenvector = {}.fromkeys(range(0, dimension), 0) def get_eigenvector(self): for node_id in range(1, self.dom_tree.size() + 1): node = self.dom_tree.get_node(node_id) node_feature = self.create_feature(node) feature_hash = self.feature_hash(node_feature) node_weight = self.calculate_weight(node, node_id, feature_hash) self.construct_eigenvector(feature_hash, node_weight) return self.dom_eigenvector @staticmethod def create_feature(node): node_attr_list = [] node_feature = node.data.label + '|' for attr in node.data.attrs.keys(): node_attr_list.append(attr + ':' + str(node.data.attrs[attr])) node_feature += '|'.join(node_attr_list) return node_feature @staticmethod def feature_hash(node_feature): return abs(hash(node_feature)) % (10 ** 8) def calculate_weight(self, node, node_id, feature_hash): brother_node_count = 0 depth = self.dom_tree.depth(node) for brother_node in self.dom_tree.siblings(node_id): brother_node_feature_hash = self.feature_hash(self.create_feature(brother_node)) if brother_node_feature_hash == feature_hash: brother_node_count = brother_node_count + 1 if brother_node_count: node_weight = self.initial_weight * self.attenuation_ratio ** depth * self.attenuation_ratio ** brother_node_count else: node_weight = self.initial_weight * self.attenuation_ratio ** depth return node_weight def construct_eigenvector(self, feature_hash, node_weight): feature_hash = feature_hash % self.dimension self.dom_eigenvector[feature_hash] += node_weight ================================================ FILE: htmlparser.py ================================================ # -*- coding: utf-8 -*- """ @author: taoqi @file: htmlparser.py @time: 2019-07-02 17:17 """ from treelib import Tree from bs4 import BeautifulSoup import bs4 class DOMTree: def __init__(self, label, attrs): self.label = label self.attrs = attrs class HTMLParser: def __init__(self, html): self.dom_id = 1 self.dom_tree = Tree() self.bs_html = BeautifulSoup(html, 'lxml') def get_dom_structure_tree(self): for content in self.bs_html.contents: if isinstance(content, bs4.element.Tag): self.bs_html = content self.recursive_descendants(self.bs_html, 1) return self.dom_tree def recursive_descendants(self, descendants, parent_id): if self.dom_id == 1: self.dom_tree.create_node(descendants.name, self.dom_id, data=DOMTree(descendants.name, descendants.attrs)) self.dom_id = self.dom_id + 1 for child in descendants.contents: if isinstance(child, bs4.element.Tag): self.dom_tree.create_node(child.name, self.dom_id, parent_id, data=DOMTree(child.name, child.attrs)) self.dom_id = self.dom_id + 1 self.recursive_descendants(child, self.dom_id - 1) ================================================ FILE: htmlsimilarity.py ================================================ # -*- coding: utf-8 -*- """ @author: taoqi @file: HTMLSimilarity.py @time: 2019-07-02 16:57 """ from htmlparser import HTMLParser from domtree2data import Converter from calc import calculated_similarity def get_html_similarity(html_doc1, html_doc2, dimension=5000): hp1 = HTMLParser(html_doc1) html_doc1_dom_tree = hp1.get_dom_structure_tree() hp2 = HTMLParser(html_doc2) html_doc2_dom_tree = hp2.get_dom_structure_tree() converter = Converter(html_doc1_dom_tree, dimension) dom1_eigenvector = converter.get_eigenvector() converter = Converter(html_doc2_dom_tree, dimension) dom2_eigenvector = converter.get_eigenvector() value = calculated_similarity(dom1_eigenvector, dom2_eigenvector, dimension) if value > 0.2: return False, value else: return True, value