Repository: SPuerBRead/HTMLSimilarity
Branch: master
Commit: 4f0a3f115b7c
Files: 6
Total size: 6.5 KB
Directory structure:
gitextract_zhbu9mib/
├── .gitignore
├── README.md
├── calc.py
├── domtree2data.py
├── htmlparser.py
└── htmlsimilarity.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
================================================
FILE: README.md
================================================
# HTMLSimilarity
根据网页结构判断页面相似性(Determine page similarity based on HTML page structure)
[]()
使用方法
-----------
```
from htmlsimilarity import get_html_similarity
is_similarity, value = get_html_similarity(html_doc1, html_doc2)
```
说明
-----------
##### 输入参数:
* HTML文档1
* HTML文档2
* 降维后的维数,默认是5000
##### 返回值:
* 是否相似
* 相似值(value<0.2时相似,value>0.2时不相似)
判断方法
-----------
根据网页的DOM树确定网页的模板特征向量,对模板特征向量计算网页结构相似性。
详细参考:[李景阳, 张波. 网页结构相似性确定方法及装置:.](http://cprs.patentstar.com.cn/Search/Detail?ANE=9HCC7BGA7AHACGEA7GAA8BHA5ADA9FGF8CBA9EDA9BDC9FCG)
原理参考上述专利文章,对其判断相似性部分进行简单实现。
用途
-----------
判断越权时,需要对response进行对比,当后端返回渲染后HTML的情况下,无法直接判断是否出现了越权,利用常规的文本相似度对比如difflib,通过分词或最长公共子串等方法进行判断并不适用于用来判断越权,所以使用根据页面结构判断相似度,确定是否出现了越权。
================================================
FILE: calc.py
================================================
# -*- coding: utf-8 -*-
"""
@author: taoqi
@file: calc.py
@time: 2019-07-03 18:22
"""
def calculated_similarity(dom1_eigenvector, dom2_eigenvector, dimension):
a, b = 0, 0
for i in range(dimension):
a += dom1_eigenvector[i]-dom2_eigenvector[i]
if dom1_eigenvector[i] and dom2_eigenvector[i]:
b += dom1_eigenvector[i] + dom2_eigenvector[i]
similarity = abs(a)/b
return similarity
================================================
FILE: domtree2data.py
================================================
# -*- coding: utf-8 -*-
"""
@author: taoqi
@file: domtree2data.py
@time: 2019-07-03 14:47
"""
class Converter:
def __init__(self, dom_tree, dimension):
self.dom_tree = dom_tree
self.node_info_list = []
self.dimension = dimension
self.initial_weight = 1
self.attenuation_ratio = 0.6
self.dom_eigenvector = {}.fromkeys(range(0, dimension), 0)
def get_eigenvector(self):
for node_id in range(1, self.dom_tree.size() + 1):
node = self.dom_tree.get_node(node_id)
node_feature = self.create_feature(node)
feature_hash = self.feature_hash(node_feature)
node_weight = self.calculate_weight(node, node_id, feature_hash)
self.construct_eigenvector(feature_hash, node_weight)
return self.dom_eigenvector
@staticmethod
def create_feature(node):
node_attr_list = []
node_feature = node.data.label + '|'
for attr in node.data.attrs.keys():
node_attr_list.append(attr + ':' + str(node.data.attrs[attr]))
node_feature += '|'.join(node_attr_list)
return node_feature
@staticmethod
def feature_hash(node_feature):
return abs(hash(node_feature)) % (10 ** 8)
def calculate_weight(self, node, node_id, feature_hash):
brother_node_count = 0
depth = self.dom_tree.depth(node)
for brother_node in self.dom_tree.siblings(node_id):
brother_node_feature_hash = self.feature_hash(self.create_feature(brother_node))
if brother_node_feature_hash == feature_hash:
brother_node_count = brother_node_count + 1
if brother_node_count:
node_weight = self.initial_weight * self.attenuation_ratio ** depth * self.attenuation_ratio ** brother_node_count
else:
node_weight = self.initial_weight * self.attenuation_ratio ** depth
return node_weight
def construct_eigenvector(self, feature_hash, node_weight):
feature_hash = feature_hash % self.dimension
self.dom_eigenvector[feature_hash] += node_weight
================================================
FILE: htmlparser.py
================================================
# -*- coding: utf-8 -*-
"""
@author: taoqi
@file: htmlparser.py
@time: 2019-07-02 17:17
"""
from treelib import Tree
from bs4 import BeautifulSoup
import bs4
class DOMTree:
def __init__(self, label, attrs):
self.label = label
self.attrs = attrs
class HTMLParser:
def __init__(self, html):
self.dom_id = 1
self.dom_tree = Tree()
self.bs_html = BeautifulSoup(html, 'lxml')
def get_dom_structure_tree(self):
for content in self.bs_html.contents:
if isinstance(content, bs4.element.Tag):
self.bs_html = content
self.recursive_descendants(self.bs_html, 1)
return self.dom_tree
def recursive_descendants(self, descendants, parent_id):
if self.dom_id == 1:
self.dom_tree.create_node(descendants.name, self.dom_id, data=DOMTree(descendants.name, descendants.attrs))
self.dom_id = self.dom_id + 1
for child in descendants.contents:
if isinstance(child, bs4.element.Tag):
self.dom_tree.create_node(child.name, self.dom_id, parent_id, data=DOMTree(child.name, child.attrs))
self.dom_id = self.dom_id + 1
self.recursive_descendants(child, self.dom_id - 1)
================================================
FILE: htmlsimilarity.py
================================================
# -*- coding: utf-8 -*-
"""
@author: taoqi
@file: HTMLSimilarity.py
@time: 2019-07-02 16:57
"""
from htmlparser import HTMLParser
from domtree2data import Converter
from calc import calculated_similarity
def get_html_similarity(html_doc1, html_doc2, dimension=5000):
hp1 = HTMLParser(html_doc1)
html_doc1_dom_tree = hp1.get_dom_structure_tree()
hp2 = HTMLParser(html_doc2)
html_doc2_dom_tree = hp2.get_dom_structure_tree()
converter = Converter(html_doc1_dom_tree, dimension)
dom1_eigenvector = converter.get_eigenvector()
converter = Converter(html_doc2_dom_tree, dimension)
dom2_eigenvector = converter.get_eigenvector()
value = calculated_similarity(dom1_eigenvector, dom2_eigenvector, dimension)
if value > 0.2:
return False, value
else:
return True, value