
的微博:还记得和宝宝陪着@Dear-迪丽热巴 走过的花路吗?谢谢阿丝们一直以来的陪伴
的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆
转发了 @WCS野生生物保护学会
的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也能获得同样感受与动力。
://@Dear-迪丽热巴:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。 举报 赞[0] 回复 01月10日 12:43 来自来自河南
的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆

的微博:#幸福触手可及##幸福触手可及定档0519# 从没有一个时刻,幸福如此靠近,只因有你在身边
的微博:【爱豆喊你来助力#北京2022#】
的微博:【想看看战疫一线医护人员们的脸!#极限挑战致敬医护人员#】脱下防疫服,援鄂人员们原来是这个模样。八位医护人员集体分享支援一线的故事,是他们为后方的我们竖起了最坚实的屏障,感谢这群医护天使的负重前行,致敬!@央视网青年 @雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴...全文 赞[364004] 原文转发[1056354] 原文评论[3645]
的微博:#五四致敬战疫青年# #青春万岁#各地应急响应级别陆续下调,我们正在走向痊愈。回望这些年轻医务人员的脸,不应忘记,正是他们在危难之下,白衣执甲,毅然逆行,为我们筑起血肉长城。感恩提灯天使,致敬最可爱的人!春暖花开,等到疫情完全解除,无论你是从医还是就医,请记住医患之间的休戚与共、唇齿...全文 [组图共12张]
的微博:鸡条君目睹了vivo#极限挑战#第六季首发阵容@雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴 @郭京飞 @邓伦 集结的整个过程,这就是欢迎新人的方式
pic_list = self.selector.xpath('//div[@class="c"]//img/@src')
for i, pic in enumerate(pic_list):
if "?" in pic:
pic = pic[:pic.index("?")]
pic_list[i] = pic
return pic_list
================================================
FILE: weibo_spider/parser/comment_parser.py
================================================
import logging
import random
import requests
import re
from time import sleep
from lxml.html import tostring
from lxml.html import fromstring
from lxml import etree
from .parser import Parser
from .util import handle_garbled, handle_html
logger = logging.getLogger('spider.comment_parser')
class CommentParser(Parser):
def __init__(self, cookie, weibo_id):
self.cookie = cookie
self.url = 'https://weibo.cn/comment/' + weibo_id
self.selector = handle_html(self.cookie, self.url)
def get_long_weibo(self):
"""获取长原创微博"""
try:
for i in range(5):
self.selector = handle_html(self.cookie, self.url)
if self.selector is not None:
info_div = self.selector.xpath("//div[@class='c' and @id='M_']")[0]
info_span = info_div.xpath("//span[@class='ctt']")[0]
# 1. 获取 info_span 中的所有 HTML 代码作为字符串
html_string = etree.tostring(info_span, encoding='unicode', method='html')
# 2. 将
result = self.selector.xpath('//img[@alt="头像相册"]/../@href')
if len(result) > 0:
return "https://weibo.cn" + result[0]
else:
return "https://weibo.cn/" + str(self.user_id) + "/avatar?rl=0"
================================================
FILE: weibo_spider/parser/util.py
================================================
import hashlib
import json
import logging
import sys
import aiohttp
import requests
from lxml import etree
# Set GENERATE_TEST_DATA to True when generating test data.
GENERATE_TEST_DATA = False
TEST_DATA_DIR = 'tests/testdata'
URL_MAP_FILE = 'url_map.json'
logger = logging.getLogger('spider.util')
def hash_url(url):
return hashlib.sha224(url.encode('utf8')).hexdigest()
async def handle_html_async(cookie, url, session):
"""异步处理html"""
try:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
headers = {'User-Agent': user_agent, 'Cookie': cookie}
async with session.get(url, headers=headers) as resp:
content = await resp.read()
if GENERATE_TEST_DATA:
import io
import os
resp_file = os.path.join(TEST_DATA_DIR, '%s.html' % hash_url(url))
with io.open(resp_file, 'wb') as f:
f.write(content)
with io.open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE), 'r+') as f:
url_map = json.loads(f.read())
url_map[url] = resp_file
f.seek(0)
f.write(json.dumps(url_map, indent=4, ensure_ascii=False))
f.truncate()
selector = etree.HTML(content)
return selector
except Exception as e:
logger.exception(e)
def handle_html(cookie, url):
"""处理html"""
try:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
headers = {'User-Agent': user_agent, 'Cookie': cookie}
resp = requests.get(url, headers=headers)
if GENERATE_TEST_DATA:
import io
import os
resp_file = os.path.join(TEST_DATA_DIR, '%s.html' % hash_url(url))
with io.open(resp_file, 'w', encoding='utf-8') as f:
f.write(resp.text)
with io.open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE), 'r+') as f:
url_map = json.loads(f.read())
url_map[url] = resp_file
f.seek(0)
f.write(json.dumps(url_map, indent=4, ensure_ascii=False))
f.truncate()
selector = etree.HTML(resp.content)
return selector
except Exception as e:
logger.exception(e)
def handle_garbled(info):
"""处理乱码"""
try:
if hasattr(info, 'xpath'): # 检查 info 是否具有 xpath 方法
info_str = info.xpath('string(.)') # 提取字符串内容
else:
info_str = str(info) # 若不支持 xpath,将其转换为字符串
info = info_str.replace(u'\u200b', '').encode(
sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)
return info
except Exception as e:
logger.exception(e)
return u'无'
def bid2mid(bid):
"""convert string bid to string mid"""
alphabet = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
base = len(alphabet)
bidlen = len(bid)
head = bidlen % 4
digit = int((bidlen - head) / 4)
dlist = [bid[0:head]]
for d in range(1, digit + 1):
dlist.append(bid[head:head + d * 4])
head += 4
mid = ''
for d in dlist:
num = 0
idx = 0
strlen = len(d)
for char in d:
power = (strlen - (idx + 1))
num += alphabet.index(char) * (base**power)
idx += 1
strnum = str(num)
while (len(d) == 4 and len(strnum) < 7):
strnum = '0' + strnum
mid += strnum
return mid
def to_video_download_url(cookie, video_page_url):
if video_page_url == '':
return ''
video_object_url = video_page_url.replace('m.weibo.cn/s/video/show',
'm.weibo.cn/s/video/object')
try:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
headers = {'User_Agent': user_agent, 'Cookie': cookie}
wb_info = requests.get(video_object_url, headers=headers).json()
video_url = wb_info['data']['object']['stream'].get('hd_url')
if not video_url:
video_url = wb_info['data']['object']['stream']['url']
if not video_url: # 说明该视频为直播
video_url = ''
except json.decoder.JSONDecodeError:
logger.warning(u'当前账号没有浏览该视频的权限')
return video_url
def string_to_int(string):
"""字符串转换为整数"""
if len(string) == 0:
logger.warning("string to int, the input string is empty!")
return 0
if isinstance(string, int):
return string
elif string.endswith(u'万+'):
string = string[:-2] + '0000'
elif string.endswith(u'万'):
string = float(string[:-1]) * 10000
elif string.endswith(u'亿'):
string = float(string[:-1]) * 100000000
return int(string)
================================================
FILE: weibo_spider/spider.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import json
import logging
import logging.config
import os
import random
import shutil
import sys
import asyncio
import aiohttp
from datetime import date, datetime, timedelta
from time import sleep
from absl import app, flags
from tqdm import tqdm
from . import config_util, datetime_util
from .downloader import AvatarPictureDownloader
from .parser import AlbumParser, IndexParser, PageParser, PhotoParser
from .parser.util import handle_html_async
from .user import User
FLAGS = flags.FLAGS
flags.DEFINE_string('config_path', None, 'The path to config.json.')
flags.DEFINE_string('u', None, 'The user_id we want to input.')
flags.DEFINE_string('user_id_list', None, 'The path to user_id_list.txt.')
flags.DEFINE_string('output_dir', None, 'The dir path to store results.')
logging_path = os.path.split(
os.path.realpath(__file__))[0] + os.sep + 'logging.conf'
logging.config.fileConfig(logging_path)
logger = logging.getLogger('spider')
class Spider:
def __init__(self, config):
"""Weibo类初始化"""
self.filter = config[
'filter'] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博
since_date = config['since_date']
if isinstance(since_date, int):
since_date = date.today() - timedelta(since_date)
self.since_date = str(
since_date) # 起始时间,即爬取发布日期从该值到结束时间的微博,形式为yyyy-mm-dd
self.end_date = config[
'end_date'] # 结束时间,即爬取发布日期从起始时间到该值的微博,形式为yyyy-mm-dd,特殊值"now"代表现在
random_wait_pages = config['random_wait_pages']
self.random_wait_pages = [
min(random_wait_pages),
max(random_wait_pages)
] # 随机等待频率,即每爬多少页暂停一次
random_wait_seconds = config['random_wait_seconds']
self.random_wait_seconds = [
min(random_wait_seconds),
max(random_wait_seconds)
] # 随机等待时间,即每次暂停要sleep多少秒
self.global_wait = config['global_wait'] # 配置全局等待时间,如每爬1000页等待3600秒等
self.page_count = 0 # 统计每次全局等待后,爬取了多少页,若页数满足全局等待要求就进入下一次全局等待
self.write_mode = config[
'write_mode'] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型
self.pic_download = config[
'pic_download'] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载
self.video_download = config[
'video_download'] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载
self.file_download_timeout = config.get(
'file_download_timeout',
[5, 5, 10
]) # 控制文件下载“超时”时的操作,值是list形式,包含三个数字,依次分别是最大超时重试次数、最大连接时间和最大读取时间
self.result_dir_name = config.get(
'result_dir_name', 0) # 结果目录名,取值为0或1,决定结果文件存储在用户昵称文件夹里还是用户id文件夹里
self.cookie = config['cookie']
self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填
self.sqlite_config = config.get('sqlite_config')
self.kafka_config = config.get('kafka_config')
self.mongo_config = config.get('mongo_config')
self.post_config = config.get('post_config')
self.user_config_file_path = ''
user_id_list = config['user_id_list']
if FLAGS.user_id_list:
user_id_list = FLAGS.user_id_list
if not isinstance(user_id_list, list):
if not os.path.isabs(user_id_list):
user_id_list = os.getcwd() + os.sep + user_id_list
if not os.path.isfile(user_id_list):
logger.warning('不存在%s文件', user_id_list)
sys.exit()
self.user_config_file_path = user_id_list
if FLAGS.u:
user_id_list = FLAGS.u.split(',')
if isinstance(user_id_list, list):
# 第一部分是处理dict类型的
# 第二部分是其他类型,其他类型提供去重功能
user_config_list = list(
map(
lambda x: {
'user_uri': x['id'],
'since_date': x.get('since_date', self.since_date),
'end_date': x.get('end_date', self.end_date),
}, [user_id for user_id in user_id_list
if isinstance(user_id, dict)])) + list(
map(
lambda x: {
'user_uri': x,
'since_date': self.since_date,
'end_date': self.end_date
},
set([
user_id for user_id in user_id_list
if not isinstance(user_id, dict)
])))
if FLAGS.u:
config_util.add_user_uri_list(self.user_config_file_path,
user_id_list)
else:
user_config_list = config_util.get_user_config_list(
user_id_list, self.since_date)
for user_config in user_config_list:
user_config['end_date'] = self.end_date
self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表
self.user_config = {} # 用户配置,包含用户id和since_date
self.new_since_date = '' # 完成某用户爬取后,自动生成对应用户新的since_date
self.user = User() # 存储爬取到的用户信息
self.got_num = 0 # 存储爬取到的微博数
self.weibo_id_list = [] # 存储爬取到的所有微博id
self.session = None # aiohttp session
async def write_weibo(self, weibos):
"""将爬取到的信息写入文件或数据库"""
for downloader in self.downloaders:
await downloader.download_files(weibos, self.session)
for writer in self.writers:
writer.write_weibo(weibos)
def write_user(self, user):
"""将用户信息写入数据库"""
for writer in self.writers:
writer.write_user(user)
async def get_user_info(self, user_uri):
"""获取用户信息"""
url = 'https://weibo.cn/%s/profile' % (user_uri)
selector = await handle_html_async(self.cookie, url, self.session)
self.user = await IndexParser(self.cookie, user_uri, selector=selector).get_user_async(self.session)
self.page_count += 1
async def download_user_avatar(self, user_uri):
"""下载用户头像"""
# Note: This remains synchronous for now as it's a minor part of the flow
avatar_album_url = PhotoParser(self.cookie,
user_uri).extract_avatar_album_url()
pic_urls = AlbumParser(self.cookie,
avatar_album_url).extract_pic_urls()
await AvatarPictureDownloader(
self._get_filepath('img'),
self.file_download_timeout).handle_download(pic_urls, self.session)
async def get_weibo_info(self):
"""获取微博信息"""
try:
since_date = datetime_util.str_to_time(
self.user_config['since_date'])
now = datetime.now()
if since_date <= now:
# Async fetch page num
user_uri = self.user_config['user_uri']
url = 'https://weibo.cn/%s/profile' % (user_uri)
selector = await handle_html_async(self.cookie, url, self.session)
page_num = IndexParser(self.cookie, user_uri, selector=selector).get_page_num()
self.page_count += 1
if self.page_count > 2 and (self.page_count +
page_num) > self.global_wait[0][0]:
wait_seconds = int(
self.global_wait[0][1] *
min(1, self.page_count / self.global_wait[0][0]))
logger.info(u'即将进入全局等待时间,%d秒后程序继续执行' % wait_seconds)
for i in tqdm(range(wait_seconds)):
await asyncio.sleep(1)
self.page_count = 0
self.global_wait.append(self.global_wait.pop(0))
page1 = 0
random_pages = random.randint(*self.random_wait_pages)
for page in tqdm(range(1, page_num + 1), desc='Progress'):
# Get URL from parser without fetching
parser_temp = PageParser(
self.cookie,
self.user_config, page, self.filter, defer_fetch=True)
# Async fetch with retry
selector = None
for _ in range(3):
selector = await handle_html_async(self.cookie, parser_temp.url, self.session)
if selector is not None:
info = selector.xpath("//div[@class='c']")
if info and len(info) > 0:
break
parser = PageParser(self.cookie, self.user_config, page, self.filter, selector=selector)
weibos, self.weibo_id_list, to_continue = parser.get_one_page(self.weibo_id_list)
logger.info(
u'%s已获取%s(%s)的第%d页微博%s',
'-' * 30,
self.user.nickname,
self.user.id,
page,
'-' * 30,
)
self.page_count += 1
if weibos:
yield weibos
if not to_continue:
break
if (page - page1) % random_pages == 0 and page < page_num:
await asyncio.sleep(random.randint(*self.random_wait_seconds))
page1 = page
random_pages = random.randint(*self.random_wait_pages)
if self.page_count >= self.global_wait[0][0]:
logger.info(u'即将进入全局等待时间,%d秒后程序继续执行' %
self.global_wait[0][1])
for i in tqdm(range(self.global_wait[0][1])):
await asyncio.sleep(1)
self.page_count = 0
self.global_wait.append(self.global_wait.pop(0))
if self.user_config_file_path or FLAGS.u:
config_util.update_user_config_file(
self.user_config_file_path,
self.user_config['user_uri'],
self.user.nickname,
self.new_since_date,
)
except Exception as e:
logger.exception(e)
def _get_filepath(self, type):
"""获取结果文件路径"""
try:
dir_name = self.user.nickname
if self.result_dir_name:
dir_name = self.user.id
if FLAGS.output_dir is not None:
file_dir = FLAGS.output_dir + os.sep + dir_name
else:
file_dir = (os.getcwd() + os.sep + 'weibo' + os.sep + dir_name)
if type == 'img' or type == 'video':
file_dir = file_dir + os.sep + type
if not os.path.isdir(file_dir):
os.makedirs(file_dir)
if type == 'img' or type == 'video':
return file_dir
file_path = file_dir + os.sep + self.user.id + '.' + type
return file_path
except Exception as e:
logger.exception(e)
def initialize_info(self, user_config):
"""初始化爬虫信息"""
self.got_num = 0
self.user_config = user_config
self.weibo_id_list = []
if self.end_date == 'now':
self.new_since_date = datetime.now().strftime('%Y-%m-%d %H:%M')
else:
self.new_since_date = self.end_date
self.writers = []
if 'csv' in self.write_mode:
from .writer import CsvWriter
self.writers.append(
CsvWriter(self._get_filepath('csv'), self.filter))
if 'txt' in self.write_mode:
from .writer import TxtWriter
self.writers.append(
TxtWriter(self._get_filepath('txt'), self.filter))
if 'json' in self.write_mode:
from .writer import JsonWriter
self.writers.append(JsonWriter(self._get_filepath('json')))
if 'mysql' in self.write_mode:
from .writer import MySqlWriter
self.writers.append(MySqlWriter(self.mysql_config))
if 'mongo' in self.write_mode:
from .writer import MongoWriter
self.writers.append(MongoWriter(self.mongo_config))
if 'sqlite' in self.write_mode:
from .writer import SqliteWriter
self.writers.append(SqliteWriter(self.sqlite_config))
if 'kafka' in self.write_mode:
from .writer import KafkaWriter
self.writers.append(KafkaWriter(self.kafka_config))
if 'post' in self.write_mode:
from .writer import PostWriter
self.writers.append(PostWriter(self.post_config))
self.downloaders = []
if self.pic_download == 1:
from .downloader import (
OriginPictureDownloader,
RetweetPictureDownloader)
self.downloaders.append(
OriginPictureDownloader(self._get_filepath('img'),
self.file_download_timeout))
if self.pic_download and not self.filter:
self.downloaders.append(
RetweetPictureDownloader(self._get_filepath('img'),
self.file_download_timeout))
if self.video_download == 1:
from .downloader import VideoDownloader
self.downloaders.append(
VideoDownloader(self._get_filepath('video'),
self.file_download_timeout))
async def get_one_user(self, user_config):
"""获取一个用户的微博"""
try:
await self.get_user_info(user_config['user_uri'])
logger.info(self.user)
logger.info('*' * 100)
self.initialize_info(user_config)
self.write_user(self.user)
logger.info('*' * 100)
# 下载用户头像相册中的图片。
if self.pic_download:
await self.download_user_avatar(user_config['user_uri'])
async for weibos in self.get_weibo_info():
await self.write_weibo(weibos)
self.got_num += len(weibos)
if not self.filter:
logger.info(u'共爬取' + str(self.got_num) + u'条微博')
else:
logger.info(u'共爬取' + str(self.got_num) + u'条原创微博')
logger.info(u'信息抓取完毕')
logger.info('*' * 100)
except Exception as e:
logger.exception(e)
async def start(self):
"""运行爬虫"""
try:
if not self.user_config_list:
logger.info(
u'没有配置有效的user_id,请通过config.json或user_id_list.txt配置user_id')
return
async with aiohttp.ClientSession() as session:
self.session = session
user_count = 0
user_count1 = random.randint(*self.random_wait_pages)
random_users = random.randint(*self.random_wait_pages)
for user_config in self.user_config_list:
if (user_count - user_count1) % random_users == 0:
await asyncio.sleep(random.randint(*self.random_wait_seconds))
user_count1 = user_count
random_users = random.randint(*self.random_wait_pages)
user_count += 1
await self.get_one_user(user_config)
except Exception as e:
logger.exception(e)
def _get_config():
"""获取config.json数据"""
src = os.path.split(
os.path.realpath(__file__))[0] + os.sep + 'config_sample.json'
config_path = os.getcwd() + os.sep + 'config.json'
if FLAGS.config_path:
config_path = FLAGS.config_path
elif not os.path.isfile(config_path):
shutil.copy(src, config_path)
logger.info(u'请先配置当前目录(%s)下的config.json文件,'
u'如果想了解config.json参数的具体意义及配置方法,请访问\n'
u'https://github.com/dataabc/weiboSpider#2程序设置' %
os.getcwd())
sys.exit()
try:
with open(config_path) as f:
try:
config_util.check_cookie(config_path)
except Exception:
logger.info("Using the cookie field in config.json as the request cookie.")
config = json.loads(f.read())
return config
except ValueError:
logger.error(u'config.json 格式不正确,请访问 '
u'https://github.com/dataabc/weiboSpider#2程序设置')
sys.exit()
async def async_main(_):
try:
config = _get_config()
config_util.validate_config(config)
wb = Spider(config)
await wb.start() # 爬取微博信息
except Exception as e:
logger.exception(e)
def main(_):
asyncio.run(async_main(_))
if __name__ == '__main__':
app.run(main)
================================================
FILE: weibo_spider/user.py
================================================
class User:
__slots__ = (
'id', 'nickname', 'gender', 'location', 'birthday', 'description',
'verified_reason', 'talent', 'education', 'work', 'weibo_num',
'following', 'followers'
)
def __init__(self):
self.id = ''
self.nickname = ''
self.gender = ''
self.location = ''
self.birthday = ''
self.description = ''
self.verified_reason = ''
self.talent = ''
self.education = ''
self.work = ''
self.weibo_num = 0
self.following = 0
self.followers = 0
def to_dict(self):
"""将对象转换为字典"""
return {slot: getattr(self, slot) for slot in self.__slots__ if hasattr(self, slot)}
def __str__(self):
"""打印微博用户信息"""
result = ''
result += u'用户昵称: %s\n' % self.nickname
result += u'用户id: %s\n' % self.id
result += u'微博数: %d\n' % self.weibo_num
result += u'关注数: %d\n' % self.following
result += u'粉丝数: %d\n' % self.followers
return result
================================================
FILE: weibo_spider/user_id_list.txt
================================================
1669879400 Dear-迪丽热巴 2020-01-13 19:18
1223178222 胡歌 2020-01-13 19:28
1729370543 郭碧婷 2020-01-13 19:33
================================================
FILE: weibo_spider/weibo.py
================================================
class Weibo:
__slots__ = (
'id', 'user_id', 'content', 'article_url', 'original_pictures',
'retweet_pictures', 'original', 'video_url', 'original_pictures_list',
'retweet_pictures_list', 'media', 'publish_place', 'publish_time',
'publish_tool', 'up_num', 'retweet_num', 'comment_num'
)
def to_dict(self):
"""将对象转换为字典"""
return {slot: getattr(self, slot) for slot in self.__slots__ if hasattr(self, slot)}
def __init__(self):
self.id = ''
self.user_id = ''
self.content = ''
self.article_url = ''
self.original_pictures = []
self.retweet_pictures = []
self.original = True
self.video_url = ''
self.original_pictures_list = []
self.retweet_pictures_list = []
self.media = {}
self.publish_place = ''
self.publish_time = ''
self.publish_tool = ''
self.up_num = 0
self.retweet_num = 0
self.comment_num = 0
def __str__(self):
"""打印一条微博"""
result = self.content + '\n'
result += u'微博发布位置:%s\n' % self.publish_place
result += u'发布时间:%s\n' % self.publish_time
result += u'发布工具:%s\n' % self.publish_tool
result += u'点赞数:%d\n' % self.up_num
result += u'转发数:%d\n' % self.retweet_num
result += u'评论数:%d\n' % self.comment_num
result += u'url:https://weibo.cn/comment/%s\n' % self.id
return result
================================================
FILE: weibo_spider/writer/__init__.py
================================================
from .csv_writer import CsvWriter
from .json_writer import JsonWriter
from .mongo_writer import MongoWriter
from .mysql_writer import MySqlWriter
from .txt_writer import TxtWriter
from .sqlite_writer import SqliteWriter
from .kafka_writer import KafkaWriter
from .post_writer import PostWriter
__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter, PostWriter]
================================================
FILE: weibo_spider/writer/csv_writer.py
================================================
import csv
import logging
from .writer import Writer
logger = logging.getLogger('spider.csv_writer')
class CsvWriter(Writer):
def __init__(self, file_path, filter):
self.file_path = file_path
self.result_headers = [('微博id', 'id'), ('微博正文', 'content'),
('头条文章url', 'article_url'),
('原始图片url', 'original_pictures'),
('微博视频url', 'video_url'),
('发布位置', 'publish_place'),
('发布时间', 'publish_time'),
('发布工具', 'publish_tool'), ('点赞数', 'up_num'),
('转发数', 'retweet_num'), ('评论数', 'comment_num')]
if not filter:
self.result_headers.insert(4, ('被转发微博原始图片url', 'retweet_pictures'))
self.result_headers.insert(5, ('是否为原创微博', 'original'))
try:
with open(self.file_path, 'a', encoding='utf-8-sig',
newline='') as f:
writer = csv.writer(f)
writer.writerows([[kv[0] for kv in self.result_headers]])
except Exception as e:
logger.exception(e)
def write_user(self, user):
self.user = user
def write_weibo(self, weibos):
"""将爬取的信息写入csv文件"""
try:
result_data = [[getattr(w, kv[1]) for kv in self.result_headers]
for w in weibos]
with open(self.file_path, 'a', encoding='utf-8-sig',
newline='') as f:
writer = csv.writer(f)
writer.writerows(result_data)
logger.info(u'%d条微博写入csv文件完毕,保存路径:%s', len(weibos), self.file_path)
except Exception as e:
logger.exception(e)
================================================
FILE: weibo_spider/writer/json_writer.py
================================================
import codecs
import json
import logging
import os
from .writer import Writer
logger = logging.getLogger('spider.json_writer')
class JsonWriter(Writer):
def __init__(self, file_path):
self.file_path = file_path
def write_user(self, user):
self.user = user
def _update_json_data(self, data, weibo_info):
"""更新要写入json结果文件中的数据,已经存在于json中的信息更新为最新值,不存在的信息添加到data中"""
data['user'] = self.user.to_dict()
if data.get('weibo'):
is_new = 1 # 待写入微博是否全部为新微博,即待写入微博与json中的数据不重复
for old in data['weibo']:
if weibo_info[-1]['id'] == old['id']:
is_new = 0
break
if is_new == 0:
for new in weibo_info:
flag = 1
for i, old in enumerate(data['weibo']):
if new['id'] == old['id']:
data['weibo'][i] = new
flag = 0
break
if flag:
data['weibo'].append(new)
else:
data['weibo'] += weibo_info
else:
data['weibo'] = weibo_info
return data
def write_weibo(self, weibos):
"""将爬到的信息写入json文件"""
data = {}
if os.path.isfile(self.file_path):
with codecs.open(self.file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
data = self._update_json_data(data, [w.to_dict() for w in weibos])
with codecs.open(self.file_path, 'w', encoding='utf-8') as f:
f.write(json.dumps(data, indent=4, ensure_ascii=False))
logger.info(u'%d条微博写入json文件完毕,保存路径:%s', len(weibos), self.file_path)
================================================
FILE: weibo_spider/writer/kafka_writer.py
================================================
import json
import logging
import sys
from .writer import Writer
logger = logging.getLogger('spider.kafka_writer')
class KafkaWriter(Writer):
def __init__(self, kafka_config):
try:
from kafka import KafkaProducer
except ImportError:
logger.warning(
u'系统中可能没有安装kafka库,请先运行 pip install kafka-python ,再运行程序')
sys.exit()
self.kafka_config = kafka_config
self.producer = KafkaProducer(
bootstrap_servers=str(kafka_config['bootstrap-server']).split(','),
value_serializer=lambda m: json.dumps(m, ensure_ascii=False
).encode('UTF-8'))
self.weibo_topics = list(kafka_config['weibo_topics'])
self.user_topics = list(kafka_config['user_topics'])
logger.info('{}', kafka_config)
def write_weibo(self, weibo):
for w in weibo:
w.user_id = self.user.id
for topic in self.weibo_topics:
self.producer.send(topic, value=w.to_dict())
def write_user(self, user):
self.user = user
for topic in self.user_topics:
self.producer.send(topic, value=user.to_dict())
def __del__(self):
self.producer.close()
================================================
FILE: weibo_spider/writer/mongo_writer.py
================================================
import copy
import logging
import sys
from .writer import Writer
logger = logging.getLogger('spider.mongo_writer')
class MongoWriter(Writer):
def __init__(self, mongo_config):
self.mongo_config = mongo_config
self.connection_string = mongo_config['connection_string']
self.dba_name = mongo_config.get('dba_name', None)
self.dba_password = mongo_config.get('dba_password', None)
def _info_to_mongodb(self, collection, info_list):
"""将爬取的信息写入MongoDB数据库"""
try:
import pymongo
except ImportError:
logger.warning(
u'系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序')
sys.exit()
try:
from pymongo import MongoClient
client = MongoClient(self.connection_string)
if self.dba_name or self.dba_password:
# authenticate() 在PyMongo3.6版本就已弃用,这一段可能需要后续跟进
client.admin.authenticate(
self.dba_name, self.dba_password, mechanism='SCRAM-SHA-1'
)
db = client['weibo']
collection = db[collection]
new_info_list = copy.deepcopy(info_list)
for info in new_info_list:
if not collection.find_one({'id': info['id']}):
collection.insert_one(info)
else:
collection.update_one({'id': info['id']}, {'$set': info})
except pymongo.errors.ServerSelectionTimeoutError:
logger.warning(
u'系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序')
sys.exit()
def write_weibo(self, weibos):
"""将爬取的微博信息写入MongoDB数据库"""
weibo_list = []
for w in weibos:
w.user_id = self.user.id
weibo_list.append(w.to_dict())
self._info_to_mongodb('weibo', weibo_list)
logger.info(u'%d条微博写入MongoDB数据库完毕', len(weibos))
def write_user(self, user):
"""将爬取的用户信息写入MongoDB数据库"""
self.user = user
user_list = [user.to_dict()]
self._info_to_mongodb('user', user_list)
logger.info(u'%s信息写入MongoDB数据库完毕', user.nickname)
================================================
FILE: weibo_spider/writer/mysql_writer.py
================================================
import copy
import logging
import sys
from .writer import Writer
logger = logging.getLogger('spider.mysql_writer')
class MySqlWriter(Writer):
def __init__(self, mysql_config):
self.mysql_config = mysql_config
# 创建'weibo'数据库
create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT
CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"""
self._mysql_create_database(create_database)
self.mysql_config['db'] = 'weibo'
def _mysql_create(self, connection, sql):
"""创建MySQL数据库或表"""
try:
with connection.cursor() as cursor:
cursor.execute(sql)
finally:
connection.close()
def _mysql_create_database(self, sql):
"""创建MySQL数据库"""
try:
import pymysql
except ImportError:
logger.warning(
u'系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序')
sys.exit()
try:
connection = pymysql.connect(**self.mysql_config)
self._mysql_create(connection, sql)
except pymysql.OperationalError:
logger.warning(u'系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序')
sys.exit()
def _mysql_create_table(self, sql):
"""创建MySQL表"""
import pymysql
connection = pymysql.connect(**self.mysql_config)
self._mysql_create(connection, sql)
def _mysql_insert(self, table, data_list):
"""向MySQL表插入或更新数据"""
import pymysql
if len(data_list) > 0:
# We use this to filter out unset values.
data_list = [{k: v
for k, v in data.items() if v is not None}
for data in data_list]
keys = ', '.join(data_list[0].keys())
values = ', '.join(['%s'] * len(data_list[0]))
connection = pymysql.connect(**self.mysql_config)
cursor = connection.cursor()
sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON
DUPLICATE KEY UPDATE""".format(table=table,
keys=keys,
values=values)
update = ','.join([
' {key} = values({key})'.format(key=key)
for key in data_list[0]
])
sql += update
try:
cursor.executemany(
sql, [tuple(data.values()) for data in data_list])
connection.commit()
except Exception as e:
connection.rollback()
logger.exception(e)
finally:
connection.close()
def write_weibo(self, weibos):
"""将爬取的微博信息写入MySQL数据库"""
# 创建'weibo'表
try:
create_table = """
CREATE TABLE IF NOT EXISTS weibo (
id varchar(10) NOT NULL,
user_id varchar(12),
content varchar(5000),
article_url varchar(200),
original_pictures varchar(3000),
retweet_pictures varchar(3000),
original BOOLEAN NOT NULL DEFAULT 1,
video_url varchar(300),
publish_place varchar(100),
publish_time DATETIME NOT NULL,
publish_tool varchar(30),
up_num INT NOT NULL,
retweet_num INT NOT NULL,
comment_num INT NOT NULL,
PRIMARY KEY (id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"""
self._mysql_create_table(create_table)
# 在'weibo'表中插入或更新微博数据
weibo_list = []
info_list = copy.deepcopy(weibos)
for weibo in info_list:
weibo.user_id = self.user.id
weibo_list.append(weibo.to_dict())
self._mysql_insert('weibo', weibo_list)
logger.info(u'%d条微博写入MySQL数据库完毕', len(weibos))
except Exception as e:
logger.exception(e)
def write_user(self, user):
"""将爬取的用户信息写入MySQL数据库"""
try:
self.user = user
# 创建'user'表
create_table = """
CREATE TABLE IF NOT EXISTS user (
id varchar(20) NOT NULL,
nickname varchar(30),
gender varchar(10),
location varchar(200),
birthday varchar(40),
description varchar(400),
verified_reason varchar(140),
talent varchar(200),
education varchar(200),
work varchar(200),
weibo_num INT,
following INT,
followers INT,
PRIMARY KEY (id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"""
self._mysql_create_table(create_table)
self._mysql_insert('user', [user.to_dict()])
logger.info(u'%s信息写入MySQL数据库完毕', user.nickname)
except Exception as e:
logger.exception(e)
================================================
FILE: weibo_spider/writer/post_writer.py
================================================
import codecs
import json
import logging
import os
import requests
from .writer import Writer
from time import sleep
from requests.exceptions import RequestException
logger = logging.getLogger('spider.post_writer')
class PostWriter(Writer):
def __init__(self, post_config):
self.post_config = post_config
self.api_url = post_config['api_url']
self.api_token = post_config.get('api_token', None)
self.dba_password = post_config.get('dba_password', None)
def write_user(self, user):
self.user = user
def _update_json_data(self, data, weibo_info):
"""将获取到的微博数据转换为json输出模式一致"""
data['user'] = self.user.to_dict()
if data.get('weibo'):
data['weibo'] += weibo_info
else:
data['weibo'] = weibo_info
return data
def send_post_request_with_token(self, url, data, token, max_retries, backoff_factor):
headers = {
'Content-Type': 'application/json',
'api-token': f'{token}',
}
for attempt in range(max_retries + 1):
try:
response = requests.post(url, json=data, headers=headers)
if response.status_code == requests.codes.ok:
return response.json()
else:
raise RequestException(f"Unexpected response status: {response.status_code}")
except RequestException as e:
if attempt < max_retries:
sleep(backoff_factor * (attempt + 1)) # 逐步增加等待时间,避免频繁重试
continue
else:
logger.error(f"在尝试{max_retries}次发出POST连接后,请求失败:{e}")
def write_weibo(self, weibos):
"""将爬到的信息POST到API"""
data = {}
data = self._update_json_data(data, [w.to_dict() for w in weibos])
if data:
self.send_post_request_with_token(self.api_url, data, self.api_token, 3, 2)
logger.info(u'%d条微博通过POST发送到 %s', len(weibos), self.api_url)
else:
logger.info(u'没有获取到微博,略过API POST')
================================================
FILE: weibo_spider/writer/sqlite_writer.py
================================================
import copy
import logging
import sys
from .writer import Writer
logger = logging.getLogger('spider.sqlite_writer')
class SqliteWriter(Writer):
def __init__(self, sqlite_config):
self.sqlite_config = sqlite_config
def _sqlite_create(self, connection, sql):
"""创建sqlite数据库或表"""
try:
cursor = connection.cursor()
cursor.execute(sql)
finally:
connection.close()
def _sqlite_create_table(self, sql):
"""创建sqlite表"""
import sqlite3
connection = sqlite3.connect(self.sqlite_config)
self._sqlite_create(connection, sql)
def _sqlite_insert(self, table, data_list):
"""向sqlite表插入或更新数据"""
import sqlite3
if len(data_list) > 0:
# We use this to filter out unset values.
data_list = [{k: v
for k, v in data.items() if v is not None}
for data in data_list]
keys = ', '.join(data_list[0].keys())
values = ', '.join(['?'] * len(data_list[0]))
connection = sqlite3.connect(self.sqlite_config)
cursor = connection.cursor()
sql = """INSERT OR REPLACE INTO {table}({keys}) VALUES ({values})""".format(
table=table, keys=keys, values=values)
try:
cursor.executemany(
sql, [tuple(data.values()) for data in data_list])
connection.commit()
except Exception as e:
connection.rollback()
logger.exception(e)
finally:
connection.close()
def write_weibo(self, weibos):
"""将爬取的微博信息写入sqlite数据库"""
# 创建'weibo'表
create_table = """
CREATE TABLE IF NOT EXISTS weibo (
id varchar(10) NOT NULL,
user_id varchar(12),
content varchar(2000),
article_url varchar(200),
original_pictures varchar(3000),
retweet_pictures varchar(3000),
original BOOLEAN NOT NULL DEFAULT 1,
video_url varchar(300),
publish_place varchar(100),
publish_time DATETIME NOT NULL,
publish_tool varchar(30),
up_num INT NOT NULL,
retweet_num INT NOT NULL,
comment_num INT NOT NULL,
PRIMARY KEY (id)
)"""
self._sqlite_create_table(create_table)
# 在'weibo'表中插入或更新微博数据
weibo_list = []
info_list = copy.deepcopy(weibos)
for weibo in info_list:
weibo.user_id = self.user.id
weibo_list.append(weibo.to_dict())
self._sqlite_insert('weibo', weibo_list)
logger.info(u'%d条微博写入sqlite数据库完毕', len(weibos))
def write_user(self, user):
"""将爬取的用户信息写入sqlite数据库"""
self.user = user
# 创建'user'表
create_table = """
CREATE TABLE IF NOT EXISTS user (
id varchar(20) NOT NULL,
nickname varchar(30),
gender varchar(10),
location varchar(200),
birthday varchar(40),
description varchar(400),
verified_reason varchar(140),
talent varchar(200),
education varchar(200),
work varchar(200),
weibo_num INT,
following INT,
followers INT,
PRIMARY KEY (id)
)"""
self._sqlite_create_table(create_table)
self._sqlite_insert('user', [user.to_dict()])
logger.info(u'%s信息写入sqlite数据库完毕', user.nickname)
================================================
FILE: weibo_spider/writer/txt_writer.py
================================================
import logging
import sys
from .writer import Writer
logger = logging.getLogger('spider.txt_writer')
class TxtWriter(Writer):
def __init__(self, file_path, filter):
self.file_path = file_path
self.user_header = u'用户信息'
self.user_desc = [('nickname', '用户昵称'), ('id', '用户id'),
('weibo_num', '微博数'), ('following', '关注数'),
('followers', '粉丝数')]
if filter:
self.weibo_header = u'原创微博内容'
else:
self.weibo_header = u'微博内容'
self.weibo_desc = [('publish_place', '微博位置'), ('publish_time', '发布时间'),
('up_num', '点赞数'), ('retweet_num', '转发数'),
('comment_num', '评论数'), ('publish_tool', '发布工具')]
def write_user(self, user):
self.user = user
user_info = '\n'.join(
[v + ':' + str(getattr(self.user, k)) for k, v in self.user_desc])
with open(self.file_path, 'ab') as f:
f.write((self.user_header + ':\n' + user_info + '\n\n').encode(
sys.stdout.encoding))
logger.info(u'%s信息写入txt文件完毕,保存路径:%s', self.user.nickname,
self.file_path)
def write_weibo(self, weibo):
"""将爬取的信息写入txt文件"""
weibo_header = ''
if self.weibo_header:
weibo_header = self.weibo_header + ':\n'
self.weibo_header = ''
try:
temp_result = []
for w in weibo:
temp_result.append(getattr(w, 'content') + '\n' + '\n'.join(
[v + ':' + str(getattr(w, k))
for k, v in self.weibo_desc]))
result = '\n\n'.join(temp_result) + '\n\n'
with open(self.file_path, 'ab') as f:
f.write((weibo_header + result).encode(sys.stdout.encoding))
logger.info(u'%d条微博写入txt文件完毕,保存路径:%s', len(weibo), self.file_path)
except Exception as e:
logger.exception(e)
================================================
FILE: weibo_spider/writer/writer.py
================================================
from abc import ABC, abstractmethod
class Writer(ABC):
def __init__(self):
"""根据需要,初始化结果路径、初始化表头、初始化数据库等"""
pass
@abstractmethod
def write_weibo(self, weibo):
"""给定微博信息,写入对应文本或数据库"""
pass
@abstractmethod
def write_user(self, user):
"""给定用户信息,写入对应文本或数据库"""
pass