Repository: nghuyong/ZhengFang Branch: master Commit: 7752a5235c07 Files: 6 Total size: 15.6 KB Directory structure: gitextract_0mu_khm0/ ├── .gitignore ├── README.md ├── model.py ├── parseHtml.py ├── requirements.txt └── spider.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea/ ================================================ FILE: README.md ================================================ # 正方教务系统助手 该项目中的解析代码是2016年编写的,不保证可用于当前你学校的教务系统。代码简单,仅供参考。 ## 1.项目定义 这个项目实现了正方教务系统的一套API: 包括模拟登陆,个人信息查询,课表获取,成绩查询等等。 随着API的不断完善于扩充,可以很方便的作为后台服务。 比如教务系统手机客户端,桌面客户端,也可以作为某些特定应用需要学生课表,信息的后台。 同时这个项目定义为助手,可以继续开发其他便捷的工具: * 自动完成评教任务 * 期末新的成绩公布,邮件通知 * 分学期,分学年绩点计算 * 公选课抢课功能 ## 2.项目结构 1. ZhengFang.db 数据库 2. model.py 模型层,通过ORM与数据库相连 3. spider.py 业务层,网页爬虫,**项目入口** 4. parseHtml.py 业务层,网页解析工具 ## 3.如何使用 ```bash git clone git@github.com:nghuyong/ZhengFang.git cd ZhengFang/ pip install -r requirements.txt # 将spider.py文件中的教务系统地址,账号,密码替换成你自己的 python spider.py ``` ## 4.项目功能 项目均已江南大学正方教务系统为例测试 ### 4.1模拟登陆 登陆有两种方式 1. 默认登陆: 需要处理验证码。将验证码下载到本地。code.jpg。人工识别验证码后,手动输入验证码。实现登陆。 2. 绕过验证码登陆: 由于正方教务系统的漏洞在若存在**default5.asp**页面,可以不用验证码直接登陆。可以从default3,default4,都试一试。 ### 4.2个人信息获取 通过教务系统个人信息页面,抓取,个人信息(真的有很多信息!)并持久化保存到数据库中。 ### 4.3课表获取 通过抓取的个人信息读到学生入学的年份,在结合当前时间,就可以知道能够抓取到哪些学期的课表。 例如学生2014年入学,当前是2016年8月,说明至少可以抓取到: 2014-2015年度 第 1 学期 2014-2015年度 第 2 学期 2015-2016年度 第 1 学期 2015-2016年度 第 2 学期 这4个学期的课表,当然由于现在是2016年8月,可能可以抓取到2016-2017年度第 1 学期课表,可以试着抓取。 将抓取到的课表持久化到数据库中。 ### 4.4成绩查询 通过教务系统成绩查询页面中的“历年成绩”抓取,课程成绩,并持久化到数据库中 ### 4.5分学期分学年绩点计算 根据抓取下来的成绩,通过(总的(学分*绩点)/总的学分)分别计算各个学期的绩点与各个学年的绩点 ================================================ FILE: model.py ================================================ # author: HuYong # coding=utf-8 from peewee import * db = SqliteDatabase('ZhengFang.db') class Student(Model): name = CharField(null=True) # 姓名 urlName = CharField(null=True) # url编码后的姓名 studentnumber = CharField(null=True) # 学号 password = CharField(null=True) # 教务系统密码 idCardNumber = CharField(null=True) # 身份证号 sex = CharField(null=True) # 性别 enterSchoolTime = CharField(null=True) # 入学时间 birthsday = CharField(null=True) # 出生日期 highschool = CharField(null=True) # 毕业中学 nationality = CharField(null=True) # 名族 hometown = CharField(null=True) # 籍贯 politicsStatus = CharField(null=True) # 政治面貌 college = CharField(null=True) # 学院 major = CharField(null=True) # 专业 classname = CharField(null=True) # 所在班级 gradeClass = CharField(null=True) # 年级 class Meta: database = db class ClassSchedule(Model): student = ForeignKeyField(Student, related_name="classSchedule") # 学生 year = CharField(null=True) # 年度 term = IntegerField(null=True) # 学期 class Meta: database = db class Class(Model): schedule = ForeignKeyField(ClassSchedule, related_name="classes") # 归属课表 name = CharField(null=True) # 课程名称 type = CharField(null=True) # 课程性质 timeInTheWeek = CharField(null=True) # 星期几 timeInTheDay = CharField(null=True) # 第几节课 timeInTheTerm = CharField(null=True) # 上课周数 teacher = CharField(null=True) # 授课教师 location = CharField(null=True) # 授课地点 class Meta: database = db class YearGrade(Model): student = ForeignKeyField(Student, related_name="grades") # 归属学生 year = CharField(null=True) # 学年 yearGPA = DoubleField(null=True) # 学年GPA yearCredit = DoubleField(null=True) # 学年总学分 class Meta: database = db class TermGrade(Model): year = ForeignKeyField(YearGrade,related_name="terms") # 归属学年 term = IntegerField(null=True) # 学期 termGPA = DoubleField(null=True) # 学期GPA termCredit = DoubleField(null=True) #学期总学分 class Meta: database = db class OneLessonGrade(Model): term = ForeignKeyField(TermGrade, related_name="lessonsGrades") # 归属学期 name = CharField(null=True) # 课程名 type = CharField(null=True) # 课程性质 credit = DoubleField(null=True) # 学分 gradePoint = DoubleField(null=True) # 绩点 grade = CharField(null=True) # 成绩 class Meta: database = db ================================================ FILE: parseHtml.py ================================================ # author: HuYong # coding=utf-8 from bs4 import BeautifulSoup # 从网页中解析学生信息 def getStudentInfor(response): html = response.content.decode("gb2312") soup = BeautifulSoup(html.decode("utf-8"), "html5lib") d = {} d["studentnumber"] = soup.find(id="xh").string d["idCardNumber"] = soup.find(id="lbl_sfzh").string d["name"] = soup.find(id="xm").string d["sex"] = soup.find(id="lbl_xb").string d["enterSchoolTime"] = soup.find(id="lbl_rxrq").string d["birthsday"] = soup.find(id="lbl_csrq").string d["highschool"] = soup.find(id="lbl_byzx").string d["nationality"] = soup.find(id="lbl_mz").string d["hometown"] = soup.find(id="lbl_jg").string d["politicsStatus"] = soup.find(id="lbl_zzmm").string d["college"] = soup.find(id="lbl_xy").string d["major"] = soup.find(id="lbl_zymc").string d["classname"] = soup.find(id="lbl_xzb").string d["gradeClass"] = soup.find(id="lbl_dqszj").string return d # 从网页中解析课表信息 def getClassScheduleFromHtml(response): html = response.content.decode("gb2312","ignore") soup = BeautifulSoup(html.decode("utf-8"), "html5lib") __VIEWSTATE = soup.findAll(name="input")[2]["value"] trs = soup.find(id="Table1").find_all('tr') classes = [] for tr in trs: tds = tr.find_all('td') for td in tds: if td.string == None: oneClassKeys = ["name", "type", "time", "teacher", "location"] oneClassValues = [] for child in td.children: if child.string != None: oneClassValues.append(child.string) while len(oneClassValues) < len(oneClassKeys): oneClassValues.append("") oneClass = dict((key, value) for key, value in zip(oneClassKeys, oneClassValues)) oneClass["timeInTheWeek"] = oneClass["time"].split("{")[0][:2] oneClass["timeInTheDay"] = oneClass["time"].split("{")[0][2:] oneClass["timeInTheTerm"] = oneClass["time"].split("{")[1][:-1] classes.append(oneClass) return {"classes": classes, "__VIEWSTATE": __VIEWSTATE} def get__VIEWSTATE(response): html = response.content.decode("gb2312") soup = BeautifulSoup(html.decode("utf-8"), "html5lib") __VIEWSTATE = soup.findAll(name="input")[2]["value"] return __VIEWSTATE def getGrade(response): html = response.content.decode("gb2312") soup = BeautifulSoup(html.decode("utf-8"), "html5lib") trs = soup.find(id="Datagrid1").findAll("tr")[1:] Grades = [] for tr in trs: tds = tr.findAll("td") tds = tds[:2] + tds[3:5] + tds[6:9] oneGradeKeys = ["year", "term", "name", "type", "credit","gradePonit","grade"] oneGradeValues = [] for td in tds: oneGradeValues.append(td.string) oneGrade = dict((key, value) for key, value in zip(oneGradeKeys, oneGradeValues)) Grades.append(oneGrade) return Grades ================================================ FILE: requirements.txt ================================================ peewee==3.2.2 requests==2.19.1 beautifulsoup4==4.6.3 lxml==4.2.5 ================================================ FILE: spider.py ================================================ # author: HuYong # coding=utf-8 import os import sys import urllib import datetime import requests from lxml import etree from parseHtml import getClassScheduleFromHtml, getStudentInfor, get__VIEWSTATE, getGrade from model import Student, db, ClassSchedule, Class, YearGrade, OneLessonGrade, TermGrade class ZhengFangSpider: def __init__(self, student, baseUrl="http://202.195.144.168/jndx"): self.student = student self.baseUrl = baseUrl self.session = requests.session() self.session.headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' # 含验证码登陆 def login(self): loginurl = self.baseUrl + "/default2.aspx" response = self.session.get(loginurl) selector = etree.HTML(response.content) __VIEWSTATE = selector.xpath('//*[@id="form1"]/input/@value')[0] imgUrl = self.baseUrl + "/CheckCode.aspx?" imgresponse = self.session.get(imgUrl, stream=True) image = imgresponse.content DstDir = os.getcwd() + "\\" print("保存验证码到:" + DstDir + "code.jpg" + "\n") try: with open(DstDir + "code.jpg", "wb") as jpg: jpg.write(image) except IOError: print("IO Error\n") finally: jpg.close() code = input("验证码是:") RadioButtonList1 = u"学生".encode('gb2312', 'replace') data = { "RadioButtonList1": RadioButtonList1, "__VIEWSTATE": __VIEWSTATE, "TextBox1": self.student.studentnumber, "TextBox2": self.student.password, "TextBox3": code, "Button1": "", "lbLanguage": "" } # 登陆教务系统 Loginresponse = self.session.post(loginurl, data=data) if Loginresponse.status_code == requests.codes.ok: print("成功进入教务系统!") # 绕过验证码登陆 def loginWithOutCode(self): loginurl = self.baseUrl + "/default5.aspx" response = self.session.get(loginurl) selector = etree.HTML(response.content) __VIEWSTATE = selector.xpath('//*[@id="form1"]/input/@value')[0] RadioButtonList1 = u"学生".encode('gb2312', 'replace') data = { "RadioButtonList1": RadioButtonList1, "__VIEWSTATE": __VIEWSTATE, "TextBox1": self.student.studentnumber, "TextBox2": self.student.password, "Button1": "", } # 登陆教务系统 Loginresponse = self.session.post(loginurl, data=data) if Loginresponse.status_code == requests.codes.ok: print("成功进入教务系统!") # 获取学生基本信息 def getStudentBaseInfo(self): self.session.headers['Referer'] = self.baseUrl + "/xs_main.aspx?xh=" + self.student.studentnumber url = self.baseUrl + "/xsgrxx.aspx?xh=" + self.student.studentnumber + "&" response = self.session.get(url) d = getStudentInfor(response) self.student.idCardNumber = d["idCardNumber"] self.student.name = d["name"] self.student.urlName = urllib.quote_plus(str(d["name"].encode('gb2312'))) self.student.sex = d["sex"] self.student.enterSchoolTime = d["enterSchoolTime"] self.student.birthsday = d["birthsday"] self.student.highschool = d["highschool"] self.student.nationality = d["nationality"] self.student.hometown = d["hometown"] self.student.politicsStatus = d["politicsStatus"] self.student.college = d["college"] self.student.major = d["major"] self.student.classname = d["classname"] self.student.gradeClass = d["gradeClass"] self.student.save() print("读取学生基本信息成功") # 获取学生课表 def getClassSchedule(self): self.session.headers['Referer'] = self.baseUrl + "/xs_main.aspx?xh=" + self.student.studentnumber url = self.baseUrl + "/xskbcx.aspx?xh=" + self.student.studentnumber + "&xm=" + self.student.urlName + "&gnmkdm=N121603" response = self.session.get(url, allow_redirects=False) __VIEWSTATE = getClassScheduleFromHtml(response)["__VIEWSTATE"] year = int(self.student.gradeClass) term = 1 today = datetime.date.today() while today.year > year or (today.year == year and today.month >= 7 and term == 1): data = { "__EVENTTARGET": "xqd", "__EVENTARGUMENT": "", "__VIEWSTATE": __VIEWSTATE, "xnd": str(year) + "-" + str(year + 1), "xqd": str(term), } self.session.headers['Referer'] = url response = self.session.post(url, data) print("正在获取" + str(year) + "-" + str(year + 1) + "学年第" + str(term) + "学期课表") classes = getClassScheduleFromHtml(response)["classes"] __VIEWSTATE = getClassScheduleFromHtml(response)["__VIEWSTATE"] classSchedule = ClassSchedule(student=self.student, year=str(year) + "-" + str(year + 1), term=term) classSchedule.save() for each in classes: oneClass = Class(schedule=classSchedule, name=each["name"], type=each["type"], timeInTheWeek=each["timeInTheWeek"], timeInTheDay=each["timeInTheDay"], timeInTheTerm=each["timeInTheTerm"], teacher=each["teacher"], location=each["location"] ) oneClass.save() term = term + 1 if term > 2: term = 1 year = year + 1 print("成功获取课表") # 获取学生绩点 def getStudentGrade(self): url = self.baseUrl + "/xscjcx.aspx?xh=" + self.student.studentnumber + "&xm=" + self.student.urlName + "&gnmkdm=N121605" self.session.headers['Referer'] = self.baseUrl + "/xs_main.aspx?xh=" + self.student.studentnumber response = self.session.get(url) __VIEWSTATE = get__VIEWSTATE(response) self.session.headers['Referer'] = url data = { "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": __VIEWSTATE, 'hidLanguage': "", "ddlXN": "", "ddlXQ": "", "ddl_kcxz": "", "btn_zcj": u"历年成绩".encode('gb2312', 'replace') } response = self.session.post(url, data=data) grades = getGrade(response) for onegrade in grades: year = onegrade["year"] term = onegrade["term"] try: yearGrade = YearGrade.get(YearGrade.year == year, YearGrade.student == self.student) except: yearGrade = YearGrade(year=year, student=self.student) yearGrade.save() try: termGrade = TermGrade.get(TermGrade.year == yearGrade, TermGrade.term == int(term)) except: termGrade = TermGrade(year=yearGrade, term=int(term)) termGrade.save() try: gradePoint = float(onegrade["gradePonit"]) except: gradePoint = None oneLessonGrade = OneLessonGrade(term=termGrade, name=onegrade["name"], type=onegrade["type"], credit=float(onegrade["credit"]), gradePoint=gradePoint, grade=onegrade["grade"]) oneLessonGrade.save() print("获取成绩成功") # 计算每学期,每学年的绩点 def calculateOneTermAndOneYearGPA(self): years = self.student.grades for year in years: terms = year.terms for term in terms: sumCredit = 0.0 sumGrade = 0.0 grades = term.lessonsGrades for grade in grades: if grade.gradePoint is None: continue sumGrade = sumGrade + (grade.credit * grade.gradePoint) sumCredit = sumCredit + grade.credit termGPA = float('%.2f' % (sumGrade / sumCredit)) term.termGPA = termGPA term.termCredit = sumCredit term.save() sumGrade = 0.0 sumCredit = 0.0 for term in terms: sumGrade += term.termGPA * term.termCredit sumCredit += term.termCredit year.yearGPA = float('%.2f' % (sumGrade / sumCredit)) year.yearCredit = sumCredit year.save() print("绩点计算完毕") if __name__ == "__main__": # 连接数据库,建立数据表 try: db.connect() db.create_tables([Student, ClassSchedule, Class, YearGrade, TermGrade, OneLessonGrade]) except: pass # 查找学生,若不存在则创建账号 try: student = Student.get(Student.studentnumber == "xxxxxxxx") except Exception as e: student = Student(studentnumber="xxxxxxxx", password="xxxxxxxxx") # 用自己的教务系统账号密码 student.save() spider = ZhengFangSpider(student, baseUrl="http://202.195.144.168/jndx") # 实例化爬虫 spider.loginWithOutCode() if student.name is None: spider.getStudentBaseInfo() spider.getStudentGrade() spider.calculateOneTermAndOneYearGPA() spider.getClassSchedule()