finish the home work

This commit is contained in:
2020-03-25 11:34:46 +08:00
commit c7fab80594
9 changed files with 643 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
.vscode
*.pyc
config.json
out.txt
*.zip

277
GUI.py Normal file
View File

@@ -0,0 +1,277 @@
import tkinter as tk
import thread
class EntryWithLabel():
def __init__(self, root, labelValue, default, **kw):
self.root = root
self.labelValue = labelValue
self.default = default
self.frame = tk.Frame(root)
self.label = tk.Label(self.frame, text=labelValue)
self.var = tk.Variable()
self.var.set(default)
self.entry = tk.Entry(self.frame, textvariable=self.var, **kw)
def pack(self, **kw):
self.label.pack(side=tk.LEFT)
self.entry.pack(side=tk.RIGHT)
self.frame.pack(**kw)
return self
def get(self):
return self.var.get()
def set(self, value):
return self.var.set(value)
class LabelWithReadOnlyText():
def __init__(self, root, labelValue, default):
self.root = root
self.labelValue = labelValue
self.var = default
self.frame = tk.Frame(root)
self.label = tk.Label(self.frame, text=labelValue)
self.text = tk.Label(self.frame, text=self.var)
def pack(self, **kw):
self.label.pack(side=tk.LEFT)
self.text.pack(side=tk.RIGHT)
self.frame.pack(**kw)
return self
def set(self, value):
self.var = value
self.text.config(text=self.var)
def get(self):
return self.var
class ProgressBar():
def __init__(self, root, labelValue):
self.frame = tk.Frame(root)
self.label_title = tk.Label(self.frame, text=labelValue)
self.canvas = tk.Canvas(self.frame, width=200, height=22, bg="white")
self.base_line = self.canvas.create_rectangle(2,2,200,23,width = 1,outline = "black", fill="white")
self.fill_line = self.canvas.create_rectangle(2,2,0,23,width = 0,fill = "blue")
self.label_progress = tk.Label(self.frame, width=10)
self.progress = 0
def setProgress(self, progress, text):
if text:
self.label_progress.config(text=text)
if progress < self.progress:
self.canvas.create_rectangle(2,2,200,23,width = 1,outline = "black", fill="white")
self.fill_line = self.canvas.create_rectangle(2,2,0,23,width = 0,fill = "blue")
if progress > 1:
progress = 1
self.progress = progress
self.canvas.coords(self.fill_line, (0, 0, 200*progress, 30))
self.frame.update()
def pack(self, **kw):
self.label_title.pack(side=tk.LEFT)
self.canvas.pack(side=tk.LEFT)
self.label_progress.pack(side=tk.LEFT)
self.frame.pack(**kw)
return self
class BaseInforFrame():
def __init__(self, root):
self.labelframe = tk.LabelFrame(root, text="基本信息")
self.entry_number = EntryWithLabel(self.labelframe, "学号", "2017210281")
self.label_b = LabelWithReadOnlyText(self.labelframe, "B:", 3)
self.label_task = LabelWithReadOnlyText(self.labelframe, "任务:", "B")
self.entry_no = EntryWithLabel(self.labelframe, "本机编号:", 1)
self.entry_total = EntryWithLabel(self.labelframe, "总机器数量:", 1)
self.entry_number.var.trace("w", self._updateNumber)
self.entry_no.var.trace("w", self._updateNo)
self.entry_total.var.trace("w", self._updateTotal)
def disable(self):
self.entry_no.entry.config(state="disable")
self.entry_number.entry.config(state="disable")
self.entry_total.entry.config(state="disable")
def normal(self):
self.entry_no.entry.config(state="normal")
self.entry_number.entry.config(state="normal")
self.entry_total.entry.config(state="normal")
def pack(self, **kw):
self.entry_number.pack()
self.label_b.pack()
self.label_task.pack()
self.entry_no.pack()
self.entry_total.pack()
self.labelframe.pack(**kw)
return self
def _bToTask(self, b):
if b > 2:
return "B"
else:
return "A"
def inputToInt(self, entry):
textcheck = str(entry.get())
textcheck = ''.join(i for i in textcheck if i in '0123456789')
if textcheck == '':
return None
return int(textcheck)
def _updateNumber(self, *args):
number = self.inputToInt(self.entry_number)
if number == None:
return
try:
self.entry_number.set(number)
b = number % 19
self.label_b.set(b)
self.label_task.set(self._bToTask(b))
return False
except:
pass
def _updateNo(self, *args):
no = self.inputToInt(self.entry_no)
total = self.inputToInt(self.entry_total)
if no == None or total == None:
return
if no > total:
self.entry_no.set(total)
elif no < 1:
self.entry_no.set(1)
else:
self.entry_no.set(no)
def _updateTotal(self, *args):
total = self.inputToInt(self.entry_total)
if total == None:
return
if total < 1:
total = 1
self.entry_total.set(total)
class SpiderStatus():
def __init__(self, root):
self.labelframe = tk.LabelFrame(root, text="爬取结果")
self.text_info = tk.Text(self.labelframe, width= 100, height= 20)
def pack(self, **kw):
self.text_info.pack()
self.labelframe.pack(**kw)
def set(self, text):
# self.text_info.delete('1.0','end')
self.text_info.insert('end', text+"\n")
self.text_info.see(tk.END)
class Timmer():
def __init__(self, root, start, stop):
self.pause = False
self.start = False
self.sleeptime = 4.0
self.maxoffset = 212
self.startP = start
self.stop = stop
self.labelframe = tk.LabelFrame(root, text="计时器")
self.progress_each = ProgressBar(self.labelframe, "每次计时:")
self.progress_totle = ProgressBar(self.labelframe, "总进度:")
self.entry_sleepTime = EntryWithLabel(self.labelframe, "每次延时(s):", self.sleeptime)
self.entry_maxOffset = EntryWithLabel(self.labelframe, "最大偏移:", self.maxoffset)
self.btnframe = tk.Frame(self.labelframe)
self.button_startStop = tk.Button(self.btnframe, text="开始", command=self.startStop)
self.button_pause = tk.Button(self.btnframe, text="暂停", command=self._pause, state="disable")
self.entry_sleepTime.var.trace("w", self._updateSleepTime)
self.entry_maxOffset.var.trace("w", self._updateMaxOffset)
def pack(self, **kw):
self.progress_each.pack()
self.progress_totle.pack()
self.entry_sleepTime.pack()
self.entry_maxOffset.pack()
self.button_startStop.pack(side=tk.LEFT)
self.button_pause.pack(side=tk.LEFT)
self.btnframe.pack()
self.labelframe.pack(**kw)
def inputToInt(self, entry):
textcheck = str(entry.get())
textcheck = ''.join(i for i in textcheck if i in '0123456789')
if textcheck == '':
return None
return int(textcheck)
def inputToFloat(self, entry):
textcheck = str(entry.get())
textcheck = ''.join(i for i in textcheck if i in '0123456789.')
if textcheck == '':
return None
if textcheck.count('.') > 1:
first = textcheck.find('.')
second = textcheck.find('.', first+1)
textcheck = textcheck[0:second]
return float(textcheck)
def _updateSleepTime(self, *args):
sleeptime = self.inputToFloat(self.entry_sleepTime)
if sleeptime == None:
return
self.sleeptime = sleeptime
self.entry_sleepTime.set(self.sleeptime)
def _updateMaxOffset(self, *args):
maxoffset = self.inputToInt(self.entry_maxOffset)
if maxoffset == None:
return
self.maxoffset = maxoffset
self.progress_totle.setProgress(int(self.progress_totle.label_progress.config("text")[-1])/self.maxoffset, None)
self.entry_maxOffset.set(self.maxoffset)
def startStop(self):
if self.start and not self.pause: # 停止
self.start = False
self.pause = False
self.stop()
self.button_startStop.config(text="开始")
self.button_pause.config(state="disable")
elif not self.start and not self.pause: # 开始
self.start = True
self.pause = False
self.startP()
self.button_startStop.config(text="停止")
self.button_pause.config(state="normal")
elif self.start and self.pause: # 继续
self.start = True
self.pause = False
self.button_startStop.config(text="停止")
self.button_pause.config(state="normal")
def _pause(self):
self.button_pause.config(state="disable")
self.button_startStop.config(text="继续")
self.pause = True
class MainGui():
def __init__(self, backend):
self.root = tk.Tk()
self.root.title("Web搜索技术第一次作业")
self.backend = backend
self.thread = None
self.topFrame = tk.Frame(self.root)
self.baseinfo = BaseInforFrame(self.topFrame)
self.timmer = Timmer(self.topFrame, self.startThread, self.stopThread)
self.spiderstatus = SpiderStatus(self.root)
def start(self):
self.baseinfo.pack(side=tk.LEFT)
self.timmer.pack(side=tk.LEFT)
self.topFrame.pack()
self.spiderstatus.pack()
self.timmer.progress_totle.setProgress(self.backend.getOffset()/self.timmer.maxoffset, str(self.backend.getOffset()))
self.root.protocol("WM_DELETE_WINDOW", self.on_closing)
self.root.mainloop()
def on_closing(self):
if self.thread and self.thread.is_alive():
self.thread.terminate()
thread.stop_thread(self.thread)
self.thread.join()
self.root.destroy()
def startThread(self):
self.baseinfo.disable()
self.backend.config.data["number"] = self.baseinfo.inputToInt(self.baseinfo.entry_number)
self.backend.config.data["totle"] = self.baseinfo.inputToInt(self.baseinfo.entry_total)
self.backend.config.data["no"] = self.baseinfo.inputToInt(self.baseinfo.entry_no)
self.backend.config.saveData()
self.backend.b = self.baseinfo.label_b.get()
self.backend.task = self.baseinfo.label_task.get()
self.thread = thread.TimmerThread(self.timmer, self.spiderstatus, self.backend)
self.thread.start()
def stopThread(self):
self.baseinfo.normal()

78
README.md Normal file
View File

@@ -0,0 +1,78 @@
Web搜索技术 第一次作业 网站信息精准采集技术
===
## 前言
姑且这就算做实验报告吧。首先由于被当年信通院的OJ的各种奇怪问题惊艳到而产生心理阴影我希望我的程序可以被**人工审核**。
## 需求分析
网站信息精准采集程序的要求
- [x] 采集想要的内容,不多不少不重复 ~ 完备性
- [x] 对网站不构成负担 (性能压力不大)~ 轻量采集
- [x] 出现异常情况 ~ 自行停止程序
- [ ] 程序异常退出 ~ 自动重启
- [x] 断点续采 ~ 鲁棒性
- [x] 精准采集 ~ 正则表达式
- [x] 采集过程信息及时展示 ~ 界面友好
- [x] 采集结果实时存储 ~ 写文件
- [x] 允许多个机器同时运行,采集不同的内容 ~ 并行性所有内容构成完备集。
## 技术实现
为了可以达到“界面友好”的需求使用PyTk实现了一个GUI因此整个程序分为两大部分
- UI 负责处理GUI交互
- Spider负责爬虫相关逻辑
程序运行时会涉及两个线程:
- MainloopGUi主循环事件处理点击以及页面绘制
- TimmerThread子进程负责计时并发送网络请求
为了降低代码耦合度,抽离如下模块:
- errors.py 定义各种自定义错误
- GUI.py 主要的GUI方面的代码包括所有的子组件的实现
- main.py 程序入口
- spider.py 爬虫相关代码
- thread.py 计时器子线程的代码子线程中调用GUI相关组件更新状态
- utils.py 包括正则匹配,格式化输入输出的一些工具类
## 具体使用
测试平台Windows 10 专业版 1909 18363.752
Pyhton版本3.7.3
安装依赖:
```bash
pip install -i requirements.txt
```
运行:
```bash
python main.py
```
填写学号并配置多机器(可选),点击开始即可。
中途可自由暂停、继续、停止、关闭窗口,基本不会引起文件保存错误,并且可以继续之前的进度进行爬取。
个人认为程序异常退出还要重启有点问题,会导致程序无法正常关闭,无论如何关闭,对于程序来说都是发生了一个异常,所以无论如何都会重启,并非爬虫的本分,所以未作实现。
由于本人是任务B所以对于任务A的并未做任何实现但是考虑到接口类似仅仅声明了接口如果需要可以对其进行实现即可
## 目前的问题
有一些符号例如'•'(\u2202)和' '(\xa0)不存在GBK对应符号因此使用空格进行替换。
这也是为什么我对于自动判别持怀疑态度。
## 明知存在问题为什么还要早早交
判别程序的问题不是我的责任,我认为我已经完成了规定的项目,我既然大晚上十分兴奋写完,一定要提交一下。见识一下自动判决会出什么问题。

27
errors.py Normal file
View File

@@ -0,0 +1,27 @@
class SpiderError(RuntimeError):
def __init__(self):
super(SpiderError, self).__init__()
def __str__(self):
return "一个未知的爬虫错误"
class SpiderTaskTypeError(SpiderError):
def __init__(self, task):
super(SpiderTaskTypeError, self).__init__()
self.task = task
def __str__(self):
return "Task的类型: %s 不受支持" % self.task
class SpiderNotFond(SpiderError):
def __init__(self, id):
super(SpiderNotFond, self).__init__()
self.id = id
def __str__(self):
return "id: %s 的页面并不存在" % str(self.id)
class SpiderReFailed(SpiderError):
def __init__(self, id, error):
super(SpiderReFailed, self).__init__()
self.id = id
self.err = error
def __str__(self):
return "id: %s 的页面在正则识别id/title的时候出错, 具体错误如下:\n%s" % (str(self.id), str(self.err))

13
main.py Normal file
View File

@@ -0,0 +1,13 @@
#-*- coding: UTF-8 -*-
__author__ = 'Xice <admin@xice.wang>'
__data__ = '2020-03-24'
import GUI
import spider
if __name__ == "__main__":
c = spider.CheckPointAndConfig("./config.json")
s = spider.Spider(c)
gui = GUI.MainGui(s)
gui.start()

1
requirements.txt Normal file
View File

@@ -0,0 +1 @@
requests==2.21.0

67
spider.py Normal file
View File

@@ -0,0 +1,67 @@
import requests
import html
import unicodedata
from utils import ReForTaskB, ReForTaskA, OutPutFile, CheckPointAndConfig
from errors import SpiderNotFond, SpiderTaskTypeError, SpiderReFailed
class Spider():
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
def __init__(self, config):
self.config = config
self.out = OutPutFile(config.data["outputFile"], "gbk")
self.b = 3
self.task = "B"
def getOffset(self):
return self.config.data['checkpoint'] * self.config.data['totle'] + self.config.data['no'] - 1
urlTaskB = "https://win.bupt.edu.cn/program.do?id=%d"
staticReTaskB = ReForTaskB()
def _doOneStepOnTaskB(self):
pid = (self.b - 3) * 150 + 1 + self.getOffset()
r = requests.get(self.urlTaskB % pid, headers=self.headers)
text = html.unescape(r.text)
text = text.replace('\u2022', ' ').replace('\xa0', ' ')
# text = unicodedata.normalize('NFKD', text)
if '抱歉,没有该项目' in text:
raise SpiderNotFond(pid)
try:
res = self.staticReTaskB.search(text).display()
return res
except AttributeError as e:
raise SpiderReFailed(pid, e)
urlTaskA = "https://win.bupt.edu.cn/project.do?next=collectlist&p=%d"
staticReTaskA = ReForTaskA()
def _doOneStepOnTaskA(self):
pass
def doOneStep(self):
try:
if self.task == "B":
res = self._doOneStepOnTaskB()
else:
raise SpiderTaskTypeError(self.task)
self.out.write(res)
return res
finally:
self.config.data['checkpoint'] += 1
self.config.saveData()
if __name__ == "__main__":
c = CheckPointAndConfig("./config.json")
s = Spider(c)
while True:
try:
res = s.doOneStep()
print(res)
except SpiderTaskTypeError as e:
print(str(e))
break
except SpiderReFailed as e:
print(str(e))
break
except SpiderNotFond as e:
print(str(e))
except Exception as e:
print("未知错误:\n" + str(e))
break

87
thread.py Normal file
View File

@@ -0,0 +1,87 @@
import threading
import time
from errors import SpiderTaskTypeError,SpiderNotFond,SpiderReFailed
import inspect
import ctypes
def _async_raise(tid, exctype):
"""raises the exception, performs cleanup if needed"""
tid = ctypes.c_long(tid)
if not inspect.isclass(exctype):
exctype = type(exctype)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
if res == 0:
raise ValueError("invalid thread id")
elif res != 1:
# """if it returns a number greater than one, you're in trouble,
# and you should call it again with exc=NULL to revert the effect"""
ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
raise SystemError("PyThreadState_SetAsyncExc failed")
def stop_thread(thread):
_async_raise(thread.ident, RuntimeError)
class TimmerThread(threading.Thread):
def __init__(self, timmer, status, backend):
threading.Thread.__init__(self)
self._timmer = timmer
self._status = status
self._backend = backend
self._running = True
def terminate(self):
self._running = False
def timmer(self):
if self._running:
return self._timmer
else:
raise RuntimeError()
def status(self):
if self._running:
return self._status
else:
raise RuntimeError()
def backend(self):
if self._running:
return self._backend
else:
raise RuntimeError()
def run(self):
nowTime = 0.0
try:
while self._running and self.timmer().start:
if self.backend().getOffset() > self.timmer().maxoffset:
self.timmer().progress_totle.setProgress(self.backend().getOffset()/self.timmer().maxoffset, str(self.backend().getOffset()))
self.status().set("爬取结束")
self.timmer().startStop()
return
time.sleep(0.02)
if self.timmer().pause:
continue
nowTime += 0.02
try:
self.timmer().progress_each.setProgress(nowTime/self.timmer().sleeptime, str(round(nowTime,2)) + "s")
except ZeroDivisionError:
self.timmer().progress_each.setProgress(1, str(round(nowTime,2)) + "s")
if nowTime > self.timmer().sleeptime:
self.timmer().progress_totle.setProgress(self.backend().getOffset()/self.timmer().maxoffset, str(self.backend().getOffset()))
nowTime = 0.0
try:
res = self.backend().doOneStep()
self.status().set(res)
except SpiderTaskTypeError as e:
self.status().set(str(e))
self.timmer().startStop()
return
except SpiderReFailed as e:
self.status().set(str(e))
self.timmer().startStop()
return
except SpiderNotFond as e:
self.status().set(str(e))
except UnicodeEncodeError as e:
self.status().set("编码错误:\n" + str(e))
self.timmer().startStop()
return
except Exception as e:
self.status().set("未知错误:\n" + str(e))
self.timmer().startStop()
return
except RuntimeError:
return

87
utils.py Normal file
View File

@@ -0,0 +1,87 @@
import re
import os
import json
class _resInTextForTaskB():
def __init__(self):
self.id = ""
self.title = ""
self.en_title = ""
self.college = ""
self.info = ""
self.source = ""
def __str__(self):
return self.display()
def display(self):
res = ""
res += "id:%s\n" % self.id
res += "title:%s\n" % self.title
res += "en_title:%s\n" % self.en_title
res += "college:%s\n" % self.college
res += "info:%s\n" % self.info
res += "source:%s" % self.source
return res
class ReForTaskB():
reId = re.compile(r'"http://win\.bupt\.edu\.cn/program\.do\?id=(\d+)"')
reTitle = re.compile(r'<h2 style="display:inline">\s*(.*?)\s*</h2>')
reEnTitle = re.compile(r'<div style="margin-top:-7px;overflow: hidden;white-space: nowrap;text-overflow: ellipsis;">\s*(.*?)\s*</div>')
reCollege = re.compile(r'<h3 style="display:inline;">\s*(.*?)\s*</h3>')
reInfo = re.compile(r'<br>\s*<div style="font-size:17px;line-height:25px;">\s*(.*?)(\s*</div>){7}', re.S)
reSource = re.compile(r'\[\{"score":".*?","type":".*?","time":".*?","name":"(.*?)"\}')
def search(self, text):
res = _resInTextForTaskB()
res.id = self.reId.search(text).group(1)
res.title = self.reTitle.search(text).group(1)
try:
res.en_title = self.reEnTitle.search(text).group(1)
except AttributeError:
res.en_title = ""
try:
res.college = self.reCollege.search(text).group(1)
except AttributeError:
res.college = ""
try:
res.info = self.reInfo.search(text).group(1).replace("\r\n"," ").replace("\n"," ").replace("\r"," ")
except AttributeError:
res.info = ""
try:
res.source = self.reSource.search(text).group(1).encode('utf-8').decode('unicode_escape')
except AttributeError:
res.source = ""
return res
class ReForTaskA():
def __init__(self):
pass
class OutPutFile():
def __init__(self, fileName, encoding):
self.fileName = fileName
self.encoding = encoding
def write(self, text):
if os.path.exists(self.fileName):
text = "\n" + text
with open(self.fileName,mode='a',encoding=self.encoding) as f:
f.write(text)
class CheckPointAndConfig():
def __init__(self, fileName):
self.fileName = fileName
self.data = {}
self.loadData()
def loadData(self):
if os.path.exists(self.fileName):
with open(self.fileName,'r') as load_f:
self.data = json.load(load_f)
else:
self.data = {
"number": 2017210281,
"outputFile": "out.txt",
"totle": 1,
"no": 1,
"checkpoint": 0
}
self.saveData()
def saveData(self):
with open(self.fileName,"w") as dump_f:
json.dump(self.data,dump_f)