commit c7fab80594dbb88254f356891f40a710ef0f2e40 Author: LiangXiao Date: Wed Mar 25 11:34:46 2020 +0800 finish the home work diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7191673 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.vscode +*.pyc +config.json +out.txt + +*.zip \ No newline at end of file diff --git a/GUI.py b/GUI.py new file mode 100644 index 0000000..a6ccc1e --- /dev/null +++ b/GUI.py @@ -0,0 +1,277 @@ +import tkinter as tk +import thread +class EntryWithLabel(): + def __init__(self, root, labelValue, default, **kw): + self.root = root + self.labelValue = labelValue + self.default = default + self.frame = tk.Frame(root) + self.label = tk.Label(self.frame, text=labelValue) + self.var = tk.Variable() + self.var.set(default) + self.entry = tk.Entry(self.frame, textvariable=self.var, **kw) + + def pack(self, **kw): + self.label.pack(side=tk.LEFT) + self.entry.pack(side=tk.RIGHT) + self.frame.pack(**kw) + return self + def get(self): + return self.var.get() + def set(self, value): + return self.var.set(value) + +class LabelWithReadOnlyText(): + def __init__(self, root, labelValue, default): + self.root = root + self.labelValue = labelValue + self.var = default + self.frame = tk.Frame(root) + self.label = tk.Label(self.frame, text=labelValue) + self.text = tk.Label(self.frame, text=self.var) + def pack(self, **kw): + self.label.pack(side=tk.LEFT) + self.text.pack(side=tk.RIGHT) + self.frame.pack(**kw) + return self + def set(self, value): + self.var = value + self.text.config(text=self.var) + def get(self): + return self.var + +class ProgressBar(): + def __init__(self, root, labelValue): + self.frame = tk.Frame(root) + self.label_title = tk.Label(self.frame, text=labelValue) + self.canvas = tk.Canvas(self.frame, width=200, height=22, bg="white") + self.base_line = self.canvas.create_rectangle(2,2,200,23,width = 1,outline = "black", fill="white") + self.fill_line = self.canvas.create_rectangle(2,2,0,23,width = 0,fill = "blue") + self.label_progress = tk.Label(self.frame, width=10) + self.progress = 0 + def setProgress(self, progress, text): + if text: + self.label_progress.config(text=text) + if progress < self.progress: + self.canvas.create_rectangle(2,2,200,23,width = 1,outline = "black", fill="white") + self.fill_line = self.canvas.create_rectangle(2,2,0,23,width = 0,fill = "blue") + if progress > 1: + progress = 1 + self.progress = progress + self.canvas.coords(self.fill_line, (0, 0, 200*progress, 30)) + self.frame.update() + def pack(self, **kw): + self.label_title.pack(side=tk.LEFT) + self.canvas.pack(side=tk.LEFT) + self.label_progress.pack(side=tk.LEFT) + self.frame.pack(**kw) + return self + + +class BaseInforFrame(): + def __init__(self, root): + self.labelframe = tk.LabelFrame(root, text="基本信息") + self.entry_number = EntryWithLabel(self.labelframe, "学号", "2017210281") + self.label_b = LabelWithReadOnlyText(self.labelframe, "B:", 3) + self.label_task = LabelWithReadOnlyText(self.labelframe, "任务:", "B") + self.entry_no = EntryWithLabel(self.labelframe, "本机编号:", 1) + self.entry_total = EntryWithLabel(self.labelframe, "总机器数量:", 1) + + self.entry_number.var.trace("w", self._updateNumber) + self.entry_no.var.trace("w", self._updateNo) + self.entry_total.var.trace("w", self._updateTotal) + def disable(self): + self.entry_no.entry.config(state="disable") + self.entry_number.entry.config(state="disable") + self.entry_total.entry.config(state="disable") + def normal(self): + self.entry_no.entry.config(state="normal") + self.entry_number.entry.config(state="normal") + self.entry_total.entry.config(state="normal") + def pack(self, **kw): + self.entry_number.pack() + self.label_b.pack() + self.label_task.pack() + self.entry_no.pack() + self.entry_total.pack() + self.labelframe.pack(**kw) + return self + + def _bToTask(self, b): + if b > 2: + return "B" + else: + return "A" + def inputToInt(self, entry): + textcheck = str(entry.get()) + textcheck = ''.join(i for i in textcheck if i in '0123456789') + if textcheck == '': + return None + return int(textcheck) + + def _updateNumber(self, *args): + number = self.inputToInt(self.entry_number) + if number == None: + return + try: + self.entry_number.set(number) + b = number % 19 + self.label_b.set(b) + self.label_task.set(self._bToTask(b)) + return False + except: + pass + def _updateNo(self, *args): + no = self.inputToInt(self.entry_no) + total = self.inputToInt(self.entry_total) + if no == None or total == None: + return + if no > total: + self.entry_no.set(total) + elif no < 1: + self.entry_no.set(1) + else: + self.entry_no.set(no) + + def _updateTotal(self, *args): + total = self.inputToInt(self.entry_total) + if total == None: + return + if total < 1: + total = 1 + self.entry_total.set(total) + + +class SpiderStatus(): + def __init__(self, root): + self.labelframe = tk.LabelFrame(root, text="爬取结果") + self.text_info = tk.Text(self.labelframe, width= 100, height= 20) + def pack(self, **kw): + self.text_info.pack() + self.labelframe.pack(**kw) + def set(self, text): + # self.text_info.delete('1.0','end') + self.text_info.insert('end', text+"\n") + self.text_info.see(tk.END) + +class Timmer(): + def __init__(self, root, start, stop): + self.pause = False + self.start = False + self.sleeptime = 4.0 + self.maxoffset = 212 + self.startP = start + self.stop = stop + + self.labelframe = tk.LabelFrame(root, text="计时器") + self.progress_each = ProgressBar(self.labelframe, "每次计时:") + self.progress_totle = ProgressBar(self.labelframe, "总进度:") + self.entry_sleepTime = EntryWithLabel(self.labelframe, "每次延时(s):", self.sleeptime) + self.entry_maxOffset = EntryWithLabel(self.labelframe, "最大偏移:", self.maxoffset) + self.btnframe = tk.Frame(self.labelframe) + self.button_startStop = tk.Button(self.btnframe, text="开始", command=self.startStop) + self.button_pause = tk.Button(self.btnframe, text="暂停", command=self._pause, state="disable") + + self.entry_sleepTime.var.trace("w", self._updateSleepTime) + self.entry_maxOffset.var.trace("w", self._updateMaxOffset) + def pack(self, **kw): + self.progress_each.pack() + self.progress_totle.pack() + self.entry_sleepTime.pack() + self.entry_maxOffset.pack() + self.button_startStop.pack(side=tk.LEFT) + self.button_pause.pack(side=tk.LEFT) + self.btnframe.pack() + self.labelframe.pack(**kw) + def inputToInt(self, entry): + textcheck = str(entry.get()) + textcheck = ''.join(i for i in textcheck if i in '0123456789') + if textcheck == '': + return None + return int(textcheck) + def inputToFloat(self, entry): + textcheck = str(entry.get()) + textcheck = ''.join(i for i in textcheck if i in '0123456789.') + if textcheck == '': + return None + if textcheck.count('.') > 1: + first = textcheck.find('.') + second = textcheck.find('.', first+1) + textcheck = textcheck[0:second] + return float(textcheck) + def _updateSleepTime(self, *args): + sleeptime = self.inputToFloat(self.entry_sleepTime) + if sleeptime == None: + return + self.sleeptime = sleeptime + self.entry_sleepTime.set(self.sleeptime) + def _updateMaxOffset(self, *args): + maxoffset = self.inputToInt(self.entry_maxOffset) + if maxoffset == None: + return + self.maxoffset = maxoffset + self.progress_totle.setProgress(int(self.progress_totle.label_progress.config("text")[-1])/self.maxoffset, None) + self.entry_maxOffset.set(self.maxoffset) + def startStop(self): + if self.start and not self.pause: # 停止 + self.start = False + self.pause = False + self.stop() + self.button_startStop.config(text="开始") + self.button_pause.config(state="disable") + elif not self.start and not self.pause: # 开始 + self.start = True + self.pause = False + self.startP() + self.button_startStop.config(text="停止") + self.button_pause.config(state="normal") + elif self.start and self.pause: # 继续 + self.start = True + self.pause = False + self.button_startStop.config(text="停止") + self.button_pause.config(state="normal") + def _pause(self): + self.button_pause.config(state="disable") + self.button_startStop.config(text="继续") + self.pause = True + + +class MainGui(): + def __init__(self, backend): + self.root = tk.Tk() + self.root.title("Web搜索技术第一次作业") + self.backend = backend + self.thread = None + self.topFrame = tk.Frame(self.root) + self.baseinfo = BaseInforFrame(self.topFrame) + self.timmer = Timmer(self.topFrame, self.startThread, self.stopThread) + self.spiderstatus = SpiderStatus(self.root) + + def start(self): + self.baseinfo.pack(side=tk.LEFT) + self.timmer.pack(side=tk.LEFT) + self.topFrame.pack() + self.spiderstatus.pack() + self.timmer.progress_totle.setProgress(self.backend.getOffset()/self.timmer.maxoffset, str(self.backend.getOffset())) + self.root.protocol("WM_DELETE_WINDOW", self.on_closing) + self.root.mainloop() + + def on_closing(self): + if self.thread and self.thread.is_alive(): + self.thread.terminate() + thread.stop_thread(self.thread) + self.thread.join() + self.root.destroy() + + def startThread(self): + self.baseinfo.disable() + self.backend.config.data["number"] = self.baseinfo.inputToInt(self.baseinfo.entry_number) + self.backend.config.data["totle"] = self.baseinfo.inputToInt(self.baseinfo.entry_total) + self.backend.config.data["no"] = self.baseinfo.inputToInt(self.baseinfo.entry_no) + self.backend.config.saveData() + self.backend.b = self.baseinfo.label_b.get() + self.backend.task = self.baseinfo.label_task.get() + self.thread = thread.TimmerThread(self.timmer, self.spiderstatus, self.backend) + self.thread.start() + def stopThread(self): + self.baseinfo.normal() \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e9096f5 --- /dev/null +++ b/README.md @@ -0,0 +1,78 @@ +Web搜索技术 第一次作业 网站信息精准采集技术 +=== + +## 前言 + +姑且这就算做实验报告吧。首先由于被当年信通院的OJ的各种奇怪问题惊艳到而产生心理阴影,我希望我的程序可以被**人工审核**。 + +## 需求分析 + +网站信息精准采集程序的要求 + +- [x] 采集想要的内容,不多不少不重复 ~ 完备性 +- [x] 对网站不构成负担 (性能压力不大)~ 轻量采集 +- [x] 出现异常情况 ~ 自行停止程序 +- [ ] 程序异常退出 ~ 自动重启 +- [x] 断点续采 ~ 鲁棒性 +- [x] 精准采集 ~ 正则表达式 +- [x] 采集过程信息及时展示 ~ 界面友好 +- [x] 采集结果实时存储 ~ 写文件 +- [x] 允许多个机器同时运行,采集不同的内容 ~ 并行性所有内容构成完备集。 + +## 技术实现 + +为了可以达到“界面友好”的需求,使用PyTk实现了一个GUI,因此整个程序分为两大部分: + +- UI: 负责处理GUI交互 +- Spider:负责爬虫相关逻辑 + +程序运行时会涉及两个线程: + +- Mainloop:GUi主循环事件,处理点击以及页面绘制 +- TimmerThread:子进程,负责计时并发送网络请求 + +为了降低代码耦合度,抽离如下模块: + +- errors.py 定义各种自定义错误 +- GUI.py 主要的GUI方面的代码,包括所有的子组件的实现 +- main.py 程序入口 +- spider.py 爬虫相关代码 +- thread.py 计时器子线程的代码,子线程中调用GUI相关组件更新状态 +- utils.py 包括正则匹配,格式化输入输出的一些工具类 + +## 具体使用 + +测试平台:Windows 10 专业版 1909 18363.752 + +Pyhton版本:3.7.3 + +安装依赖: + +```bash +pip install -i requirements.txt +``` + +运行: + +```bash +python main.py +``` + +填写学号并配置多机器(可选),点击开始即可。 + +中途可自由暂停、继续、停止、关闭窗口,基本不会引起文件保存错误,并且可以继续之前的进度进行爬取。 + +个人认为程序异常退出还要重启有点问题,会导致程序无法正常关闭,无论如何关闭,对于程序来说都是发生了一个异常,所以无论如何都会重启,并非爬虫的本分,所以未作实现。 + +由于本人是任务B,所以对于任务A的并未做任何实现,但是考虑到接口类似,仅仅声明了接口,如果需要可以对其进行实现即可 + + +## 目前的问题 + +有一些符号例如'•'(\u2202)和' '(\xa0)不存在GBK对应符号,因此使用空格进行替换。 + +这也是为什么我对于自动判别持怀疑态度。 + +## 明知存在问题为什么还要早早交 + +判别程序的问题不是我的责任,我认为我已经完成了规定的项目,我既然大晚上十分兴奋写完,一定要提交一下。见识一下自动判决会出什么问题。 \ No newline at end of file diff --git a/errors.py b/errors.py new file mode 100644 index 0000000..226809f --- /dev/null +++ b/errors.py @@ -0,0 +1,27 @@ +class SpiderError(RuntimeError): + def __init__(self): + super(SpiderError, self).__init__() + def __str__(self): + return "一个未知的爬虫错误" + +class SpiderTaskTypeError(SpiderError): + def __init__(self, task): + super(SpiderTaskTypeError, self).__init__() + self.task = task + def __str__(self): + return "Task的类型: %s 不受支持" % self.task + +class SpiderNotFond(SpiderError): + def __init__(self, id): + super(SpiderNotFond, self).__init__() + self.id = id + def __str__(self): + return "id: %s 的页面并不存在" % str(self.id) + +class SpiderReFailed(SpiderError): + def __init__(self, id, error): + super(SpiderReFailed, self).__init__() + self.id = id + self.err = error + def __str__(self): + return "id: %s 的页面在正则识别id/title的时候出错, 具体错误如下:\n%s" % (str(self.id), str(self.err)) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..e9b4e04 --- /dev/null +++ b/main.py @@ -0,0 +1,13 @@ +#-*- coding: UTF-8 -*- +__author__ = 'Xice ' +__data__ = '2020-03-24' + +import GUI +import spider + +if __name__ == "__main__": + c = spider.CheckPointAndConfig("./config.json") + s = spider.Spider(c) + gui = GUI.MainGui(s) + gui.start() + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8024749 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests==2.21.0 \ No newline at end of file diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..e5a2648 --- /dev/null +++ b/spider.py @@ -0,0 +1,67 @@ +import requests +import html +import unicodedata +from utils import ReForTaskB, ReForTaskA, OutPutFile, CheckPointAndConfig +from errors import SpiderNotFond, SpiderTaskTypeError, SpiderReFailed +class Spider(): + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0' + } + def __init__(self, config): + self.config = config + self.out = OutPutFile(config.data["outputFile"], "gbk") + self.b = 3 + self.task = "B" + + def getOffset(self): + return self.config.data['checkpoint'] * self.config.data['totle'] + self.config.data['no'] - 1 + + urlTaskB = "https://win.bupt.edu.cn/program.do?id=%d" + staticReTaskB = ReForTaskB() + def _doOneStepOnTaskB(self): + pid = (self.b - 3) * 150 + 1 + self.getOffset() + r = requests.get(self.urlTaskB % pid, headers=self.headers) + text = html.unescape(r.text) + text = text.replace('\u2022', ' ').replace('\xa0', ' ') + # text = unicodedata.normalize('NFKD', text) + if '抱歉,没有该项目' in text: + raise SpiderNotFond(pid) + try: + res = self.staticReTaskB.search(text).display() + return res + except AttributeError as e: + raise SpiderReFailed(pid, e) + urlTaskA = "https://win.bupt.edu.cn/project.do?next=collectlist&p=%d" + staticReTaskA = ReForTaskA() + def _doOneStepOnTaskA(self): + pass + def doOneStep(self): + try: + if self.task == "B": + res = self._doOneStepOnTaskB() + else: + raise SpiderTaskTypeError(self.task) + self.out.write(res) + return res + finally: + self.config.data['checkpoint'] += 1 + self.config.saveData() + +if __name__ == "__main__": + c = CheckPointAndConfig("./config.json") + s = Spider(c) + while True: + try: + res = s.doOneStep() + print(res) + except SpiderTaskTypeError as e: + print(str(e)) + break + except SpiderReFailed as e: + print(str(e)) + break + except SpiderNotFond as e: + print(str(e)) + except Exception as e: + print("未知错误:\n" + str(e)) + break \ No newline at end of file diff --git a/thread.py b/thread.py new file mode 100644 index 0000000..93c02a5 --- /dev/null +++ b/thread.py @@ -0,0 +1,87 @@ +import threading +import time +from errors import SpiderTaskTypeError,SpiderNotFond,SpiderReFailed +import inspect +import ctypes +def _async_raise(tid, exctype): + """raises the exception, performs cleanup if needed""" + tid = ctypes.c_long(tid) + if not inspect.isclass(exctype): + exctype = type(exctype) + res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype)) + if res == 0: + raise ValueError("invalid thread id") + elif res != 1: + # """if it returns a number greater than one, you're in trouble, + # and you should call it again with exc=NULL to revert the effect""" + ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None) + raise SystemError("PyThreadState_SetAsyncExc failed") +def stop_thread(thread): + _async_raise(thread.ident, RuntimeError) +class TimmerThread(threading.Thread): + def __init__(self, timmer, status, backend): + threading.Thread.__init__(self) + self._timmer = timmer + self._status = status + self._backend = backend + self._running = True + def terminate(self): + self._running = False + def timmer(self): + if self._running: + return self._timmer + else: + raise RuntimeError() + def status(self): + if self._running: + return self._status + else: + raise RuntimeError() + def backend(self): + if self._running: + return self._backend + else: + raise RuntimeError() + def run(self): + nowTime = 0.0 + try: + while self._running and self.timmer().start: + if self.backend().getOffset() > self.timmer().maxoffset: + self.timmer().progress_totle.setProgress(self.backend().getOffset()/self.timmer().maxoffset, str(self.backend().getOffset())) + self.status().set("爬取结束") + self.timmer().startStop() + return + time.sleep(0.02) + if self.timmer().pause: + continue + nowTime += 0.02 + try: + self.timmer().progress_each.setProgress(nowTime/self.timmer().sleeptime, str(round(nowTime,2)) + "s") + except ZeroDivisionError: + self.timmer().progress_each.setProgress(1, str(round(nowTime,2)) + "s") + if nowTime > self.timmer().sleeptime: + self.timmer().progress_totle.setProgress(self.backend().getOffset()/self.timmer().maxoffset, str(self.backend().getOffset())) + nowTime = 0.0 + try: + res = self.backend().doOneStep() + self.status().set(res) + except SpiderTaskTypeError as e: + self.status().set(str(e)) + self.timmer().startStop() + return + except SpiderReFailed as e: + self.status().set(str(e)) + self.timmer().startStop() + return + except SpiderNotFond as e: + self.status().set(str(e)) + except UnicodeEncodeError as e: + self.status().set("编码错误:\n" + str(e)) + self.timmer().startStop() + return + except Exception as e: + self.status().set("未知错误:\n" + str(e)) + self.timmer().startStop() + return + except RuntimeError: + return \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..0e4c8f7 --- /dev/null +++ b/utils.py @@ -0,0 +1,87 @@ +import re +import os +import json +class _resInTextForTaskB(): + def __init__(self): + self.id = "" + self.title = "" + self.en_title = "" + self.college = "" + self.info = "" + self.source = "" + def __str__(self): + return self.display() + def display(self): + res = "" + res += "id:%s\n" % self.id + res += "title:%s\n" % self.title + res += "en_title:%s\n" % self.en_title + res += "college:%s\n" % self.college + res += "info:%s\n" % self.info + res += "source:%s" % self.source + return res + +class ReForTaskB(): + reId = re.compile(r'"http://win\.bupt\.edu\.cn/program\.do\?id=(\d+)"') + reTitle = re.compile(r'

\s*(.*?)\s*

') + reEnTitle = re.compile(r'
\s*(.*?)\s*
') + reCollege = re.compile(r'

\s*(.*?)\s*

') + reInfo = re.compile(r'
\s*
\s*(.*?)(\s*
){7}', re.S) + reSource = re.compile(r'\[\{"score":".*?","type":".*?","time":".*?","name":"(.*?)"\}') + def search(self, text): + res = _resInTextForTaskB() + res.id = self.reId.search(text).group(1) + res.title = self.reTitle.search(text).group(1) + try: + res.en_title = self.reEnTitle.search(text).group(1) + except AttributeError: + res.en_title = "" + try: + res.college = self.reCollege.search(text).group(1) + except AttributeError: + res.college = "" + try: + res.info = self.reInfo.search(text).group(1).replace("\r\n"," ").replace("\n"," ").replace("\r"," ") + except AttributeError: + res.info = "" + try: + res.source = self.reSource.search(text).group(1).encode('utf-8').decode('unicode_escape') + except AttributeError: + res.source = "" + return res + +class ReForTaskA(): + def __init__(self): + pass + +class OutPutFile(): + def __init__(self, fileName, encoding): + self.fileName = fileName + self.encoding = encoding + def write(self, text): + if os.path.exists(self.fileName): + text = "\n" + text + with open(self.fileName,mode='a',encoding=self.encoding) as f: + f.write(text) + +class CheckPointAndConfig(): + def __init__(self, fileName): + self.fileName = fileName + self.data = {} + self.loadData() + def loadData(self): + if os.path.exists(self.fileName): + with open(self.fileName,'r') as load_f: + self.data = json.load(load_f) + else: + self.data = { + "number": 2017210281, + "outputFile": "out.txt", + "totle": 1, + "no": 1, + "checkpoint": 0 + } + self.saveData() + def saveData(self): + with open(self.fileName,"w") as dump_f: + json.dump(self.data,dump_f) \ No newline at end of file