finish the home work
This commit is contained in:
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
.vscode
|
||||
*.pyc
|
||||
config.json
|
||||
out.txt
|
||||
|
||||
*.zip
|
||||
277
GUI.py
Normal file
277
GUI.py
Normal file
@@ -0,0 +1,277 @@
|
||||
import tkinter as tk
|
||||
import thread
|
||||
class EntryWithLabel():
|
||||
def __init__(self, root, labelValue, default, **kw):
|
||||
self.root = root
|
||||
self.labelValue = labelValue
|
||||
self.default = default
|
||||
self.frame = tk.Frame(root)
|
||||
self.label = tk.Label(self.frame, text=labelValue)
|
||||
self.var = tk.Variable()
|
||||
self.var.set(default)
|
||||
self.entry = tk.Entry(self.frame, textvariable=self.var, **kw)
|
||||
|
||||
def pack(self, **kw):
|
||||
self.label.pack(side=tk.LEFT)
|
||||
self.entry.pack(side=tk.RIGHT)
|
||||
self.frame.pack(**kw)
|
||||
return self
|
||||
def get(self):
|
||||
return self.var.get()
|
||||
def set(self, value):
|
||||
return self.var.set(value)
|
||||
|
||||
class LabelWithReadOnlyText():
|
||||
def __init__(self, root, labelValue, default):
|
||||
self.root = root
|
||||
self.labelValue = labelValue
|
||||
self.var = default
|
||||
self.frame = tk.Frame(root)
|
||||
self.label = tk.Label(self.frame, text=labelValue)
|
||||
self.text = tk.Label(self.frame, text=self.var)
|
||||
def pack(self, **kw):
|
||||
self.label.pack(side=tk.LEFT)
|
||||
self.text.pack(side=tk.RIGHT)
|
||||
self.frame.pack(**kw)
|
||||
return self
|
||||
def set(self, value):
|
||||
self.var = value
|
||||
self.text.config(text=self.var)
|
||||
def get(self):
|
||||
return self.var
|
||||
|
||||
class ProgressBar():
|
||||
def __init__(self, root, labelValue):
|
||||
self.frame = tk.Frame(root)
|
||||
self.label_title = tk.Label(self.frame, text=labelValue)
|
||||
self.canvas = tk.Canvas(self.frame, width=200, height=22, bg="white")
|
||||
self.base_line = self.canvas.create_rectangle(2,2,200,23,width = 1,outline = "black", fill="white")
|
||||
self.fill_line = self.canvas.create_rectangle(2,2,0,23,width = 0,fill = "blue")
|
||||
self.label_progress = tk.Label(self.frame, width=10)
|
||||
self.progress = 0
|
||||
def setProgress(self, progress, text):
|
||||
if text:
|
||||
self.label_progress.config(text=text)
|
||||
if progress < self.progress:
|
||||
self.canvas.create_rectangle(2,2,200,23,width = 1,outline = "black", fill="white")
|
||||
self.fill_line = self.canvas.create_rectangle(2,2,0,23,width = 0,fill = "blue")
|
||||
if progress > 1:
|
||||
progress = 1
|
||||
self.progress = progress
|
||||
self.canvas.coords(self.fill_line, (0, 0, 200*progress, 30))
|
||||
self.frame.update()
|
||||
def pack(self, **kw):
|
||||
self.label_title.pack(side=tk.LEFT)
|
||||
self.canvas.pack(side=tk.LEFT)
|
||||
self.label_progress.pack(side=tk.LEFT)
|
||||
self.frame.pack(**kw)
|
||||
return self
|
||||
|
||||
|
||||
class BaseInforFrame():
|
||||
def __init__(self, root):
|
||||
self.labelframe = tk.LabelFrame(root, text="基本信息")
|
||||
self.entry_number = EntryWithLabel(self.labelframe, "学号", "2017210281")
|
||||
self.label_b = LabelWithReadOnlyText(self.labelframe, "B:", 3)
|
||||
self.label_task = LabelWithReadOnlyText(self.labelframe, "任务:", "B")
|
||||
self.entry_no = EntryWithLabel(self.labelframe, "本机编号:", 1)
|
||||
self.entry_total = EntryWithLabel(self.labelframe, "总机器数量:", 1)
|
||||
|
||||
self.entry_number.var.trace("w", self._updateNumber)
|
||||
self.entry_no.var.trace("w", self._updateNo)
|
||||
self.entry_total.var.trace("w", self._updateTotal)
|
||||
def disable(self):
|
||||
self.entry_no.entry.config(state="disable")
|
||||
self.entry_number.entry.config(state="disable")
|
||||
self.entry_total.entry.config(state="disable")
|
||||
def normal(self):
|
||||
self.entry_no.entry.config(state="normal")
|
||||
self.entry_number.entry.config(state="normal")
|
||||
self.entry_total.entry.config(state="normal")
|
||||
def pack(self, **kw):
|
||||
self.entry_number.pack()
|
||||
self.label_b.pack()
|
||||
self.label_task.pack()
|
||||
self.entry_no.pack()
|
||||
self.entry_total.pack()
|
||||
self.labelframe.pack(**kw)
|
||||
return self
|
||||
|
||||
def _bToTask(self, b):
|
||||
if b > 2:
|
||||
return "B"
|
||||
else:
|
||||
return "A"
|
||||
def inputToInt(self, entry):
|
||||
textcheck = str(entry.get())
|
||||
textcheck = ''.join(i for i in textcheck if i in '0123456789')
|
||||
if textcheck == '':
|
||||
return None
|
||||
return int(textcheck)
|
||||
|
||||
def _updateNumber(self, *args):
|
||||
number = self.inputToInt(self.entry_number)
|
||||
if number == None:
|
||||
return
|
||||
try:
|
||||
self.entry_number.set(number)
|
||||
b = number % 19
|
||||
self.label_b.set(b)
|
||||
self.label_task.set(self._bToTask(b))
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
def _updateNo(self, *args):
|
||||
no = self.inputToInt(self.entry_no)
|
||||
total = self.inputToInt(self.entry_total)
|
||||
if no == None or total == None:
|
||||
return
|
||||
if no > total:
|
||||
self.entry_no.set(total)
|
||||
elif no < 1:
|
||||
self.entry_no.set(1)
|
||||
else:
|
||||
self.entry_no.set(no)
|
||||
|
||||
def _updateTotal(self, *args):
|
||||
total = self.inputToInt(self.entry_total)
|
||||
if total == None:
|
||||
return
|
||||
if total < 1:
|
||||
total = 1
|
||||
self.entry_total.set(total)
|
||||
|
||||
|
||||
class SpiderStatus():
|
||||
def __init__(self, root):
|
||||
self.labelframe = tk.LabelFrame(root, text="爬取结果")
|
||||
self.text_info = tk.Text(self.labelframe, width= 100, height= 20)
|
||||
def pack(self, **kw):
|
||||
self.text_info.pack()
|
||||
self.labelframe.pack(**kw)
|
||||
def set(self, text):
|
||||
# self.text_info.delete('1.0','end')
|
||||
self.text_info.insert('end', text+"\n")
|
||||
self.text_info.see(tk.END)
|
||||
|
||||
class Timmer():
|
||||
def __init__(self, root, start, stop):
|
||||
self.pause = False
|
||||
self.start = False
|
||||
self.sleeptime = 4.0
|
||||
self.maxoffset = 212
|
||||
self.startP = start
|
||||
self.stop = stop
|
||||
|
||||
self.labelframe = tk.LabelFrame(root, text="计时器")
|
||||
self.progress_each = ProgressBar(self.labelframe, "每次计时:")
|
||||
self.progress_totle = ProgressBar(self.labelframe, "总进度:")
|
||||
self.entry_sleepTime = EntryWithLabel(self.labelframe, "每次延时(s):", self.sleeptime)
|
||||
self.entry_maxOffset = EntryWithLabel(self.labelframe, "最大偏移:", self.maxoffset)
|
||||
self.btnframe = tk.Frame(self.labelframe)
|
||||
self.button_startStop = tk.Button(self.btnframe, text="开始", command=self.startStop)
|
||||
self.button_pause = tk.Button(self.btnframe, text="暂停", command=self._pause, state="disable")
|
||||
|
||||
self.entry_sleepTime.var.trace("w", self._updateSleepTime)
|
||||
self.entry_maxOffset.var.trace("w", self._updateMaxOffset)
|
||||
def pack(self, **kw):
|
||||
self.progress_each.pack()
|
||||
self.progress_totle.pack()
|
||||
self.entry_sleepTime.pack()
|
||||
self.entry_maxOffset.pack()
|
||||
self.button_startStop.pack(side=tk.LEFT)
|
||||
self.button_pause.pack(side=tk.LEFT)
|
||||
self.btnframe.pack()
|
||||
self.labelframe.pack(**kw)
|
||||
def inputToInt(self, entry):
|
||||
textcheck = str(entry.get())
|
||||
textcheck = ''.join(i for i in textcheck if i in '0123456789')
|
||||
if textcheck == '':
|
||||
return None
|
||||
return int(textcheck)
|
||||
def inputToFloat(self, entry):
|
||||
textcheck = str(entry.get())
|
||||
textcheck = ''.join(i for i in textcheck if i in '0123456789.')
|
||||
if textcheck == '':
|
||||
return None
|
||||
if textcheck.count('.') > 1:
|
||||
first = textcheck.find('.')
|
||||
second = textcheck.find('.', first+1)
|
||||
textcheck = textcheck[0:second]
|
||||
return float(textcheck)
|
||||
def _updateSleepTime(self, *args):
|
||||
sleeptime = self.inputToFloat(self.entry_sleepTime)
|
||||
if sleeptime == None:
|
||||
return
|
||||
self.sleeptime = sleeptime
|
||||
self.entry_sleepTime.set(self.sleeptime)
|
||||
def _updateMaxOffset(self, *args):
|
||||
maxoffset = self.inputToInt(self.entry_maxOffset)
|
||||
if maxoffset == None:
|
||||
return
|
||||
self.maxoffset = maxoffset
|
||||
self.progress_totle.setProgress(int(self.progress_totle.label_progress.config("text")[-1])/self.maxoffset, None)
|
||||
self.entry_maxOffset.set(self.maxoffset)
|
||||
def startStop(self):
|
||||
if self.start and not self.pause: # 停止
|
||||
self.start = False
|
||||
self.pause = False
|
||||
self.stop()
|
||||
self.button_startStop.config(text="开始")
|
||||
self.button_pause.config(state="disable")
|
||||
elif not self.start and not self.pause: # 开始
|
||||
self.start = True
|
||||
self.pause = False
|
||||
self.startP()
|
||||
self.button_startStop.config(text="停止")
|
||||
self.button_pause.config(state="normal")
|
||||
elif self.start and self.pause: # 继续
|
||||
self.start = True
|
||||
self.pause = False
|
||||
self.button_startStop.config(text="停止")
|
||||
self.button_pause.config(state="normal")
|
||||
def _pause(self):
|
||||
self.button_pause.config(state="disable")
|
||||
self.button_startStop.config(text="继续")
|
||||
self.pause = True
|
||||
|
||||
|
||||
class MainGui():
|
||||
def __init__(self, backend):
|
||||
self.root = tk.Tk()
|
||||
self.root.title("Web搜索技术第一次作业")
|
||||
self.backend = backend
|
||||
self.thread = None
|
||||
self.topFrame = tk.Frame(self.root)
|
||||
self.baseinfo = BaseInforFrame(self.topFrame)
|
||||
self.timmer = Timmer(self.topFrame, self.startThread, self.stopThread)
|
||||
self.spiderstatus = SpiderStatus(self.root)
|
||||
|
||||
def start(self):
|
||||
self.baseinfo.pack(side=tk.LEFT)
|
||||
self.timmer.pack(side=tk.LEFT)
|
||||
self.topFrame.pack()
|
||||
self.spiderstatus.pack()
|
||||
self.timmer.progress_totle.setProgress(self.backend.getOffset()/self.timmer.maxoffset, str(self.backend.getOffset()))
|
||||
self.root.protocol("WM_DELETE_WINDOW", self.on_closing)
|
||||
self.root.mainloop()
|
||||
|
||||
def on_closing(self):
|
||||
if self.thread and self.thread.is_alive():
|
||||
self.thread.terminate()
|
||||
thread.stop_thread(self.thread)
|
||||
self.thread.join()
|
||||
self.root.destroy()
|
||||
|
||||
def startThread(self):
|
||||
self.baseinfo.disable()
|
||||
self.backend.config.data["number"] = self.baseinfo.inputToInt(self.baseinfo.entry_number)
|
||||
self.backend.config.data["totle"] = self.baseinfo.inputToInt(self.baseinfo.entry_total)
|
||||
self.backend.config.data["no"] = self.baseinfo.inputToInt(self.baseinfo.entry_no)
|
||||
self.backend.config.saveData()
|
||||
self.backend.b = self.baseinfo.label_b.get()
|
||||
self.backend.task = self.baseinfo.label_task.get()
|
||||
self.thread = thread.TimmerThread(self.timmer, self.spiderstatus, self.backend)
|
||||
self.thread.start()
|
||||
def stopThread(self):
|
||||
self.baseinfo.normal()
|
||||
78
README.md
Normal file
78
README.md
Normal file
@@ -0,0 +1,78 @@
|
||||
Web搜索技术 第一次作业 网站信息精准采集技术
|
||||
===
|
||||
|
||||
## 前言
|
||||
|
||||
姑且这就算做实验报告吧。首先由于被当年信通院的OJ的各种奇怪问题惊艳到而产生心理阴影,我希望我的程序可以被**人工审核**。
|
||||
|
||||
## 需求分析
|
||||
|
||||
网站信息精准采集程序的要求
|
||||
|
||||
- [x] 采集想要的内容,不多不少不重复 ~ 完备性
|
||||
- [x] 对网站不构成负担 (性能压力不大)~ 轻量采集
|
||||
- [x] 出现异常情况 ~ 自行停止程序
|
||||
- [ ] 程序异常退出 ~ 自动重启
|
||||
- [x] 断点续采 ~ 鲁棒性
|
||||
- [x] 精准采集 ~ 正则表达式
|
||||
- [x] 采集过程信息及时展示 ~ 界面友好
|
||||
- [x] 采集结果实时存储 ~ 写文件
|
||||
- [x] 允许多个机器同时运行,采集不同的内容 ~ 并行性所有内容构成完备集。
|
||||
|
||||
## 技术实现
|
||||
|
||||
为了可以达到“界面友好”的需求,使用PyTk实现了一个GUI,因此整个程序分为两大部分:
|
||||
|
||||
- UI: 负责处理GUI交互
|
||||
- Spider:负责爬虫相关逻辑
|
||||
|
||||
程序运行时会涉及两个线程:
|
||||
|
||||
- Mainloop:GUi主循环事件,处理点击以及页面绘制
|
||||
- TimmerThread:子进程,负责计时并发送网络请求
|
||||
|
||||
为了降低代码耦合度,抽离如下模块:
|
||||
|
||||
- errors.py 定义各种自定义错误
|
||||
- GUI.py 主要的GUI方面的代码,包括所有的子组件的实现
|
||||
- main.py 程序入口
|
||||
- spider.py 爬虫相关代码
|
||||
- thread.py 计时器子线程的代码,子线程中调用GUI相关组件更新状态
|
||||
- utils.py 包括正则匹配,格式化输入输出的一些工具类
|
||||
|
||||
## 具体使用
|
||||
|
||||
测试平台:Windows 10 专业版 1909 18363.752
|
||||
|
||||
Pyhton版本:3.7.3
|
||||
|
||||
安装依赖:
|
||||
|
||||
```bash
|
||||
pip install -i requirements.txt
|
||||
```
|
||||
|
||||
运行:
|
||||
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
填写学号并配置多机器(可选),点击开始即可。
|
||||
|
||||
中途可自由暂停、继续、停止、关闭窗口,基本不会引起文件保存错误,并且可以继续之前的进度进行爬取。
|
||||
|
||||
个人认为程序异常退出还要重启有点问题,会导致程序无法正常关闭,无论如何关闭,对于程序来说都是发生了一个异常,所以无论如何都会重启,并非爬虫的本分,所以未作实现。
|
||||
|
||||
由于本人是任务B,所以对于任务A的并未做任何实现,但是考虑到接口类似,仅仅声明了接口,如果需要可以对其进行实现即可
|
||||
|
||||
|
||||
## 目前的问题
|
||||
|
||||
有一些符号例如'•'(\u2202)和' '(\xa0)不存在GBK对应符号,因此使用空格进行替换。
|
||||
|
||||
这也是为什么我对于自动判别持怀疑态度。
|
||||
|
||||
## 明知存在问题为什么还要早早交
|
||||
|
||||
判别程序的问题不是我的责任,我认为我已经完成了规定的项目,我既然大晚上十分兴奋写完,一定要提交一下。见识一下自动判决会出什么问题。
|
||||
27
errors.py
Normal file
27
errors.py
Normal file
@@ -0,0 +1,27 @@
|
||||
class SpiderError(RuntimeError):
|
||||
def __init__(self):
|
||||
super(SpiderError, self).__init__()
|
||||
def __str__(self):
|
||||
return "一个未知的爬虫错误"
|
||||
|
||||
class SpiderTaskTypeError(SpiderError):
|
||||
def __init__(self, task):
|
||||
super(SpiderTaskTypeError, self).__init__()
|
||||
self.task = task
|
||||
def __str__(self):
|
||||
return "Task的类型: %s 不受支持" % self.task
|
||||
|
||||
class SpiderNotFond(SpiderError):
|
||||
def __init__(self, id):
|
||||
super(SpiderNotFond, self).__init__()
|
||||
self.id = id
|
||||
def __str__(self):
|
||||
return "id: %s 的页面并不存在" % str(self.id)
|
||||
|
||||
class SpiderReFailed(SpiderError):
|
||||
def __init__(self, id, error):
|
||||
super(SpiderReFailed, self).__init__()
|
||||
self.id = id
|
||||
self.err = error
|
||||
def __str__(self):
|
||||
return "id: %s 的页面在正则识别id/title的时候出错, 具体错误如下:\n%s" % (str(self.id), str(self.err))
|
||||
13
main.py
Normal file
13
main.py
Normal file
@@ -0,0 +1,13 @@
|
||||
#-*- coding: UTF-8 -*-
|
||||
__author__ = 'Xice <admin@xice.wang>'
|
||||
__data__ = '2020-03-24'
|
||||
|
||||
import GUI
|
||||
import spider
|
||||
|
||||
if __name__ == "__main__":
|
||||
c = spider.CheckPointAndConfig("./config.json")
|
||||
s = spider.Spider(c)
|
||||
gui = GUI.MainGui(s)
|
||||
gui.start()
|
||||
|
||||
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
requests==2.21.0
|
||||
67
spider.py
Normal file
67
spider.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import requests
|
||||
import html
|
||||
import unicodedata
|
||||
from utils import ReForTaskB, ReForTaskA, OutPutFile, CheckPointAndConfig
|
||||
from errors import SpiderNotFond, SpiderTaskTypeError, SpiderReFailed
|
||||
class Spider():
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
|
||||
}
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.out = OutPutFile(config.data["outputFile"], "gbk")
|
||||
self.b = 3
|
||||
self.task = "B"
|
||||
|
||||
def getOffset(self):
|
||||
return self.config.data['checkpoint'] * self.config.data['totle'] + self.config.data['no'] - 1
|
||||
|
||||
urlTaskB = "https://win.bupt.edu.cn/program.do?id=%d"
|
||||
staticReTaskB = ReForTaskB()
|
||||
def _doOneStepOnTaskB(self):
|
||||
pid = (self.b - 3) * 150 + 1 + self.getOffset()
|
||||
r = requests.get(self.urlTaskB % pid, headers=self.headers)
|
||||
text = html.unescape(r.text)
|
||||
text = text.replace('\u2022', ' ').replace('\xa0', ' ')
|
||||
# text = unicodedata.normalize('NFKD', text)
|
||||
if '抱歉,没有该项目' in text:
|
||||
raise SpiderNotFond(pid)
|
||||
try:
|
||||
res = self.staticReTaskB.search(text).display()
|
||||
return res
|
||||
except AttributeError as e:
|
||||
raise SpiderReFailed(pid, e)
|
||||
urlTaskA = "https://win.bupt.edu.cn/project.do?next=collectlist&p=%d"
|
||||
staticReTaskA = ReForTaskA()
|
||||
def _doOneStepOnTaskA(self):
|
||||
pass
|
||||
def doOneStep(self):
|
||||
try:
|
||||
if self.task == "B":
|
||||
res = self._doOneStepOnTaskB()
|
||||
else:
|
||||
raise SpiderTaskTypeError(self.task)
|
||||
self.out.write(res)
|
||||
return res
|
||||
finally:
|
||||
self.config.data['checkpoint'] += 1
|
||||
self.config.saveData()
|
||||
|
||||
if __name__ == "__main__":
|
||||
c = CheckPointAndConfig("./config.json")
|
||||
s = Spider(c)
|
||||
while True:
|
||||
try:
|
||||
res = s.doOneStep()
|
||||
print(res)
|
||||
except SpiderTaskTypeError as e:
|
||||
print(str(e))
|
||||
break
|
||||
except SpiderReFailed as e:
|
||||
print(str(e))
|
||||
break
|
||||
except SpiderNotFond as e:
|
||||
print(str(e))
|
||||
except Exception as e:
|
||||
print("未知错误:\n" + str(e))
|
||||
break
|
||||
87
thread.py
Normal file
87
thread.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import threading
|
||||
import time
|
||||
from errors import SpiderTaskTypeError,SpiderNotFond,SpiderReFailed
|
||||
import inspect
|
||||
import ctypes
|
||||
def _async_raise(tid, exctype):
|
||||
"""raises the exception, performs cleanup if needed"""
|
||||
tid = ctypes.c_long(tid)
|
||||
if not inspect.isclass(exctype):
|
||||
exctype = type(exctype)
|
||||
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
|
||||
if res == 0:
|
||||
raise ValueError("invalid thread id")
|
||||
elif res != 1:
|
||||
# """if it returns a number greater than one, you're in trouble,
|
||||
# and you should call it again with exc=NULL to revert the effect"""
|
||||
ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
|
||||
raise SystemError("PyThreadState_SetAsyncExc failed")
|
||||
def stop_thread(thread):
|
||||
_async_raise(thread.ident, RuntimeError)
|
||||
class TimmerThread(threading.Thread):
|
||||
def __init__(self, timmer, status, backend):
|
||||
threading.Thread.__init__(self)
|
||||
self._timmer = timmer
|
||||
self._status = status
|
||||
self._backend = backend
|
||||
self._running = True
|
||||
def terminate(self):
|
||||
self._running = False
|
||||
def timmer(self):
|
||||
if self._running:
|
||||
return self._timmer
|
||||
else:
|
||||
raise RuntimeError()
|
||||
def status(self):
|
||||
if self._running:
|
||||
return self._status
|
||||
else:
|
||||
raise RuntimeError()
|
||||
def backend(self):
|
||||
if self._running:
|
||||
return self._backend
|
||||
else:
|
||||
raise RuntimeError()
|
||||
def run(self):
|
||||
nowTime = 0.0
|
||||
try:
|
||||
while self._running and self.timmer().start:
|
||||
if self.backend().getOffset() > self.timmer().maxoffset:
|
||||
self.timmer().progress_totle.setProgress(self.backend().getOffset()/self.timmer().maxoffset, str(self.backend().getOffset()))
|
||||
self.status().set("爬取结束")
|
||||
self.timmer().startStop()
|
||||
return
|
||||
time.sleep(0.02)
|
||||
if self.timmer().pause:
|
||||
continue
|
||||
nowTime += 0.02
|
||||
try:
|
||||
self.timmer().progress_each.setProgress(nowTime/self.timmer().sleeptime, str(round(nowTime,2)) + "s")
|
||||
except ZeroDivisionError:
|
||||
self.timmer().progress_each.setProgress(1, str(round(nowTime,2)) + "s")
|
||||
if nowTime > self.timmer().sleeptime:
|
||||
self.timmer().progress_totle.setProgress(self.backend().getOffset()/self.timmer().maxoffset, str(self.backend().getOffset()))
|
||||
nowTime = 0.0
|
||||
try:
|
||||
res = self.backend().doOneStep()
|
||||
self.status().set(res)
|
||||
except SpiderTaskTypeError as e:
|
||||
self.status().set(str(e))
|
||||
self.timmer().startStop()
|
||||
return
|
||||
except SpiderReFailed as e:
|
||||
self.status().set(str(e))
|
||||
self.timmer().startStop()
|
||||
return
|
||||
except SpiderNotFond as e:
|
||||
self.status().set(str(e))
|
||||
except UnicodeEncodeError as e:
|
||||
self.status().set("编码错误:\n" + str(e))
|
||||
self.timmer().startStop()
|
||||
return
|
||||
except Exception as e:
|
||||
self.status().set("未知错误:\n" + str(e))
|
||||
self.timmer().startStop()
|
||||
return
|
||||
except RuntimeError:
|
||||
return
|
||||
87
utils.py
Normal file
87
utils.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
class _resInTextForTaskB():
|
||||
def __init__(self):
|
||||
self.id = ""
|
||||
self.title = ""
|
||||
self.en_title = ""
|
||||
self.college = ""
|
||||
self.info = ""
|
||||
self.source = ""
|
||||
def __str__(self):
|
||||
return self.display()
|
||||
def display(self):
|
||||
res = ""
|
||||
res += "id:%s\n" % self.id
|
||||
res += "title:%s\n" % self.title
|
||||
res += "en_title:%s\n" % self.en_title
|
||||
res += "college:%s\n" % self.college
|
||||
res += "info:%s\n" % self.info
|
||||
res += "source:%s" % self.source
|
||||
return res
|
||||
|
||||
class ReForTaskB():
|
||||
reId = re.compile(r'"http://win\.bupt\.edu\.cn/program\.do\?id=(\d+)"')
|
||||
reTitle = re.compile(r'<h2 style="display:inline">\s*(.*?)\s*</h2>')
|
||||
reEnTitle = re.compile(r'<div style="margin-top:-7px;overflow: hidden;white-space: nowrap;text-overflow: ellipsis;">\s*(.*?)\s*</div>')
|
||||
reCollege = re.compile(r'<h3 style="display:inline;">\s*(.*?)\s*</h3>')
|
||||
reInfo = re.compile(r'<br>\s*<div style="font-size:17px;line-height:25px;">\s*(.*?)(\s*</div>){7}', re.S)
|
||||
reSource = re.compile(r'\[\{"score":".*?","type":".*?","time":".*?","name":"(.*?)"\}')
|
||||
def search(self, text):
|
||||
res = _resInTextForTaskB()
|
||||
res.id = self.reId.search(text).group(1)
|
||||
res.title = self.reTitle.search(text).group(1)
|
||||
try:
|
||||
res.en_title = self.reEnTitle.search(text).group(1)
|
||||
except AttributeError:
|
||||
res.en_title = ""
|
||||
try:
|
||||
res.college = self.reCollege.search(text).group(1)
|
||||
except AttributeError:
|
||||
res.college = ""
|
||||
try:
|
||||
res.info = self.reInfo.search(text).group(1).replace("\r\n"," ").replace("\n"," ").replace("\r"," ")
|
||||
except AttributeError:
|
||||
res.info = ""
|
||||
try:
|
||||
res.source = self.reSource.search(text).group(1).encode('utf-8').decode('unicode_escape')
|
||||
except AttributeError:
|
||||
res.source = ""
|
||||
return res
|
||||
|
||||
class ReForTaskA():
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
class OutPutFile():
|
||||
def __init__(self, fileName, encoding):
|
||||
self.fileName = fileName
|
||||
self.encoding = encoding
|
||||
def write(self, text):
|
||||
if os.path.exists(self.fileName):
|
||||
text = "\n" + text
|
||||
with open(self.fileName,mode='a',encoding=self.encoding) as f:
|
||||
f.write(text)
|
||||
|
||||
class CheckPointAndConfig():
|
||||
def __init__(self, fileName):
|
||||
self.fileName = fileName
|
||||
self.data = {}
|
||||
self.loadData()
|
||||
def loadData(self):
|
||||
if os.path.exists(self.fileName):
|
||||
with open(self.fileName,'r') as load_f:
|
||||
self.data = json.load(load_f)
|
||||
else:
|
||||
self.data = {
|
||||
"number": 2017210281,
|
||||
"outputFile": "out.txt",
|
||||
"totle": 1,
|
||||
"no": 1,
|
||||
"checkpoint": 0
|
||||
}
|
||||
self.saveData()
|
||||
def saveData(self):
|
||||
with open(self.fileName,"w") as dump_f:
|
||||
json.dump(self.data,dump_f)
|
||||
Reference in New Issue
Block a user