Files
SpiderWithGUI/spider.py
2020-03-25 11:34:46 +08:00

67 lines
2.2 KiB
Python

import requests
import html
import unicodedata
from utils import ReForTaskB, ReForTaskA, OutPutFile, CheckPointAndConfig
from errors import SpiderNotFond, SpiderTaskTypeError, SpiderReFailed
class Spider():
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
def __init__(self, config):
self.config = config
self.out = OutPutFile(config.data["outputFile"], "gbk")
self.b = 3
self.task = "B"
def getOffset(self):
return self.config.data['checkpoint'] * self.config.data['totle'] + self.config.data['no'] - 1
urlTaskB = "https://win.bupt.edu.cn/program.do?id=%d"
staticReTaskB = ReForTaskB()
def _doOneStepOnTaskB(self):
pid = (self.b - 3) * 150 + 1 + self.getOffset()
r = requests.get(self.urlTaskB % pid, headers=self.headers)
text = html.unescape(r.text)
text = text.replace('\u2022', ' ').replace('\xa0', ' ')
# text = unicodedata.normalize('NFKD', text)
if '抱歉,没有该项目' in text:
raise SpiderNotFond(pid)
try:
res = self.staticReTaskB.search(text).display()
return res
except AttributeError as e:
raise SpiderReFailed(pid, e)
urlTaskA = "https://win.bupt.edu.cn/project.do?next=collectlist&p=%d"
staticReTaskA = ReForTaskA()
def _doOneStepOnTaskA(self):
pass
def doOneStep(self):
try:
if self.task == "B":
res = self._doOneStepOnTaskB()
else:
raise SpiderTaskTypeError(self.task)
self.out.write(res)
return res
finally:
self.config.data['checkpoint'] += 1
self.config.saveData()
if __name__ == "__main__":
c = CheckPointAndConfig("./config.json")
s = Spider(c)
while True:
try:
res = s.doOneStep()
print(res)
except SpiderTaskTypeError as e:
print(str(e))
break
except SpiderReFailed as e:
print(str(e))
break
except SpiderNotFond as e:
print(str(e))
except Exception as e:
print("未知错误:\n" + str(e))
break