import requests import html import unicodedata from utils import ReForTaskB, ReForTaskA, OutPutFile, CheckPointAndConfig from errors import SpiderNotFond, SpiderTaskTypeError, SpiderReFailed class Spider(): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0' } def __init__(self, config): self.config = config self.out = OutPutFile(config.data["outputFile"], "gbk") self.b = 3 self.task = "B" def getOffset(self): return self.config.data['checkpoint'] * self.config.data['totle'] + self.config.data['no'] - 1 urlTaskB = "https://win.bupt.edu.cn/program.do?id=%d" staticReTaskB = ReForTaskB() def _doOneStepOnTaskB(self): pid = (self.b - 3) * 150 + 1 + self.getOffset() r = requests.get(self.urlTaskB % pid, headers=self.headers) text = html.unescape(r.text) text = text.replace('\u2022', ' ').replace('\xa0', ' ') # text = unicodedata.normalize('NFKD', text) if '抱歉,没有该项目' in text: raise SpiderNotFond(pid) try: res = self.staticReTaskB.search(text).display() return res except AttributeError as e: raise SpiderReFailed(pid, e) urlTaskA = "https://win.bupt.edu.cn/project.do?next=collectlist&p=%d" staticReTaskA = ReForTaskA() def _doOneStepOnTaskA(self): pass def doOneStep(self): try: if self.task == "B": res = self._doOneStepOnTaskB() else: raise SpiderTaskTypeError(self.task) self.out.write(res) return res finally: self.config.data['checkpoint'] += 1 self.config.saveData() if __name__ == "__main__": c = CheckPointAndConfig("./config.json") s = Spider(c) while True: try: res = s.doOneStep() print(res) except SpiderTaskTypeError as e: print(str(e)) break except SpiderReFailed as e: print(str(e)) break except SpiderNotFond as e: print(str(e)) except Exception as e: print("未知错误:\n" + str(e)) break