67 lines
2.2 KiB
Python
67 lines
2.2 KiB
Python
import requests
|
|
import html
|
|
import unicodedata
|
|
from utils import ReForTaskB, ReForTaskA, OutPutFile, CheckPointAndConfig
|
|
from errors import SpiderNotFond, SpiderTaskTypeError, SpiderReFailed
|
|
class Spider():
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
|
|
}
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.out = OutPutFile(config.data["outputFile"], "gbk")
|
|
self.b = 3
|
|
self.task = "B"
|
|
|
|
def getOffset(self):
|
|
return self.config.data['checkpoint'] * self.config.data['totle'] + self.config.data['no'] - 1
|
|
|
|
urlTaskB = "https://win.bupt.edu.cn/program.do?id=%d"
|
|
staticReTaskB = ReForTaskB()
|
|
def _doOneStepOnTaskB(self):
|
|
pid = (self.b - 3) * 150 + 1 + self.getOffset()
|
|
r = requests.get(self.urlTaskB % pid, headers=self.headers)
|
|
text = html.unescape(r.text)
|
|
text = text.replace('\u2022', ' ').replace('\xa0', ' ')
|
|
# text = unicodedata.normalize('NFKD', text)
|
|
if '抱歉,没有该项目' in text:
|
|
raise SpiderNotFond(pid)
|
|
try:
|
|
res = self.staticReTaskB.search(text).display()
|
|
return res
|
|
except AttributeError as e:
|
|
raise SpiderReFailed(pid, e)
|
|
urlTaskA = "https://win.bupt.edu.cn/project.do?next=collectlist&p=%d"
|
|
staticReTaskA = ReForTaskA()
|
|
def _doOneStepOnTaskA(self):
|
|
pass
|
|
def doOneStep(self):
|
|
try:
|
|
if self.task == "B":
|
|
res = self._doOneStepOnTaskB()
|
|
else:
|
|
raise SpiderTaskTypeError(self.task)
|
|
self.out.write(res)
|
|
return res
|
|
finally:
|
|
self.config.data['checkpoint'] += 1
|
|
self.config.saveData()
|
|
|
|
if __name__ == "__main__":
|
|
c = CheckPointAndConfig("./config.json")
|
|
s = Spider(c)
|
|
while True:
|
|
try:
|
|
res = s.doOneStep()
|
|
print(res)
|
|
except SpiderTaskTypeError as e:
|
|
print(str(e))
|
|
break
|
|
except SpiderReFailed as e:
|
|
print(str(e))
|
|
break
|
|
except SpiderNotFond as e:
|
|
print(str(e))
|
|
except Exception as e:
|
|
print("未知错误:\n" + str(e))
|
|
break |