|
@@ -1,56 +1,139 @@
|
1
|
|
-import scrapy
|
2
|
|
-import time
|
3
|
|
-from crawl.comm.basketball import BSTResult
|
4
|
|
-
|
5
|
|
-class BasketballSpider(scrapy.Spider):
|
6
|
|
- name = "basketball-result"
|
7
|
|
-
|
8
|
|
- def start_requests(self):
|
9
|
|
- # 开奖
|
10
|
|
- today = time.strftime("%Y-%m-%d")
|
11
|
|
- url = "https://www.lottery.gov.cn/basketball/result_99.jspx?startDate="+today+"&endDate="+today+"&f_league_id=0&f_league_name=%E5%85%A8%E9%83%A8%E8%81%94%E8%B5%9B&single=off"
|
12
|
|
- yield scrapy.Request(url, self.parseResult)
|
13
|
|
-
|
14
|
|
- def parseResult(self, response):
|
15
|
|
- cssMain = ".xxsj table tr"
|
16
|
|
-
|
17
|
|
- # 获取所有比赛
|
18
|
|
- matches = response.css(cssMain)
|
19
|
|
- for node in matches[1:-1]: # 标题行忽略以及末尾一行
|
20
|
|
- prop = node.css("td")
|
21
|
|
- if len(prop) < 7:
|
22
|
|
- continue
|
23
|
|
-
|
24
|
|
- matchTime = prop[0].css('::text').get()
|
25
|
|
- matchWeek = prop[1].css('::text').get()
|
26
|
|
- league = prop[2].css('::text').get()
|
27
|
|
- team = prop[3].css('a::text').getall()
|
28
|
|
- if team is None or len(team) == 0:
|
29
|
|
- team = prop[3].css('::text').get().split('VS')
|
30
|
|
- homeTeam = team[0].strip()
|
31
|
|
- awayTeam = team[1].strip()
|
32
|
|
- single = self.isSingle(prop[3].attrib.get('class'))
|
33
|
|
- tmp = prop[4].css('::text').get()
|
34
|
|
- score = tmp.strip() if tmp is not None else ""
|
35
|
|
- tmp = prop[5].css('::text').get()
|
36
|
|
- status = tmp.strip() if tmp is not None else ""
|
37
|
|
-
|
38
|
|
- BSTResult(
|
39
|
|
- matchTime,
|
40
|
|
- matchWeek,
|
41
|
|
- league,
|
42
|
|
- homeTeam,
|
43
|
|
- awayTeam,
|
44
|
|
- single,
|
45
|
|
- score,
|
46
|
|
- status
|
47
|
|
- ).persist()
|
48
|
|
-
|
49
|
|
- def isSingle(self, eleCls):
|
50
|
|
- if eleCls is None:
|
51
|
|
- return '0'
|
52
|
|
-
|
53
|
|
- if 'dan' in eleCls:
|
54
|
|
- return '1'
|
55
|
|
- else:
|
56
|
|
- return '0'
|
|
1
|
+import scrapy
|
|
2
|
+import re
|
|
3
|
+import logging
|
|
4
|
+from crawl.comm.basketball import BSTResult
|
|
5
|
+from crawl.spiders.util import getNoneStr
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+class BasketballSpider(scrapy.Spider):
|
|
9
|
+ name = "basketball-result"
|
|
10
|
+
|
|
11
|
+ # 所有比赛
|
|
12
|
+ _matchesMap = {}
|
|
13
|
+
|
|
14
|
+ def start_requests(self):
|
|
15
|
+ url = 'https://info.sporttery.cn/basketball/match_result.php'
|
|
16
|
+ yield scrapy.Request(url, self.parseResult, 'GET')
|
|
17
|
+
|
|
18
|
+ def parseResult(self, response):
|
|
19
|
+ # 处理比赛结果
|
|
20
|
+ cssOfMatch = '.all-wrap > .match_list .m-tab tr'
|
|
21
|
+ matches = response.css(cssOfMatch)
|
|
22
|
+ if matches is None: return
|
|
23
|
+ for matchNode in matches:
|
|
24
|
+ tdNodeList = matchNode.css('td')
|
|
25
|
+ if tdNodeList is None or len(tdNodeList) < 10:
|
|
26
|
+ continue
|
|
27
|
+
|
|
28
|
+ matchTime = getNoneStr(tdNodeList[0].css('::text').get())
|
|
29
|
+ matchWeek = getNoneStr(tdNodeList[1].css('::text').get())
|
|
30
|
+ league = getNoneStr(tdNodeList[2].css('::text').get())
|
|
31
|
+ leagueFullName = getNoneStr(tdNodeList[2].attrib.get('title')) # 联赛全称
|
|
32
|
+ leagueName = '|'.join((league, leagueFullName))
|
|
33
|
+ homeTeam = self.trimBrackets(getNoneStr(tdNodeList[3].css('.zhu::text').get())) # 主队
|
|
34
|
+ awayTeam = self.trimBrackets(getNoneStr(tdNodeList[3].css('.ke::text').get())) # 客队
|
|
35
|
+
|
|
36
|
+ half = ''
|
|
37
|
+ halfSpan = tdNodeList[4].css('span')
|
|
38
|
+ if halfSpan is not None and len(halfSpan) > 0:
|
|
39
|
+ part1 = getNoneStr(tdNodeList[4].css("span::text")[0].get())
|
|
40
|
+ part2 = getNoneStr(tdNodeList[4].css("span::text")[1].get())
|
|
41
|
+ part3 = getNoneStr(tdNodeList[5].css("span::text")[0].get())
|
|
42
|
+ part4 = getNoneStr(tdNodeList[5].css("span::text")[1].get())
|
|
43
|
+ part5 = getNoneStr(tdNodeList[6].css('::text').get())
|
|
44
|
+ half = '|'.join((part1, part2, part3, part4, part5))
|
|
45
|
+
|
|
46
|
+ whole = getNoneStr(tdNodeList[7].css('span::text').get())
|
|
47
|
+ status = getNoneStr(tdNodeList[11].css('::text').get())
|
|
48
|
+
|
|
49
|
+ if whole == '无效场次':
|
|
50
|
+ status = '无效场次'
|
|
51
|
+
|
|
52
|
+ bstResult = BSTResult(
|
|
53
|
+ matchTime,
|
|
54
|
+ matchWeek,
|
|
55
|
+ leagueName,
|
|
56
|
+ homeTeam,
|
|
57
|
+ awayTeam,
|
|
58
|
+ half,
|
|
59
|
+ whole,
|
|
60
|
+ status,
|
|
61
|
+ ''
|
|
62
|
+ )
|
|
63
|
+
|
|
64
|
+ matchLink = getNoneStr(tdNodeList[12].css('a').attrib.get('href'))
|
|
65
|
+ matchId = re.sub(r'^.*pool_result.php\?id=', '', matchLink)
|
|
66
|
+
|
|
67
|
+ # 加入到临时字典里
|
|
68
|
+ self._matchesMap[matchId] = bstResult
|
|
69
|
+
|
|
70
|
+ if status == '已完成':
|
|
71
|
+ yield scrapy.Request('https:' + matchLink, self.parsePrice, 'GET')
|
|
72
|
+ else:
|
|
73
|
+ logging.info("采集到数据 --> %s" % bstResult.toString())
|
|
74
|
+ bstResult.persist()
|
|
75
|
+
|
|
76
|
+ # 是否存在下一页
|
|
77
|
+ pgNodes = response.css('.m-page .u-pg2')
|
|
78
|
+ if pgNodes is None or len(pgNodes) < 1: return
|
|
79
|
+
|
|
80
|
+ # 总页数
|
|
81
|
+ total = pgNodes[-1].css('a::text').get()
|
|
82
|
+ if total is None or total == '':
|
|
83
|
+ total = 0
|
|
84
|
+
|
|
85
|
+ # 当前页数
|
|
86
|
+ current = re.sub(r'^.*match_result.php[?page=]*', '', response.url)
|
|
87
|
+ if current is None or current == '':
|
|
88
|
+ current = 1
|
|
89
|
+
|
|
90
|
+ if int(current) >= int(total): return
|
|
91
|
+
|
|
92
|
+ # 下一页
|
|
93
|
+ nextPG = int(current) + 1
|
|
94
|
+ url = 'https://info.sporttery.cn/football/match_result.php?page=' + str(nextPG)
|
|
95
|
+ yield scrapy.Request(url, self.parseResult, 'GET')
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+ def trimBrackets(self, str):
|
|
99
|
+ if str is None: return ''
|
|
100
|
+
|
|
101
|
+ return re.sub(r'\(.*\)', '', str)
|
|
102
|
+
|
|
103
|
+ def parsePrice(self, response):
|
|
104
|
+ logging.info("采集数据源 ---> %s" % response.url)
|
|
105
|
+
|
|
106
|
+ tableList = response.css('.kj-table')
|
|
107
|
+ if tableList is None or len(tableList) < 4:
|
|
108
|
+ logging.error("抓取赔率结果失败")
|
|
109
|
+ return
|
|
110
|
+
|
|
111
|
+ playRes = []
|
|
112
|
+
|
|
113
|
+ for i, tab in enumerate(tableList):
|
|
114
|
+
|
|
115
|
+ try:
|
|
116
|
+ if i == 0 or i == 3:
|
|
117
|
+ # 胜负 or 胜分差
|
|
118
|
+ tr = tab.css("tr")[0]
|
|
119
|
+ tdList = tr.css('td')
|
|
120
|
+ playRes.append(getNoneStr(tdList[1].css("span::text").get()).replace(' ', ''))
|
|
121
|
+ elif i == 1 or i == 2:
|
|
122
|
+ # 让分胜负 or 大小分
|
|
123
|
+ tr = tab.css("tr")[-1]
|
|
124
|
+ tdList = tr.css('td')
|
|
125
|
+ playRes.append(getNoneStr(tdList[-1].css("span::text").get()).replace(' ', ''))
|
|
126
|
+ else:
|
|
127
|
+ continue
|
|
128
|
+ except:
|
|
129
|
+ continue
|
|
130
|
+
|
|
131
|
+ matchId = re.sub(r'^.*pool_result.php\?id=', '', response.url)
|
|
132
|
+ bstResult = self._matchesMap[matchId]
|
|
133
|
+ bstResult.playRes = '|'.join(playRes)
|
|
134
|
+
|
|
135
|
+ #
|
|
136
|
+ logging.info("采集到数据 --> %s" % bstResult.toString())
|
|
137
|
+
|
|
138
|
+ # 入库
|
|
139
|
+ bstResult.persist()
|