张延森 4 年 前
コミット
686e36a12a
共有24 個のファイルを変更した372 個の追加59 個の削除を含む
  1. バイナリ
      crawl/__pycache__/__init__.cpython-38.pyc
  2. バイナリ
      crawl/__pycache__/settings.cpython-38.pyc
  3. バイナリ
      crawl/comm/__pycache__/__init__.cpython-38.pyc
  4. バイナリ
      crawl/comm/__pycache__/basketball.cpython-38.pyc
  5. バイナリ
      crawl/comm/__pycache__/football.cpython-38.pyc
  6. バイナリ
      crawl/comm/__pycache__/lottery.cpython-38.pyc
  7. バイナリ
      crawl/comm/__pycache__/mydb.cpython-38.pyc
  8. 7
    2
      crawl/comm/basketball.py
  9. バイナリ
      crawl/spiders/__pycache__/__init__.cpython-38.pyc
  10. バイナリ
      crawl/spiders/__pycache__/basketball.cpython-38.pyc
  11. バイナリ
      crawl/spiders/__pycache__/basketball_match.cpython-38.pyc
  12. バイナリ
      crawl/spiders/__pycache__/basketball_price.cpython-38.pyc
  13. バイナリ
      crawl/spiders/__pycache__/basketball_result.cpython-38.pyc
  14. バイナリ
      crawl/spiders/__pycache__/football.cpython-38.pyc
  15. バイナリ
      crawl/spiders/__pycache__/football_match.cpython-38.pyc
  16. バイナリ
      crawl/spiders/__pycache__/football_price.cpython-38.pyc
  17. バイナリ
      crawl/spiders/__pycache__/football_result.cpython-38.pyc
  18. バイナリ
      crawl/spiders/__pycache__/lottery.cpython-38.pyc
  19. バイナリ
      crawl/spiders/__pycache__/util.cpython-38.pyc
  20. 158
    0
      crawl/spiders/basketball.py
  21. 139
    56
      crawl/spiders/basketball_result.py
  22. 61
    0
      crawl/spiders/basketball_result.py.bak
  23. 1
    1
      crawl/spiders/football_result.py
  24. 6
    0
      crawl/spiders/util.py

バイナリ
crawl/__pycache__/__init__.cpython-38.pyc ファイルの表示


バイナリ
crawl/__pycache__/settings.cpython-38.pyc ファイルの表示


バイナリ
crawl/comm/__pycache__/__init__.cpython-38.pyc ファイルの表示


バイナリ
crawl/comm/__pycache__/basketball.cpython-38.pyc ファイルの表示


バイナリ
crawl/comm/__pycache__/football.cpython-38.pyc ファイルの表示


バイナリ
crawl/comm/__pycache__/lottery.cpython-38.pyc ファイルの表示


バイナリ
crawl/comm/__pycache__/mydb.cpython-38.pyc ファイルの表示


+ 7
- 2
crawl/comm/basketball.py ファイルの表示

@@ -286,7 +286,10 @@ class BSTResult:
286 286
                score,
287 287
 
288 288
                # 状态
289
-               status
289
+               status,
290
+
291
+               # 玩法结果
292
+               playRes = None
290 293
                ):
291 294
     self.matchTime = matchTime
292 295
     self.matchWeek = matchWeek
@@ -296,6 +299,7 @@ class BSTResult:
296 299
     self.single = single
297 300
     self.score = score
298 301
     self.status = status
302
+    self.playRes = playRes
299 303
 
300 304
   def toString(self):
301 305
     return ';'.join((
@@ -306,7 +310,8 @@ class BSTResult:
306 310
       self.awayTeam,
307 311
       self.single,
308 312
       self.score,
309
-      self.status
313
+      self.status,
314
+      self.playRes
310 315
     ))
311 316
 
312 317
   def persist(self):

バイナリ
crawl/spiders/__pycache__/__init__.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/basketball.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/basketball_match.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/basketball_price.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/basketball_result.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/football.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/football_match.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/football_price.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/football_result.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/lottery.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/util.cpython-38.pyc ファイルの表示


+ 158
- 0
crawl/spiders/basketball.py ファイルの表示

@@ -0,0 +1,158 @@
1
+import scrapy
2
+import re
3
+import logging
4
+from crawl.comm.basketball import BSTMatch, BSTPrice, WLOdds, WLSpreadOdds, ScoreResult, PointsResult
5
+from crawl.spiders.util import getNoneStr
6
+
7
+
8
+class BasketballSpider(scrapy.Spider):
9
+  name = 'basketball'
10
+
11
+  # 所有比赛
12
+  _matchesMap = {}
13
+
14
+  def start_requests(self):
15
+    # 受注赛事
16
+    url = 'https://info.sporttery.cn/basketball/match_list.php'
17
+    yield scrapy.Request(url, self.parseMatch, 'GET')
18
+
19
+  def parseMatch(self, response):
20
+    cssOfMatches = '.all-wrap > .match_list .m-tab tr'
21
+
22
+    # 所有赛事
23
+    matches = response.css(cssOfMatches)
24
+    # 当前赛事
25
+    for node in matches:
26
+      tdNodeList = node.css('td')
27
+
28
+      matchWeek = getNoneStr(tdNodeList[0].css('::text').get())  # 赛事编号
29
+      league = getNoneStr(tdNodeList[1].css('::text').get())  # 联赛简称
30
+      leagueFullName = getNoneStr(tdNodeList[1].attrib.get('title'))  # 联赛全称
31
+      leagueName = '|'.join((league, leagueFullName))
32
+      homeTeam = getNoneStr(tdNodeList[2].css('.zhu::text').get())  # 主队
33
+      awayTeam = getNoneStr(tdNodeList[2].css('.ke::text').get())  # 客队
34
+      matchTime = getNoneStr(tdNodeList[3].css('::text').get())  # 比赛时间
35
+      saleStatus = getNoneStr(tdNodeList[5].css('::text').get())  # 销售状态
36
+      wdl = self.parsePassWay(tdNodeList[6].css('div'))
37
+      wdls = self.parsePassWay(tdNodeList[7].css('div'))
38
+      score = self.parsePassWay(tdNodeList[8].css('div'))
39
+      points = self.parsePassWay(tdNodeList[9].css('div'))
40
+
41
+      matchLink = tdNodeList[2].css('a').attrib.get('href')
42
+      matchId = re.sub(r'^.*m=', '', matchLink)
43
+
44
+      # 入库
45
+      match = BSTMatch(
46
+        matchWeek,
47
+        matchTime,
48
+        leagueName,
49
+        homeTeam,
50
+        awayTeam,
51
+        saleStatus,
52
+        wdl,
53
+        wdls,
54
+        score,
55
+        points
56
+      )
57
+      match.persist()
58
+
59
+      # 加入到临时字典里
60
+      self._matchesMap[matchId] = match
61
+
62
+      # 请求比赛赔率
63
+      if saleStatus == '已开售':
64
+        priceAPI = 'https://info.sporttery.cn/basketball/pool_result.php?id=' + matchId
65
+        yield scrapy.Request(priceAPI, self.parsePrice, 'GET')
66
+
67
+  def parsePassWay(self, div):
68
+    # 待开售
69
+    if div is None:
70
+      return 'wait'
71
+
72
+    # 地址
73
+    cls = div.attrib.get('class')
74
+    if cls is None:
75
+      return 'wait'
76
+
77
+    # 开售单关方式和过关方式
78
+    if "u-dan" in cls:
79
+      return 'pass&single'
80
+
81
+    # 仅开售过关方式
82
+    elif "u-cir" in cls:
83
+      return 'pass'
84
+
85
+    # 未开售此玩法
86
+    else:
87
+      return 'wait'
88
+
89
+  def parsePrice(self, response):
90
+    logging.info("采集数据源 ---> %s" % response.url)
91
+
92
+    tableList = response.css('.kj-table')
93
+    if tableList is None or len(tableList) < 4:
94
+      logging.error("抓取赔率结果失败")
95
+      return
96
+
97
+    wl = None
98
+    wls = None
99
+    score = None
100
+    points = None
101
+
102
+    for i, tab in enumerate(tableList):
103
+      # 取最新的记录
104
+      tr = tab.css("tr")[-1]
105
+      tdList = tr.css('td')
106
+
107
+      try:
108
+        if i == 0:
109
+          # 胜负
110
+          wl = WLOdds(getNoneStr(tdList[2].css("::text").get()), getNoneStr(tdList[1].css("::text").get()))
111
+        elif i == 1:
112
+          # 让分胜负
113
+          wls = WLSpreadOdds(getNoneStr(tdList[3].css("::text").get()), getNoneStr(tdList[1].css("::text").get()), getNoneStr(tdList[2].css("::text").get()))
114
+        elif i == 2:
115
+          # 大小分
116
+          score = ScoreResult(getNoneStr(tdList[1].css("::text").get()), getNoneStr(tdList[3].css("::text").get()), getNoneStr(tdList[2].css("::text").get()))
117
+        elif i == 3:
118
+          # 胜分差
119
+          points = PointsResult(
120
+            getNoneStr(tdList[7].css("::text").get()),
121
+            getNoneStr(tdList[8].css("::text").get()),
122
+            getNoneStr(tdList[9].css("::text").get()),
123
+            getNoneStr(tdList[10].css("::text").get()),
124
+            getNoneStr(tdList[11].css("::text").get()),
125
+            getNoneStr(tdList[12].css("::text").get()),
126
+            getNoneStr(tdList[1].css("::text").get()),
127
+            getNoneStr(tdList[2].css("::text").get()),
128
+            getNoneStr(tdList[3].css("::text").get()),
129
+            getNoneStr(tdList[4].css("::text").get()),
130
+            getNoneStr(tdList[5].css("::text").get()),
131
+            getNoneStr(tdList[6].css("::text").get())
132
+          )
133
+        else:
134
+          continue
135
+      except:
136
+        continue
137
+
138
+    matchId = re.sub(r'^.*pool_result.php\?id=', '', response.url)
139
+    match = self._matchesMap[matchId]
140
+
141
+    bstprice = BSTPrice(
142
+      matchId,
143
+      match.matchWeek,
144
+      match.matchTime,
145
+      match.league,
146
+      match.homeTeam,
147
+      match.awayTeam,
148
+      wl,
149
+      wls,
150
+      score,
151
+      points
152
+    )
153
+
154
+    #
155
+    logging.info("采集到数据 --> %s" % bstprice.toString())
156
+
157
+    # 入库
158
+    bstprice.persist()

+ 139
- 56
crawl/spiders/basketball_result.py ファイルの表示

@@ -1,56 +1,139 @@
1
-import scrapy
2
-import time
3
-from crawl.comm.basketball import BSTResult
4
-
5
-class BasketballSpider(scrapy.Spider):
6
-  name = "basketball-result"
7
-
8
-  def start_requests(self):
9
-    # 开奖
10
-    today = time.strftime("%Y-%m-%d")
11
-    url = "https://www.lottery.gov.cn/basketball/result_99.jspx?startDate="+today+"&endDate="+today+"&f_league_id=0&f_league_name=%E5%85%A8%E9%83%A8%E8%81%94%E8%B5%9B&single=off"
12
-    yield scrapy.Request(url, self.parseResult)
13
-
14
-  def parseResult(self, response):
15
-    cssMain = ".xxsj table tr"
16
-
17
-    # 获取所有比赛
18
-    matches = response.css(cssMain)
19
-    for node in matches[1:-1]:  # 标题行忽略以及末尾一行
20
-      prop = node.css("td")
21
-      if len(prop) < 7:
22
-        continue
23
-
24
-      matchTime = prop[0].css('::text').get()
25
-      matchWeek = prop[1].css('::text').get()
26
-      league = prop[2].css('::text').get()
27
-      team = prop[3].css('a::text').getall()
28
-      if team is None or len(team) == 0:
29
-        team = prop[3].css('::text').get().split('VS')
30
-      homeTeam = team[0].strip()
31
-      awayTeam = team[1].strip()
32
-      single = self.isSingle(prop[3].attrib.get('class'))
33
-      tmp = prop[4].css('::text').get()
34
-      score = tmp.strip() if tmp is not None else ""
35
-      tmp = prop[5].css('::text').get()
36
-      status = tmp.strip() if tmp is not None else ""
37
-
38
-      BSTResult(
39
-        matchTime,
40
-        matchWeek,
41
-        league,
42
-        homeTeam,
43
-        awayTeam,
44
-        single,
45
-        score,
46
-        status
47
-      ).persist()
48
-
49
-  def isSingle(self, eleCls):
50
-    if eleCls is None:
51
-      return '0'
52
-    
53
-    if 'dan' in eleCls:
54
-      return '1'
55
-    else:
56
-      return '0'
1
+import scrapy
2
+import re
3
+import logging
4
+from crawl.comm.basketball import BSTResult
5
+from crawl.spiders.util import getNoneStr
6
+
7
+
8
+class BasketballSpider(scrapy.Spider):
9
+  name = "basketball-result"
10
+
11
+  # 所有比赛
12
+  _matchesMap = {}
13
+
14
+  def start_requests(self):
15
+    url = 'https://info.sporttery.cn/basketball/match_result.php'
16
+    yield scrapy.Request(url, self.parseResult, 'GET')
17
+
18
+  def parseResult(self, response):
19
+    # 处理比赛结果
20
+    cssOfMatch = '.all-wrap > .match_list .m-tab tr'
21
+    matches = response.css(cssOfMatch)
22
+    if matches is None: return
23
+    for matchNode in matches:
24
+      tdNodeList = matchNode.css('td')
25
+      if tdNodeList is None or len(tdNodeList) < 10:
26
+        continue
27
+
28
+      matchTime = getNoneStr(tdNodeList[0].css('::text').get())
29
+      matchWeek = getNoneStr(tdNodeList[1].css('::text').get())
30
+      league = getNoneStr(tdNodeList[2].css('::text').get())
31
+      leagueFullName = getNoneStr(tdNodeList[2].attrib.get('title'))  # 联赛全称
32
+      leagueName = '|'.join((league, leagueFullName))
33
+      homeTeam = self.trimBrackets(getNoneStr(tdNodeList[3].css('.zhu::text').get()))  # 主队
34
+      awayTeam = self.trimBrackets(getNoneStr(tdNodeList[3].css('.ke::text').get()))  # 客队
35
+
36
+      half = ''
37
+      halfSpan = tdNodeList[4].css('span')
38
+      if halfSpan is not None and len(halfSpan) > 0:
39
+        part1 = getNoneStr(tdNodeList[4].css("span::text")[0].get())
40
+        part2 = getNoneStr(tdNodeList[4].css("span::text")[1].get())
41
+        part3 = getNoneStr(tdNodeList[5].css("span::text")[0].get())
42
+        part4 = getNoneStr(tdNodeList[5].css("span::text")[1].get())
43
+        part5 = getNoneStr(tdNodeList[6].css('::text').get())
44
+        half = '|'.join((part1, part2, part3, part4, part5))
45
+
46
+      whole = getNoneStr(tdNodeList[7].css('span::text').get())
47
+      status = getNoneStr(tdNodeList[11].css('::text').get())
48
+
49
+      if whole == '无效场次':
50
+        status = '无效场次'
51
+
52
+      bstResult = BSTResult(
53
+        matchTime,
54
+        matchWeek,
55
+        leagueName,
56
+        homeTeam,
57
+        awayTeam,
58
+        half,
59
+        whole,
60
+        status,
61
+        ''
62
+      )
63
+
64
+      matchLink = getNoneStr(tdNodeList[12].css('a').attrib.get('href'))
65
+      matchId = re.sub(r'^.*pool_result.php\?id=', '', matchLink)
66
+
67
+      # 加入到临时字典里
68
+      self._matchesMap[matchId] = bstResult
69
+
70
+      if status == '已完成':
71
+        yield scrapy.Request('https:' + matchLink, self.parsePrice, 'GET')
72
+      else:
73
+        logging.info("采集到数据 --> %s" % bstResult.toString())
74
+        bstResult.persist()
75
+
76
+    # 是否存在下一页
77
+    pgNodes = response.css('.m-page .u-pg2')
78
+    if pgNodes is None or len(pgNodes) < 1: return
79
+
80
+    # 总页数
81
+    total = pgNodes[-1].css('a::text').get()
82
+    if total is None or total == '':
83
+      total = 0
84
+
85
+    # 当前页数
86
+    current = re.sub(r'^.*match_result.php[?page=]*', '', response.url)
87
+    if current is None or current == '':
88
+      current = 1
89
+
90
+    if int(current) >= int(total): return
91
+
92
+    # 下一页
93
+    nextPG = int(current) + 1
94
+    url = 'https://info.sporttery.cn/football/match_result.php?page=' + str(nextPG)
95
+    yield scrapy.Request(url, self.parseResult, 'GET')
96
+
97
+
98
+  def trimBrackets(self, str):
99
+    if str is None: return ''
100
+
101
+    return re.sub(r'\(.*\)', '', str)
102
+
103
+  def parsePrice(self, response):
104
+    logging.info("采集数据源 ---> %s" % response.url)
105
+
106
+    tableList = response.css('.kj-table')
107
+    if tableList is None or len(tableList) < 4:
108
+      logging.error("抓取赔率结果失败")
109
+      return
110
+
111
+    playRes = []
112
+
113
+    for i, tab in enumerate(tableList):
114
+
115
+      try:
116
+        if i == 0 or i == 3:
117
+          # 胜负 or 胜分差
118
+          tr = tab.css("tr")[0]
119
+          tdList = tr.css('td')
120
+          playRes.append(getNoneStr(tdList[1].css("span::text").get()).replace(' ', ''))
121
+        elif i == 1 or i == 2:
122
+          # 让分胜负 or 大小分
123
+          tr = tab.css("tr")[-1]
124
+          tdList = tr.css('td')
125
+          playRes.append(getNoneStr(tdList[-1].css("span::text").get()).replace(' ', ''))
126
+        else:
127
+          continue
128
+      except:
129
+        continue
130
+
131
+    matchId = re.sub(r'^.*pool_result.php\?id=', '', response.url)
132
+    bstResult = self._matchesMap[matchId]
133
+    bstResult.playRes = '|'.join(playRes)
134
+
135
+    #
136
+    logging.info("采集到数据 --> %s" % bstResult.toString())
137
+
138
+    # 入库
139
+    bstResult.persist()

+ 61
- 0
crawl/spiders/basketball_result.py.bak ファイルの表示

@@ -0,0 +1,61 @@
1
+import scrapy
2
+import time
3
+from crawl.comm.basketball import BSTResult
4
+
5
+class BasketballSpider(scrapy.Spider):
6
+  name = "basketball-result"
7
+
8
+  def start_requests(self):
9
+    # 开奖
10
+    today = time.strftime("%Y-%m-%d")
11
+    url = "https://www.lottery.gov.cn/basketball/result_99.jspx?startDate="+today+"&endDate="+today+"&f_league_id=0&f_league_name=%E5%85%A8%E9%83%A8%E8%81%94%E8%B5%9B&single=off"
12
+    yield scrapy.Request(url, self.parseResult)
13
+
14
+  def parseResult(self, response):
15
+    cssMain = ".xxsj table tr"
16
+
17
+    # 获取所有比赛
18
+    matches = response.css(cssMain)
19
+    for node in matches[1:-1]:  # 标题行忽略以及末尾一行
20
+      prop = node.css("td")
21
+      if len(prop) < 7:
22
+        continue
23
+
24
+      matchTime = prop[0].css('::text').get()
25
+      matchWeek = prop[1].css('::text').get()
26
+      league = prop[2].css('::text').get()
27
+      team = prop[3].css('a::text').getall()
28
+      if team is None or len(team) == 0:
29
+        team = prop[3].css('::text').get().split('VS')
30
+      homeTeam = team[1].strip()
31
+      awayTeam = team[0].strip()
32
+      single = self.isSingle(prop[3].attrib.get('class'))
33
+      tmp = prop[4].css('::text').get()
34
+      score = tmp.strip() if tmp is not None else ""
35
+      tmp = prop[5].css('::text').get()
36
+      status = tmp.strip() if tmp is not None else ""
37
+
38
+      bstResult = BSTResult(
39
+        matchTime,
40
+        matchWeek,
41
+        league,
42
+        homeTeam,
43
+        awayTeam,
44
+        single,
45
+        score,
46
+        status
47
+      )
48
+      bstResult.persist()
49
+
50
+      # if status == '已完成':
51
+      #   resURI = prop[6].css('a').attrib.get('href')
52
+      #   yield scrapy.Request('https://www.lottery.gov.cn' + resURI, self.parseResDetail, 'GET')
53
+
54
+  def isSingle(self, eleCls):
55
+    if eleCls is None:
56
+      return '0'
57
+    
58
+    if 'dan' in eleCls:
59
+      return '1'
60
+    else:
61
+      return '0'

+ 1
- 1
crawl/spiders/football_result.py ファイルの表示

@@ -67,7 +67,7 @@ class FootballSpider(scrapy.Spider):
67 67
     # 下一页
68 68
     nextPG = int(current) + 1
69 69
     url = 'https://info.sporttery.cn/football/match_result.php?page=' + str(nextPG)
70
-    return scrapy.Request(url, self.parseResult, 'GET')
70
+    yield scrapy.Request(url, self.parseResult, 'GET')
71 71
 
72 72
 
73 73
   def trimBrackets(self, str):

+ 6
- 0
crawl/spiders/util.py ファイルの表示

@@ -0,0 +1,6 @@
1
+
2
+def getNoneStr(s):
3
+  if s is None:
4
+    return ''
5
+  else:
6
+    return s.strip()