Your Name 4 年 前
コミット
27119280ea

+ 15
- 0
.vscode/launch.json ファイルの表示

@@ -0,0 +1,15 @@
1
+{
2
+  // Use IntelliSense to learn about possible attributes.
3
+  // Hover to view descriptions of existing attributes.
4
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+  "version": "0.2.0",
6
+  "configurations": [
7
+    {
8
+      "name": "Python: Current File",
9
+      "type": "python",
10
+      "request": "launch",
11
+      "program": "${workspaceRoot}/crawl/run.py",
12
+      "console": "integratedTerminal"
13
+    }
14
+  ]
15
+}

+ 0
- 0
crawl/__init__.py ファイルの表示


バイナリ
crawl/__pycache__/__init__.cpython-38.pyc ファイルの表示


バイナリ
crawl/__pycache__/settings.cpython-38.pyc ファイルの表示


+ 0
- 0
crawl/comm/__init__.py ファイルの表示


バイナリ
crawl/comm/__pycache__/__init__.cpython-38.pyc ファイルの表示


バイナリ
crawl/comm/__pycache__/football.cpython-38.pyc ファイルの表示


バイナリ
crawl/comm/__pycache__/football_match.cpython-38.pyc ファイルの表示


バイナリ
crawl/comm/__pycache__/mydb.cpython-38.pyc ファイルの表示


+ 452
- 0
crawl/comm/football.py ファイルの表示

@@ -0,0 +1,452 @@
1
+from .mydb import MyDB
2
+
3
+class FTMatch:
4
+  def __init__(self,
5
+    # 比赛周编码
6
+    matchWeek,
7
+
8
+    # 比赛时间
9
+    matchTime,
10
+
11
+    # 联赛
12
+    league,
13
+
14
+    # 主队
15
+    homeTeam,
16
+
17
+    # 客队
18
+    awayTeam,
19
+
20
+    # 胜平负
21
+    wdl,
22
+
23
+    # 胜平负(让)
24
+    wdls,
25
+
26
+    # 比分
27
+    score,
28
+
29
+    # 总进球
30
+    points,
31
+
32
+    # 半全场
33
+    double
34
+  ):
35
+    self.matchWeek = matchWeek
36
+    self.matchTime = matchTime
37
+    self.league = league
38
+    self.homeTeam = homeTeam
39
+    self.awayTeam = awayTeam
40
+    self.wdl = wdl
41
+    self.wdls = wdls
42
+    self.score = score
43
+    self.points = points
44
+    self.double = double
45
+
46
+  def toString(self):
47
+    return ";".join((
48
+      self.matchWeek,
49
+      self.matchTime,
50
+      self.league,
51
+      self.homeTeam,
52
+      self.awayTeam,
53
+      self.wdl,
54
+      self.wdls,
55
+      self.score,
56
+      self.points,
57
+      self.double
58
+    ))
59
+  
60
+  def persist(self):
61
+    cursor = MyDB.getCursor()
62
+    if cursor is None:
63
+      return
64
+    
65
+    sql = "insert into ta_crawl_football(data_type, content) values('match', %s)"
66
+    cursor.execute(sql, self.toString())
67
+    MyDB.commit()
68
+    cursor.close()
69
+
70
+# 赔率
71
+class FTPrice:
72
+  def __init__(self,
73
+    # 比赛ID
74
+    matchId,
75
+
76
+    # 比赛周编码
77
+    matchWeek,
78
+
79
+    # 比赛时间
80
+    matchTime,
81
+
82
+    # 联赛
83
+    league,
84
+
85
+    # 主队
86
+    homeTeam,
87
+
88
+    # 客队
89
+    awayTeam,
90
+
91
+    # 胜平负
92
+    wdlOdds = None,
93
+
94
+    # 胜平负(让) - [w, d, l, s]
95
+    wdlsOdds = None,
96
+
97
+    # 比分 - [w(13) + d(4) + l(13)]
98
+    scoreResult = None,
99
+
100
+    # 进球数 - [0, 1, 2, 3, 4, 5, 6, >6]
101
+    pointsResult = None,
102
+
103
+    # 半全场 - [w/w, w/d, w/l, d/w, d/d, d/l, l/w, l/d, l/l]
104
+    doubleResult = None
105
+  ):
106
+    self.matchId = matchId
107
+    self.matchWeek = matchWeek
108
+    self.matchTime = matchTime
109
+    self.league = league
110
+    self.homeTeam = homeTeam
111
+    self.awayTeam = awayTeam
112
+    self.wdlOdds = wdlOdds
113
+    self.wdlsOdds = wdlsOdds
114
+    self.scoreResult = scoreResult
115
+    self.pointsResult = pointsResult
116
+    self.doubleResult = doubleResult
117
+
118
+  def toString(self):
119
+    return ";".join((
120
+      self.matchId,
121
+      self.matchWeek,
122
+      self.matchTime,
123
+      self.league,
124
+      self.homeTeam,
125
+      self.awayTeam,
126
+      self.wdlOdds.toString(),
127
+      self.wdlsOdds.toString(),
128
+      self.scoreResult.toString(),
129
+      self.pointsResult.toString(),
130
+      self.doubleResult.toString()
131
+    ))
132
+
133
+  def persist(self):
134
+    cursor = MyDB.getCursor()
135
+    if cursor is None:
136
+      return
137
+    
138
+    sql = "insert into ta_crawl_football(data_type, content) values('price', %s)"
139
+    cursor.execute(sql, self.toString())
140
+    MyDB.commit()
141
+    cursor.close()
142
+
143
+# 胜平负
144
+class WDLOdds:
145
+  def __init__(self,
146
+    # 胜
147
+    win = None,
148
+    
149
+    # 平
150
+    dead = None,
151
+    
152
+    # 负
153
+    lose = None
154
+  ):
155
+    self.win = win
156
+    self.dead = dead
157
+    self.lose = lose
158
+
159
+  def datas(self, datas = []):
160
+    self.win = datas[0]
161
+    self.dead = datas[1]
162
+    self.lose = datas[2]
163
+
164
+  def toString(self):
165
+    return '|'.join((self.win, self.dead, self.lose))
166
+
167
+# 胜平负(让)
168
+class WDLSpreadOdds:
169
+  def __init__(self,
170
+    # 胜
171
+    win = None,
172
+    
173
+    # 平
174
+    dead = None,
175
+    
176
+    # 负
177
+    lose = None,
178
+
179
+    # 让球
180
+    spread = None
181
+  ):
182
+    self.win = win
183
+    self.dead = dead
184
+    self.lose = lose
185
+    self.spread = spread
186
+
187
+  def datas(self, datas = []):
188
+    self.win = datas[0]
189
+    self.dead = datas[1]
190
+    self.lose = datas[2]
191
+    self.spread = datas[3]
192
+
193
+  def toString(self):
194
+    return '|'.join((self.win, self.dead, self.lose, self.spread))
195
+
196
+# 比分
197
+class ScoreResult:
198
+  def __init__(self,
199
+    w10 = None, # 胜 1-0
200
+    w20 = None, # 胜 2-0
201
+    w21 = None, # 胜 2-1
202
+    w30 = None, # 胜 3-0
203
+    w31 = None, # 胜 3-1
204
+    w32 = None, # 胜 3-2
205
+    w40 = None, # 胜 4-0
206
+    w41 = None, # 胜 4-1
207
+    w42 = None, # 胜 4-2
208
+    w50 = None, # 胜 5-0
209
+    w51 = None, # 胜 5-1
210
+    w52 = None, # 胜 5-2
211
+    w99 = None, # 胜 其他
212
+    d00 = None, # 平 0-0
213
+    d11 = None, # 平 1-1
214
+    d22 = None, # 平 2-2
215
+    d33 = None, # 平 3-3
216
+    d99 = None, # 平 其他
217
+    l01 = None, # 负 0-1
218
+    l02 = None, # 负 0-2
219
+    l12 = None, # 负 1-2
220
+    l03 = None, # 负 0-3
221
+    l13 = None, # 负 1-3
222
+    l23 = None, # 负 2-3
223
+    l04 = None, # 负 0-4
224
+    l14 = None, # 负 1-4
225
+    l24 = None, # 负 2-4
226
+    l05 = None, # 负 0-5
227
+    l15 = None, # 负 1-5
228
+    l25 = None, # 负 2-5
229
+    l99 = None # 负 其他
230
+  ):
231
+    self.w10 = w10
232
+    self.w20 = w20
233
+    self.w21 = w21
234
+    self.w30 = w30
235
+    self.w31 = w31
236
+    self.w32 = w32
237
+    self.w40 = w40
238
+    self.w41 = w41
239
+    self.w42 = w42
240
+    self.w50 = w50
241
+    self.w51 = w51
242
+    self.w52 = w52
243
+    self.w99 = w99
244
+    self.d00 = d00
245
+    self.d11 = d11
246
+    self.d22 = d22
247
+    self.d33 = d33
248
+    self.d99 = d99
249
+    self.l01 = l01
250
+    self.l02 = l02
251
+    self.l12 = l12
252
+    self.l03 = l03
253
+    self.l13 = l13
254
+    self.l23 = l23
255
+    self.l04 = l04
256
+    self.l14 = l14
257
+    self.l24 = l24
258
+    self.l05 = l05
259
+    self.l15 = l15
260
+    self.l25 = l25
261
+    self.l99 = l99
262
+
263
+  def datas(self, datas = []):
264
+    self.w10 = datas[0]
265
+    self.w20 = datas[1]
266
+    self.w21 = datas[2]
267
+    self.w30 = datas[3]
268
+    self.w31 = datas[4]
269
+    self.w32 = datas[5]
270
+    self.w40 = datas[6]
271
+    self.w41 = datas[7]
272
+    self.w42 = datas[8]
273
+    self.w50 = datas[9]
274
+    self.w51 = datas[10]
275
+    self.w52 = datas[11]
276
+    self.w99 = datas[12]
277
+    self.d00 = datas[13]
278
+    self.d11 = datas[14]
279
+    self.d22 = datas[15]
280
+    self.d33 = datas[16]
281
+    self.d99 = datas[17]
282
+    self.l01 = datas[18]
283
+    self.l02 = datas[19]
284
+    self.l12 = datas[20]
285
+    self.l03 = datas[21]
286
+    self.l13 = datas[22]
287
+    self.l23 = datas[23]
288
+    self.l04 = datas[24]
289
+    self.l14 = datas[25]
290
+    self.l24 = datas[26]
291
+    self.l05 = datas[27]
292
+    self.l15 = datas[28]
293
+    self.l25 = datas[29]
294
+    self.l99 = datas[30]
295
+
296
+  def toString(self):
297
+    return '|'.join((
298
+      self.w10,
299
+      self.w20,
300
+      self.w21,
301
+      self.w30,
302
+      self.w31,
303
+      self.w32,
304
+      self.w40,
305
+      self.w41,
306
+      self.w42,
307
+      self.w50,
308
+      self.w51,
309
+      self.w52,
310
+      self.w99,
311
+      self.d00,
312
+      self.d11,
313
+      self.d22,
314
+      self.d33,
315
+      self.d99,
316
+      self.l01,
317
+      self.l02,
318
+      self.l12,
319
+      self.l03,
320
+      self.l13,
321
+      self.l23,
322
+      self.l04,
323
+      self.l14,
324
+      self.l24,
325
+      self.l05,
326
+      self.l15,
327
+      self.l25,
328
+      self.l99)
329
+    )
330
+
331
+# 进球数
332
+class PointsResult:
333
+  def __init__(self,
334
+    # 总0球
335
+    p0 = None,
336
+    
337
+    # 总1球
338
+    p1 = None,
339
+    
340
+    # 总2球
341
+    p2 = None,
342
+    
343
+    # 总3球
344
+    p3 = None,
345
+    
346
+    # 总4球
347
+    p4 = None,
348
+    
349
+    # 总5球
350
+    p5 = None,
351
+    
352
+    # 总6球
353
+    p6 = None,
354
+    
355
+    # 总>6球
356
+    p99 = None
357
+  ):
358
+    self.p0 = p0
359
+    self.p1 = p1
360
+    self.p2 = p2
361
+    self.p3 = p3
362
+    self.p4 = p4
363
+    self.p5 = p5
364
+    self.p6 = p6
365
+    self.p99 = p99
366
+
367
+  def datas(self, datas = []):
368
+    self.p0 = datas[0]
369
+    self.p1 = datas[1]
370
+    self.p2 = datas[2]
371
+    self.p3 = datas[3]
372
+    self.p4 = datas[4]
373
+    self.p5 = datas[5]
374
+    self.p6 = datas[6]
375
+    self.p99 = datas[7]
376
+
377
+  def toString(self):
378
+    return '|'.join((
379
+      self.p0,
380
+      self.p1,
381
+      self.p2,
382
+      self.p3,
383
+      self.p4,
384
+      self.p5,
385
+      self.p6,
386
+      self.p99
387
+    ))
388
+
389
+# 半全场
390
+class DoubleResult:
391
+  def __init__(self,
392
+    # 胜/胜
393
+    ww = None,
394
+    
395
+    # 胜/平
396
+    wd = None,
397
+    
398
+    # 胜/负
399
+    wl = None,
400
+    
401
+    # 平/胜
402
+    dw = None,
403
+    
404
+    # 平/平
405
+    dd = None,
406
+    
407
+    # 平/负
408
+    dl = None,
409
+    
410
+    # 负/胜
411
+    lw = None,
412
+    
413
+    # 负/平
414
+    ld = None,
415
+    
416
+    # 负/负
417
+    ll = None
418
+  ):
419
+   self.ww = ww
420
+   self.wd = wd
421
+   self.wl = wl
422
+   self.dw = dw
423
+   self.dd = dd
424
+   self.dl = dl
425
+   self.lw = lw
426
+   self.ld = ld
427
+   self.ll = ll
428
+
429
+  def datas(self, datas = []):
430
+    self.ww = datas[0]
431
+    self.wd = datas[1]
432
+    self.wl = datas[2]
433
+    self.dw = datas[3]
434
+    self.dd = datas[4]
435
+    self.dl = datas[5]
436
+    self.lw = datas[6]
437
+    self.ld = datas[7]
438
+    self.ll = datas[8]
439
+
440
+
441
+  def toString(self):
442
+    return '|'.join((
443
+      self.ww,
444
+      self.wd,
445
+      self.wl,
446
+      self.dw,
447
+      self.dd,
448
+      self.dl,
449
+      self.lw,
450
+      self.ld,
451
+      self.ll
452
+    ))

+ 38
- 0
crawl/comm/mydb.py ファイルの表示

@@ -0,0 +1,38 @@
1
+import pymysql
2
+import scrapy
3
+
4
+class MyDB:
5
+  __connected = False
6
+  __conn = None
7
+
8
+  @classmethod
9
+  def from_crawler(cls, crawler):
10
+    cls.connectDB(crawler.settings['DATABASE'])
11
+    return cls()
12
+
13
+  @classmethod
14
+  def connectDB(cls, dbSetting):
15
+    cls.__conn = pymysql.connect(
16
+      host = dbSetting['host'],
17
+      port = dbSetting['port'],
18
+      user = dbSetting['user'],
19
+      password = dbSetting['password'],
20
+      db = dbSetting['name'],
21
+      charset = 'utf8'
22
+    )
23
+    cls.__connected = True
24
+
25
+  @classmethod
26
+  def getCursor(cls):
27
+    if cls.__connected:
28
+      return cls.__conn.cursor()
29
+
30
+  @classmethod
31
+  def commit(cls):
32
+    if cls.__connected:
33
+      return cls.__conn.commit()
34
+
35
+  def __del__(self):
36
+    if self.__class__.__connected:
37
+      self.__class__.__connected = False
38
+      self.__class__.__conn.close()

+ 12
- 0
crawl/items.py ファイルの表示

@@ -0,0 +1,12 @@
1
+# Define here the models for your scraped items
2
+#
3
+# See documentation in:
4
+# https://docs.scrapy.org/en/latest/topics/items.html
5
+
6
+import scrapy
7
+
8
+
9
+class CrawlItem(scrapy.Item):
10
+    # define the fields for your item here like:
11
+    # name = scrapy.Field()
12
+    pass

+ 103
- 0
crawl/middlewares.py ファイルの表示

@@ -0,0 +1,103 @@
1
+# Define here the models for your spider middleware
2
+#
3
+# See documentation in:
4
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5
+
6
+from scrapy import signals
7
+
8
+# useful for handling different item types with a single interface
9
+from itemadapter import is_item, ItemAdapter
10
+
11
+
12
+class CrawlSpiderMiddleware:
13
+    # Not all methods need to be defined. If a method is not defined,
14
+    # scrapy acts as if the spider middleware does not modify the
15
+    # passed objects.
16
+
17
+    @classmethod
18
+    def from_crawler(cls, crawler):
19
+        # This method is used by Scrapy to create your spiders.
20
+        s = cls()
21
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22
+        return s
23
+
24
+    def process_spider_input(self, response, spider):
25
+        # Called for each response that goes through the spider
26
+        # middleware and into the spider.
27
+
28
+        # Should return None or raise an exception.
29
+        return None
30
+
31
+    def process_spider_output(self, response, result, spider):
32
+        # Called with the results returned from the Spider, after
33
+        # it has processed the response.
34
+
35
+        # Must return an iterable of Request, or item objects.
36
+        for i in result:
37
+            yield i
38
+
39
+    def process_spider_exception(self, response, exception, spider):
40
+        # Called when a spider or process_spider_input() method
41
+        # (from other spider middleware) raises an exception.
42
+
43
+        # Should return either None or an iterable of Request or item objects.
44
+        pass
45
+
46
+    def process_start_requests(self, start_requests, spider):
47
+        # Called with the start requests of the spider, and works
48
+        # similarly to the process_spider_output() method, except
49
+        # that it doesn’t have a response associated.
50
+
51
+        # Must return only requests (not items).
52
+        for r in start_requests:
53
+            yield r
54
+
55
+    def spider_opened(self, spider):
56
+        spider.logger.info('Spider opened: %s' % spider.name)
57
+
58
+
59
+class CrawlDownloaderMiddleware:
60
+    # Not all methods need to be defined. If a method is not defined,
61
+    # scrapy acts as if the downloader middleware does not modify the
62
+    # passed objects.
63
+
64
+    @classmethod
65
+    def from_crawler(cls, crawler):
66
+        # This method is used by Scrapy to create your spiders.
67
+        s = cls()
68
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69
+        return s
70
+
71
+    def process_request(self, request, spider):
72
+        # Called for each request that goes through the downloader
73
+        # middleware.
74
+
75
+        # Must either:
76
+        # - return None: continue processing this request
77
+        # - or return a Response object
78
+        # - or return a Request object
79
+        # - or raise IgnoreRequest: process_exception() methods of
80
+        #   installed downloader middleware will be called
81
+        return None
82
+
83
+    def process_response(self, request, response, spider):
84
+        # Called with the response returned from the downloader.
85
+
86
+        # Must either;
87
+        # - return a Response object
88
+        # - return a Request object
89
+        # - or raise IgnoreRequest
90
+        return response
91
+
92
+    def process_exception(self, request, exception, spider):
93
+        # Called when a download handler or a process_request()
94
+        # (from other downloader middleware) raises an exception.
95
+
96
+        # Must either:
97
+        # - return None: continue processing this exception
98
+        # - return a Response object: stops process_exception() chain
99
+        # - return a Request object: stops process_exception() chain
100
+        pass
101
+
102
+    def spider_opened(self, spider):
103
+        spider.logger.info('Spider opened: %s' % spider.name)

+ 13
- 0
crawl/pipelines.py ファイルの表示

@@ -0,0 +1,13 @@
1
+# Define your item pipelines here
2
+#
3
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5
+
6
+
7
+# useful for handling different item types with a single interface
8
+from itemadapter import ItemAdapter
9
+
10
+
11
+class CrawlPipeline:
12
+    def process_item(self, item, spider):
13
+        return item

+ 10
- 0
crawl/run.py ファイルの表示

@@ -0,0 +1,10 @@
1
+from scrapy.cmdline import execute
2
+import sys
3
+import os
4
+# 获取当前脚本路径
5
+dirpath = os.path.dirname(os.path.abspath(__file__))
6
+print(dirpath)
7
+# 添加环境变量
8
+sys.path.append(dirpath)
9
+# 启动爬虫,第三个参数为爬虫name
10
+execute(['scrapy','crawl','football'])

+ 96
- 0
crawl/settings.py ファイルの表示

@@ -0,0 +1,96 @@
1
+# Scrapy settings for crawl project
2
+#
3
+# For simplicity, this file contains only settings considered important or
4
+# commonly used. You can find more settings consulting the documentation:
5
+#
6
+#     https://docs.scrapy.org/en/latest/topics/settings.html
7
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
+
10
+BOT_NAME = 'crawl'
11
+
12
+SPIDER_MODULES = ['crawl.spiders']
13
+NEWSPIDER_MODULE = 'crawl.spiders'
14
+
15
+
16
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
17
+#USER_AGENT = 'crawl (+http://www.yourdomain.com)'
18
+
19
+# Obey robots.txt rules
20
+ROBOTSTXT_OBEY = True
21
+
22
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
23
+#CONCURRENT_REQUESTS = 32
24
+
25
+# Configure a delay for requests for the same website (default: 0)
26
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27
+# See also autothrottle settings and docs
28
+#DOWNLOAD_DELAY = 3
29
+# The download delay setting will honor only one of:
30
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
31
+#CONCURRENT_REQUESTS_PER_IP = 16
32
+
33
+# Disable cookies (enabled by default)
34
+#COOKIES_ENABLED = False
35
+
36
+# Disable Telnet Console (enabled by default)
37
+#TELNETCONSOLE_ENABLED = False
38
+
39
+# Override the default request headers:
40
+#DEFAULT_REQUEST_HEADERS = {
41
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42
+#   'Accept-Language': 'en',
43
+#}
44
+
45
+# Enable or disable spider middlewares
46
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47
+#SPIDER_MIDDLEWARES = {
48
+#    'crawl.middlewares.CrawlSpiderMiddleware': 543,
49
+#}
50
+
51
+# Enable or disable downloader middlewares
52
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53
+#DOWNLOADER_MIDDLEWARES = {
54
+#    'crawl.middlewares.CrawlDownloaderMiddleware': 543,
55
+#}
56
+
57
+# Enable or disable extensions
58
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
59
+EXTENSIONS = {
60
+    'crawl.comm.mydb.MyDB': 100,
61
+}
62
+
63
+# Configure item pipelines
64
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65
+#ITEM_PIPELINES = {
66
+#    'crawl.pipelines.CrawlPipeline': 300,
67
+#}
68
+
69
+# Enable and configure the AutoThrottle extension (disabled by default)
70
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71
+#AUTOTHROTTLE_ENABLED = True
72
+# The initial download delay
73
+#AUTOTHROTTLE_START_DELAY = 5
74
+# The maximum download delay to be set in case of high latencies
75
+#AUTOTHROTTLE_MAX_DELAY = 60
76
+# The average number of requests Scrapy should be sending in parallel to
77
+# each remote server
78
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79
+# Enable showing throttling stats for every response received:
80
+#AUTOTHROTTLE_DEBUG = False
81
+
82
+# Enable and configure HTTP caching (disabled by default)
83
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84
+#HTTPCACHE_ENABLED = True
85
+#HTTPCACHE_EXPIRATION_SECS = 0
86
+#HTTPCACHE_DIR = 'httpcache'
87
+#HTTPCACHE_IGNORE_HTTP_CODES = []
88
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89
+
90
+DATABASE = {
91
+  'host': 'rm-uf6z3z6jq11x653d77o.mysql.rds.aliyuncs.com',
92
+  'port': 3306,
93
+  'name': 'niucai',
94
+  'user': 'niucai',
95
+  'password': '1qaz#EDC'
96
+}

+ 4
- 0
crawl/spiders/__init__.py ファイルの表示

@@ -0,0 +1,4 @@
1
+# This package will contain the spiders of your Scrapy project
2
+#
3
+# Please refer to the documentation for information on how to create and manage
4
+# your spiders.

バイナリ
crawl/spiders/__pycache__/__init__.cpython-38.pyc ファイルの表示


バイナリ
crawl/spiders/__pycache__/football_spider.cpython-38.pyc ファイルの表示


+ 159
- 0
crawl/spiders/football_spider.py ファイルの表示

@@ -0,0 +1,159 @@
1
+import scrapy
2
+from crawl.comm.football import *
3
+
4
+class FootballSpider(scrapy.Spider):
5
+  name = "football"
6
+
7
+  def start_requests(self):
8
+    # 受注比赛
9
+    url = "https://www.lottery.gov.cn/football/match_list.jspx"
10
+    yield scrapy.Request(url, self.parseMatch)
11
+
12
+    # 赔率
13
+    url = "https://www.lottery.gov.cn/football/counter.jspx"
14
+    yield scrapy.Request(url, self.parseCurrent)
15
+
16
+    # 开奖
17
+
18
+  
19
+  def parseMatch(self, response):
20
+    cssMain = ".xxsj table table tr"
21
+    cssDetail = "td"
22
+
23
+    # 获取所有比赛
24
+    matches = response.css(cssMain)
25
+    for node in matches[1:]:  # 标题行忽略
26
+      prop = node.css(cssDetail)
27
+
28
+      # 小于 2 个 td 为无效行
29
+      if len(prop) < 2:
30
+        continue
31
+
32
+      matchWeek = prop[0].css('::text').get()
33
+      league = prop[1].css('::text').get()
34
+      homeTeam = prop[2].css('.zhu::text').get()
35
+      awayTeam = prop[2].css('.ke::text').get()
36
+      matchTime = prop[3].css('::text').get()
37
+      wdl = self.parsePassWay(prop[6].css('img'))
38
+      wdls = self.parsePassWay(prop[7].css('img'))
39
+      score = self.parsePassWay(prop[8].css('img'))
40
+      points = self.parsePassWay(prop[9].css('img'))
41
+      double = self.parsePassWay(prop[10].css('img'))
42
+
43
+      FTMatch(
44
+        matchWeek,
45
+        matchTime,
46
+        league,
47
+        homeTeam,
48
+        awayTeam,
49
+        wdl,
50
+        wdls,
51
+        score,
52
+        points,
53
+        double
54
+      ).persist()
55
+  
56
+  def parsePassWay(self, img):
57
+    # 待开售
58
+    if img is None:
59
+      return 'wait'
60
+
61
+    # 图片地址
62
+    src = img.attrib['src']
63
+
64
+    # 开售单关方式和过关方式
65
+    if "ball11.png" in src:
66
+      return 'pass&single'
67
+
68
+    # 仅开售过关方式
69
+    elif "ball1.png" in src:
70
+      return 'pass'
71
+
72
+    # 未开售此玩法
73
+    else:
74
+      return 'no'
75
+
76
+
77
+  def parseCurrent(self, response):
78
+    cssMain = "#content .article .articleCon .section"
79
+    cssDetail = ".saishi"
80
+    cssOther = ".saishiCon"
81
+
82
+    # 获取所有比赛
83
+    matches = response.css(cssMain)
84
+    for node in matches:
85
+
86
+      # 比赛ID
87
+      matchId = node.attrib['match_id']
88
+      matchTime = node.attrib['match_time']
89
+
90
+      # 其他相关属性
91
+      details = node.css(cssDetail).css("td")
92
+      matchWeek = details[0].css('::text').get()
93
+      league = details[1].css('::text').get()
94
+      homeTeam = details[3].css('::text').get()
95
+      awayTeam = details[4].css('::text').get()
96
+
97
+      match = FTPrice(
98
+        matchId,
99
+        matchWeek,
100
+        matchTime,
101
+        league,
102
+        homeTeam,
103
+        awayTeam,
104
+        WDLOdds(
105
+          details[5].css('::text').get(),
106
+          details[6].css('::text').get(),
107
+          details[7].css('::text').get()
108
+        ),
109
+        WDLSpreadOdds(
110
+          details[9].css('::text').get(),
111
+          details[10].css('::text').get(),
112
+          details[11].css('::text').get(),
113
+          details[8].css('::text').get()
114
+        )
115
+      )
116
+
117
+      # 比分, 总分, 半全场 是另外的 table
118
+      otherOdds = node.css(cssOther)
119
+      for index, otherNode in enumerate(otherOdds):
120
+
121
+        # 获取比分的赔率
122
+        if index == 0:
123
+          scoreResult = ScoreResult()
124
+          datas = []
125
+          lst = otherNode.css('td')
126
+
127
+          # table 的 3 个 tr
128
+          for it in lst[1:14] + lst[15:20] + lst[21:34]:
129
+            datas += [it.css('strong::text').get()]
130
+
131
+          scoreResult.datas(datas)
132
+          match.scoreResult = scoreResult
133
+
134
+        # 获取总分的赔率
135
+        elif index == 1:
136
+          pointsResult = PointsResult()
137
+          datas = []
138
+          lst = otherNode.css('td')
139
+
140
+          for it in lst[2:10]:
141
+            datas += [it.css('strong::text').get()]
142
+
143
+          pointsResult.datas(datas)
144
+          match.pointsResult = pointsResult
145
+
146
+        # 获取半全场赔率
147
+        else:
148
+          doubleResult = DoubleResult()
149
+          datas = []
150
+          lst = otherNode.css('td')
151
+
152
+          for it in lst[2:11]:
153
+            datas += [it.css('strong::text').get()]
154
+
155
+          doubleResult.datas(datas)
156
+          match.doubleResult = doubleResult
157
+
158
+      # 入库
159
+      match.persist()

+ 11
- 0
scrapy.cfg ファイルの表示

@@ -0,0 +1,11 @@
1
+# Automatically created by: scrapy startproject
2
+#
3
+# For more information about the [deploy] section see:
4
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
5
+
6
+[settings]
7
+default = crawl.settings
8
+
9
+[deploy]
10
+#url = http://localhost:6800/
11
+project = crawl