📦 EqualifyEverything / crawler

📄 config.py · 59 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59BOT_ID = {
    'name': 'A11yCrawler',
    'version': '2.1',
    'url': 'https://github.com/EqualifyApp/crawler',
    'description': 'The A11yCrawler is a spider system to collect urls for further testing. Contact us if I am a bad bot.',
}


SITEMAP_FREQUENCY = 3600
# Default Spider Settings
# You can override these in the spider file. if desired. If no value is set in the spider file, the defaults below are used.

DEFAULT_SPIDEY_SENSES = {
    "COOKIES_ENABLED": False,               # Disable cookies
    "CONCURRENT_ITEMS": 50,                 # Number of concurrent items (per response) to process
    "CONCURRENT_REQUESTS": 16,              # Maximum concurrent requests
    #"DEPTH_LIMIT": 3,                      # Max depth that will be crawled. 0 for no limit
    "DNSCACHE_ENABLED": True,               # Enable DNS in-memory cache
    "DNS_TIMEOUT": 60,                      # Timeout for processing DNS queries
    "HTTPCACHE_ENABLED": False,             # Enable or disable caching
    "CONCURRENT_REQUESTS_PER_DOMAIN": 16,   # Maximum concurrent requests per domain
    "ROBOTSTXT_OBEY": True,                 # Obey robots.txt rules
    "AUTOTHROTTLE_ENABLED": True,           # Enable AutoThrottle extension
    "AUTOTHROTTLE_START_DELAY": 5,          # Initial delay before AutoThrottle starts adjusting the delay
    "AUTOTHROTTLE_TARGET_CONCURRENCY": 2,   # Target concurrency for AutoThrottle
    # Logging Settings
    "AUTOTHROTTLE_DEBUG": False,             # Debug logs on Autothrottle
    "LOG_LEVEL": "INFO",                   # Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
    #"LOG_FILE": "logs/crawl.log",          # Where to save logs
    "LOG_ENABLED": True,                     # Enable logging
    "REQUEST_FINGERPRINTER_IMPLEMENTATION": 'scrapy.fingerprint.NoDupeUrlFingerprint'

}




# Definition of Spider Settings:

# COOKIES_ENABLED: Whether to enable cookies (if enabled, stores cookies between requests)
# CONCURRENT_ITEMS: Number of concurrent items (per response) to process
# CONCURRENT_REQUESTS: Maximum concurrent requests
# DEPTH_LIMIT: Max depth that will be crawled. 0 for no limit
# DNSCACHE_ENABLED: Enable DNS in-memory cache
# DNS_TIMEOUT: Timeout for processing DNS queries
# HTTPCACHE_ENABLED: Enable or disable caching
# CONCURRENT_REQUESTS_PER_DOMAIN: Maximum concurrent requests per domain
# ROBOTSTXT_OBEY: Obey robots.txt rules
# AUTOTHROTTLE_ENABLED: Enable AutoThrottle extension
# AUTOTHROTTLE_START_DELAY: Initial delay before AutoThrottle starts adjusting the delay
# AUTOTHROTTLE_TARGET_CONCURRENCY: Target concurrency for AutoThrottle
# AUTOTHROTTLE_DEBUG: Debug logs on Autothrottle
# LOG_LEVEL: Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
# LOG_FILE: Where to save logs
# LOG_ENABLED: Enable logging