1. import logging
    2. import time
    3. from scrapy.dupefilters import BaseDupeFilter
    4. from scrapy.utils.request import request_fingerprint
    5. from .connection import get_redis_from_settings
    6. DEFAULT_DUPEFILTER_KEY = "dupefilter:%(timestamp)s"
    7. logger = logging.getLogger(__name__)
    8. # TODO: Rename class to RedisDupeFilter.
    9. class RFPDupeFilter(BaseDupeFilter):
    10. """Redis-based request duplicates filter.
    11. This class can also be used with default Scrapy's scheduler.
    12. """
    13. logger = logger
    14. def __init__(self, server, key, debug=False):
    15. """Initialize the duplicates filter.
    16. Parameters
    17. ----------
    18. server : redis.StrictRedis
    19. The redis server instance.
    20. key : str
    21. Redis key Where to store fingerprints.
    22. debug : bool, optional
    23. Whether to log filtered requests.
    24. """
    25. self.server = server
    26. self.key = key
    27. self.debug = debug
    28. self.logdupes = True
    29. @classmethod
    30. def from_settings(cls, settings):
    31. """Returns an instance from given settings.
    32. This uses by default the key ``dupefilter:<timestamp>``. When using the
    33. ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
    34. it needs to pass the spider name in the key.
    35. Parameters
    36. ----------
    37. settings : scrapy.settings.Settings
    38. Returns
    39. -------
    40. RFPDupeFilter
    41. A RFPDupeFilter instance.
    42. """
    43. server = get_redis_from_settings(settings)
    44. # XXX: This creates one-time key. needed to support to use this
    45. # class as standalone dupefilter with scrapy's default scheduler
    46. # if scrapy passes spider on open() method this wouldn't be needed
    47. # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
    48. key = DEFAULT_DUPEFILTER_KEY % {'timestamp': int(time.time())}
    49. debug = settings.getbool('DUPEFILTER_DEBUG')
    50. return cls(server, key=key, debug=debug)
    51. @classmethod
    52. def from_crawler(cls, crawler):
    53. """Returns instance from crawler.
    54. Parameters
    55. ----------
    56. crawler : scrapy.crawler.Crawler
    57. Returns
    58. -------
    59. RFPDupeFilter
    60. Instance of RFPDupeFilter.
    61. """
    62. return cls.from_settings(crawler.settings)
    63. def request_seen(self, request):
    64. """Returns True if request was already seen.
    65. Parameters
    66. ----------
    67. request : scrapy.http.Request
    68. Returns
    69. -------
    70. bool
    71. """
    72. fp = self.request_fingerprint(request)
    73. # This returns the number of values added, zero if already exists.
    74. added = self.server.sadd(self.key, fp)
    75. return added == 0
    76. def request_fingerprint(self, request):
    77. """Returns a fingerprint for a given request.
    78. Parameters
    79. ----------
    80. request : scrapy.http.Request
    81. Returns
    82. -------
    83. str
    84. """
    85. return request_fingerprint(request)
    86. def close(self, reason=''):
    87. """Delete data on close. Called by Scrapy's scheduler.
    88. Parameters
    89. ----------
    90. reason : str, optional
    91. """
    92. self.clear()
    93. def clear(self):
    94. """Clears fingerprints data."""
    95. self.server.delete(self.key)
    96. def log(self, request, spider):
    97. """Logs given request.
    98. Parameters
    99. ----------
    100. request : scrapy.http.Request
    101. spider : scrapy.spiders.Spider
    102. """
    103. if self.debug:
    104. msg = "Filtered duplicate request: %(request)s"
    105. self.logger.debug(msg, {'request': request}, extra={'spider': spider})
    106. elif self.logdupes:
    107. msg = ("Filtered duplicate request %(request)s"
    108. " - no more duplicates will be shown"
    109. " (see DUPEFILTER_DEBUG to show all duplicates)")
    110. msg = "Filtered duplicate request: %(request)s"
    111. self.logger.debug(msg, {'request': request}, extra={'spider': spider})
    112. self.logdupes = False