Nutch教程——URLNormalizer源码详解

URL正规化(URLNormalize)对大多数网络爬虫来说是一个非常重要的流程,大部分爬虫的去重机制都是基于URL的去重,在实际中,不同的URL有可能对应同一个网页,例如下面的几个URL:

这些URL其实都指向http://nutcher.org/,在文件路径中,../表示上级目录, ./表示当前目录,所以/abc/../和/./都等价于/.如果爬虫没有URL正规化机制, 爬虫会认为http://nutcher.org/http://nutcher.org/abc/../http://nutcher.org/./是三个不同的页面,而三个URL实际都指向http://nutcher.org/,因此会将http://nutcher.org/爬取三次.

可以看出,URL正规化的主要目的就是防止网页的重复采集.这里我们介绍Nutch的URL正规化组件之一, urlnormalizer-basic,该组件在nutch中对应的类是BasicURLNormalizer.java,这里我们用源码注释的方式描述urlnormalizer-basic的工作机制:

  1. package org.apache.nutch.net.urlnormalizer.basic;
  2. import java.net.URL;
  3. import java.net.MalformedURLException;
  4. // Slf4j Logging imports
  5. import org.slf4j.Logger;
  6. import org.slf4j.LoggerFactory;
  7. // Nutch imports
  8. import org.apache.nutch.net.URLNormalizer;
  9. import org.apache.hadoop.conf.Configuration;
  10. import org.apache.hadoop.conf.Configured;
  11. import org.apache.oro.text.regex.*;
  12. /**
  13. * Converts URLs to a normal form:
  14. * <ul>
  15. * <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
  16. * <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
  17. * </ul>
  18. * 将URL转换为正规的形式
  19. * 移除URL中类似/./和/../的部分
  20. * 移除默认端口,例如对于http协议,移除URL中指定的80端口
  21. */
  22. public class BasicURLNormalizer extends Configured implements URLNormalizer {
  23. public static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class);
  24. /*这里使用Perl规范的正则,借助了org.apache.oro.text.regex.Perl5Compiler类*/
  25. private Perl5Compiler compiler = new Perl5Compiler();
  26. private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() {
  27. protected Perl5Matcher initialValue() {
  28. return new Perl5Matcher();
  29. }
  30. };
  31. private final Rule relativePathRule;
  32. private final Rule leadingRelativePathRule;
  33. private final Rule currentPathRule;
  34. private final Rule adjacentSlashRule;
  35. private final static java.util.regex.Pattern hasNormalizablePattern = java.util.regex.Pattern.compile("/\\.?\\.?/");
  36. private Configuration conf;
  37. public BasicURLNormalizer() {
  38. try {
  39. // this pattern tries to find spots like "/xx/../" in the url, which
  40. // could be replaced by "/" xx consists of chars, different then "/"
  41. // (slash) and needs to have at least one char different from "."
  42. // 这里希望从URL中找到匹配/xx/../的部分,替换为/
  43. // xx中的每个字符不能是'/',且至少出现一个不为'.'的字符
  44. relativePathRule = new Rule();
  45. relativePathRule.pattern = (Perl5Pattern)
  46. compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
  47. Perl5Compiler.READ_ONLY_MASK);
  48. relativePathRule.substitution = new Perl5Substitution("/");
  49. // this pattern tries to find spots like leading "/../" in the url,
  50. // which could be replaced by "/"
  51. // 如果URL中的路径,以/../起始,将其替换为/
  52. leadingRelativePathRule = new Rule();
  53. leadingRelativePathRule.pattern = (Perl5Pattern)
  54. compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
  55. leadingRelativePathRule.substitution = new Perl5Substitution("/");
  56. // this pattern tries to find spots like "/./" in the url,
  57. // which could be replaced by "/"
  58. // 这里希望从URL中找到匹配/./的部分,替换为/
  59. currentPathRule = new Rule();
  60. currentPathRule.pattern = (Perl5Pattern)
  61. compiler.compile("(/\\./)", Perl5Compiler.READ_ONLY_MASK);
  62. currentPathRule.substitution = new Perl5Substitution("/");
  63. // this pattern tries to find spots like "xx//yy" in the url,
  64. // which could be replaced by a "/"
  65. // 这里希望从URL中找到匹配//的部分,替换为/
  66. adjacentSlashRule = new Rule();
  67. adjacentSlashRule.pattern = (Perl5Pattern)
  68. compiler.compile("/{2,}", Perl5Compiler.READ_ONLY_MASK);
  69. adjacentSlashRule.substitution = new Perl5Substitution("/");
  70. } catch (MalformedPatternException e) {
  71. throw new RuntimeException(e);
  72. }
  73. }
  74. public String normalize(String urlString, String scope)
  75. throws MalformedURLException {
  76. if ("".equals(urlString)) // permit empty
  77. return urlString;
  78. urlString = urlString.trim(); // remove extra spaces
  79. URL url = new URL(urlString);
  80. String protocol = url.getProtocol();
  81. String host = url.getHost();
  82. int port = url.getPort();
  83. String file = url.getFile();
  84. boolean changed = false;
  85. if (!urlString.startsWith(protocol)) // protocol was lowercased
  86. changed = true;
  87. if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) {
  88. if (host != null) {
  89. String newHost = host.toLowerCase(); // lowercase host
  90. if (!host.equals(newHost)) {
  91. host = newHost;
  92. changed = true;
  93. }
  94. }
  95. if (port == url.getDefaultPort()) { // uses default port 如果URL中出现端口信息,检查是否为URL对应协议默认端口
  96. port = -1; // so don't specify it
  97. changed = true;
  98. }
  99. if (file == null || "".equals(file)) { // add a slash
  100. file = "/";
  101. changed = true;
  102. }
  103. if (url.getRef() != null) { // remove the ref
  104. changed = true;
  105. }
  106. // check for unnecessary use of "/../"
  107. String file2 = substituteUnnecessaryRelativePaths(file);
  108. if (!file.equals(file2)) {
  109. changed = true;
  110. file = file2;
  111. }
  112. }
  113. if (changed)
  114. urlString = new URL(protocol, host, port, file).toString();
  115. return urlString;
  116. }
  117. private String substituteUnnecessaryRelativePaths(String file) {
  118. if (!hasNormalizablePattern.matcher(file).find())
  119. return file;
  120. String fileWorkCopy = file;
  121. int oldLen = file.length();
  122. int newLen = oldLen - 1;
  123. // All substitutions will be done step by step, to ensure that certain
  124. // constellations will be normalized, too
  125. //
  126. // URL正规化会一步一步进行,下面给出一个例子
  127. //
  128. // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
  129. // following manner:
  130. // "/aa/bb/../../cc/../foo.html"
  131. // "/aa/../cc/../foo.html"
  132. // "/cc/../foo.html"
  133. // "/foo.html"
  134. //
  135. // The normalization also takes care of leading "/../", which will be
  136. // replaced by "/", because this is a rather a sign of bad webserver
  137. // configuration than of a wanted link. For example, urls like
  138. // "http://www.foo.com/../" should return a http 404 error instead of
  139. // redirecting to "http://www.foo.com".
  140. //
  141. Perl5Matcher matcher = (Perl5Matcher)matchers.get();
  142. while (oldLen != newLen) {
  143. // substitue first occurence of "/xx/../" by "/"
  144. oldLen = fileWorkCopy.length();
  145. fileWorkCopy = Util.substitute
  146. (matcher, relativePathRule.pattern,
  147. relativePathRule.substitution, fileWorkCopy, 1);
  148. // remove leading "/../"
  149. fileWorkCopy = Util.substitute
  150. (matcher, leadingRelativePathRule.pattern,
  151. leadingRelativePathRule.substitution, fileWorkCopy, 1);
  152. // remove unnecessary "/./"
  153. fileWorkCopy = Util.substitute
  154. (matcher, currentPathRule.pattern,
  155. currentPathRule.substitution, fileWorkCopy, 1);
  156. // collapse adjacent slashes with "/"
  157. fileWorkCopy = Util.substitute
  158. (matcher, adjacentSlashRule.pattern,
  159. adjacentSlashRule.substitution, fileWorkCopy, 1);
  160. newLen = fileWorkCopy.length();
  161. }
  162. return fileWorkCopy;
  163. }
  164. /**
  165. * Class which holds a compiled pattern and its corresponding substition
  166. * string.
  167. */
  168. private static class Rule {
  169. public Perl5Pattern pattern;
  170. public Perl5Substitution substitution;
  171. }
  172. public void setConf(Configuration conf) {
  173. this.conf = conf;
  174. }
  175. public Configuration getConf() {
  176. return this.conf;
  177. }
  178. }