一、官方DataImport方式

相关配置文件

  • data-config.xml ```xml <?xml version=”1.0” encoding=”UTF-8” ?>


  1. - managed-schema
  2. ```xml
  3. <!-- IK分词 -->
  4. <fieldType name="text_ik" class="solr.TextField">
  5. <analyzer type="index">
  6. <tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" conf="ik.conf" useSmart="false"/>
  7. <filter class="solr.LowerCaseFilterFactory"/>
  8. </analyzer>
  9. <analyzer type="query">
  10. <tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" conf="ik.conf" useSmart="true"/>
  11. <filter class="solr.LowerCaseFilterFactory"/>
  12. </analyzer>
  13. </fieldType>
  14. <field name="content" type="text_ik" indexed="true" stored="true" omitNorms="true" multiValued="false"/>
  15. <field name="author" type="text_ik" indexed="true" stored="true"/>
  16. <field name="title" type="text_ik" indexed="true" stored="true"/>
  17. <field name="fileName" type="string" indexed="true" stored="true"/>
  18. <field name="filePath" type="string" indexed="true" stored="true" multiValued="false"/>
  19. <field name="size" type="plong" indexed="true" stored="true"/>
  20. <field name="lastModified" type="pdate" indexed="true" stored="true"/>
  21. <!-- 索引复制,联合索引 -->
  22. <field name="keyword" type="text_ik" indexed="true" stored="true" omitNorms="true" multiValued="true"/>
  23. <copyField source="author" dest="keyword" maxChars="30000"/>
  24. <copyField source="title" dest="keyword" maxChars="30000"/>
  25. <copyField source="content" dest="keyword" maxChars="30000"/>
  26. <copyField source="fileName" dest="keyword" maxChars="30000"/>
  • solrconfig.xml
    1. <!-- DataImport -->
    2. <requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
    3. <lst name="defaults">
    4. <str name="config">data-config.xml</str>
    5. </lst>
    6. </requestHandler>

coreDataImport 中选择配置的 entity ,点击执行:
image.png
导入成功后在 coreQuery 中验证是否导入:
image.png

二、使用SolrJ方式索引文档

引入配置文件

  1. <dependency>
  2. <groupId>org.springframework.boot</groupId>
  3. <artifactId>spring-boot-starter-data-solr</artifactId>
  4. <version>2.4.13</version>
  5. </dependency>
  6. <dependency>
  7. <groupId>cn.hutool</groupId>
  8. <artifactId>hutool-all</artifactId>
  9. <version>5.7.9</version>
  10. </dependency>

编写接口

  1. package com.clown.solr_test.controller;
  2. import cn.hutool.core.io.FileUtil;
  3. import lombok.AllArgsConstructor;
  4. import org.apache.solr.client.solrj.SolrClient;
  5. import org.apache.solr.client.solrj.SolrQuery;
  6. import org.apache.solr.client.solrj.SolrServerException;
  7. import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
  8. import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
  9. import org.apache.solr.client.solrj.response.QueryResponse;
  10. import org.apache.solr.common.SolrDocumentList;
  11. import org.springframework.web.bind.annotation.PostMapping;
  12. import org.springframework.web.bind.annotation.RequestParam;
  13. import org.springframework.web.bind.annotation.RestController;
  14. import org.springframework.web.multipart.MultipartFile;
  15. import java.io.File;
  16. import java.io.IOException;
  17. import java.time.LocalDateTime;
  18. import java.time.format.DateTimeFormatter;
  19. import java.util.HashMap;
  20. import java.util.Map;
  21. import java.util.Objects;
  22. import java.util.Optional;
  23. import java.util.logging.Handler;
  24. /**
  25. * @author xq
  26. * @date 2022年02月11日 10:36
  27. */
  28. @RestController
  29. @AllArgsConstructor
  30. public class SolrTestController {
  31. private final SolrClient client;
  32. /**
  33. * 查询
  34. *
  35. * @return
  36. * @throws SolrServerException
  37. * @throws IOException
  38. */
  39. @PostMapping("/query")
  40. public Object get(@RequestParam String content) throws SolrServerException, IOException {
  41. SolrQuery query = new SolrQuery();
  42. query.add("q", content);
  43. // 高亮
  44. query.setParam("hl", "true");
  45. query.setParam("hl.fl", "content");
  46. query.setParam("hl.simple.pre", "<font color=\"red\">");
  47. query.setParam("hl.simple.post", "</font>");
  48. // 以下方法也可设置高亮
  49. // query.setHighlight(true);
  50. // query.addHighlightField("content");
  51. // query.setHighlightSimplePre("<font color=\"red\">");
  52. // query.setHighlightSimplePost("</font>");
  53. QueryResponse response = client.query(query);
  54. Map<String, Object> map = new HashMap<>(1);
  55. map.put("response", response.getResults());
  56. map.put("highlighting", Optional.ofNullable(response.getHighlighting()).orElseGet(HashMap::new));
  57. return map;
  58. }
  59. /**
  60. * 索引文档
  61. *
  62. * @param multipartFile
  63. * @throws IOException
  64. * @throws SolrServerException
  65. */
  66. @PostMapping("/upload")
  67. public void put(@RequestParam MultipartFile multipartFile) throws IOException, SolrServerException {
  68. ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
  69. File file = new File(Objects.requireNonNull(multipartFile.getOriginalFilename()));
  70. FileUtil.writeFromStream(multipartFile.getInputStream(), file);
  71. String contentType = getFileContentType(file.getName());
  72. up.addFile(file, contentType);
  73. up.setParam("literal.id", file.getAbsolutePath());
  74. //up.setParam("uprefix", "ignored_");
  75. up.setParam("literal.filePath", file.getAbsolutePath());
  76. //up.setParam("literal.author", "clown");
  77. up.setParam("literal.title", file.getName());
  78. up.setParam("literal.fileName", file.getName());
  79. up.setParam("literal.size", String.valueOf(file.getTotalSpace()));
  80. up.setParam("literal.lastModified", LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
  81. up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);
  82. client.request(up);
  83. }
  84. /**
  85. * 根据文件名获取文件的ContentType类型
  86. *
  87. * @param filename
  88. * @return
  89. */
  90. public static String getFileContentType(String filename) {
  91. String contentType = "";
  92. String prefix = filename.substring(filename.lastIndexOf(".") + 1);
  93. switch (prefix) {
  94. case "xlsx":
  95. contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
  96. break;
  97. case "pdf":
  98. contentType = "application/pdf";
  99. break;
  100. case "doc":
  101. contentType = "application/msword";
  102. break;
  103. case "txt":
  104. contentType = "text/plain";
  105. break;
  106. case "xls":
  107. contentType = "application/vnd.ms-excel";
  108. break;
  109. case "docx":
  110. contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
  111. break;
  112. case "ppt":
  113. contentType = "application/vnd.ms-powerpoint";
  114. break;
  115. case "pptx":
  116. contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
  117. break;
  118. default:
  119. contentType = "othertype";
  120. break;
  121. }
  122. return contentType;
  123. }
  124. }

通过以上方法可以看出给solr上传文件使用ContentStreamUpdateRequest 封装请求对象,利用
solr.request(up) 实现post请求.
literal 设置的id 是你的schema.xml 中key值 对应的 solrId为其value值,
在schema.xml 中 配置

官网请求参数详细介绍如下:

  1. Input Parameters
  2. * fmap.<source_field>=<target_field> - Maps (moves) one field name to another. Example: fmap.content=text will cause the content field normally generated by Tika to be moved to the "text" field.
  3. * boost.<fieldname>=<float> - Boost the specified field.
  4. * literal.<fieldname>=<value> - Create a field with the specified value. May be multivalued if the Field is multivalued.
  5. * uprefix=<prefix> - Prefix all fields that are not defined in the schema with the given prefix. This is very useful when combined with dynamic field definitions. Example: uprefix=ignored_ would effectively ignore all unknown fields generated by Tika given the example schema contains <dynamicField name="ignored_*" type="ignored"/>
  6. * defaultField=<Field Name> - If uprefix is not specified and a Field cannot be determined, the default field will be used.
  7. * extractOnly=true|false - Default is false. If true, return the extracted content from Tika without indexing the document. This literally includes the extracted XHTML as a string in the response. When viewing manually, it may be useful to use a response format other than XML to aid in viewing the embedded XHTML tags. See TikaExtractOnlyExampleOutput.
  8. * resource.name=<File Name> - The optional name of the file. Tika can use it as a hint for detecting mime type.
  9. * capture=<Tika XHTML NAME> - Capture XHTML elements with the name separately for adding to the Solr document. This can be useful for grabbing chunks of the XHTML into a separate field. For instance, it could be used to grab paragraphs (<p>) and index them into a separate field. Note that content is also still captured into the overall "content" field.
  10. * captureAttr=true|false - Index attributes of the Tika XHTML elements into separate fields, named after the element. For example, when extracting from HTML, Tika can return the href attributes in <a> tags as fields named "a". See the examples below.
  11. * xpath=<XPath expression> - When extracting, only return Tika XHTML content that satisfies the XPath expression. See http://tika.apache.org/1.2/parser.html for details on the format of Tika XHTML. See also TikaExtractOnlyExampleOutput.
  12. * lowernames=true|false - Map all field names to lowercase with underscores. For example, Content-Type would be mapped to content_type.
  13. * literalsOverride=true|false - Solr4.0 When true, literal field values will override other values with same field name, such as metadata and content. If false, then literal field values will be appended to any extracted data from Tika, and the resulting field needs to be multi valued. Default: true
  14. * resource.password=<password> - Solr4.0 The optional password for a password protected PDF or OOXML file. File format support depends on Tika.
  15. * passwordsFile=<file name> - Solr4.0 The optional name of a file containing file name pattern to password mappings. See chapter "Encrypted Files" below
  16. If extractOnly is true, additional input parameters:
  17. * extractFormat=xml|text - Default is xml. Controls the serialization format of the extract content. xml format is actually XHTML, like passing the -x command to the tika command line application, while text is like the -t command.
  18. ### Order of field operations
  19. 1. fields are generated by Tika or passed in as literals via literal.fieldname=value. Before Solr4.0 or if literalsOverride=false, then literals will be appended as multi-value to tika generated field.
  20. 2. if lowernames==true, fields are mapped to lower case
  21. 3. mapping rules fmap.source=target are applied
  22. 4. if uprefix is specified, any unknown field names are prefixed with that value, else if defaultField is specified, unknown fields are copied to that.

译文:

  1. 输入参数
  2. * fmap.<source_field>=<target_field> - 将一个字段名映射(移动)到另一个。示例: fmap.content=text 将导致通常由 Tika 生成的内容字段移动到“文本”字段。
  3. * boost.<fieldname>=<float> - 提升指定的字段。
  4. * literal.<fieldname>=<value> - 创建具有指定值的字段。如果字段是多值的,则可能是多值的。
  5. * uprefix=<prefix> - 使用给定前缀为架构中未定义的所有字段添加前缀。这在与动态字段定义结合使用时非常有用。示例:uprefix=ignored_ 将有效地忽略 Tika 生成的所有未知字段,因为示例模式包含 <dynamicField name="ignored_*" type="ignored"/>
  6. * defaultField=<Field Name> - 如果未指定uprefix 并且无法确定字段,则将使用默认字段。
  7. * extractOnly=true|false - 默认为 false。如果为 true,则返回从 Tika 提取的内容而不为文档编制索引。这实际上将提取的 XHTML 作为字符串包含在响应中。手动查看时,使用 XML 以外的响应格式来帮助查看嵌入的 XHTML 标记可能很有用。请参阅 TikaExtractOnlyExampleOutput
  8. * resource.name=<文件名> - 文件的可选名称。 Tika 可以将其用作检测 mime 类型的提示。
  9. * capture=<Tika XHTML NAME> - 分别捕获具有名称的 XHTML 元素以添加到 Solr 文档。这对于将 XHTML 块抓取到单独的字段中很有用。例如,它可以用来抓取段落 (<p>) 并将它们索引到一个单独的字段中。请注意,内容仍会被捕获到整个“内容”字段中。
  10. * captureAttr=true|false - Tika XHTML 元素的属性索引到单独的字段中,以元素命名。例如,从 HTML 中提取时,Tika 可以将 <a> 标记中的 href 属性作为名为“a”的字段返回。请参阅下面的示例。
  11. * xpath=<XPath 表达式> - 提取时,仅返回满足 XPath 表达式的 Tika XHTML 内容。有关 Tika XHTML 格式的详细信息,请参见 http://tika.apache.org/1.2/parser.html。另请参见 TikaExtractOnlyExampleOutput。
  12. * lowernames=true|false - 将所有字段名称映射为带下划线的小写。例如,Content-Type 将映射到 content_type
  13. * literalsOverride=true|false - Solr4.0 当为真时,文字字段值将覆盖具有相同字段名称的其他值,例如元数据和内容。如果为 false,则文字字段值将附加到从 Tika 提取的任何数据中,并且结果字段需要是多值的。默认值:真
  14. * resource.password=<password> - Solr4.0 受密码保护的 PDF OOXML 文件的可选密码。文件格式支持取决于 Tika
  15. * passwordsFile=<文件名> - Solr4.0 包含文件名模式到密码映射的文件的可选名称。请参阅下面的“加密文件”一章
  16. 如果 extractOnly 为真,则附加输入参数:
  17. * extractFormat=xml|text - 默认为 xml。控制提取内容的序列化格式。 xml 格式实际上是 XHTML,就像将 -x 命令传递给 tika 命令行应用程序一样,而 text 就像 -t 命令一样。
  18. ### 现场操作顺序
  19. 1. 字段由 Tika 生成或通过 literal.fieldname=value 作为文字传入。在 Solr4.0 之前或如果 literalsOverride=false,则文字将作为多值附加到 tika 生成的字段。
  20. 2.如果lowernames==true,则字段映射为小写
  21. 3. 应用映射规则 fmap.source=target
  22. 4. 如果指定了uprefix,任何未知的字段名称都以该值作为前缀,否则如果指定了defaultField,未知字段将被复制到那个值。

代码中存在以上实现还不够,
此请求到/update/extract 这个请求处理器在solrconfig.xml中必须有相应的配置

  1. <requestHandler name="/update/extract" startup="lazy" class="solr.extraction.ExtractingRequestHandler">
  2. <lst name="defaults">
  3. <str name="lowernames">true</str>
  4. <str name="uprefix">ignored_</str>
  5. <str name="fmap.content">content</str>
  6. </lst>
  7. </requestHandler>

并需把contrib/extraction/lib 及dist下的包全部复制到solr的 WEB-INF的lib下存放