maven

    1. <dependency>
    2. <groupId>org.apache.tika</groupId>
    3. <artifactId>tika-core</artifactId>
    4. <version>1.14</version>
    5. </dependency>
    6. <dependency>
    7. <groupId>org.apache.tika</groupId>
    8. <artifactId>tika-parsers</artifactId>
    9. <version>1.14</version>
    10. </dependency>
    1. package com.alibaba.middleware.hsf;
    2. import org.apache.tika.Tika;
    3. import org.apache.tika.exception.TikaException;
    4. import org.apache.tika.metadata.Metadata;
    5. import org.apache.tika.parser.AutoDetectParser;
    6. import org.apache.tika.sax.BodyContentHandler;
    7. import org.xml.sax.SAXException;
    8. import java.io.*;
    9. import java.net.ContentHandler;
    10. /**
    11. * Created by xiaoming.linxm on 2018/10/24.
    12. */
    13. public class TestTika {
    14. public static void main(String[] argv) throws IOException, TikaException {
    15. Tika tika=new Tika();
    16. //System.out.println(tika.parseToString(new URL("http://www.taobao.com")));
    17. // System.out.println(tika.parseToString(new File("TikaSample.class")));
    18. String[] tt=new String[]{"d:/test.doc"};
    19. for (String file : tt) {
    20. System.out.println(file);
    21. System.out.println(tika.detect(new File(file)));
    22. String text = tika.parseToString(new File(file));
    23. System.out.print(text);
    24. }
    25. }
    26. }