引言

客户需求, 需要从docx文档读取内容并且做简单格式化, 难点就在于如何读取docx格式并且转换为php可以识别的字符串形式, 惯例先贴代码.

代码

  1. /**
  2. * Class Docx2Text
  3. *
  4. * Docx => String
  5. */
  6. class Docx2Text
  7. {
  8. const SEPARATOR_TAB = "\t";
  9. /**
  10. * object zipArchive
  11. *
  12. * @var string
  13. * @access private
  14. */
  15. private $docx;
  16. /**
  17. * object domDocument from document.xml
  18. *
  19. * @var string
  20. * @access private
  21. */
  22. private $domDocument;
  23. /**
  24. * xml from document.xml
  25. *
  26. * @var string
  27. * @access private
  28. */
  29. private $_document;
  30. /**
  31. * xml from numbering.xml
  32. *
  33. * @var string
  34. * @access private
  35. */
  36. private $_numbering;
  37. /**
  38. * xml from footnote
  39. *
  40. * @var string
  41. * @access private
  42. */
  43. private $_footnote;
  44. /**
  45. * xml from endnote
  46. *
  47. * @var string
  48. * @access private
  49. */
  50. private $_endnote;
  51. /**
  52. * array of all the endnotes of the document
  53. *
  54. * @var string
  55. * @access private
  56. */
  57. private $endnotes;
  58. /**
  59. * array of all the footnotes of the document
  60. *
  61. * @var string
  62. * @access private
  63. */
  64. private $footnotes;
  65. /**
  66. * array of all the relations of the document
  67. *
  68. * @var string
  69. * @access private
  70. */
  71. private $relations;
  72. /**
  73. * array of characters to insert like a list
  74. *
  75. * @var string
  76. * @access private
  77. */
  78. private $numberingList;
  79. /**
  80. * the text content that will be exported
  81. *
  82. * @var string
  83. * @access private
  84. */
  85. private $textOuput;
  86. /**
  87. * boolean variable to know if a chart will be transformed to text
  88. *
  89. * @var string
  90. * @access private
  91. */
  92. private $chart2text;
  93. /**
  94. * boolean variable to know if a table will be transformed to text
  95. *
  96. * @var string
  97. * @access private
  98. */
  99. private $table2text;
  100. /**
  101. * boolean variable to know if a list will be transformed to text
  102. *
  103. * @var string
  104. * @access private
  105. */
  106. private $list2text;
  107. /**
  108. * boolean variable to know if a paragraph will be transformed to text
  109. *
  110. * @var string
  111. * @access private
  112. */
  113. private $paragraph2text;
  114. /**
  115. * boolean variable to know if footnotes will be extracteded
  116. *
  117. * @var string
  118. * @access private
  119. */
  120. private $footnote2text;
  121. /**
  122. * boolean variable to know if endnotes will be extracted
  123. *
  124. * @var string
  125. * @access private
  126. */
  127. private $endnote2text;
  128. /**
  129. * Construct
  130. *
  131. * @param $boolTransforms array of boolean values of which elements should be transformed or not
  132. * @access public
  133. */
  134. public function __construct($boolTransforms = array())
  135. {
  136. //table,list, paragraph, footnote, endnote, chart
  137. if (isset($boolTransforms['table'])) {
  138. $this->table2text = $boolTransforms['table'];
  139. } else {
  140. $this->table2text = true;
  141. }
  142. if (isset($boolTransforms['list'])) {
  143. $this->list2text = $boolTransforms['list'];
  144. } else {
  145. $this->list2text = true;
  146. }
  147. if (isset($boolTransforms['paragraph'])) {
  148. $this->paragraph2text = $boolTransforms['paragraph'];
  149. } else {
  150. $this->paragraph2text = true;
  151. }
  152. if (isset($boolTransforms['footnote'])) {
  153. $this->footnote2text = $boolTransforms['footnote'];
  154. } else {
  155. $this->footnote2text = true;
  156. }
  157. if (isset($boolTransforms['endnote'])) {
  158. $this->endnote2text = $boolTransforms['endnote'];
  159. } else {
  160. $this->endnote2text = true;
  161. }
  162. if (isset($boolTransforms['chart'])) {
  163. $this->chart2text = $boolTransforms['chart'];
  164. } else {
  165. $this->chart2text = true;
  166. }
  167. $this->textOuput = '';
  168. $this->docx = null;
  169. $this->_numbering = '';
  170. $this->numberingList = array();
  171. $this->endnotes = array();
  172. $this->footnotes = array();
  173. $this->relations = array();
  174. }
  175. /**
  176. *
  177. * Extract the content of a word document and create a text file if the name is given
  178. *
  179. * @access public
  180. * @param string $filename of the word document.
  181. *
  182. * @return string
  183. */
  184. public function extract($filename = '')
  185. {
  186. if (empty($this->_document)) {
  187. //xml content from document.xml is not got
  188. exit('There is no content');
  189. }
  190. $this->domDocument = new DomDocument();
  191. $this->domDocument->loadXML($this->_document);
  192. //get the body node to check the content from all his children
  193. $bodyNode = $this->domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'body');
  194. //We get the body node. it is known that there is only one body tag
  195. $bodyNode = $bodyNode->item(0);
  196. foreach ($bodyNode->childNodes as $child) {
  197. //the children can be a table, a paragraph or a section. We only implement the 2 first option said.
  198. if ($this->table2text && $child->tagName == 'w:tbl') {
  199. //this node is a table and the content is split with tabs if the variable table2text from the class is true
  200. $this->textOuput .= $this->table($child) . $this->separator();
  201. } else {
  202. //this node is a paragraph
  203. $this->textOuput .= $this->printWP($child) . ($this->paragraph2text ? $this->separator() : '');
  204. }
  205. }
  206. if (!empty($filename)) {
  207. $this->writeFile($filename, $this->textOuput);
  208. } else {
  209. return $this->textOuput;
  210. }
  211. }
  212. /**
  213. * Setter
  214. *
  215. * @access public
  216. * @param $filename
  217. */
  218. public function setDocx($filename)
  219. {
  220. $this->docx = new ZipArchive();
  221. $ret = $this->docx->open($filename);
  222. if ($ret === true) {
  223. $this->_document = $this->docx->getFromName('word/document.xml');
  224. } else {
  225. exit('failed');
  226. }
  227. }
  228. /**
  229. * extract the content to an array from endnote.xml
  230. *
  231. * @access private
  232. */
  233. private function loadEndNote()
  234. {
  235. if (empty($this->endnotes)) {
  236. if (empty($this->_endnote)) {
  237. $this->_endnote = $this->docx->getFromName('word/endnotes.xml');
  238. }
  239. if (!empty($this->_endnote)) {
  240. $domDocument = new DomDocument();
  241. $domDocument->loadXML($this->_endnote);
  242. $endnotes = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'endnote');
  243. foreach ($endnotes as $endnote) {
  244. $xml = $endnote->ownerDocument->saveXML($endnote);
  245. $this->endnotes[$endnote->getAttribute('w:id')] = trim(strip_tags($xml));
  246. }
  247. }
  248. }
  249. }
  250. /**
  251. * Extract the content to an array from footnote.xml
  252. *
  253. * @access private
  254. */
  255. private function loadFootNote()
  256. {
  257. if (empty($this->footnotes)) {
  258. if (empty($this->_footnote)) {
  259. $this->_footnote = $this->docx->getFromName('word/footnotes.xml');
  260. }
  261. if (!empty($this->_footnote)) {
  262. $domDocument = new DomDocument();
  263. $domDocument->loadXML($this->_footnote);
  264. $footnotes = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'footnote');
  265. foreach ($footnotes as $footnote) {
  266. $xml = $footnote->ownerDocument->saveXML($footnote);
  267. $this->footnotes[$footnote->getAttribute('w:id')] = trim(strip_tags($xml));
  268. }
  269. }
  270. }
  271. }
  272. /**
  273. * Extract the styles of the list to an array
  274. *
  275. * @access private
  276. */
  277. private function listNumbering()
  278. {
  279. $ids = array();
  280. $nums = array();
  281. //get the xml code from the zip archive
  282. $this->_numbering = $this->docx->getFromName('word/numbering.xml');
  283. if (!empty($this->_numbering)) {
  284. //we use the domdocument to iterate the children of the numbering tag
  285. $domDocument = new DomDocument();
  286. $domDocument->loadXML($this->_numbering);
  287. $numberings = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'numbering');
  288. //there is only one numbering tag in the numbering.xml
  289. $numberings = $numberings->item(0);
  290. foreach ($numberings->childNodes as $child) {
  291. $flag = true;//boolean variable to know if the node is the first style of the list
  292. foreach ($child->childNodes as $son) {
  293. if ($child->tagName == 'w:abstractNum' && $son->tagName == 'w:lvl') {
  294. foreach ($son->childNodes as $daughter) {
  295. if ($daughter->tagName == 'w:numFmt' && $flag) {
  296. $nums[$child->getAttribute('w:abstractNumId')] = $daughter->getAttribute('w:val');//set the key with internal index for the listand the value it is the type of bullet
  297. $flag = false;
  298. }
  299. }
  300. } elseif ($child->tagName == 'w:num' && $son->tagName == 'w:abstractNumId') {
  301. $ids[$son->getAttribute('w:val')] = $child->getAttribute('w:numId');//$ids is the index of the list
  302. }
  303. }
  304. }
  305. //once we know what kind of list there is in the documents, is prepared the bullet that the library will use
  306. foreach ($ids as $ind => $id) {
  307. if ($nums[$ind] == 'decimal') {
  308. //if the type is decimal it means that the bullet will be numbers
  309. $this->numberingList[$id][0] = range(1, 10);
  310. $this->numberingList[$id][1] = range(1, 10);
  311. $this->numberingList[$id][2] = range(1, 10);
  312. $this->numberingList[$id][3] = range(1, 10);
  313. } else {
  314. //otherwise is *, and other characters
  315. $this->numberingList[$id][0] = array('*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*');
  316. $this->numberingList[$id][1] = array(chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175));
  317. $this->numberingList[$id][2] = array(chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237));
  318. $this->numberingList[$id][3] = array(chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248));
  319. }
  320. }
  321. }
  322. }
  323. /**
  324. * Extract the content of a w:p tag
  325. *
  326. * @access private
  327. * @param $node object
  328. * @return string
  329. */
  330. private function printWP($node)
  331. {
  332. $ilvl = $numId = -1;
  333. if ($this->list2text) {//transform the list in ooxml to formatted list with tabs and bullets
  334. if (empty($this->numberingList)) {//check if numbering.xml is extracted from the zip archive
  335. $this->listNumbering();
  336. }
  337. //use the xpath to get expecific children from a node
  338. $xpath = new DOMXPath($this->domDocument);
  339. $query = 'w:pPr/w:numPr';
  340. $xmlLists = $xpath->query($query, $node);
  341. $xmlLists = $xmlLists->item(0);
  342. //if ($xmlLists->tagName == 'w:numPr') {
  343. // if ($xmlLists->hasChildNodes()) {
  344. // foreach ($xmlLists->childNodes as $child) {
  345. // if ($child->tagName == 'w:ilvl') {
  346. // $ilvl = $child->getAttribute('w:val');
  347. // }elseif ($child->tagName == 'w:numId') {
  348. // $numId = $child->getAttribute('w:val');
  349. // }
  350. // }
  351. // }
  352. //}
  353. //if (($ilvl != -1) && ($numId != -1)) {
  354. // //if is founded the style index of the list in the document and the kind of list
  355. // $ret = '';
  356. // for($i=-1; $i < $ilvl; $i++) {
  357. // if(self::DEBUG) {
  358. // $ret .= self::SEPARATOR_TAB_DEBUG;
  359. // }
  360. // else {
  361. // $ret .= self::SEPARATOR_TAB;
  362. // }
  363. // }
  364. // $ret .= array_shift($this->numberingList[$numId][$ilvl]) . ' ' . $this->toText($node); //print the bullet
  365. //} else {
  366. $ret = $this->toText($node);
  367. //}
  368. } else {
  369. //if dont want to formatted lists, we strip from html tags
  370. $ret = $this->toText($node);
  371. }
  372. //get the data from the charts
  373. if ($this->chart2text) {
  374. $query = 'w:r/w:drawing/wp:inline';
  375. $xmlChart = $xpath->query($query, $node);
  376. //get the relation id from the document, to get the name of the xml chart file from the relations to extract the xml code.
  377. foreach ($xmlChart as $chart) {
  378. foreach ($chart->childNodes as $child) {
  379. foreach ($child->childNodes as $child2) {
  380. foreach ($child2->childNodes as $child3) {
  381. $rid = $child3->getAttribute('r:id');
  382. }
  383. }
  384. }
  385. }
  386. //if (!empty($rid)) {
  387. // if (empty($this->relations)) {
  388. // $this->loadRelations();
  389. // }
  390. // //get the name of the chart xml file from the relations docuemnt
  391. // $dataChart = new getDataFromXmlChart($this->docx->getFromName('word/' . $this->relations[$rid]['file']));
  392. // if (in_array($this->chart2text, array(2, 'table'))) {
  393. // $ret .= $this->printChartDataTable($dataChart);//formatted print of the chart data
  394. // } else {
  395. // $ret .= $this->printChartDataArray($dataChart);//formatted print of the chart data
  396. // }
  397. //}
  398. }
  399. //extract the expecific endnote to insert with the text content
  400. if ($this->endnote2text) {
  401. if (empty($this->endnotes)) {
  402. $this->loadEndNote();
  403. }
  404. $query = 'w:r/w:endnoteReference';
  405. $xmlEndNote = $xpath->query($query, $node);
  406. foreach ($xmlEndNote as $note) {
  407. $ret .= '[' . $this->endnotes[$note->getAttribute('w:id')] . '] ';
  408. }
  409. }
  410. //extract the expecific footnote to insert with the text content
  411. if ($this->footnote2text) {
  412. if (empty($this->footnotes)) {
  413. $this->loadFootNote();
  414. }
  415. $query = 'w:r/w:footnoteReference';
  416. $xmlFootNote = $xpath->query($query, $node);
  417. foreach ($xmlFootNote as $note) {
  418. $ret .= '[' . $this->footnotes[$note->getAttribute('w:id')] . '] ';
  419. }
  420. }
  421. if ((($ilvl != -1) && ($numId != -1)) || (1)) {
  422. $ret .= $this->separator();
  423. }
  424. return $ret;
  425. }
  426. /**
  427. * return a text end of line
  428. *
  429. * @access private
  430. */
  431. private function separator()
  432. {
  433. return "\r\n";
  434. }
  435. /**
  436. *
  437. * Extract the content of a table node from the document.xml and return a text content
  438. *
  439. * @access private
  440. * @param $node object
  441. *
  442. * @return string
  443. */
  444. private function table($node)
  445. {
  446. $output = '';
  447. if ($node->hasChildNodes()) {
  448. foreach ($node->childNodes as $child) {
  449. //start a new line of the table
  450. if ($child->tagName == 'w:tr') {
  451. foreach ($child->childNodes as $cell) {
  452. //start a new cell
  453. if ($cell->tagName == 'w:tc') {
  454. if ($cell->hasChildNodes()) {
  455. //
  456. foreach ($cell->childNodes as $p) {
  457. $output .= $this->printWP($p);
  458. }
  459. $output .= self::SEPARATOR_TAB;
  460. }
  461. }
  462. }
  463. }
  464. $output .= $this->separator();
  465. }
  466. }
  467. return $output;
  468. }
  469. /**
  470. *
  471. * Extract the content of a node from the document.xml and return only the text content and. stripping the html tags
  472. *
  473. * @access private
  474. * @param $node object
  475. *
  476. * @return string
  477. */
  478. private function toText($node)
  479. {
  480. $xml = $node->ownerDocument->saveXML($node);
  481. return trim(strip_tags($xml));
  482. }
  483. }
  484. // 实例化
  485. $text = new Docx2Text();
  486. // 加载docx文件
  487. $text->setDocx('./1.docx');
  488. // 将内容存入$docx变量中
  489. $docx = $text->extract();
  490. // 调试输出
  491. var_dump($docx);

小结

代码中处理docx的类来自这里
其实docx就是xml的一种扩展类型的文档.

================================================================================
【文章来源】https://blog.csdn.net/hldh214/article/details/51549866