|
| 1 | +<?php |
| 2 | +/** |
| 3 | + * DocX Parser |
| 4 | + * |
| 5 | + * For namespaces use | instead of : |
| 6 | + * |
| 7 | + * |
| 8 | + * @author Emily Brand |
| 9 | + * @license LGPL The GNU Lesser GPL (LGPL) or an MIT-like license. |
| 10 | + * @see http://www.urbandictionary.com/ |
| 11 | + */ |
| 12 | + |
| 13 | +use QueryPath\CSS\ParseException; |
| 14 | +use QueryPath\DOMQuery; |
| 15 | +use QueryPath\Exception; |
| 16 | + |
| 17 | +require_once __DIR__ . '/../../vendor/autoload.php'; |
| 18 | + |
| 19 | +try { |
| 20 | + // Try load the test.docx file, parse for text nodes and output with basic formatting |
| 21 | + foreach (qp(docx2text('test.docx'), 'w|p') as $qp) { |
| 22 | + /** @var $qp DOMQuery */ |
| 23 | + /** @var $qr DOMQuery */ |
| 24 | + foreach ($qp->find('w|r') as $qr) { |
| 25 | + echo format($qr); |
| 26 | + } |
| 27 | + |
| 28 | + echo '<br />'; |
| 29 | + } |
| 30 | +} catch (Exception $e) { |
| 31 | + die($e->getMessage()); |
| 32 | +} |
| 33 | + |
| 34 | +/** |
| 35 | + * Get the node text and apply basic formatting, if necessary |
| 36 | + * |
| 37 | + * @param DOMQuery $qp |
| 38 | + * |
| 39 | + * @return string |
| 40 | + * @throws ParseException |
| 41 | + * @throws Exception |
| 42 | + */ |
| 43 | +function format(DOMQuery $qp): string |
| 44 | +{ |
| 45 | + $text = $qp->find('w|t')->text() . ' '; |
| 46 | + |
| 47 | + $text = checkUnderline($qp) ? sprintf('<u>%s</u>', $text) : $text; |
| 48 | + $text = checkBold($qp) ? sprintf('<b>%s</b>', $text) : $text; |
| 49 | + |
| 50 | + return $text; |
| 51 | +} |
| 52 | + |
| 53 | +/** |
| 54 | + * Look for the <w:rPr><w:b></w:rPr> node to determine if the text is bolded |
| 55 | + * |
| 56 | + * @param DOMQuery $qp |
| 57 | + * |
| 58 | + * @return bool |
| 59 | + * @throws ParseException |
| 60 | + * @throws Exception |
| 61 | + */ |
| 62 | +function checkBold(DOMQuery $qp): bool |
| 63 | +{ |
| 64 | + return (bool) $qp->children('w|rPr') |
| 65 | + ->children('w|b') |
| 66 | + ->count(); |
| 67 | +} |
| 68 | + |
| 69 | +/** |
| 70 | + * Look for the <w:rPr><w:u></w:rPr> node to determine if the text is underlined |
| 71 | + * |
| 72 | + * @param DOMQuery $qp |
| 73 | + * |
| 74 | + * @return bool |
| 75 | + * @throws ParseException |
| 76 | + * @throws Exception |
| 77 | + */ |
| 78 | +function checkUnderline(DOMQuery $qp): bool |
| 79 | +{ |
| 80 | + return (bool) $qp->children('w|rPr') |
| 81 | + ->children('w|u') |
| 82 | + ->count(); |
| 83 | +} |
| 84 | + |
| 85 | +/** |
| 86 | + * Extract the text from a docx file |
| 87 | + * |
| 88 | + * @param string $archiveFile The path to the .docx file to extract information from |
| 89 | + * @return string |
| 90 | + */ |
| 91 | +function docx2text(string $archiveFile): string |
| 92 | +{ |
| 93 | + $dataFile = 'word/document.xml'; |
| 94 | + |
| 95 | + if (!class_exists('ZipArchive', false)) { |
| 96 | + throw new RuntimeException('ZipArchive extension must be enabled to parse .docx files'); |
| 97 | + } |
| 98 | + |
| 99 | + $zip = new ZipArchive(); |
| 100 | + // Open received archive file |
| 101 | + if (true !== $zip->open($archiveFile)) { |
| 102 | + throw new RuntimeException('Could not open the file using ZipArchive: ' . $zip->getStatusString()); |
| 103 | + } |
| 104 | + |
| 105 | + $data = ''; |
| 106 | + // Search for the docx data file |
| 107 | + if (($index = $zip->locateName($dataFile)) !== false) { |
| 108 | + $data = $zip->getFromIndex($index); |
| 109 | + } |
| 110 | + |
| 111 | + // Close zip to prevent memory leak |
| 112 | + $zip->close(); |
| 113 | + |
| 114 | + return $data; |
| 115 | +} |
0 commit comments