Skip to content

Commit ee2e99f

Browse files
committed
Continue refactoring examples
1 parent e77c8f0 commit ee2e99f

File tree

10 files changed

+2316
-4160
lines changed

10 files changed

+2316
-4160
lines changed
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
<?php
2+
/**
3+
* DocX Parser
4+
*
5+
* For namespaces use | instead of :
6+
*
7+
*
8+
* @author Emily Brand
9+
* @license LGPL The GNU Lesser GPL (LGPL) or an MIT-like license.
10+
* @see http://www.urbandictionary.com/
11+
*/
12+
13+
use QueryPath\CSS\ParseException;
14+
use QueryPath\DOMQuery;
15+
use QueryPath\Exception;
16+
17+
require_once __DIR__ . '/../../vendor/autoload.php';
18+
19+
try {
20+
// Try load the test.docx file, parse for text nodes and output with basic formatting
21+
foreach (qp(docx2text('test.docx'), 'w|p') as $qp) {
22+
/** @var $qp DOMQuery */
23+
/** @var $qr DOMQuery */
24+
foreach ($qp->find('w|r') as $qr) {
25+
echo format($qr);
26+
}
27+
28+
echo '<br />';
29+
}
30+
} catch (Exception $e) {
31+
die($e->getMessage());
32+
}
33+
34+
/**
35+
* Get the node text and apply basic formatting, if necessary
36+
*
37+
* @param DOMQuery $qp
38+
*
39+
* @return string
40+
* @throws ParseException
41+
* @throws Exception
42+
*/
43+
function format(DOMQuery $qp): string
44+
{
45+
$text = $qp->find('w|t')->text() . ' ';
46+
47+
$text = checkUnderline($qp) ? sprintf('<u>%s</u>', $text) : $text;
48+
$text = checkBold($qp) ? sprintf('<b>%s</b>', $text) : $text;
49+
50+
return $text;
51+
}
52+
53+
/**
54+
* Look for the <w:rPr><w:b></w:rPr> node to determine if the text is bolded
55+
*
56+
* @param DOMQuery $qp
57+
*
58+
* @return bool
59+
* @throws ParseException
60+
* @throws Exception
61+
*/
62+
function checkBold(DOMQuery $qp): bool
63+
{
64+
return (bool) $qp->children('w|rPr')
65+
->children('w|b')
66+
->count();
67+
}
68+
69+
/**
70+
* Look for the <w:rPr><w:u></w:rPr> node to determine if the text is underlined
71+
*
72+
* @param DOMQuery $qp
73+
*
74+
* @return bool
75+
* @throws ParseException
76+
* @throws Exception
77+
*/
78+
function checkUnderline(DOMQuery $qp): bool
79+
{
80+
return (bool) $qp->children('w|rPr')
81+
->children('w|u')
82+
->count();
83+
}
84+
85+
/**
86+
* Extract the text from a docx file
87+
*
88+
* @param string $archiveFile The path to the .docx file to extract information from
89+
* @return string
90+
*/
91+
function docx2text(string $archiveFile): string
92+
{
93+
$dataFile = 'word/document.xml';
94+
95+
if (!class_exists('ZipArchive', false)) {
96+
throw new RuntimeException('ZipArchive extension must be enabled to parse .docx files');
97+
}
98+
99+
$zip = new ZipArchive();
100+
// Open received archive file
101+
if (true !== $zip->open($archiveFile)) {
102+
throw new RuntimeException('Could not open the file using ZipArchive: ' . $zip->getStatusString());
103+
}
104+
105+
$data = '';
106+
// Search for the docx data file
107+
if (($index = $zip->locateName($dataFile)) !== false) {
108+
$data = $zip->getFromIndex($index);
109+
}
110+
111+
// Close zip to prevent memory leak
112+
$zip->close();
113+
114+
return $data;
115+
}

0 commit comments

Comments
 (0)