-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathProductScraper.php
244 lines (213 loc) · 6.33 KB
/
ProductScraper.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
<?php
/**
* Configuration parameters
*/
define('HANDLERS_LOCATION', 'Handlers/');
/**
* Autoload for Handler classes
*/
function __autoload($class)
{
@include(HANDLERS_LOCATION . $class . '.php');
}
/**
* Product scraper
* This class has one public static method getInfo() which takes a product url
* and parses out title, price info, description, images, and a normalized url
* and returns them in an array
*
* Custom Handlers must implement:
* 1. XPath queries for all scraping functions
* 2. parsing for the DOMNodeList returned by getDescription()
* 3. their own url normalization
* 4. setting their own value for $imageWidthThreshold
* 5. any postprocessing of scraped values
*
*/
class ProductScraper
{
/**
* Loads up product page and either calls site specific Handler, or uses default
* Handler to scrape page.
*
* The Handler returns an array (values NULL if failed) containing:
* - [0] Title of object
* - [1] Price of object
* - [2] Description of object
* - [3] Array() of likely product images
* - [4] Normalized url
*
*
* @param string $url
*
* return array
*/
static public function getInfo($url)
{
// initialize working variables
$urlComponents = parse_url($url);
$pageData = self::getPage($url);
$pageDOM = new DOMDocument();
@$pageDOM->loadHTML($pageData);
$xpath = new DOMXPath($pageDOM);
/*
* Use domain/subdomain Handlers if they exist, else use default handler
*/
$domain = $urlComponents['host'];
$handlerName = preg_replace('/[.]/', '_', $domain);
if (class_exists($handlerName))
{
$handlerExists = TRUE;
}
else
{
$handlerName = preg_replace('/\b[a-z0-9A-Z]+_/', '', $handlerName);
(class_exists($handlerName)) ? $handlerExists = TRUE : 0;
}
if ($handlerExists)
{
$handler = new $handlerName();
return $handler->customScraper($xpath, $urlComponents);
}
//Unlike custom Handlers, we pass the page string data to the default handler b/c it parses price with text processing
return self::defaultScraper($xpath, $urlComponents, $pageData);
}
/**
* @param string $url
*
* @return string
*/
static protected function getPage($url)
{
$pageData = curl_init($url);
// user agent is set to Chrome
$userAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.A.B.C Safari/525.13";
curl_setopt($pageData, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($pageData, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($pageData, CURLOPT_USERAGENT, $userAgent);
return curl_exec($pageData);
}
/**
* @param DOMXPath $xpath
* @param string $xpathQuery
*
* return string
*/
static protected function getTitle($xpath, $xpathQuery)
{
$title = $xpath->evaluate($xpathQuery);
// could use textContent too, but nodeValue is DOM level 1
return $title->item(0)->nodeValue;
}
/**
* For getting the price, we just query and return a DOMNodeList because
* we don't know what kind of processing the Handlers need to do.
*
* @param DOMXPath $xpath
* @param string $xpathQuery
*
* return DOMNodeList
*/
static protected function getPrice($xpath, $xpathQuery)
{
return $xpath->evaluate($xpathQuery);
}
/**
* For getting the description, we just query and return a DOMNodeList because
* we don't know what kind of processing the Handlers need to do.
*
* @param DOMXPath $xpath
* @param string $xpathQuery
*
* return DOMNodeList
*/
static protected function getDescription($xpath, $xpathQuery)
{
return $xpath->evaluate($xpathQuery);
}
/**
* @param DOMXPath $xpath
* @param string $xpathQuery
*
* @return array
*/
static protected function getImages($xpath, $xpathQuery, $imageWidthThreshold)
{
$productImages = Array();
$allImages = $xpath->evaluate($xpathQuery);
for($i = 0; $i < $allImages->length; $i++)
{
$image = $allImages->item($i);
$imageWidth = $image->getAttribute('width');
($imageWidth > $imageWidthThreshold) ? $productImages[] = $image->getAttribute('src') : 0;
}
return $productImages;
}
/**
* default url normalization function
*
* @param array $urlComponents
*
* return string
*/
static protected function defaultNormalize($urlComponents)
{
$normalizedUrl = preg_replace('/^www./', '', $urlComponents['host']);
$normalizedUrl .= $urlComponents['path'];
($urlComponents['query']) ? ($normalizedUrl .= '?') : 0;
$normalizedUrl .= $urlComponents['query'];
return $normalizedUrl;
}
/**
* This default scraper goes for the most general implementation:
* - $title contains the contents of the <title> tag in <head>
* - $price contains the first $ in the page
* - $description contains the contents of the description meta tag
* - $productImages finds where <img> width attribute > $imageWidthThreshold
* - $normalizedUrl just strips 'http://www.'
*
* @param DOMXPath $xpath
*
* return array
*/
static private function defaultScraper($xpath, $urlComponents, $pageData)
{
$xpathQuery = '/html/head/title';
$title = self::getTitle($xpath, $xpathQuery);
$price = self::defaultGetPrice($pageData);
$xpathQuery = '/html/head/meta';
$descriptionNodeArray = self::getDescription($xpath, $xpathQuery);
$description = "a description metatag has not been found";
for($i = 0; $i < $descriptionNodeArray->length; $i++)
{
$descriptionNode = $descriptionNodeArray->item($i);
(($descriptionNode->getAttribute('name') == 'description') ||
($descriptionNode->getAttribute('name') == 'Description')) ? $description=$descriptionNode->getAttribute('content') : 0;
}
$imageWidthThreshold = 200;
$xpathQuery = '/html/body/descendant::img';
$productImages = self::getImages($xpath, $xpathQuery, $imageWidthThreshold);
$normalizedUrl = self::defaultNormalize($urlComponents);
//assemble return array
$scrapedValues = Array();
$scrapedValues[] = $title;
$scrapedValues[] = $price;
$scrapedValues[] = $description;
$scrapedValues[] = $productImages;
$scrapedValues[] = $normalizedUrl;
return $scrapedValues;
}
/**
* @param string $pageData
*
* return float
*/
static private function defaultGetPrice($pageData)
{
$priceMatches = Array();
preg_match('/\$\d+\.\d\d/', $pageData, $priceMatches);
$firstPrice = $priceMatches[0];
return $firstPrice;
}
}
?>