Skip to content
This repository has been archived by the owner on Sep 24, 2018. It is now read-only.

Commit

Permalink
Merge pull request #17 from rfussien/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
rfussien authored Apr 7, 2017
2 parents 56f5c33 + 1523947 commit fdee41f
Show file tree
Hide file tree
Showing 21 changed files with 3,730 additions and 265 deletions.
140 changes: 74 additions & 66 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,47 +62,48 @@ $ composer require rfussien/leboncoin-crawler
(new Lbc\GetFrom)->search('<search_result_url>', true);
```
*example of output*:
```json
{
"page": 2,
"links": {
"current": "http://www.leboncoin.fr/ventes_immobilieres/offres/basse_normandie/calvados/?pe=11&sqs=10&ros=5&ret=1&f=p&o=2",
"previous": "http://www.leboncoin.fr/ventes_immobilieres/offres/basse_normandie/calvados/?pe=11&sqs=10&ros=5&ret=1&f=p&o=1",
"next": "http://www.leboncoin.fr/ventes_immobilieres/offres/basse_normandie/calvados/?pe=11&sqs=10&ros=5&ret=1&f=p&o=3"
},
"total_ads": 604,
"total_page": 18,
"category": "ventes_immobilieres",
"location": null,
"search_area": "basse_normandie",
"sort_by": "date",
"type": "part",
"ads": {
"602701721": {
"id": "602701721",
"title": "Maison F7 à EVRECY",
"price": 200000,
"url": "http://www.leboncoin.fr/ventes_immobilieres/602701721.htm?ca=4_s",
"created_at": "2015-01-10 19:01",
"thumb": "http://193.164.197.40/thumbs/808/808ae4f91c5bf1871b96f16bccb3751eeb0baec4.jpg",
"nb_image": 3,
"placement": "Evrecy / Calvados",
"type": "part"
},
[...]
"755560430": {
"id": "755560430",
"title": "Maison Atypique ( Esprit Loft ) 145 m2",
"price": 243000,
"url": "http://www.leboncoin.fr/ventes_immobilieres/755560430.htm?ca=4_s",
"created_at": "2015-01-11 19:01",
"thumb": "http://193.164.196.60/thumbs/aa3/aa336ba634f7e5f43b6c016358afa2510e42aa0b.jpg",
"nb_image": 3,
"placement": "Caen / Calvados",
"type": "part"
}
}
}
```php
[
'page' => 2,
'links' => [
'current' => 'https://www.leboncoin.fr/ventes_immobilieres/offres/basse_normandie/?o=2&sqs=12&ret=1&location=Caen%2014000',
'previous' => 'https://www.leboncoin.fr/ventes_immobilieres/offres/basse_normandie/?o=1&sqs=12&ret=1&location=Caen%2014000',
'next' => 'https://www.leboncoin.fr/ventes_immobilieres/offres/basse_normandie/?o=3&sqs=12&ret=1&location=Caen%2014000',
],
'total_ads' => 466,
'total_page' => 14,
'ads_per_page' => 35,
'category' => 'ventes_immobilieres',
'location' => 'Caen 14000',
'search_area' => 'basse_normandie',
'sort_by' => 'date',
'type' => 'all',
'ads' => [
1117890265 => [
'id' => '1117890265',
'titre' => 'Maison 7 pièces 243 m²',
'is_pro' => true,
'prix' => 490000,
'url' => 'https://www.leboncoin.fr/ventes_immobilieres/1117890265.htm',
'created_at' => '2017-04-06',
'images_thumbs' => 'https://img1.leboncoin.fr/ad-thumb/fdf29ab66506b52f5768c509cbd4c9940035b220.jpg',
'nb_image' => '10',
'placement' => 'Caen / Calvados',
],
[...],
1116940130 => [
'id' => '1116940130',
'titre' => 'Maison de ville 5 pièces 121 m²',
'is_pro' => true,
'prix' => 338000,
'url' => 'https://www.leboncoin.fr/ventes_immobilieres/1116940130.htm',
'created_at' => '2017-04-04',
'images_thumbs' => 'https://img2.leboncoin.fr/ad-thumb/2bb09136b010d9009f0d5542c8699ede3f6bedfd.jpg',
'nb_image' => '4',
'placement' => 'Caen / Calvados',
],
],
]
```

### Get the structured data from an ad
Expand All @@ -114,33 +115,40 @@ $ composer require rfussien/leboncoin-crawler
```

*example of output*:
```json
{
"id": "602701721",
"category": "ventes_immobilieres",
"thumbs": [
"http://193.164.197.40/thumbs/808/808ae4f91c5bf1871b96f16bccb3751eeb0baec4.jpg",
"http://193.164.196.60/thumbs/1b4/1b40871304534d25c99c7b3baeda07c16c8b48cd.jpg",
"http://193.164.196.30/thumbs/152/15251eb4128758c6d0c44523b6733ee9d5ea3749.jpg"
```php
[
'id' => '1072097995',
'category' => 'ventes_immobilieres',
'images_thumbs' => [
0 => 'https://img0.leboncoin.fr/ad-thumb/6c3962c95d1be2367d8b30f8cc1c04317be61cae.jpg',
1 => 'https://img5.leboncoin.fr/ad-thumb/9346546557dc1cf9eafc0249c8f80e27530ec36f.jpg',
2 => 'https://img6.leboncoin.fr/ad-thumb/f0e61ab47f008ae101c0ed03e3023d34ee37df5f.jpg',
3 => 'https://img4.leboncoin.fr/ad-thumb/60a4a187064407bc792b421189e66f87e1a2425c.jpg',
4 => 'https://img5.leboncoin.fr/ad-thumb/d34a4ef9545e60ae88169acbe4858608ba01e8a9.jpg',
],
'images' => [
0 => 'https://img0.leboncoin.fr/ad-image/6c3962c95d1be2367d8b30f8cc1c04317be61cae.jpg',
1 => 'https://img5.leboncoin.fr/ad-image/9346546557dc1cf9eafc0249c8f80e27530ec36f.jpg',
2 => 'https://img6.leboncoin.fr/ad-large/f0e61ab47f008ae101c0ed03e3023d34ee37df5f.jpg',
3 => 'https://img4.leboncoin.fr/ad-image/60a4a187064407bc792b421189e66f87e1a2425c.jpg',
4 => 'https://img5.leboncoin.fr/ad-image/d34a4ef9545e60ae88169acbe4858608ba01e8a9.jpg',
],
"pictures": [
"http://193.164.197.40/images/808/808ae4f91c5bf1871b96f16bccb3751eeb0baec4.jpg",
"http://193.164.196.60/images/1b4/1b40871304534d25c99c7b3baeda07c16c8b48cd.jpg",
"http://193.164.196.30/images/152/15251eb4128758c6d0c44523b6733ee9d5ea3749.jpg"
'properties' => [
'titre' => 'Maison 11 pièces 450 m²',
'created_at' => '2017-02-18',
'is_pro' => 1,
'prix' => 1185000,
'ville' => 'Bayeux',
'cp' => '14400',
'type_de_bien' => 'Maison',
'pieces' => 11,
'surface' => 450,
'reference' => '394348',
'ges' => 'C (de 11 à 20)',
'classe_energie' => 'C (de 91 à 150)',
],
"title": "Maison F7 à EVRECY",
"cp": "14210",
"city": "Evrecy",
"price": 200000,
"criterias": {
"type_de_bien": "Maison",
"pieces": "7",
"surface": "140 m2",
"ges": "F (de 56 à 80)",
"classe_energie": "D (de 151 à 230)"
},
"description": "Baisse de prix pour une maison à [...] sur un terrain de 576 m². AGENCE S'ABSTENIR."
}
'description' => 'Vente Maison/villa 11 piècesI@D France - [...]3562178Référence annonce : 394348',
]
```

There are a bunch of features if you digg a bit in the sources.
Expand Down
94 changes: 56 additions & 38 deletions src/Crawler/AdCrawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
use Lbc\Filter\CpSanitizer;
use Lbc\Filter\DefaultSanitizer;
use Lbc\Filter\KeySanitizer;
use Lbc\Filter\PriceSanitizer;
use Lbc\Parser\AdUrlParser;
use League\Uri\Schemes\Http;
use Symfony\Component\DomCrawler\Crawler;
Expand All @@ -17,6 +16,11 @@
*/
class AdCrawler extends CrawlerAbstract
{
/**
* @var AdUrlParser
*/
protected $url;

/**
* @param $url
* @return AdUrlParser
Expand Down Expand Up @@ -54,41 +58,40 @@ public function getPictures(Crawler $node = null)
{
$node = $node ?: $this->node;

$images = [];
$images_thumbs = [];
$images = [
'images_thumbs' => [],
'images' => [],
];

$node
->filter('.adview_main script')
->each(function (Crawler $crawler) use (&$images, &$images_thumbs) {
preg_match_all(
->each(function (Crawler $crawler) use (&$images) {
if (preg_match_all(
'#//img.+.leboncoin.fr/.*\.jpg#',
$crawler->html(),
$matches
);

if (count($matches[0]) > 0) {
)) {
foreach ($matches[0] as $image) {
if (preg_match('/thumb/', $image)) {
array_push(
$images_thumbs,
(string)Http::createFromString($image)
->withScheme($this->sheme)
);
} else {
array_push(
$images,
$images['images_thumbs'],
(string)Http::createFromString($image)
->withScheme($this->sheme)
);

continue;
}

array_push(
$images['images'],
(string)Http::createFromString($image)
->withScheme($this->sheme)
);
}
}
});

return [
'images' => $images,
'images_thumbs' => $images_thumbs,
];
return $images;
}

/**
Expand All @@ -102,11 +105,16 @@ public function getProperties(Crawler $node = null)
{
$node = $node ?: $this->node;

$properties = [];

$properties['title'] = DefaultSanitizer::clean(
$this->node->filter('h1')->text()
);
$properties = [
'titre' => DefaultSanitizer::clean(
$node->filter('h1')->text()
),
'created_at' => $node
->filter('*[itemprop=availabilityStarts]')
->first()
->attr('content'),
'is_pro' => ($node->filter('.ispro')->count()),
];

$node->filter('h2')
->each(function (Crawler $crawler) use (&$properties) {
Expand All @@ -130,31 +138,41 @@ public function getProperties(Crawler $node = null)
*/
public function getDescription(Crawler $node = null)
{
return ['description' => $this->node->filter("p#description")->text()];
$node = $node ?: $this->node;

return [
'description' => $this->getFieldValue(
$node->filter("p[itemprop=description]"),
null
)
];
}

/**
* Transform the properties name into a snake_case string
* Transform the properties name into a snake_case string and sanitize
* the value
*
* @param string $key
* @param string $value
* @return string
*/
private function sanitize($key, $value)
{
$key = KeySanitizer::clean($key);

switch ($key) {
case 'prix':
return ['price' => PriceSanitizer::clean($value)];
break;
case 'ville':
return [
'city' => CitySanitizer::clean($value),
'cp' => CpSanitizer::clean($value),
];
break;
default:
return [$key => DefaultSanitizer::clean($value)];
if ($key == 'ville') {
return [
'ville' => CitySanitizer::clean($value),
'cp' => CpSanitizer::clean($value),
];
}

$filterName = 'Lbc\\Filter\\' . ucfirst($key) . 'Sanitizer';

if (!class_exists($filterName)) {
$filterName = 'Lbc\\Filter\\DefaultSanitizer';
}

return [$key => call_user_func("$filterName::clean", $value)];
}
}
43 changes: 38 additions & 5 deletions src/Crawler/CrawlerAbstract.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

namespace Lbc\Crawler;

use function foo\func;
use Lbc\Filter\DefaultSanitizer;
use Symfony\Component\DomCrawler\Crawler;

/**
Expand All @@ -10,6 +12,11 @@
*/
abstract class CrawlerAbstract
{
/**
* @var
*/
protected $url;

/**
* @var string
*/
Expand All @@ -20,11 +27,6 @@ abstract class CrawlerAbstract
*/
protected $node;

/**
* @var
*/
protected $url;

/**
* CrawlerAbstract constructor.
* @param Crawler $node
Expand Down Expand Up @@ -55,6 +57,37 @@ public function getUrlParser()
return $this->url;
}

/**
* Return the field's value
*
* @param Crawler $node
* @param mixed $defaultValue
* @param \Closure $callback
* @param string $funcName
* @param string $funcParam
*
* @return mixed
*/
protected function getFieldValue(
Crawler $node,
$defaultValue,
$callback = null,
$funcName = 'text',
$funcParam = ''
) {
if ($callback == null) {
$callback = function ($value) {
return DefaultSanitizer::clean($value);
};
}

if ($node->count()) {
return $callback($node->$funcName($funcParam));
}

return $defaultValue;
}

/**
* @param $url
* @return mixed
Expand Down
Loading

0 comments on commit fdee41f

Please sign in to comment.