Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds xpath_alias, sample re commands #83

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions .idea/SelectorLib.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/discord.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

201 changes: 201 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion selectorlib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

"""Top-level package for selectorlib."""
"""Top-level package for Selectorlib."""

__author__ = """scrapehero"""
__email__ = '[email protected]'
Expand Down
6 changes: 3 additions & 3 deletions selectorlib/cli.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# -*- coding: utf-8 -*-

"""Console script for selectorlib."""
"""Console script for Selectorlib."""
import sys
import click


@click.command()
def main(args=None):
"""Console script for selectorlib."""
"""Console script for Selectorlib."""
click.echo("Replace this message by putting your code into "
"selectorlib.cli.main")
"Selectorlib.cli.main")
click.echo("See click documentation at http://click.pocoo.org/")
return 0

Expand Down
15 changes: 13 additions & 2 deletions selectorlib/selectorlib.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
import re
import parsel
import yaml
import inspect


def extract_field(element, item_type, attribute=None, formatter=None):
if item_type == 'Text':
texts = [i.strip() for i in element.xpath('.//text()').getall() if i.strip()]
Expand Down Expand Up @@ -35,7 +35,7 @@ def __init__(self, config, formatters=None):
def from_yaml_string(cls, yaml_string: str, formatters=None):
"""create `Extractor` object from yaml string

>>> yaml_string = '''
>>> yaml_string = ''
title:
css: "h1"
type: Text
Expand Down Expand Up @@ -77,13 +77,19 @@ def extract(self, html: str, base_url: str = None):
def _extract_selector(self, field_config, parent_parser):
if field_config.get("xpath") is not None:
elements = parent_parser.xpath(field_config['xpath'])
if len(elements) == 0:
if field_config.get("xpath_alias") is not None:
elements = parent_parser.xpath(field_config['alias'])

else:
css = field_config['css']
if css == '':
elements = [parent_parser]
else:
elements = parent_parser.css(field_config['css'])

item_type = field_config.get('type', 'Text')
# print(field_config) # Returns all fields
if not elements:
return None
values = []
Expand All @@ -100,6 +106,11 @@ def _extract_selector(self, field_config, parent_parser):
value = extract_field(element, item_type, **kwargs)

if field_config.get('multiple') is not True:
if 're' in field_config:
pattern = re.compile(f'{field_config.get("re")}')
regex = re.sub(pattern, '', value)
return regex

return value
else:
values.append(value)
Expand Down