Skip to content

Commit 2915e0a

Browse files
Merge pull request #1 from scrapinghub/items
Add AutoExtract Article and Product items
2 parents 3b479af + 590831d commit 2915e0a

File tree

9 files changed

+373
-0
lines changed

9 files changed

+373
-0
lines changed

.travis.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
language: python
2+
branches:
3+
only:
4+
- master
5+
- /^\d\.\d+$/
6+
- /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/
7+
matrix:
8+
include:
9+
- python: 3.6
10+
env: TOXENV=py36
11+
- python: 3.7
12+
env: TOXENV=py37
13+
- python: 3.8
14+
env: TOXENV=py38
15+
- python: 3.8
16+
env: TOXENV=mypy
17+
install:
18+
- pip install -U tox codecov
19+
script: tox
20+
after_success:
21+
- codecov

autoextract_poet/__init__.py

Whitespace-only changes.

autoextract_poet/items.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
from typing import Dict, List, Optional
2+
3+
import attr
4+
5+
6+
@attr.s(auto_attribs=True, slots=True)
7+
class Item:
8+
9+
@classmethod
10+
def from_dict(cls, item: Optional[Dict]):
11+
return cls(**item) if item else None # type: ignore
12+
13+
@classmethod
14+
def from_list(cls, items: List[Dict]):
15+
return [cls.from_dict(item) for item in items if item]
16+
17+
18+
@attr.s(auto_attribs=True, slots=True)
19+
class Offer(Item):
20+
21+
price: Optional[str] = None
22+
currency: Optional[str] = None
23+
availability: Optional[str] = None
24+
regularPrice: Optional[str] = None
25+
26+
27+
@attr.s(auto_attribs=True, slots=True)
28+
class Breadcrumb(Item):
29+
30+
name: Optional[str] = None
31+
link: Optional[str] = None
32+
33+
34+
@attr.s(auto_attribs=True, slots=True)
35+
class Rating(Item):
36+
37+
ratingValue: Optional[float]
38+
bestRating: Optional[float]
39+
reviewCount: Optional[int]
40+
41+
42+
@attr.s(auto_attribs=True, slots=True)
43+
class AdditionalProperty(Item):
44+
45+
name: str
46+
value: str
47+
48+
49+
@attr.s(auto_attribs=True, slots=True)
50+
class GTIN(Item):
51+
52+
type: str
53+
value: str
54+
55+
56+
@attr.s(auto_attribs=True, slots=True)
57+
class Article(Item):
58+
59+
headline: Optional[str] = None
60+
datePublished: Optional[str] = None
61+
datePublishedRaw: Optional[str] = None
62+
dateModified: Optional[str] = None
63+
dateModifiedRaw: Optional[str] = None
64+
author: Optional[str] = None
65+
authorsList: Optional[List[str]] = None
66+
inLanguage: Optional[str] = None
67+
breadcrumbs: Optional[List[Breadcrumb]] = None
68+
mainImage: Optional[str] = None
69+
images: Optional[List[str]] = None
70+
description: Optional[str] = None
71+
articleBody: Optional[str] = None
72+
articleBodyHtml: Optional[str] = None
73+
articleBodyRaw: Optional[str] = None
74+
videoUrls: Optional[List[str]] = None
75+
audioUrls: Optional[List[str]] = None
76+
probability: Optional[float] = None
77+
canonicalUrl: Optional[str] = None
78+
url: Optional[str] = None
79+
80+
@classmethod
81+
def from_dict(cls, item: Optional[Dict]):
82+
if not item:
83+
return None
84+
85+
new_item = dict(**item)
86+
new_item.update(dict(
87+
breadcrumbs=Breadcrumb.from_list(item.get("breadcrumbs", [])),
88+
))
89+
90+
return super().from_dict(new_item)
91+
92+
93+
@attr.s(auto_attribs=True, slots=True)
94+
class Product(Item):
95+
96+
name: Optional[str] = None
97+
offers: Optional[List[Offer]] = None
98+
sku: Optional[str] = None
99+
gtin: Optional[List[GTIN]] = None
100+
mpn: Optional[str] = None
101+
brand: Optional[str] = None
102+
breadcrumbs: Optional[List[Breadcrumb]] = None
103+
mainImage: Optional[str] = None
104+
images: Optional[List[str]] = None
105+
description: Optional[str] = None
106+
probability: Optional[float] = None
107+
url: Optional[str] = None
108+
additionalProperty: Optional[List[AdditionalProperty]] = None
109+
aggregateRating: Optional[Rating] = None
110+
111+
@classmethod
112+
def from_dict(cls, item: Optional[Dict]):
113+
if not item:
114+
return None
115+
116+
new_item = dict(**item)
117+
new_item.update(dict(
118+
additionalProperty=AdditionalProperty.from_list(
119+
item.get("additionalProperty", [])),
120+
aggregateRating=Rating.from_dict(item.get("aggregateRating")),
121+
breadcrumbs=Breadcrumb.from_list(item.get("breadcrumbs", [])),
122+
gtin=GTIN.from_list(item.get("gtin", [])),
123+
offers=Offer.from_list(item.get("offers", [])),
124+
))
125+
126+
return super().from_dict(new_item)

setup.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from setuptools import setup, find_packages
2+
3+
4+
setup(
5+
name='autoextract-poet',
6+
version='0.0.1',
7+
description='web-poet definitions for AutoExtract API',
8+
long_description=open('README.rst').read(),
9+
long_description_content_type="text/x-rst",
10+
author='Scrapinghub',
11+
author_email='[email protected]',
12+
url='https://github.com/scrapinghub/autoextract-poet',
13+
packages=find_packages(exclude=['tests',]),
14+
install_requires=[
15+
'attrs',
16+
'web-poet',
17+
],
18+
classifiers=[
19+
'Development Status :: 2 - Pre-Alpha',
20+
'Intended Audience :: Developers',
21+
'License :: OSI Approved :: BSD License',
22+
'Natural Language :: English',
23+
'Operating System :: OS Independent',
24+
'Programming Language :: Python :: 3',
25+
'Programming Language :: Python :: 3.6',
26+
'Programming Language :: Python :: 3.7',
27+
'Programming Language :: Python :: 3.8',
28+
],
29+
)

tests/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import json
2+
import os
3+
4+
5+
def load_fixture(name):
6+
path = os.path.join(
7+
os.path.dirname(__file__),
8+
f"fixtures/{name}"
9+
)
10+
with open(path, 'r') as f:
11+
return json.loads(f.read())

tests/fixtures/sample_article.json

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"article": {
3+
"headline": "Article headline",
4+
"datePublished": "2019-06-19T00:00:00",
5+
"datePublishedRaw": "June 19, 2019",
6+
"dateModified": "2019-06-21T00:00:00",
7+
"dateModifiedRaw": "June 21, 2019",
8+
"author": "Article author",
9+
"authorsList": [
10+
"Article author"
11+
],
12+
"inLanguage": "en",
13+
"breadcrumbs": [
14+
{
15+
"name": "Level 1",
16+
"link": "http://example.com"
17+
}
18+
],
19+
"mainImage": "http://example.com/image.png",
20+
"images": [
21+
"http://example.com/image.png"
22+
],
23+
"description": "Article summary",
24+
"articleBody": "Article body ...",
25+
"articleBodyHtml": "<article><p>Article body ... </p> ... </article>",
26+
"articleBodyRaw": "<div id=\"an-article\">Article body ...",
27+
"videoUrls": [
28+
"https://example.com/video.mp4"
29+
],
30+
"audioUrls": [
31+
"https://example.com/audio.mp3"
32+
],
33+
"probability": 0.95,
34+
"canonicalUrl": "https://example.com/article/article-about-something",
35+
"url": "https://example.com/article?id=24"
36+
},
37+
"webPage": {
38+
"inLanguages": [
39+
{"code": "en"},
40+
{"code": "es"}
41+
]
42+
},
43+
"query": {
44+
"id": "1564747029122-9e02a1868d70b7a3",
45+
"domain": "example.com",
46+
"userQuery": {
47+
"pageType": "article",
48+
"url": "http://example.com/article?id=24"
49+
}
50+
}
51+
}

tests/fixtures/sample_product.json

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
{
2+
"product": {
3+
"name": "Product name",
4+
"offers": [
5+
{
6+
"price": "42",
7+
"currency": "USD",
8+
"availability": "InStock"
9+
}
10+
],
11+
"sku": "product sku",
12+
"mpn": "product mpn",
13+
"gtin": [
14+
{
15+
"type": "ean13",
16+
"value": "978-3-16-148410-0"
17+
}
18+
],
19+
"brand": "product brand",
20+
"breadcrumbs": [
21+
{
22+
"name": "Level 1",
23+
"link": "http://example.com"
24+
}
25+
],
26+
"mainImage": "http://example.com/image.png",
27+
"images": [
28+
"http://example.com/image.png"
29+
],
30+
"description": "product description",
31+
"aggregateRating": {
32+
"ratingValue": 4.5,
33+
"bestRating": 5.0,
34+
"reviewCount": 31
35+
},
36+
"additionalProperty": [
37+
{
38+
"name": "property 1",
39+
"value": "value of property 1"
40+
}
41+
],
42+
"probability": 0.95,
43+
"url": "https://example.com/product"
44+
},
45+
"webPage": {
46+
"inLanguages": [
47+
{"code": "en"},
48+
{"code": "es"}
49+
]
50+
},
51+
"query": {
52+
"id": "1564747029122-9e02a1868d70b7a2",
53+
"domain": "example.com",
54+
"userQuery": {
55+
"pageType": "product",
56+
"url": "https://example.com/product"
57+
}
58+
}
59+
}

tests/test_items.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import pytest
2+
3+
from autoextract_poet.items import (
4+
Offer,
5+
Breadcrumb,
6+
Rating,
7+
AdditionalProperty,
8+
GTIN,
9+
Article,
10+
Product,
11+
)
12+
13+
from tests import load_fixture
14+
15+
example_product_result = load_fixture("sample_product.json")
16+
example_article_result = load_fixture("sample_article.json")
17+
18+
19+
@pytest.mark.parametrize(
20+
"cls, data",
21+
[(Offer, offer) for offer in example_product_result["product"]["offers"]] + # type: ignore
22+
[(Breadcrumb, breadcrumb) for breadcrumb in example_product_result["product"]["breadcrumbs"]] + # type: ignore
23+
[(AdditionalProperty, additionalProperty) for additionalProperty in example_product_result["product"]["additionalProperty"]] + # type: ignore
24+
[(GTIN, gtin) for gtin in example_product_result["product"]["gtin"]] + # type: ignore
25+
[(Rating, example_product_result["product"]["aggregateRating"])] + # type: ignore
26+
[(Product, example_product_result["product"])] + # type: ignore
27+
[(Article, example_article_result["article"])] # type: ignore
28+
) # type: ignore
29+
def test_item(cls, data):
30+
item = cls.from_dict(data)
31+
for key, value in data.items():
32+
if key == 'breadcrumbs':
33+
value = Breadcrumb.from_list(value)
34+
if key == 'offers':
35+
value = Offer.from_list(value)
36+
if key == 'additionalProperty':
37+
value = AdditionalProperty.from_list(value)
38+
if key == 'gtin':
39+
value = GTIN.from_list(value)
40+
if key == 'aggregateRating':
41+
value = Rating.from_dict(value)
42+
43+
assert getattr(item, key) == value
44+
45+
# AttributeError: 'cls' object has no attribute 'foo'
46+
with pytest.raises(AttributeError):
47+
item.foo = "bar"
48+
49+
# TypeError: __init__() got an unexpected argument 'foo'
50+
with pytest.raises(TypeError):
51+
cls(**data, foo="bar")
52+
53+
new_data = dict(**data)
54+
new_data["foo"] = "bar"
55+
# TypeError: __init__() got an unexpected argument 'foo'
56+
with pytest.raises(TypeError):
57+
cls.from_dict(new_data)

tox.ini

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[tox]
2+
envlist = py36,py37,py38,mypy
3+
4+
[testenv]
5+
deps =
6+
pytest
7+
pytest-cov
8+
9+
commands =
10+
py.test \
11+
--cov-report=term --cov-report=html --cov-report= --cov=autoextract_poet \
12+
--doctest-modules \
13+
{posargs:autoextract_poet tests}
14+
15+
[testenv:mypy]
16+
deps =
17+
mypy==0.782
18+
19+
commands = mypy --ignore-missing-imports --no-warn-no-return autoextract_poet tests

0 commit comments

Comments
 (0)