Skip to content

Commit d2fc277

Browse files
authored
Merge pull request #2 from scrapinghub/page-inputs
Add Page Inputs
2 parents 2915e0a + 2b3bf60 commit d2fc277

File tree

6 files changed

+226
-118
lines changed

6 files changed

+226
-118
lines changed

autoextract_poet/page_inputs.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from typing import ClassVar, Generic, Optional, TypeVar
2+
3+
import attr
4+
5+
from autoextract_poet.items import (
6+
Article,
7+
Item,
8+
Product,
9+
)
10+
11+
T = TypeVar("T", bound=Item)
12+
13+
14+
@attr.s(auto_attribs=True)
15+
class _AutoExtractData(Generic[T]):
16+
"""Container for AutoExtract data.
17+
18+
Should not be used directly by providers.
19+
Use derived classes like AutoExtractArticleData and similar.
20+
21+
API responses are wrapped in a JSON array
22+
(this is to facilitate query batching)
23+
but we're receiving single responses here..
24+
25+
https://doc.scrapinghub.com/autoextract.html#responses
26+
"""
27+
28+
item_key: ClassVar[str]
29+
30+
data: dict
31+
32+
@property
33+
def item_class(self):
34+
return self.__orig_bases__[0].__args__[0]
35+
36+
def to_item(self) -> Optional[T]:
37+
return self.item_class.from_dict(self.data[self.item_key])
38+
39+
40+
@attr.s(auto_attribs=True)
41+
class AutoExtractArticleData(_AutoExtractData[Article]):
42+
"""Container for AutoExtract Article data.
43+
44+
https://doc.scrapinghub.com/autoextract/article.html
45+
"""
46+
47+
item_key = "article"
48+
49+
50+
@attr.s(auto_attribs=True)
51+
class AutoExtractProductData(_AutoExtractData[Product]):
52+
"""Container for AutoExtract Product data.
53+
54+
https://doc.scrapinghub.com/autoextract/product.html
55+
"""
56+
57+
item_key = "product"

tests/__init__.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
import json
22
import os
33

4+
from autoextract_poet.items import (
5+
AdditionalProperty,
6+
Breadcrumb,
7+
Item,
8+
GTIN,
9+
Offer,
10+
Rating,
11+
)
12+
413

514
def load_fixture(name):
615
path = os.path.join(
@@ -9,3 +18,23 @@ def load_fixture(name):
918
)
1019
with open(path, 'r') as f:
1120
return json.loads(f.read())
21+
22+
23+
def item_equals_dict(item: Item, data: dict) -> bool:
24+
"""Return True if Item and Dict are equivalent or False otherwise."""
25+
for key, value in data.items():
26+
if key == 'additionalProperty':
27+
value = AdditionalProperty.from_list(value)
28+
if key == 'aggregateRating':
29+
value = Rating.from_dict(value)
30+
if key == 'breadcrumbs':
31+
value = Breadcrumb.from_list(value)
32+
if key == 'gtin':
33+
value = GTIN.from_list(value)
34+
if key == 'offers':
35+
value = Offer.from_list(value)
36+
37+
if getattr(item, key) != value:
38+
return False
39+
40+
return True

tests/fixtures/sample_article.json

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,57 @@
1-
{
1+
[
2+
{
23
"article": {
3-
"headline": "Article headline",
4-
"datePublished": "2019-06-19T00:00:00",
5-
"datePublishedRaw": "June 19, 2019",
6-
"dateModified": "2019-06-21T00:00:00",
7-
"dateModifiedRaw": "June 21, 2019",
8-
"author": "Article author",
9-
"authorsList": [
10-
"Article author"
11-
],
12-
"inLanguage": "en",
13-
"breadcrumbs": [
14-
{
15-
"name": "Level 1",
16-
"link": "http://example.com"
17-
}
18-
],
19-
"mainImage": "http://example.com/image.png",
20-
"images": [
21-
"http://example.com/image.png"
22-
],
23-
"description": "Article summary",
24-
"articleBody": "Article body ...",
25-
"articleBodyHtml": "<article><p>Article body ... </p> ... </article>",
26-
"articleBodyRaw": "<div id=\"an-article\">Article body ...",
27-
"videoUrls": [
28-
"https://example.com/video.mp4"
29-
],
30-
"audioUrls": [
31-
"https://example.com/audio.mp3"
32-
],
33-
"probability": 0.95,
34-
"canonicalUrl": "https://example.com/article/article-about-something",
35-
"url": "https://example.com/article?id=24"
4+
"headline": "Article headline",
5+
"datePublished": "2019-06-19T00:00:00",
6+
"datePublishedRaw": "June 19, 2019",
7+
"dateModified": "2019-06-21T00:00:00",
8+
"dateModifiedRaw": "June 21, 2019",
9+
"author": "Article author",
10+
"authorsList": [
11+
"Article author"
12+
],
13+
"inLanguage": "en",
14+
"breadcrumbs": [
15+
{
16+
"name": "Level 1",
17+
"link": "http://example.com"
18+
}
19+
],
20+
"mainImage": "http://example.com/image.png",
21+
"images": [
22+
"http://example.com/image.png"
23+
],
24+
"description": "Article summary",
25+
"articleBody": "Article body ...",
26+
"articleBodyHtml": "<article><p>Article body ... </p> ... </article>",
27+
"articleBodyRaw": "<div id=\"an-article\">Article body ...",
28+
"videoUrls": [
29+
"https://example.com/video.mp4"
30+
],
31+
"audioUrls": [
32+
"https://example.com/audio.mp3"
33+
],
34+
"probability": 0.95,
35+
"canonicalUrl": "https://example.com/article/article-about-something",
36+
"url": "https://example.com/article?id=24"
3637
},
3738
"webPage": {
38-
"inLanguages": [
39-
{"code": "en"},
40-
{"code": "es"}
41-
]
39+
"inLanguages": [
40+
{
41+
"code": "en"
42+
},
43+
{
44+
"code": "es"
45+
}
46+
]
4247
},
4348
"query": {
44-
"id": "1564747029122-9e02a1868d70b7a3",
45-
"domain": "example.com",
46-
"userQuery": {
47-
"pageType": "article",
48-
"url": "http://example.com/article?id=24"
49-
}
49+
"id": "1564747029122-9e02a1868d70b7a3",
50+
"domain": "example.com",
51+
"userQuery": {
52+
"pageType": "article",
53+
"url": "http://example.com/article?id=24"
54+
}
5055
}
51-
}
56+
}
57+
]

tests/fixtures/sample_product.json

Lines changed: 59 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,65 @@
1-
{
1+
[
2+
{
23
"product": {
3-
"name": "Product name",
4-
"offers": [
5-
{
6-
"price": "42",
7-
"currency": "USD",
8-
"availability": "InStock"
9-
}
10-
],
11-
"sku": "product sku",
12-
"mpn": "product mpn",
13-
"gtin": [
14-
{
15-
"type": "ean13",
16-
"value": "978-3-16-148410-0"
17-
}
18-
],
19-
"brand": "product brand",
20-
"breadcrumbs": [
21-
{
22-
"name": "Level 1",
23-
"link": "http://example.com"
24-
}
25-
],
26-
"mainImage": "http://example.com/image.png",
27-
"images": [
28-
"http://example.com/image.png"
29-
],
30-
"description": "product description",
31-
"aggregateRating": {
32-
"ratingValue": 4.5,
33-
"bestRating": 5.0,
34-
"reviewCount": 31
35-
},
36-
"additionalProperty": [
37-
{
38-
"name": "property 1",
39-
"value": "value of property 1"
40-
}
41-
],
42-
"probability": 0.95,
43-
"url": "https://example.com/product"
4+
"name": "Product name",
5+
"offers": [
6+
{
7+
"price": "42",
8+
"currency": "USD",
9+
"availability": "InStock"
10+
}
11+
],
12+
"sku": "product sku",
13+
"mpn": "product mpn",
14+
"gtin": [
15+
{
16+
"type": "ean13",
17+
"value": "978-3-16-148410-0"
18+
}
19+
],
20+
"brand": "product brand",
21+
"breadcrumbs": [
22+
{
23+
"name": "Level 1",
24+
"link": "http://example.com"
25+
}
26+
],
27+
"mainImage": "http://example.com/image.png",
28+
"images": [
29+
"http://example.com/image.png"
30+
],
31+
"description": "product description",
32+
"aggregateRating": {
33+
"ratingValue": 4.5,
34+
"bestRating": 5.0,
35+
"reviewCount": 31
36+
},
37+
"additionalProperty": [
38+
{
39+
"name": "property 1",
40+
"value": "value of property 1"
41+
}
42+
],
43+
"probability": 0.95,
44+
"url": "https://example.com/product"
4445
},
4546
"webPage": {
46-
"inLanguages": [
47-
{"code": "en"},
48-
{"code": "es"}
49-
]
47+
"inLanguages": [
48+
{
49+
"code": "en"
50+
},
51+
{
52+
"code": "es"
53+
}
54+
]
5055
},
5156
"query": {
52-
"id": "1564747029122-9e02a1868d70b7a2",
53-
"domain": "example.com",
54-
"userQuery": {
55-
"pageType": "product",
56-
"url": "https://example.com/product"
57-
}
57+
"id": "1564747029122-9e02a1868d70b7a2",
58+
"domain": "example.com",
59+
"userQuery": {
60+
"pageType": "product",
61+
"url": "https://example.com/product"
62+
}
5863
}
59-
}
64+
}
65+
]

tests/test_items.py

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
import pytest
22

33
from autoextract_poet.items import (
4-
Offer,
5-
Breadcrumb,
6-
Rating,
74
AdditionalProperty,
8-
GTIN,
95
Article,
6+
Breadcrumb,
7+
GTIN,
8+
Offer,
109
Product,
10+
Rating,
1111
)
1212

13-
from tests import load_fixture
13+
from tests import load_fixture, item_equals_dict
1414

15-
example_product_result = load_fixture("sample_product.json")
16-
example_article_result = load_fixture("sample_article.json")
15+
example_article_result = load_fixture("sample_article.json")[0]
16+
example_product_result = load_fixture("sample_product.json")[0]
1717

1818

1919
@pytest.mark.parametrize(
@@ -28,19 +28,7 @@
2828
) # type: ignore
2929
def test_item(cls, data):
3030
item = cls.from_dict(data)
31-
for key, value in data.items():
32-
if key == 'breadcrumbs':
33-
value = Breadcrumb.from_list(value)
34-
if key == 'offers':
35-
value = Offer.from_list(value)
36-
if key == 'additionalProperty':
37-
value = AdditionalProperty.from_list(value)
38-
if key == 'gtin':
39-
value = GTIN.from_list(value)
40-
if key == 'aggregateRating':
41-
value = Rating.from_dict(value)
42-
43-
assert getattr(item, key) == value
31+
assert item_equals_dict(item, data)
4432

4533
# AttributeError: 'cls' object has no attribute 'foo'
4634
with pytest.raises(AttributeError):

tests/test_page_inputs.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import pytest
2+
3+
from autoextract_poet.page_inputs import (
4+
AutoExtractArticleData,
5+
AutoExtractProductData,
6+
)
7+
8+
from tests import load_fixture, item_equals_dict
9+
10+
example_article_result = load_fixture("sample_article.json")
11+
example_product_result = load_fixture("sample_product.json")
12+
13+
14+
@pytest.mark.parametrize("cls, results", [
15+
(AutoExtractArticleData, example_article_result),
16+
(AutoExtractProductData, example_product_result),
17+
])
18+
def test_response_data(cls, results):
19+
response_data = cls(results[0])
20+
item = response_data.to_item()
21+
assert isinstance(item, response_data.item_class)
22+
assert item_equals_dict(item, results[0][cls.item_key])

0 commit comments

Comments
 (0)