The file redflagdeals_scraper.py introduces new bu...Hardeepex/scrapegost#7
The file redflagdeals_scraper.py introduces new bu...
Hardeepex/scrapegost#7
> >
Calling Assistant API to generate plan ✓
Code Snippets Found
This is based on the results of the Searching step.
docs/examples/tutorial/redflagdeals_scraper.py:4-39
4
5# Define the SchemaScraper for the main page and listings
6listings_scraper = SchemaScraper(
7 {
8 "url": "url",
9 "title": "str",
10 "image": "str",
11 "dealer": "str",
12 "comments_count": "int",
13 },
14 extra_preprocessors=[CSS("div.list_item")],
15)
16
17# Define the SchemaScraper for the single deal pages
18deal_scraper = SchemaScraper(
19 {
20 "title": "str",
21 "url": "url",
22 "price": "float",
23 "regular_price": "float",
24 "details": "str",
25 },
26 extra_preprocessors=[CSS("div.primary_content")],
27)
28
29# Scrape data from the website
30response = listings_scraper("https://www.redflagdeals.com/deals/")
31listings = response.data
32
33deal_data = []
34for listing in listings:
35 response = deal_scraper(listing["url"])
36 deal_data.append(response.data)
37
38# Save the scraped data to a JSON file
39with open("redflagdeals_data.json", "w") as f:
tests/test_scraper.py:4-38
4
5def test_tostr():
6 html = "<html><body>ventura</body></html>"
7 doc = lxml.html.fromstring(html)
8 assert scrapers._tostr(doc) == html
9
10
11def test_chunk_tags():
12 html = [
13 lxml.html.fromstring("<li>one</li>"),
14 lxml.html.fromstring("<li>two</li>"),
15 lxml.html.fromstring("<li>three is very long and will get its own spot</li>"),
16 lxml.html.fromstring("<li>four</li>"),
17 lxml.html.fromstring("<li>five</li>"),
18 ]
19 chunks = scrapers._chunk_tags(html, 12, "gpt-4")
20 assert len(chunks) == 3
21 assert "one" in chunks[0]
22 assert "two" in chunks[0]
23 assert "three" in chunks[1]
24 assert "four" in chunks[2]
25 assert "five" in chunks[2]
26
27
28def test_parse_html():
29 # spaces are collapsed
30 html = "<span> ventura</span>"
31 doc = scrapers._parse_url_or_html(html)
32 assert scrapers._tostr(doc) == "<span> ventura</span>"
33
34
35def test_parse_url():
36 # test that requests are made
37 url = "https://www.example.com"
38 doc = scrapers._parse_url_or_html(url)
tests/test_schemascraper.py:5-41
5
6def test_apply_preprocessors_default():
7 html = lxml.html.fromstring(
8 "<html><script src='example.js' /><body>"
9 "<span>1</span><span>2</span><span>3</span>"
10 "</body></html>"
11 )
12 schema = SchemaScraper({})
13 nodes = schema._apply_preprocessors(html, [])
14 assert len(nodes) == 1
15 assert (
16 _tostr(nodes[0])
17 == "<div><body><span>1</span><span>2</span><span>3</span></body></div>"
18 )
19
20
21def test_apply_preprocessors_constructor():
22 html = lxml.html.fromstring(
23 "<html><script src='example.js' /><body>"
24 "<span>1</span><span>2</span><span>3</span>"
25 "</body></html>"
26 )
27 schema = SchemaScraper({}, extra_preprocessors=[CSS("span")])
28 nodes = schema._apply_preprocessors(html, [])
29 assert len(nodes) == 3
30 assert _tostr(nodes[0]) == "<span>1</span>"
31
32
33def test_apply_preprocessors_extra():
34 html = lxml.html.fromstring(
35 "<html><script src='example.js' /><body>"
36 "<span>1</span><span>2</span><span>3</span>"
37 "</body></html>"
38 )
39 schema = SchemaScraper({})
40 nodes = schema._apply_preprocessors(html, [CSS("span")])
41 assert len(nodes) == 3