Logo of Sweep
The new business logic introduced in "redflagdeals...Hardeepex/scrapegost#7

> >

  Started 7 months ago (probably frozen) using GPT-3.5  •   Book a call  •   Report a bug


Calling Assistant API to generate plan

Code Snippets Found

This is based on the results of the Searching step.

docs/examples/tutorial/redflagdeals_scraper.py:4-39 
4
5# Define the SchemaScraper for the main page and listings
6listings_scraper = SchemaScraper(
7    {
8        "url": "url",
9        "title": "str",
10        "image": "str",
11        "dealer": "str",
12        "comments_count": "int",
13    },
14    extra_preprocessors=[CSS("div.list_item")],
15)
16
17# Define the SchemaScraper for the single deal pages
18deal_scraper = SchemaScraper(
19    {
20        "title": "str",
21        "url": "url",
22        "price": "float",
23        "regular_price": "float",
24        "details": "str",
25    },
26    extra_preprocessors=[CSS("div.primary_content")],
27)
28
29# Scrape data from the website
30response = listings_scraper("https://www.redflagdeals.com/deals/")
31listings = response.data
32
33deal_data = []
34for listing in listings:
35    response = deal_scraper(listing["url"])
36    deal_data.append(response.data)
37
38# Save the scraped data to a JSON file
39with open("redflagdeals_data.json", "w") as f:
tests/test_scraper.py:4-38 
4
5def test_tostr():
6    html = "<html><body>ventura</body></html>"
7    doc = lxml.html.fromstring(html)
8    assert scrapers._tostr(doc) == html
9
10
11def test_chunk_tags():
12    html = [
13        lxml.html.fromstring("<li>one</li>"),
14        lxml.html.fromstring("<li>two</li>"),
15        lxml.html.fromstring("<li>three is very long and will get its own spot</li>"),
16        lxml.html.fromstring("<li>four</li>"),
17        lxml.html.fromstring("<li>five</li>"),
18    ]
19    chunks = scrapers._chunk_tags(html, 12, "gpt-4")
20    assert len(chunks) == 3
21    assert "one" in chunks[0]
22    assert "two" in chunks[0]
23    assert "three" in chunks[1]
24    assert "four" in chunks[2]
25    assert "five" in chunks[2]
26
27
28def test_parse_html():
29    # spaces are collapsed
30    html = "<span>    ventura</span>"
31    doc = scrapers._parse_url_or_html(html)
32    assert scrapers._tostr(doc) == "<span> ventura</span>"
33
34
35def test_parse_url():
36    # test that requests are made
37    url = "https://www.example.com"
38    doc = scrapers._parse_url_or_html(url)

Relevant Directory Tree

This is the subtree of the repository directory tree that contains the retrieved files and other potentially relevant files and directories.

1docs/
2  changelog.md
3  examples/
4    cli.log
5    cli.sh
6    episode_scraper_2.log
7    pydantic_example
8    pydantic_example.log
9    pydantic_example.py
10    tutorial/
11      episode_scraper_1.log
12      episode_scraper_1.py
13      episode_scraper_2.log
14      episode_scraper_2.py
15      episode_scraper_3.log
16      episode_scraper_3.py
17      episode_scraper_4.log
18      episode_scraper_4.py
19      episode_scraper_5.log
20      episode_scraper_5.py
21      list_scraper_v1.py
22      list_scraper_v2.log
23      list_scraper_v2.py
24      redflagdeals_scraper.py
25      tutorial_final.py
26    yoyodyne.py
27  tutorial.md
28  usage.md
29tests/
30  live/
31    test_live.py
32    test_live_adversarial.py
33    test_live_cli.py
34    test_live_pagination.py
35    test_nudge_live.py
36  test_apicall.py
37  test_hallucination.py
38  test_json_postprocessor.py
39  test_pagination.py
40  test_preprocessors.py
41  test_pydantic.py
42  test_schemascraper.py
43  test_scraper.py
44  test_utils.py
45  testutils.py