The file redflagdeals_scraper.py introduces new bu...`Hardeepex/scrapegost#7`

> >

Started 7 months ago (probably frozen) using GPT-3.5 • Book a call • Report a bug

Calling Assistant API to generate plan ✓

Code Snippets Found

This is based on the results of the Searching step.

docs/examples/tutorial/redflagdeals_scraper.py:4-39

4
5# Define the SchemaScraper for the main page and listings
6listings_scraper = SchemaScraper(
7    {
8        "url": "url",
9        "title": "str",
10        "image": "str",
11        "dealer": "str",
12        "comments_count": "int",
13    },
14    extra_preprocessors=[CSS("div.list_item")],
15)
16
17# Define the SchemaScraper for the single deal pages
18deal_scraper = SchemaScraper(
19    {
20        "title": "str",
21        "url": "url",
22        "price": "float",
23        "regular_price": "float",
24        "details": "str",
25    },
26    extra_preprocessors=[CSS("div.primary_content")],
27)
28
29# Scrape data from the website
30response = listings_scraper("https://www.redflagdeals.com/deals/")
31listings = response.data
32
33deal_data = []
34for listing in listings:
35    response = deal_scraper(listing["url"])
36    deal_data.append(response.data)
37
38# Save the scraped data to a JSON file
39with open("redflagdeals_data.json", "w") as f:

tests/test_scraper.py:4-38

4
5def test_tostr():
6    html = "<html><body>ventura</body></html>"
7    doc = lxml.html.fromstring(html)
8    assert scrapers._tostr(doc) == html
9
10
11def test_chunk_tags():
12    html = [
13        lxml.html.fromstring("<li>one</li>"),
14        lxml.html.fromstring("<li>two</li>"),
15        lxml.html.fromstring("<li>three is very long and will get its own spot</li>"),
16        lxml.html.fromstring("<li>four</li>"),
17        lxml.html.fromstring("<li>five</li>"),
18    ]
19    chunks = scrapers._chunk_tags(html, 12, "gpt-4")
20    assert len(chunks) == 3
21    assert "one" in chunks[0]
22    assert "two" in chunks[0]
23    assert "three" in chunks[1]
24    assert "four" in chunks[2]
25    assert "five" in chunks[2]
26
27
28def test_parse_html():
29    # spaces are collapsed
30    html = "<span>    ventura</span>"
31    doc = scrapers._parse_url_or_html(html)
32    assert scrapers._tostr(doc) == "<span> ventura</span>"
33
34
35def test_parse_url():
36    # test that requests are made
37    url = "https://www.example.com"
38    doc = scrapers._parse_url_or_html(url)

tests/test_schemascraper.py:5-41

5
6def test_apply_preprocessors_default():
7    html = lxml.html.fromstring(
8        "<html><script src='example.js' /><body>"
9        "<span>1</span><span>2</span><span>3</span>"
10        "</body></html>"
11    )
12    schema = SchemaScraper({})
13    nodes = schema._apply_preprocessors(html, [])
14    assert len(nodes) == 1
15    assert (
16        _tostr(nodes[0])
17        == "<div><body><span>1</span><span>2</span><span>3</span></body></div>"
18    )
19
20
21def test_apply_preprocessors_constructor():
22    html = lxml.html.fromstring(
23        "<html><script src='example.js' /><body>"
24        "<span>1</span><span>2</span><span>3</span>"
25        "</body></html>"
26    )
27    schema = SchemaScraper({}, extra_preprocessors=[CSS("span")])
28    nodes = schema._apply_preprocessors(html, [])
29    assert len(nodes) == 3
30    assert _tostr(nodes[0]) == "<span>1</span>"
31
32
33def test_apply_preprocessors_extra():
34    html = lxml.html.fromstring(
35        "<html><script src='example.js' /><body>"
36        "<span>1</span><span>2</span><span>3</span>"
37        "</body></html>"
38    )
39    schema = SchemaScraper({})
40    nodes = schema._apply_preprocessors(html, [CSS("span")])
41    assert len(nodes) == 3

Relevant Directory Tree

This is the subtree of the repository directory tree that contains the retrieved files and other potentially relevant files and directories.

1docs/
2  changelog.md
3  examples/
4    cli.log
5    cli.sh
6    episode_scraper_2.log
7    pydantic_example
8    pydantic_example.log
9    pydantic_example.py
10    tutorial/
11      episode_scraper_1.log
12      episode_scraper_1.py
13      episode_scraper_2.log
14      episode_scraper_2.py
15      episode_scraper_3.log
16      episode_scraper_3.py
17      episode_scraper_4.log
18      episode_scraper_4.py
19      episode_scraper_5.log
20      episode_scraper_5.py
21      list_scraper_v1.py
22      list_scraper_v2.log
23      list_scraper_v2.py
24      redflagdeals_scraper.py
25      tutorial_final.py
26    yoyodyne.py
27  faq.md
28  openai.md
29  usage.md
30tests/
31  live/
32    test_live.py
33    test_live_adversarial.py
34    test_live_cli.py
35    test_live_pagination.py
36    test_nudge_live.py

The file redflagdeals_scraper.py introduces new bu...Hardeepex/scrapegost#7Sign in

Calling Assistant API to generate plan ✓

Code Snippets Found

Relevant Directory Tree

The file redflagdeals_scraper.py introduces new bu...`Hardeepex/scrapegost#7`