The recent commit introduced new business logic in...`Hardeepex/scrapegost#7`

> >

Started 7 months ago (probably frozen) using GPT-3.5 • Book a call • Report a bug

Calling Assistant API to generate plan ✓

Code Snippets Found

This is based on the results of the Searching step.

docs/examples/tutorial/redflagdeals_scraper.py:4-39

4
5# Define the SchemaScraper for the main page and listings
6listings_scraper = SchemaScraper(
7    {
8        "url": "url",
9        "title": "str",
10        "image": "str",
11        "dealer": "str",
12        "comments_count": "int",
13    },
14    extra_preprocessors=[CSS("div.list_item")],
15)
16
17# Define the SchemaScraper for the single deal pages
18deal_scraper = SchemaScraper(
19    {
20        "title": "str",
21        "url": "url",
22        "price": "float",
23        "regular_price": "float",
24        "details": "str",
25    },
26    extra_preprocessors=[CSS("div.primary_content")],
27)
28
29# Scrape data from the website
30response = listings_scraper("https://www.redflagdeals.com/deals/")
31listings = response.data
32
33deal_data = []
34for listing in listings:
35    response = deal_scraper(listing["url"])
36    deal_data.append(response.data)
37
38# Save the scraped data to a JSON file
39with open("redflagdeals_data.json", "w") as f:

docs/changelog.md:9-32

9
10## 0.5.1 - 2023-06-13
11
12* Improve type annotations and remove some ignored errors.
13* Support for new OpenAI models announced June 13th 2023.
14* Improved support for model fallbacks.  Now if a request has 6k tokens and the model list looks like `['gpt-3.5-turbo', 'gpt-3.5-turbo-16k']`, the 16k model will be used automatically since the default 4k model will not be able to handle the request.
15
16## 0.5.0 - 2023-06-06
17
18* Restore `PaginatedSchemaScraper` and add documentation for pagination.
19* Documentation improvements.
20* Small quality-of-life improvements such as better `pydantic` schema support and
21  more useful error messages.
22
23## 0.4.4 - 2023-03-31
24
25* Deactivate `HallucinationCheck` by default, it is overly aggressive and needs more work to be useful without raising false positives.
26* Bugfix for postprocessors parameter behavior not overriding defaults.
27
28## 0.4.2 - 2023-03-26
29
30* Fix type bug with JSON nudging.
31* Improve `HallucinationCheck` to handle more cases.
32* More tests!

tests/test_json_postprocessor.py:8-48

8
9def test_json_already_processed():
10    # already processed
11    jpp = JSONPostprocessor()
12    r = Response(data={})
13    with pytest.raises(PostprocessingError):
14        jpp(r, scraper=SchemaScraper({"name": "string"}))
15
16
17def test_json_no_nudge():
18    # single quotes, trailing commas
19    bad_json = "{'name': 'phil', }"
20    jpp = JSONPostprocessor(nudge=False)
21    r = Response(data=bad_json)
22    with pytest.raises(InvalidJSON):
23        jpp(r, scraper=SchemaScraper({"name": "string"}))
24    assert "False" in str(jpp)
25
26
27def test_json_nudge():
28    # single quotes, trailing commas
29    bad_json = "{'name': 'phil', }"
30    jpp = JSONPostprocessor(nudge=True)
31    r = Response(data=bad_json)
32    with patch_create() as create:
33        create.side_effect = lambda **kwargs: _mock_response(content='{"name": "phil"}')
34        repaired = jpp(r, scraper=SchemaScraper({"name": "string"}))
35    assert len(repaired.api_responses) == 1
36    assert repaired.data == {"name": "phil"}
37
38
39def test_nudge_fails():
40    # single quotes, trailing commas
41    bad_json = "{'name': 'phil', }"
42    jpp = JSONPostprocessor(nudge=True)
43    r = Response(data=bad_json)
44    with pytest.raises(InvalidJSON):
45        with patch_create() as create:
46            create.side_effect = lambda **kwargs: _mock_response(
47                content='{"name": "phil'
48            )

tests/test_preprocessors.py:5-29

5
6def test_clean_html():
7    doc = lxml.html.fromstring(
8        "<html><body style='background: blue;'><script>alert('hello')</script>"
9        "<noscript>here</noscript></body></html>"
10    )
11    tags = CleanHTML()(doc)
12    assert len(tags) == 1
13    doc = tags[0]
14    assert _tostr(doc) == "<div><body><noscript>here</noscript></body></div>"
15
16
17def test_select_tags_css():
18    doc = lxml.html.fromstring(
19        "<html><body><p>one</p><p>two</p><p>three</p></body></html>"
20    )
21    tags = CSS("p")(doc)
22    assert len(tags) == 3
23
24
25def test_select_tags_xpath():
26    doc = lxml.html.fromstring(
27        "<html><body><p>one</p><p>two</p><p>three</p></body></html>"
28    )
29    tags = XPath("//p")(doc)

tests/test_schemascraper.py:5-41

5
6def test_apply_preprocessors_default():
7    html = lxml.html.fromstring(
8        "<html><script src='example.js' /><body>"
9        "<span>1</span><span>2</span><span>3</span>"
10        "</body></html>"
11    )
12    schema = SchemaScraper({})
13    nodes = schema._apply_preprocessors(html, [])
14    assert len(nodes) == 1
15    assert (
16        _tostr(nodes[0])
17        == "<div><body><span>1</span><span>2</span><span>3</span></body></div>"
18    )
19
20
21def test_apply_preprocessors_constructor():
22    html = lxml.html.fromstring(
23        "<html><script src='example.js' /><body>"
24        "<span>1</span><span>2</span><span>3</span>"
25        "</body></html>"
26    )
27    schema = SchemaScraper({}, extra_preprocessors=[CSS("span")])
28    nodes = schema._apply_preprocessors(html, [])
29    assert len(nodes) == 3
30    assert _tostr(nodes[0]) == "<span>1</span>"
31
32
33def test_apply_preprocessors_extra():
34    html = lxml.html.fromstring(
35        "<html><script src='example.js' /><body>"
36        "<span>1</span><span>2</span><span>3</span>"
37        "</body></html>"
38    )
39    schema = SchemaScraper({})
40    nodes = schema._apply_preprocessors(html, [CSS("span")])
41    assert len(nodes) == 3

Relevant Directory Tree

This is the subtree of the repository directory tree that contains the retrieved files and other potentially relevant files and directories.

1docs/
2  changelog.md
3  examples/
4    tutorial/
5      episode_scraper_1.log
6      episode_scraper_1.py
7      episode_scraper_2.log
8      episode_scraper_2.py
9      episode_scraper_3.log
10      episode_scraper_3.py
11      episode_scraper_4.log
12      episode_scraper_4.py
13      episode_scraper_5.log
14      episode_scraper_5.py
15      list_scraper_v1.py
16      list_scraper_v2.log
17      list_scraper_v2.py
18      redflagdeals_scraper.py
19      tutorial_final.py
20  openai.md
21  tutorial.md
22  usage.md
23tests/
24  live/
25    test_live.py
26    test_live_adversarial.py
27    test_live_cli.py
28    test_live_pagination.py
29    test_nudge_live.py
30  test_apicall.py
31  test_hallucination.py
32  test_json_postprocessor.py
33  test_pagination.py
34  test_preprocessors.py
35  test_pydantic.py
36  test_schemascraper.py
37  test_scraper.py
38  test_utils.py
39  testutils.py

The recent commit introduced new business logic in...Hardeepex/scrapegost#7Sign in

Calling Assistant API to generate plan ✓

Code Snippets Found

Relevant Directory Tree

The recent commit introduced new business logic in...`Hardeepex/scrapegost#7`