bovine.clients.web

This module contains methods to parse a webpage and turn it into an ActivityPub object.

`RobotFileDeniesAccess`

Bases: Exception

Used to indicate that robots.txt does not allow the user agent to access the url being queried

Source code in bovine/bovine/clients/web/__init__.py

class RobotFileDeniesAccess(Exception):
    """Used to indicate that robots.txt does not allow the user agent
    to access the url being queried"""

    pass

`WebPage` `dataclass`

Class to capture loading webpages and transforming their content in objects more usable in the Fediverse.

Parameters:

Name	Type	Description	Default
`url`	`str`	URL of the webpage	required

Source code in bovine/bovine/clients/web/__init__.py

@dataclass
class WebPage:
    """Class to capture loading webpages and transforming their
    content in objects more usable in the Fediverse.

    :param url: URL of the webpage"""

    url: str
    text: str | None = None
    linked_ld: list = field(default_factory=list)

    async def fetch(
        self, session: aiohttp.ClientSession | None = None, fetch_linked_ld=False
    ):
        """Fetches the webpage and transform its content using
        BeautifulSoup"""
        if session is None:
            async with aiohttp.ClientSession() as session:
                await self._fetch_with_session(session, fetch_linked_ld=fetch_linked_ld)
        else:
            await self._fetch_with_session(session, fetch_linked_ld=fetch_linked_ld)

    async def _fetch_with_session(self, session, fetch_linked_ld):
        async with session.get(
            self.robots_url, headers={"user-agent": BOVINE_CLIENT_NAME}
        ) as response:
            robots = RobotFileParser()
            robots.parse((await response.text("utf-8")).split("\n"))

            if not robots.can_fetch(BOVINE_CLIENT_NAME, self.url):
                raise RobotFileDeniesAccess()
        async with session.get(
            self.url, headers={"accept": "text/html", "user-agent": BOVINE_CLIENT_NAME}
        ) as response:
            self.text = await response.text("utf-8")
            if fetch_linked_ld:
                links = response.links.getall("alternate")

                links = [
                    str(x["url"]) for x in links if x["type"] == "application/ld+json"
                ]
                for x in links:
                    async with session.get(
                        x,
                        headers={
                            "accept": "application/ld+json",
                            "user-agent": BOVINE_CLIENT_NAME,
                        },
                    ) as response:
                        self.linked_ld.append(await response.json())

    @cached_property
    def soup(self):
        return BeautifulSoup(self.text, features="lxml")

    @cached_property
    def jsonld(self) -> dict | list:
        """Usage for json-ld contained in a page

        ```python
        page = WebPage(
            "https://www.allrecipes.com/recipe/263822/pasta-alla-norma-eggplant-pasta/"
        )
        await page.fetch()
        print(page.jsonld[0][0])
        ```

        For json-ld contained in the link header

        ```python
        page = WebPage('https://www.wikidata.org/wiki/Q76')
        await page.fetch(fetch_linked_ld=True)
        print(page.jsonld[0][0])
        ```
        """
        raw = self.soup.find_all("script", attrs={"type": "application/ld+json"})

        raw = [safe_json_loads(tag.text) for tag in raw]

        return [x for x in raw if x] + self.linked_ld

    def meta_content_for_property(self, value: str) -> str | None:
        tag = self.soup.find("meta", attrs={"property": value})
        if tag:
            return tag.get("content")
        return None

    def meta_content_for_property_int(self, value: str) -> int | None:
        tag = self.soup.find("meta", attrs={"property": value})
        if tag:
            return int(tag.get("content"))
        return None

    @cached_property
    def open_graph_page(self) -> dict:
        """Creates an ActivityPub Page object from the Open Graph data"""
        image = Object(
            type="Image",
            url=self.meta_content_for_property("og:image"),
            name=self.meta_content_for_property("og:image:alt"),
            height=self.meta_content_for_property_int("og:image:height"),
            width=self.meta_content_for_property_int("og:image:width"),
            media_type=self.meta_content_for_property("og:image:type"),
        )
        page = Object(
            type="Page",
            name=self.meta_content_for_property("og:title"),
            url=self.meta_content_for_property("og:url"),
            summary=self.meta_content_for_property("og:description"),
            source={"url": self.url, "mediaType": "text/html"},
        )

        page.icon = image.build()

        return with_activitystreams_context(page.build())

    @cached_property
    def robots_url(self):
        parsed = urlparse(self.url)
        return f"{parsed.scheme}://{parsed.netloc}/robots.txt"

`jsonld: dict | list` `cached` `property`

Usage for json-ld contained in a page

page = WebPage(
    "https://www.allrecipes.com/recipe/263822/pasta-alla-norma-eggplant-pasta/"
)
await page.fetch()
print(page.jsonld[0][0])

For json-ld contained in the link header

page = WebPage('https://www.wikidata.org/wiki/Q76')
await page.fetch(fetch_linked_ld=True)
print(page.jsonld[0][0])

`open_graph_page: dict` `cached` `property`

Creates an ActivityPub Page object from the Open Graph data

`fetch(session=None, fetch_linked_ld=False)` `async`

Fetches the webpage and transform its content using BeautifulSoup

Source code in bovine/bovine/clients/web/__init__.py

async def fetch(
    self, session: aiohttp.ClientSession | None = None, fetch_linked_ld=False
):
    """Fetches the webpage and transform its content using
    BeautifulSoup"""
    if session is None:
        async with aiohttp.ClientSession() as session:
            await self._fetch_with_session(session, fetch_linked_ld=fetch_linked_ld)
    else:
        await self._fetch_with_session(session, fetch_linked_ld=fetch_linked_ld)

bovine.clients.web

RobotFileDeniesAccess

WebPage dataclass

jsonld: dict | list cached property

open_graph_page: dict cached property

fetch(session=None, fetch_linked_ld=False) async

`RobotFileDeniesAccess`

`WebPage` `dataclass`

`jsonld: dict | list` `cached` `property`

`open_graph_page: dict` `cached` `property`

`fetch(session=None, fetch_linked_ld=False)` `async`