group-wbl/.venv/lib/python3.13/site-packages/langchain_community/document_loaders/gitbook.py

import warnings
from typing import Any, AsyncIterator, Iterator, List, Optional, Set, Union
from urllib.parse import urlparse

from bs4 import BeautifulSoup
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.web_base import WebBaseLoader


class GitbookLoader(BaseLoader):
    """Load `GitBook` data.

    1. load from either a single page, or
    2. load all (relative) paths in the sitemap, handling nested sitemap indexes.

    When `load_all_paths=True`, the loader parses XML sitemaps and requires the
    `lxml` package to be installed (`pip install lxml`).
    """

    def __init__(
        self,
        web_page: str,
        load_all_paths: bool = False,
        base_url: Optional[str] = None,
        content_selector: str = "main",
        continue_on_failure: bool = False,
        show_progress: bool = True,
        *,
        sitemap_url: Optional[str] = None,
        allowed_domains: Optional[Set[str]] = None,
    ):
        """Initialize with web page and whether to load all paths.

        Args:
            web_page: The web page to load or the starting point from where
                relative paths are discovered.
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`. Requires `lxml` package.
            base_url: If `load_all_paths` is True, the relative paths are
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True
            sitemap_url: Custom sitemap URL to use when load_all_paths is True.
                Defaults to "{base_url}/sitemap.xml".
            allowed_domains: Optional set of allowed domains to fetch from.
                If None (default), the loader will restrict crawling to the domain
                of the `web_page` URL to prevent potential SSRF vulnerabilities.
                Provide an explicit set (e.g., {"example.com", "docs.example.com"})
                to allow crawling across multiple domains. Use with caution in
                server environments where users might control the input URLs.
        """
        self.base_url = base_url or web_page
        if self.base_url.endswith("/"):
            self.base_url = self.base_url[:-1]

        self.web_page = web_page
        self.load_all_paths = load_all_paths
        self.content_selector = content_selector
        self.continue_on_failure = continue_on_failure
        self.show_progress = show_progress
        self.allowed_domains = allowed_domains

        # If allowed_domains is not specified, extract domain from web_page as default
        if self.allowed_domains is None:
            initial_domain = urlparse(web_page).netloc
            if initial_domain:
                self.allowed_domains = {initial_domain}

        # Determine the starting URL (either a sitemap or a direct page)
        if load_all_paths:
            self.start_url = sitemap_url or f"{self.base_url}/sitemap.xml"
        else:
            self.start_url = web_page

        # Validate the start_url is allowed
        if not self._is_url_allowed(self.start_url):
            raise ValueError(
                f"Domain in {self.start_url} is not in the allowed domains list: "
                f"{self.allowed_domains}"
            )

    def _is_url_allowed(self, url: str) -> bool:
        """Check if a URL has an allowed scheme and domain."""
        # It's assumed self.allowed_domains is always set by __init__
        # either explicitly or derived from web_page. If it's somehow still
        # None here, it indicates an initialization issue, so denying is safer.
        if self.allowed_domains is None:
            return False  # Should not happen if init worked

        try:
            parsed = urlparse(url)

            # 1. Validate scheme (Minimal Enhancement)
            if parsed.scheme not in ("http", "https"):
                return False

            # 2. Validate domain (Existing logic - handles suffix correctly)
            # Ensure netloc is not empty before checking membership
            if not parsed.netloc:
                return False
            return parsed.netloc in self.allowed_domains
        except Exception:  # Catch potential urlparse errors
            return False

    def _safe_add_url(
        self, url_list: List[str], url: str, url_type: str = "URL"
    ) -> bool:
        """Safely add a URL to a list if it's from an allowed domain.

        Args:
            url_list: The list to add the URL to
            url: The URL to add
            url_type: Type of URL for warning message (e.g., "sitemap", "content")

        Returns:
            bool: True if URL was added, False if skipped
        """
        if self._is_url_allowed(url):
            url_list.append(url)
            return True
        else:
            warnings.warn(f"Skipping disallowed {url_type} URL: {url}")
            return False

    def _create_web_loader(self, url_or_urls: Union[str, List[str]]) -> WebBaseLoader:
        """Create a new WebBaseLoader instance for the given URL(s).

        This ensures each operation gets its own isolated WebBaseLoader.
        """
        return WebBaseLoader(
            web_path=url_or_urls,
            continue_on_failure=self.continue_on_failure,
            show_progress=self.show_progress,
        )

    def _is_sitemap_index(self, soup: BeautifulSoup) -> bool:
        """Check if the soup contains a sitemap index."""
        return soup.find("sitemapindex") is not None

    def _extract_sitemap_urls(self, soup: BeautifulSoup) -> List[str]:
        """Extract sitemap URLs from a sitemap index."""
        sitemap_tags = soup.find_all("sitemap")
        urls: List[str] = []
        for sitemap in sitemap_tags:
            loc = sitemap.find("loc")
            if loc and loc.text:
                self._safe_add_url(urls, loc.text, "sitemap")
        return urls

    def _process_sitemap(
        self,
        soup: BeautifulSoup,
        processed_urls: Set[str],
        web_loader: Optional[WebBaseLoader] = None,
    ) -> List[str]:
        """Process a sitemap, handling both direct content URLs and sitemap indexes.

        Args:
            soup: The BeautifulSoup object of the sitemap
            processed_urls: Set of already processed URLs to avoid cycles
            web_loader: WebBaseLoader instance to reuse for all requests,
                created if None
        """
        # Create a loader if not provided
        if web_loader is None:
            web_loader = self._create_web_loader(self.start_url)

        # If it's a sitemap index, recursively process each sitemap URL
        if self._is_sitemap_index(soup):
            sitemap_urls = self._extract_sitemap_urls(soup)
            all_content_urls = []

            for sitemap_url in sitemap_urls:
                if sitemap_url in processed_urls:
                    warnings.warn(
                        f"Skipping already processed sitemap URL: {sitemap_url}"
                    )
                    continue

                processed_urls.add(sitemap_url)
                try:
                    # Temporarily override the web_path of the loader
                    original_web_paths = web_loader.web_paths
                    web_loader.web_paths = [sitemap_url]

                    # Reuse the same loader for the next sitemap,
                    # explicitly use lxml-xml
                    sitemap_soup = web_loader.scrape(parser="lxml-xml")

                    # Restore original web_paths
                    web_loader.web_paths = original_web_paths

                    # Recursive call with the same loader
                    content_urls = self._process_sitemap(
                        sitemap_soup, processed_urls, web_loader
                    )
                    all_content_urls.extend(content_urls)
                except Exception as e:
                    if self.continue_on_failure:
                        warnings.warn(f"Error processing sitemap {sitemap_url}: {e}")
                    else:
                        raise

            return all_content_urls
        else:
            # It's a content sitemap, so extract content URLs
            return self._get_paths(soup)

    async def _aprocess_sitemap(
        self,
        soup: BeautifulSoup,
        base_url: str,
        processed_urls: Set[str],
        web_loader: Optional[WebBaseLoader] = None,
    ) -> List[str]:
        """Async version of _process_sitemap.

        Args:
            soup: The BeautifulSoup object of the sitemap
            base_url: The base URL for relative paths
            processed_urls: Set of already processed URLs to avoid cycles
            web_loader: WebBaseLoader instance to reuse for all requests,
                created if None
        """
        # Create a loader if not provided
        if web_loader is None:
            web_loader = self._create_web_loader(self.start_url)

        # If it's a sitemap index, recursively process each sitemap URL
        if self._is_sitemap_index(soup):
            sitemap_urls = self._extract_sitemap_urls(soup)
            all_content_urls = []

            # Filter out already processed URLs
            new_urls = [url for url in sitemap_urls if url not in processed_urls]

            if not new_urls:
                return []

            # Update the web_paths of the loader to fetch all sitemaps at once
            original_web_paths = web_loader.web_paths
            web_loader.web_paths = new_urls

            # Use the same WebBaseLoader's ascrape_all for efficient parallel
            # fetching, explicitly use lxml-xml
            soups = await web_loader.ascrape_all(new_urls, parser="lxml-xml")

            # Restore original web_paths
            web_loader.web_paths = original_web_paths

            for sitemap_url, sitemap_soup in zip(new_urls, soups):
                processed_urls.add(sitemap_url)
                try:
                    # Recursive call with the same loader
                    content_urls = await self._aprocess_sitemap(
                        sitemap_soup, base_url, processed_urls, web_loader
                    )
                    all_content_urls.extend(content_urls)
                except Exception as e:
                    if self.continue_on_failure:
                        warnings.warn(f"Error processing sitemap {sitemap_url}: {e}")
                    else:
                        raise

            return all_content_urls
        else:
            # It's a content sitemap, so extract content URLs
            return self._get_paths(soup)

    def lazy_load(self) -> Iterator[Document]:
        """Fetch text from one single GitBook page or recursively from sitemap."""
        if not self.load_all_paths:
            # Simple case: load a single page
            temp_loader = self._create_web_loader(self.web_page)
            soup = temp_loader.scrape()
            doc = self._get_document(soup, self.web_page)
            if doc:
                yield doc
        else:
            # Get initial sitemap using the recursive method
            temp_loader = self._create_web_loader(self.start_url)
            # Explicitly use lxml-xml for parsing the initial sitemap
            soup_info = temp_loader.scrape(parser="lxml-xml")

            # Process sitemap(s) recursively to get all content URLs
            processed_urls: Set[str] = set()
            relative_paths = self._process_sitemap(soup_info, processed_urls)

            if not relative_paths and self.show_progress:
                warnings.warn(f"No content URLs found in sitemap at {self.start_url}")

            # Build full URLs from relative paths
            urls: List[str] = []
            for url in relative_paths:
                # URLs are now already absolute from _get_paths
                self._safe_add_url(urls, url, "content")

            if not urls:
                return

            # Create a loader for content pages
            content_loader = self._create_web_loader(urls)

            # Use WebBaseLoader to fetch all pages
            soup_infos = content_loader.scrape_all(urls)

            for soup_info, url in zip(soup_infos, urls):
                doc = self._get_document(soup_info, url)
                if doc:
                    yield doc

    async def alazy_load(self) -> AsyncIterator[Document]:
        """Asynchronously fetch text from GitBook page(s)."""
        if not self.load_all_paths:
            # Simple case: load a single page asynchronously
            temp_loader = self._create_web_loader(self.web_page)
            soups = await temp_loader.ascrape_all([self.web_page])
            soup_info = soups[0]
            doc = self._get_document(soup_info, self.web_page)
            if doc:
                yield doc
        else:
            # Get initial sitemap - web_loader will be created in _aprocess_sitemap
            temp_loader = self._create_web_loader(self.start_url)
            # Explicitly use lxml-xml for parsing the initial sitemap
            soups = await temp_loader.ascrape_all([self.start_url], parser="lxml-xml")
            soup_info = soups[0]

            # Process sitemap(s) recursively to get all content URLs
            processed_urls: Set[str] = set()
            relative_paths = await self._aprocess_sitemap(
                soup_info, self.base_url, processed_urls
            )

            if not relative_paths and self.show_progress:
                warnings.warn(f"No content URLs found in sitemap at {self.start_url}")

            # Build full URLs from relative paths
            urls: List[str] = []
            for url in relative_paths:
                # URLs are now already absolute from _get_paths
                self._safe_add_url(urls, url, "content")

            if not urls:
                return

            # Create a loader for content pages
            content_loader = self._create_web_loader(urls)

            # Use WebBaseLoader's ascrape_all for efficient parallel fetching
            soup_infos = await content_loader.ascrape_all(urls)

            for soup_info, url in zip(soup_infos, urls):
                maybe_doc = self._get_document(soup_info, url)
                if maybe_doc is not None:
                    yield maybe_doc

    def _get_document(
        self, soup: Any, custom_url: Optional[str] = None
    ) -> Optional[Document]:
        """Fetch content from page and return Document."""
        page_content_raw = soup.find(self.content_selector)
        if not page_content_raw:
            return None
        content = page_content_raw.get_text(separator="\n").strip()
        title_if_exists = page_content_raw.find("h1")
        title = title_if_exists.text if title_if_exists else ""
        metadata = {"source": custom_url or self.web_page, "title": title}
        return Document(page_content=content, metadata=metadata)

    def _get_paths(self, soup: Any) -> List[str]:
        """Fetch all URLs in the sitemap."""
        urls = []
        for loc in soup.find_all("loc"):
            if loc.text:
                # Instead of extracting just the path, keep the full URL
                # to preserve domain information
                urls.append(loc.text)
        return urls