Langchain

Langchain bindings on the supermat core module. The SupermatRetriever is a drop in replacement for Langchain VectorStore.

`SupermatRetriever`

Bases: BaseRetriever

Supermat Langchain Custom Retriever. This uses any Langchain VectorStore and overrides the documents retrieval methods to make it work for Supermat. NOTE: Currently this only works on Text chunks.

from supermat.langchain.bindings import SupermatRetriever
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

retriever = SupermatRetriever(
    parsed_docs=FileProcessor.process_file(pdf_file_path),
    document_name=pdf_file_path.stem,
    vector_store=Chroma(
        embedding_function=HuggingFaceEmbeddings(
            model_name="thenlper/gte-base",
        )
    ),
)

Args: parsed_docs (ParsedDocumentType): The supermat parsed documents. vector_store (VectorStore): The vector store used to store the document chunks. vector_store_retriver_kwargs (dict[str, Any], optional): VectorStore kwargs used during initialization. Defaults to {}. max_chunk_length (int, optional): Max character length. NOTE: This needs to be based on tokens instead. Defaults to 8000. store_sentences (bool, optional): Store sentence level chunks in vector store which will then be converted to paragraphs before sending to LLM. Defaults to False.

Source code in supermat/langchain/bindings.py

class SupermatRetriever(BaseRetriever):
    """
    Supermat Langchain Custom Retriever.
    This uses any Langchain VectorStore and overrides the documents retrieval methods to make it work for Supermat.
    NOTE: Currently this only works on Text chunks.


    ``` python
    from supermat.langchain.bindings import SupermatRetriever
    from langchain_chroma import Chroma
    from langchain_huggingface import HuggingFaceEmbeddings

    retriever = SupermatRetriever(
        parsed_docs=FileProcessor.process_file(pdf_file_path),
        document_name=pdf_file_path.stem,
        vector_store=Chroma(
            embedding_function=HuggingFaceEmbeddings(
                model_name="thenlper/gte-base",
            )
        ),
    )
    ```
    Args:
        parsed_docs (ParsedDocumentType): The supermat parsed documents.
        vector_store (VectorStore): The vector store used to store the document chunks.
        vector_store_retriver_kwargs (dict[str, Any], optional): `VectorStore` kwargs used during initialization.
            Defaults to `{}`.
        max_chunk_length (int, optional): Max character length. NOTE: This needs to be based on tokens instead.
            Defaults to 8000.
        store_sentences (bool, optional): Store sentence level chunks in vector store
            which will then be converted to paragraphs before sending to LLM. Defaults to False.
    """

    parsed_docs: ParsedDocumentType = Field(exclude=True, strict=False, repr=False)
    vector_store: VectorStore
    vector_store_retriver_kwargs: dict[str, Any] = {}
    max_chunk_length: int = 8000
    store_sentences: bool = False

    @cached_property
    def vector_store_retriver(self) -> VectorStoreRetriever:
        return self.vector_store.as_retriever(**self.vector_store_retriver_kwargs)

    def _create_document_index(self) -> tuple[dict[str, int], dict[int, str]]:
        documents = {
            chunk.document
            for chunk in self.parsed_docs
            # NOTE: we assume that all chunks have document
            if chunk.document is not None
        }
        # NOTE: we want the document id to start with 1, since 0 means all in structure id.
        document_index_map = {document: doc_id for doc_id, document in enumerate(documents, 1)}
        index_document_map = dict(zip(document_index_map.values(), document_index_map.keys()))
        return document_index_map, index_document_map

    def _add_doc_id(self, document_index_map: dict[str, int]):
        """
        Mutates current `parsed_docs` to include document id in the chunk structure id.
        This is a temporary solution.
        Currently, the parsed documents do not include document as part of the strucutre id.
        We include document id in the relevant retrieved documents for now.
        TODO (@legendof-selda): Include document id as part of structure id in `ParsedDocumentType`.

        Args:
            document_index_map (dict[str, int]): 'document' name to index mapping.

        """
        for chunk in self.parsed_docs:
            doc_index = document_index_map[chunk.document]
            chunk.structure = f"{doc_index}.{chunk.structure}"

        return self.parsed_docs

    def model_post_init(self, __context: Any):
        super().model_post_init(__context)
        # TODO (@legendof-selda): integrate the chunker class here instead.
        # TODO (@legendof-selda): Build reverse lookups to get higher level sections easily from parsed_docs.
        self._document_index_map, self._index_document_map = self._create_document_index()
        self._add_doc_id(self._document_index_map)
        # NOTE: Currently paragraph chunks seemed to work best instead of sentence.
        self.vector_store.add_documents(
            [
                Document(
                    sentence.text,
                    metadata=dict(
                        document=chunk.document,
                        structure=sentence.structure,
                        # properties=chunk.properties,
                        key=",".join(sentence.key),
                        citation_id=sentence.structure,
                    ),
                )
                for chunk in self.parsed_docs
                if isinstance(chunk, BaseTextChunk)
                for sentence in (chunk.sentences if chunk.sentences else [chunk])
                if isinstance(sentence, BaseTextChunk)
            ]
            if self.store_sentences
            else [
                Document(
                    chunk.text,
                    metadata=dict(
                        document=chunk.document,
                        structure=chunk.structure,
                        # properties=chunk.properties,
                        key=",".join(chunk.key),
                        citation_id=chunk.structure,
                    ),
                )
                for chunk in self.parsed_docs
                if isinstance(chunk, BaseTextChunk)
            ]
        )

    def _get_higher_section(self, documents: list[Document]) -> list[Document]:
        """Utility to convert lower level structure (eg. sentences) to a higher level structure (eg. paragraphs).
        We return only unique documents back.
        Eg. If there are 3 sentences of the same paragraph, we only want a single paragraph document back.

        Args:
            documents (list[Document]): Relevant documents retrieved from the vector store.

        Returns:
            list[Document]: Relevant documents from the vector store, but converted to a higher level structure.
        """
        # TODO (@legendof-selda): Refactor to make use of inverse lookups for faster higher strucutre retrieval.
        return [
            Document(
                # TODO (@legendof-selda): this max chunk clipping is only a temp solution
                # ideally the intelligent chunker class will take care of this based on token length.
                chunk.text[: self.max_chunk_length],
                metadata=dict(
                    document=chunk.document,
                    # properties=chunk.properties,
                    key=",".join(chunk.key),
                    citation_id=chunk.structure,
                ),
            )
            # This is in paragraph level.
            for chunk in self.parsed_docs
            if isinstance(chunk, BaseTextChunk)
            and any(
                chunk.has_subsection(doc.metadata.get("structure", ""))
                # In sentence level.
                for doc in documents
            )
        ]

    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> list[Document]:
        documents = self.vector_store_retriver._get_relevant_documents(query, run_manager=run_manager)
        if self.store_sentences:
            documents = self._get_higher_section(documents)
        return documents

    async def _aget_relevant_documents(
        self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
    ) -> list[Document]:
        documents = await self.vector_store_retriver._aget_relevant_documents(query, run_manager=run_manager)
        if self.store_sentences:
            documents = self._get_higher_section(documents)
        return documents