Skip to content

Langchain

Langchain bindings on the supermat core module. The SupermatRetriever is a drop in replacement for Langchain VectorStore.

SupermatRetriever

Bases: BaseRetriever

Supermat Langchain Custom Retriever. This uses any Langchain VectorStore and overrides the documents retrieval methods to make it work for Supermat. NOTE: Currently this only works on Text chunks.

from supermat.langchain.bindings import SupermatRetriever
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

retriever = SupermatRetriever(
    parsed_docs=FileProcessor.process_file(pdf_file_path),
    document_name=pdf_file_path.stem,
    vector_store=Chroma(
        embedding_function=HuggingFaceEmbeddings(
            model_name="thenlper/gte-base",
        )
    ),
)

Args: parsed_docs (ParsedDocumentType): The supermat parsed documents. vector_store (VectorStore): The vector store used to store the document chunks. vector_store_retriver_kwargs (dict[str, Any], optional): VectorStore kwargs used during initialization. Defaults to {}. max_chunk_length (int, optional): Max character length. NOTE: This needs to be based on tokens instead. Defaults to 8000. store_sentences (bool, optional): Store sentence level chunks in vector store which will then be converted to paragraphs before sending to LLM. Defaults to False.

Source code in supermat/langchain/bindings.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
class SupermatRetriever(BaseRetriever):
    """
    Supermat Langchain Custom Retriever.
    This uses any Langchain VectorStore and overrides the documents retrieval methods to make it work for Supermat.
    NOTE: Currently this only works on Text chunks.


    ``` python
    from supermat.langchain.bindings import SupermatRetriever
    from langchain_chroma import Chroma
    from langchain_huggingface import HuggingFaceEmbeddings

    retriever = SupermatRetriever(
        parsed_docs=FileProcessor.process_file(pdf_file_path),
        document_name=pdf_file_path.stem,
        vector_store=Chroma(
            embedding_function=HuggingFaceEmbeddings(
                model_name="thenlper/gte-base",
            )
        ),
    )
    ```
    Args:
        parsed_docs (ParsedDocumentType): The supermat parsed documents.
        vector_store (VectorStore): The vector store used to store the document chunks.
        vector_store_retriver_kwargs (dict[str, Any], optional): `VectorStore` kwargs used during initialization.
            Defaults to `{}`.
        max_chunk_length (int, optional): Max character length. NOTE: This needs to be based on tokens instead.
            Defaults to 8000.
        store_sentences (bool, optional): Store sentence level chunks in vector store
            which will then be converted to paragraphs before sending to LLM. Defaults to False.
    """

    parsed_docs: ParsedDocumentType = Field(exclude=True, strict=False, repr=False)
    vector_store: VectorStore
    vector_store_retriver_kwargs: dict[str, Any] = {}
    max_chunk_length: int = 8000
    store_sentences: bool = False

    @cached_property
    def vector_store_retriver(self) -> VectorStoreRetriever:
        return self.vector_store.as_retriever(**self.vector_store_retriver_kwargs)

    def _create_document_index(self) -> tuple[dict[str, int], dict[int, str]]:
        documents = {
            chunk.document
            for chunk in self.parsed_docs
            # NOTE: we assume that all chunks have document
            if chunk.document is not None
        }
        # NOTE: we want the document id to start with 1, since 0 means all in structure id.
        document_index_map = {document: doc_id for doc_id, document in enumerate(documents, 1)}
        index_document_map = dict(zip(document_index_map.values(), document_index_map.keys()))
        return document_index_map, index_document_map

    def _add_doc_id(self, document_index_map: dict[str, int]):
        """
        Mutates current `parsed_docs` to include document id in the chunk structure id.
        This is a temporary solution.
        Currently, the parsed documents do not include document as part of the strucutre id.
        We include document id in the relevant retrieved documents for now.
        TODO (@legendof-selda): Include document id as part of structure id in `ParsedDocumentType`.

        Args:
            document_index_map (dict[str, int]): 'document' name to index mapping.

        """
        for chunk in self.parsed_docs:
            doc_index = document_index_map[chunk.document]
            chunk.structure = f"{doc_index}.{chunk.structure}"

        return self.parsed_docs

    def model_post_init(self, __context: Any):
        super().model_post_init(__context)
        # TODO (@legendof-selda): integrate the chunker class here instead.
        # TODO (@legendof-selda): Build reverse lookups to get higher level sections easily from parsed_docs.
        self._document_index_map, self._index_document_map = self._create_document_index()
        self._add_doc_id(self._document_index_map)
        # NOTE: Currently paragraph chunks seemed to work best instead of sentence.
        self.vector_store.add_documents(
            [
                Document(
                    sentence.text,
                    metadata=dict(
                        document=chunk.document,
                        structure=sentence.structure,
                        # properties=chunk.properties,
                        key=",".join(sentence.key),
                        citation_id=sentence.structure,
                    ),
                )
                for chunk in self.parsed_docs
                if isinstance(chunk, BaseTextChunk)
                for sentence in (chunk.sentences if chunk.sentences else [chunk])
                if isinstance(sentence, BaseTextChunk)
            ]
            if self.store_sentences
            else [
                Document(
                    chunk.text,
                    metadata=dict(
                        document=chunk.document,
                        structure=chunk.structure,
                        # properties=chunk.properties,
                        key=",".join(chunk.key),
                        citation_id=chunk.structure,
                    ),
                )
                for chunk in self.parsed_docs
                if isinstance(chunk, BaseTextChunk)
            ]
        )

    def _get_higher_section(self, documents: list[Document]) -> list[Document]:
        """Utility to convert lower level structure (eg. sentences) to a higher level structure (eg. paragraphs).
        We return only unique documents back.
        Eg. If there are 3 sentences of the same paragraph, we only want a single paragraph document back.

        Args:
            documents (list[Document]): Relevant documents retrieved from the vector store.

        Returns:
            list[Document]: Relevant documents from the vector store, but converted to a higher level structure.
        """
        # TODO (@legendof-selda): Refactor to make use of inverse lookups for faster higher strucutre retrieval.
        return [
            Document(
                # TODO (@legendof-selda): this max chunk clipping is only a temp solution
                # ideally the intelligent chunker class will take care of this based on token length.
                chunk.text[: self.max_chunk_length],
                metadata=dict(
                    document=chunk.document,
                    # properties=chunk.properties,
                    key=",".join(chunk.key),
                    citation_id=chunk.structure,
                ),
            )
            # This is in paragraph level.
            for chunk in self.parsed_docs
            if isinstance(chunk, BaseTextChunk)
            and any(
                chunk.has_subsection(doc.metadata.get("structure", ""))
                # In sentence level.
                for doc in documents
            )
        ]

    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> list[Document]:
        documents = self.vector_store_retriver._get_relevant_documents(query, run_manager=run_manager)
        if self.store_sentences:
            documents = self._get_higher_section(documents)
        return documents

    async def _aget_relevant_documents(
        self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
    ) -> list[Document]:
        documents = await self.vector_store_retriver._aget_relevant_documents(query, run_manager=run_manager)
        if self.store_sentences:
            documents = self._get_higher_section(documents)
        return documents