integrations-python/langchain/langchain_vectorize/retrievers.py at 4d0adf64f5f82873ffb6f82f1a07d5f61d148ed1 · vectorize-io/integrations-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""Vectorize LangChain retrievers."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Literal, Optional

from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from typing_extensions import override
from vectorize_client.api.pipelines_api import PipelinesApi
from vectorize_client.api_client import ApiClient
from vectorize_client.configuration import Configuration
from vectorize_client.models.retrieve_documents_request import RetrieveDocumentsRequest

if TYPE_CHECKING:
    from langchain_core.callbacks import CallbackManagerForRetrieverRun
    from langchain_core.runnables import RunnableConfig
    from vectorize_client.models.document import Document as VectorizeDocument

_METADATA_FIELDS = {
    "relevancy",
    "chunk_id",
    "total_chunks",
    "origin",
    "origin_id",
    "similarity",
    "source",
    "unique_source",
    "source_display_name",
    "pipeline_id",
    "org_id",
}
_NOT_SET = object()


class VectorizeRetriever(BaseRetriever):
    """Vectorize retriever.

    Setup:
        Install package ``langchain-vectorize``

        .. code-block:: bash

            pip install -U langchain-vectorize

    Init args:
        api_token: str
            The Vectorize API token.
        environment: Literal["prod", "dev", "local", "staging"]
            The Vectorize API environment. Defaults to "prod".
        organization: Optional[str]
            The Vectorize organization ID. Defaults to None.
        pipeline_id: Optional[str]
            The Vectorize pipeline ID. Defaults to None.
        num_results: int
            Number of documents to return. Defaults to 5.
        rerank: bool
            Whether to rerank the results. Defaults to False.
        metadata_filters: list[dict[str, Any]]
            The metadata filters to apply when retrieving the documents. Defaults to [].

    Instantiate:
        .. code-block:: python

            from langchain_vectorize import VectorizeRetriever

            retriever = VectorizeRetriever(
                api_token="xxxxx", "organization"="1234", "pipeline_id"="5678"
            )

    Usage:
        .. code-block:: python

            query = "what year was breath of the wild released?"
            retriever.invoke(query)

    Use within a chain:
        .. code-block:: python

            from langchain_core.output_parsers import StrOutputParser
            from langchain_core.prompts import ChatPromptTemplate
            from langchain_core.runnables import RunnablePassthrough
            from langchain_openai import ChatOpenAI

            prompt = ChatPromptTemplate.from_template(
                \"\"\"Answer the question based only on the context provided.

            Context: {context}

            Question: {question}\"\"\"
            )

            llm = ChatOpenAI(model="gpt-4o")

            def format_docs(docs):
                return "\n\n".join(doc.page_content for doc in docs)

            chain = (
                {"context": retriever | format_docs, "question": RunnablePassthrough()}
                | prompt
                | llm
                | StrOutputParser()
            )

            chain.invoke("how many units did breath of the wild sell in 2020")
    """  # noqa: D301

    api_token: str
    """The Vectorize API token."""
    environment: Literal["prod", "dev", "local", "staging"] = "prod"
    """The Vectorize API environment."""
    organization: Optional[str] = None  # noqa: UP007
    """The Vectorize organization ID."""
    pipeline_id: Optional[str] = None  # noqa: UP007
    """The Vectorize pipeline ID."""
    num_results: int = 5
    """The number of documents to return."""
    rerank: bool = False
    """Whether to rerank the results."""
    metadata_filters: list[dict[str, Any]] = []
    """The metadata filters to apply when retrieving the documents."""

    _pipelines: PipelinesApi = _NOT_SET  # type: ignore[assignment]

    @override
    def model_post_init(self, /, context: Any) -> None:
        header_name = None
        header_value = None
        if self.environment == "prod":
            host = "https://api.vectorize.io/v1"
        elif self.environment == "dev":
            host = "https://api-dev.vectorize.io/v1"
        elif self.environment == "local":
            host = "http://localhost:3000/api"
            header_name = "x-lambda-api-key"
            header_value = self.api_token
        else:
            host = "https://api-staging.vectorize.io/v1"
        api = ApiClient(
            Configuration(host=host, access_token=self.api_token, debug=True),
            header_name,
            header_value,
        )
        self._pipelines = PipelinesApi(api)

    @staticmethod
    def _convert_document(document: VectorizeDocument) -> Document:
        metadata = {field: getattr(document, field) for field in _METADATA_FIELDS}
        return Document(id=document.id, page_content=document.text, metadata=metadata)

    @override
    def _get_relevant_documents(
        self,
        query: str,
        *,
        run_manager: CallbackManagerForRetrieverRun,
        organization: str | None = None,
        pipeline_id: str | None = None,
        num_results: int | None = None,
        rerank: bool | None = None,
        metadata_filters: list[dict[str, Any]] | None = None,
    ) -> list[Document]:
        request = RetrieveDocumentsRequest(  # type: ignore[call-arg]
            question=query,
            num_results=num_results or self.num_results,
            rerank=rerank or self.rerank,
            metadata_filters=metadata_filters or self.metadata_filters,
        )
        organization_ = organization or self.organization
        if not organization_:
            msg = (
                "Organization must be set either at initialization "
                "or in the invoke method."
            )
            raise ValueError(msg)
        pipeline_id_ = pipeline_id or self.pipeline_id
        if not pipeline_id_:
            msg = (
                "Pipeline ID must be set either at initialization "
                "or in the invoke method."
            )
            raise ValueError(msg)

        response = self._pipelines.retrieve_documents(
            organization_, pipeline_id_, request
        )
        return [self._convert_document(doc) for doc in response.documents]

    @override
    def invoke(
        self,
        input: str,
        config: RunnableConfig | None = None,
        *,
        organization: str = "",
        pipeline_id: str = "",
        num_results: int = _NOT_SET,  # type: ignore[assignment]
        rerank: bool = _NOT_SET,  # type: ignore[assignment]
        metadata_filters: list[dict[str, Any]] = _NOT_SET,  # type: ignore[assignment]
        **_kwargs: Any,
    ) -> list[Document]:
        """Invoke the retriever to get relevant documents.

        Main entry point for retriever invocations.

        Args:
            input: The query string.
            config: Configuration for the retriever. Defaults to None.
            organization: The organization to retrieve documents from.
                If set, overrides the organization set at the initialization of the
                retriever.
            pipeline_id: The pipeline ID to retrieve documents from.
                If set, overrides the pipeline ID set at the initialization of the
                retriever.
            num_results: The number of results to retrieve.
                If set, overrides the number of results set at the initialization of
                the retriever.
            rerank: Whether to rerank the retrieved documents.
                If set, overrides the reranking set at the initialization of the
                retriever.
            metadata_filters: The metadata filters to apply when retrieving documents.
                If set, overrides the metadata filters set at the initialization of the
                retriever.

        Returns:
            List of relevant documents.

        Examples:

            .. code-block:: python

                query = "what year was breath of the wild released?"
                docs = retriever.invoke(query, num_results=2)
        """
        if organization:
            _kwargs["organization"] = organization
        if pipeline_id:
            _kwargs["pipeline_id"] = pipeline_id
        if num_results is not _NOT_SET:
            _kwargs["num_results"] = num_results
        if rerank is not _NOT_SET:
            _kwargs["rerank"] = rerank
        if metadata_filters is not _NOT_SET:
            _kwargs["metadata_filters"] = metadata_filters

        return super().invoke(input, config, **_kwargs)