-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathretrievers.py
More file actions
246 lines (207 loc) · 8.49 KB
/
retrievers.py
File metadata and controls
246 lines (207 loc) · 8.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""Vectorize LangChain retrievers."""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Literal, Optional
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from typing_extensions import override
from vectorize_client.api.pipelines_api import PipelinesApi
from vectorize_client.api_client import ApiClient
from vectorize_client.configuration import Configuration
from vectorize_client.models.retrieve_documents_request import RetrieveDocumentsRequest
if TYPE_CHECKING:
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.runnables import RunnableConfig
from vectorize_client.models.document import Document as VectorizeDocument
_METADATA_FIELDS = {
"relevancy",
"chunk_id",
"total_chunks",
"origin",
"origin_id",
"similarity",
"source",
"unique_source",
"source_display_name",
"pipeline_id",
"org_id",
}
_NOT_SET = object()
class VectorizeRetriever(BaseRetriever):
"""Vectorize retriever.
Setup:
Install package ``langchain-vectorize``
.. code-block:: bash
pip install -U langchain-vectorize
Init args:
api_token: str
The Vectorize API token.
environment: Literal["prod", "dev", "local", "staging"]
The Vectorize API environment. Defaults to "prod".
organization: Optional[str]
The Vectorize organization ID. Defaults to None.
pipeline_id: Optional[str]
The Vectorize pipeline ID. Defaults to None.
num_results: int
Number of documents to return. Defaults to 5.
rerank: bool
Whether to rerank the results. Defaults to False.
metadata_filters: list[dict[str, Any]]
The metadata filters to apply when retrieving the documents. Defaults to [].
Instantiate:
.. code-block:: python
from langchain_vectorize import VectorizeRetriever
retriever = VectorizeRetriever(
api_token="xxxxx", "organization"="1234", "pipeline_id"="5678"
)
Usage:
.. code-block:: python
query = "what year was breath of the wild released?"
retriever.invoke(query)
Use within a chain:
.. code-block:: python
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
prompt = ChatPromptTemplate.from_template(
\"\"\"Answer the question based only on the context provided.
Context: {context}
Question: {question}\"\"\"
)
llm = ChatOpenAI(model="gpt-4o")
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
chain.invoke("how many units did breath of the wild sell in 2020")
""" # noqa: D301
api_token: str
"""The Vectorize API token."""
environment: Literal["prod", "dev", "local", "staging"] = "prod"
"""The Vectorize API environment."""
organization: Optional[str] = None # noqa: UP007
"""The Vectorize organization ID."""
pipeline_id: Optional[str] = None # noqa: UP007
"""The Vectorize pipeline ID."""
num_results: int = 5
"""The number of documents to return."""
rerank: bool = False
"""Whether to rerank the results."""
metadata_filters: list[dict[str, Any]] = []
"""The metadata filters to apply when retrieving the documents."""
_pipelines: PipelinesApi = _NOT_SET # type: ignore[assignment]
@override
def model_post_init(self, /, context: Any) -> None:
header_name = None
header_value = None
if self.environment == "prod":
host = "https://api.vectorize.io/v1"
elif self.environment == "dev":
host = "https://api-dev.vectorize.io/v1"
elif self.environment == "local":
host = "http://localhost:3000/api"
header_name = "x-lambda-api-key"
header_value = self.api_token
else:
host = "https://api-staging.vectorize.io/v1"
api = ApiClient(
Configuration(host=host, access_token=self.api_token, debug=True),
header_name,
header_value,
)
self._pipelines = PipelinesApi(api)
@staticmethod
def _convert_document(document: VectorizeDocument) -> Document:
metadata = {field: getattr(document, field) for field in _METADATA_FIELDS}
return Document(id=document.id, page_content=document.text, metadata=metadata)
@override
def _get_relevant_documents(
self,
query: str,
*,
run_manager: CallbackManagerForRetrieverRun,
organization: str | None = None,
pipeline_id: str | None = None,
num_results: int | None = None,
rerank: bool | None = None,
metadata_filters: list[dict[str, Any]] | None = None,
) -> list[Document]:
request = RetrieveDocumentsRequest( # type: ignore[call-arg]
question=query,
num_results=num_results or self.num_results,
rerank=rerank or self.rerank,
metadata_filters=metadata_filters or self.metadata_filters,
)
organization_ = organization or self.organization
if not organization_:
msg = (
"Organization must be set either at initialization "
"or in the invoke method."
)
raise ValueError(msg)
pipeline_id_ = pipeline_id or self.pipeline_id
if not pipeline_id_:
msg = (
"Pipeline ID must be set either at initialization "
"or in the invoke method."
)
raise ValueError(msg)
response = self._pipelines.retrieve_documents(
organization_, pipeline_id_, request
)
return [self._convert_document(doc) for doc in response.documents]
@override
def invoke(
self,
input: str,
config: RunnableConfig | None = None,
*,
organization: str = "",
pipeline_id: str = "",
num_results: int = _NOT_SET, # type: ignore[assignment]
rerank: bool = _NOT_SET, # type: ignore[assignment]
metadata_filters: list[dict[str, Any]] = _NOT_SET, # type: ignore[assignment]
**_kwargs: Any,
) -> list[Document]:
"""Invoke the retriever to get relevant documents.
Main entry point for retriever invocations.
Args:
input: The query string.
config: Configuration for the retriever. Defaults to None.
organization: The organization to retrieve documents from.
If set, overrides the organization set at the initialization of the
retriever.
pipeline_id: The pipeline ID to retrieve documents from.
If set, overrides the pipeline ID set at the initialization of the
retriever.
num_results: The number of results to retrieve.
If set, overrides the number of results set at the initialization of
the retriever.
rerank: Whether to rerank the retrieved documents.
If set, overrides the reranking set at the initialization of the
retriever.
metadata_filters: The metadata filters to apply when retrieving documents.
If set, overrides the metadata filters set at the initialization of the
retriever.
Returns:
List of relevant documents.
Examples:
.. code-block:: python
query = "what year was breath of the wild released?"
docs = retriever.invoke(query, num_results=2)
"""
if organization:
_kwargs["organization"] = organization
if pipeline_id:
_kwargs["pipeline_id"] = pipeline_id
if num_results is not _NOT_SET:
_kwargs["num_results"] = num_results
if rerank is not _NOT_SET:
_kwargs["rerank"] = rerank
if metadata_filters is not _NOT_SET:
_kwargs["metadata_filters"] = metadata_filters
return super().invoke(input, config, **_kwargs)