from langchain.storage import InMemoryByteStore from langchain_chroma import Chroma from langchain_community.document_loaders import TextLoader from langchain_openai import OpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter
loaders = [ TextLoader("paul_graham_essay.txt"), TextLoader("state_of_the_union.txt"), ] docs = [] for loader in loaders: docs.extend(loader.load()) text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000) docs = text_splitter.split_documents(docs)
# The vectorstore to use to index the child chunks vectorstore = Chroma( collection_name="full_documents", embedding_function=OpenAIEmbeddings() )
import uuid from langchain.retrievers.multi_vector import MultiVectorRetriever
store = InMemoryByteStore() id_key = "doc_id"
# The retriever (empty to start) retriever = MultiVectorRetriever( vectorstore=vectorstore, byte_store=store, id_key=id_key, )
Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': '064eca46-a4c4-4789-8e3b-583f9597e54f', 'source': 'state_of_the_union.txt'})
import getpass import os from langchain_openai import ChatOpenAI import uuid from langchain_core.documents import Document from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate
ifnot os.environ.get("OPENAI_API_KEY"): os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")
chain = ( {"doc": lambda x: x.page_content} # Only asking for 3 hypothetical questions, but this could be adjusted | ChatPromptTemplate.from_template( "Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\n\n{doc}" ) | ChatOpenAI(max_retries=0, model="gpt-4o").with_structured_output( HypotheticalQuestions ) | (lambda x: x.questions) ) hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})
# The vectorstore to use to index the child chunks vectorstore = Chroma( collection_name="hypo-questions", embedding_function=OpenAIEmbeddings() ) # The storage layer for the parent documents store = InMemoryByteStore() id_key = "doc_id" # The retriever (empty to start) retriever = MultiVectorRetriever( vectorstore=vectorstore, byte_store=store, id_key=id_key, ) doc_ids = [str(uuid.uuid4()) for _ in docs]
question_docs = [] for i, question_list inenumerate(hypothetical_questions): question_docs.extend( [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list] )