Jun-07-2023, 06:02 PM
I am trying to use langchain to query a pdf document with chatgpt.
import os
import openai
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')
# Initialize OpenAI API
openai.api_key = OPENAI_API_KEY
# Load the PDF documents
loader = PyPDFLoader("ALJDecision.pdf")
Don't really understand what is going wrong here.
data = loader.load()
print(f'You have {len(data)} document(s) in your data')
if len(data) >= 31:
print(f'There are {len(data[30].page_content)} characters in your document')
else:
print("Data does not have an element at index 30")
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print(f'Now you have {len(texts)} documents')
# Index the documents
index = {}
for i, t in enumerate(texts):
response = openai.Completion.create(
engine="davinci",
prompt=t.page_content,
max_tokens=64,
temperature=0.5,
top_p=1.0,
n=1,
stop=None
)
text = response.choices[0].get("text") # Get the generated text
index[str(i)] = text
# Query the index
query = "Why did the judge deny this claim for social security disability?"
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": query}],
max_tokens=128,
temperature=0.5,
top_p=1.0,
n=1,
stop=None
)
query_text = response.choices[0].get("message").get("content") # Get the generated text
# Perform the search
results = []
for doc_id, doc_content in index.items():
if query_text in doc_content:
results.append(doc_id)
# Print the results
for doc_id in results:
print(doc_id)
Error:You have 19 document(s) in your data
Data does not have an element at index 30
Now you have 31 documents
