Grafikler, heterojen ve birbirine bağlı bilgileri yapılandırılmış bir şekilde temsil etme ve depolamada harikadır, çeşitli veri türleri arasında karmaşık ilişkileri ve nitelikleri zahmetsizce yakalar. Buna karşılık, vektör veritabanları genellikle bu tür yapılandırılmış bilgilerle mücadele eder, çünkü güçleri yüksek boyutlu vektörler aracılığıyla yapılandırılmamış verileri işlemekte yatar. RAG uygulamanızda, yapılandırılmış grafik verilerini yapılandırılmamış metin aracılığıyla vektör aramasıyla birleştirerek her iki dünyanın da en iyisini elde edebilirsiniz. Bu blog yazısında göstereceğimiz şey budur.
os.environ["OPENAI_API_KEY"] = "sk-" os.environ["NEO4J_URI"] = "bolt://localhost:7687" os.environ["NEO4J_USERNAME"] = "neo4j" os.environ["NEO4J_PASSWORD"] = "password" graph = Neo4jGraph()
# Read the wikipedia article raw_documents = WikipediaLoader(query="Elizabeth I").load() # Define chunking strategy text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24) documents = text_splitter.split_documents(raw_documents[:3])
llm=ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview") llm_transformer = LLMGraphTransformer(llm=llm) # Extract graph data graph_documents = llm_transformer.convert_to_graph_documents(documents) # Store to neo4j graph.add_graph_documents( graph_documents, baseEntityLabel=True, include_source=True )
Bilgi grafiği oluşturma zincirinin hangi LLM'yi kullanmasını istediğinizi tanımlayabilirsiniz. Şu anda yalnızca OpenAI ve Mistral'dan gelen fonksiyon çağıran modelleri destekliyoruz. Ancak, gelecekte LLM seçimini genişletmeyi planlıyoruz. Bu örnekte, en son GPT-4'ü kullanıyoruz. Oluşturulan grafiğin kalitesinin kullandığınız modele önemli ölçüde bağlı olduğunu unutmayın. Teoride, her zaman en yetenekli olanı kullanmak istersiniz. LLM grafik dönüştürücüleri, add_graph_documents yöntemi aracılığıyla Neo4j'ye aktarılabilen grafik belgeleri döndürür. baseEntityLabel parametresi ek bir
vector_index = Neo4jVector.from_existing_graph( OpenAIEmbeddings(), search_type="hybrid", node_label="Document", text_node_properties=["text"], embedding_node_property="embedding" )
# Extract entities from text class Entities(BaseModel): """Identifying information about entities.""" names: List[str] = Field( ..., description="All the person, organization, or business entities that " "appear in the text", ) prompt = ChatPromptTemplate.from_messages( [ ( "system", "You are extracting organization and person entities from the text.", ), ( "human", "Use the given format to extract information from the following " "input: {question}", ), ] ) entity_chain = prompt | llm.with_structured_output(Entities)
entity_chain.invoke({"question": "Where was Amelia Earhart born?"}).names # ['Amelia Earhart']
graph.query( "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]") def generate_full_text_query(input: str) -> str: """ Generate a full-text search query for a given input string. This function constructs a query string suitable for a full-text search. It processes the input string by splitting it into words and appending a similarity threshold (~2 changed characters) to each word, then combines them using the AND operator. Useful for mapping entities from user questions to database values, and allows for some misspelings. """ full_text_query = "" words = [el for el in remove_lucene_chars(input).split() if el] for word in words[:-1]: full_text_query += f" {word}~2 AND" full_text_query += f" {words[-1]}~2" return full_text_query.strip()
# Fulltext index query def structured_retriever(question: str) -> str: """ Collects the neighborhood of entities mentioned in the question """ result = "" entities = entity_chain.invoke({"question": question}) for entity in entities.names: response = graph.query( """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2}) YIELD node,score CALL { MATCH (node)-[r:!MENTIONS]->(neighbor) RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output UNION MATCH (node)<-[r:!MENTIONS]-(neighbor) RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output } RETURN output LIMIT 50 """, {"query": generate_full_text_query(entity)}, ) result += "\n".join([el['output'] for el in response]) return result
print(structured_retriever("Who is Elizabeth I?")) # Elizabeth I - BORN_ON -> 7 September 1533 # Elizabeth I - DIED_ON -> 24 March 1603 # Elizabeth I - TITLE_HELD_FROM -> Queen Of England And Ireland # Elizabeth I - TITLE_HELD_UNTIL -> 17 November 1558 # Elizabeth I - MEMBER_OF -> House Of Tudor # Elizabeth I - CHILD_OF -> Henry Viii # and more...
def retriever(question: str): print(f"Search query: {question}") structured_data = structured_retriever(question) unstructured_data = [el.page_content for el in vector_index.similarity_search(question)] final_data = f"""Structured data: {structured_data} Unstructured data: {"#Document ". join(unstructured_data)} """ return final_data
template = """Answer the question based only on the following context: {context} Question: {question} """ prompt = ChatPromptTemplate.from_template(template) chain = ( RunnableParallel( { "context": _search_query | retriever, "question": RunnablePassthrough(), } ) | prompt | llm | StrOutputParser() )
chain.invoke({"question": "Which house did Elizabeth I belong to?"}) # Search query: Which house did Elizabeth I belong to? # 'Elizabeth I belonged to the House of Tudor.'
chain.invoke( { "question": "When was she born?", "chat_history": [("Which house did Elizabeth I belong to?", "House Of Tudor")], } ) # Search query: When was Elizabeth I born? # 'Elizabeth I was born on 7 September 1533.'
Kod şu adreste mevcuttur: