import os
import multiprocessing
import time

# LLMs
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

# Text prep & vector store
from langchain.document_loaders import BSHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

# RAG
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# projects directory
DOC_PATH = '/home/nastory/repos/nigelstorydata_flask/nigelstorydata/templates/'

# load Ollama models
llm = Ollama(model='llama3', temperature=0)
embedding = OllamaEmbeddings(model='llama3')

test_file = os.path.join(DOC_PATH, 'acnh.html')
with open(test_file, 'r') as f:
    txt = f.read()

print(txt[:200])

  

<title>Animal Crossing New Horizons, the Stalk Market</title>

<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"><

loader = BSHTMLLoader(test_file)
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

documents = splitter.split_documents(docs)
vector_store = FAISS.from_documents(documents, embedding)

retriever = vector_store.as_retriever()

prompt_template = """
You are a question answer service. Given the provided context, which comes
from a machine learning project composed in Jupyter Notebooks by Nigel Story,
answer the question below.

<context>
{context}
</context>

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

rag_chain = (
    {'context': retriever, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

res = rag_chain.invoke("what are some of the ML techniques used in this project?")
print(res)

Based on the provided context from the Jupyter Notebooks, some Machine Learning (ML) techniques used in this project include:

1. **Random Forest Classification**: This is a popular ensemble learning method that combines multiple decision trees to improve predictive accuracy.
2. **Artificial Neural Network (ANN) classifier**: A type of feedforward neural network designed for classification tasks.

Additionally, the context mentions the use of:

3. **LabelEncoder()**: A pre-processing technique used to convert categorical labels into numerical representations.
4. **to_categorical()**: A function used to convert integer labels into one-hot encoded vectors.

These ML techniques are likely used in combination with other tools and libraries (e.g., scikit-learn, TensorFlow) to analyze and predict the trends of turnip prices in Animal Crossing New Horizons.

def load_html_document(file_path):
    """Load in a project HTML document.
    """

    loader = BSHTMLLoader(file_path)
    doc = loader.load()

    return doc


def chunk_document(doc, chunk_size=1000, chunk_overlap=100):
    """Split a document into text chunks for embedding.
    """

    splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
    )

    documents = splitter.split_documents(doc)

    return documents


def rag_retriever(documents, embedding):
    """Create the vector database and the retriever
    to execute RAG.
    """

    vector_store = FAISS.from_documents(documents, embedding)
    retriever = vector_store.as_retriever()

    return retriever


def get_rag_prompt(classifier_prompt):
    """Create LangChain prompt template object.
    """

    prompt = ChatPromptTemplate.from_template(classifier_prompt)

    return prompt


def generate_questions(labels):
    """Generate questions to pass through the RAG pipeline
    based on the user-provided possible labels.
    """

    question_template = "Does this project relate to {} in a significant way?"
    labels_questions = [(label, question_template.format(label)) for label in labels]

    return labels_questions


def zero_shot_mutli_label(file_path, labels, llm, embedding):
    """Ingest an HTML document and a list of possible labels and
    execute zero-shot multi-label classification.
    """

    classifier_prompt = """
    You are a helpful yes or no answer service. Given the provided context, which comes
    from a machine learning project composed in Jupyter Notebooks by Nigel Story,
    answer the question below. Take your time and find the correct answer from
    the context. Only respond with "Yes" or "No".

    <context>
    {context}
    </context>

    Question: {question}
    """

    text = load_html_document(file_path)
    documents = chunk_document(text, chunk_size=1000, chunk_overlap=100)
    retriever = rag_retriever(documents, embedding)
    prompt = get_rag_prompt(classifier_prompt)
    
    rag_chain = (
        {'context': retriever, 'question': RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    labels_questions = generate_questions(labels)

    start = time.time()

    preds = [(q[0], rag_chain.invoke(q[1])) for q in labels_questions]

    print(f"time to process prompts: {time.time() - start}s")

    return preds


def filter_preds(preds):
    """Filter binary outputs from LLM to only
    return the multi-label predictions.
    """  
    return [p[0] for p in preds if p[1] == 'Yes']

possible_labels = [
    'SQL',
    'Classification',
    'Clustering',
    'Regression',
    'Web Development',
    'Image Analytics',
    'Anomaly Detection',
    'Simulation',
    'NLP'
]

acnh_preds = zero_shot_mutli_label(test_file, possible_labels, llm, embedding)

time to process prompts: 13.631207942962646s

filter_preds(acnh_preds)

['Classification', 'Clustering', 'Regression', 'Simulation']

def pool_invoke(label, prompt, rag_chain):
    '''Global picklable func for use in Pool processing.
    '''

    response = rag_chain.invoke(prompt)

    return label, response


def parallel_prompt_zero_shot_mutli_label(file_path, labels, llm, embedding, n_jobs=1):
    """Parallelized zero-shot multi-label document classifier.
    """

    classifier_prompt = """
    You are a helpful yes or no answer service. Given the provided context, which comes
    from a machine learning project composed in Jupyter Notebooks by Nigel Story,
    answer the question below. Take your time and find the correct answer from
    the context. Only respond with "Yes" or "No".

    <context>
    {context}
    </context>

    Question: {question}
    """

    text = load_html_document(file_path)
    documents = chunk_document(text, chunk_size=1000, chunk_overlap=100)
    retriever = rag_retriever(documents, embedding)
    prompt = get_rag_prompt(classifier_prompt)
    
    rag_chain = (
        {'context': retriever, 'question': RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    labels_questions = generate_questions(labels)

    start = time.time()

    if n_jobs == -1:

        def mp_invoke(label, prompt, pred_dict):
            result = rag_chain.invoke(prompt)
            pred_dict[label] = result

        # use max possible processes
        manager = multiprocessing.Manager()
        pred_dict = manager.dict()
        jobs = []
        for question in labels_questions:
            p = multiprocessing.Process(target=mp_invoke, args=(question[0], question[1], pred_dict))
            jobs.append(p)
            p.start()

        for p in jobs:
            p.join()

        results = pred_dict.items()

    else:
        # use specified number
        pool = multiprocessing.Pool(processes=n_jobs)
        results = [pool.apply(pool_invoke, args=(question[0], question[1], rag_chain)) for question in labels_questions]

    print(f"time to process prompts: {time.time() - start}s")

    return results

parallel_acnh_preds = parallel_prompt_zero_shot_mutli_label(test_file, possible_labels, llm, embedding, n_jobs=-1)

time to process prompts: 11.637054204940796s

filter_preds(parallel_acnh_preds)

['Classification', 'Clustering', 'Regression', 'Simulation']

Nigel Story | Full-Stack Data Science

RAGs to Riches: Using RAG for Solving Zero-Shot Multi-label Classification of Documents¶

Contents¶

Introduction ¶

Overview of Retrieval Augmented Generation (RAG) ¶

RAG as a Document Classifier ¶

About the Data ¶

Packages and Setup ¶

Basic RAG Pipeline ¶

RAG as a Zero-Shot Document Multi-label Classifier ¶

Parallel Prompting Implementation ¶

Conclusion ¶

Thanks for reading!¶