from IPython.display import Image
Image(filename='data/banner.png')


pattern = re.compile(r"(detective|sergeant|lieutenant|captain|corporal|deputy|criminalist|technician|investigator"
                     r"|det\.|sgt\.|lt\.|cpt\.|cpl\.|dty\.|tech\.|dr\.)\s+([A-Z][A-Za-z]*(\s[A-Z][A-Za-z]*)?)", re.IGNORECASE)


PROMPT_TEMPLATE_HYDE = PromptTemplate(
    input_variables=["question"],
    template="""
    You're an AI assistant specializing in criminal justice research. 
    Your main focus is on identifying the names and providing detailed context of mention for each law enforcement personnel. 
    This includes police officers, detectives, deupties, lieutenants, sergeants, captains, technicians, coroners, investigators, patrolman, and criminalists, 
    as described in court transcripts.
    Be aware that the titles "Detective" and "Officer" might be used interchangeably.
    Be aware that the titles "Technician" and "Tech" might be used interchangeably.

    Question: {question}

    Roles and Responses:""",
)


def generate_hypothetical_embeddings():
    llm = OpenAI()
    prompt = PROMPT_TEMPLATE_HYDE

    llm_chain = LLMChain(llm=llm, prompt=prompt)

    base_embeddings = OpenAIEmbeddings()

    embeddings = HypotheticalDocumentEmbedder(
        llm_chain=llm_chain, base_embeddings=base_embeddings
    )
    return embeddings


def process_single_document(file_path, embeddings):
    logger.info(f"Processing document: {file_path}")

    loader = JSONLoader(file_path)
    text = loader.load()
    logger.info(f"Text loaded from document: {file_path}")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=250)
    docs = text_splitter.split_documents(text)

    db = FAISS.from_documents(docs, embeddings)
    return db


PROMPT_TEMPLATE_MODEL = PromptTemplate(
    input_variables=["roles" ,"question", "docs"],
    template="""
    As an AI assistant, my role is to meticulously analyze court transcripts, traditional officer roles, and extract information about law enforcement personnel.

    Query: {question}

    Transcripts: {docs}

    Roles: {roles}

    The response will contain:

    1) The name of a officer, detective, deputy, lieutenant, 
       sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician - 
       if an individual's name is not associated with one of these titles they do not work in law enforcement.
       Please prefix the name with "Officer Name: ". 
       For example, "Officer Name: John Smith".

    2) If available, provide an in-depth description of the context of their mention. 
       If the context induces ambiguity regarding the individual's employment in law enforcement, 
       remove the individual.
       Please prefix this information with "Officer Context: ". 

    3) Review the context to discern the role of the officer.
       Please prefix this information with "Officer Role: "
       For example, the column "Officer Role: Lead Detective" will be filled with a value of 1 for officer's who were the lead detective.
""",
)

ROLE_TEMPLATE = """
US-IPNO-Exonerations: Model Evaluation Guide 
Roles:
Lead Detective
•	Coordinates with other detectives and law enforcement officers on the case.
•	Liaises with the prosecutor's office, contributing to legal strategy and court proceedings.

Crime Lab Analyst:
•  Analyses various types of evidence gathered during an investigation, including but not limited to, DNA, fingerprints, blood samples, drug substances, etc.
•  Prepares detailed reports outlining the findings of their analyses.
"""

def get_response_from_query(db, query):
    # Set up the parameters
    prompt = PROMPT_TEMPLATE_MODEL
    roles = ROLE_TEMPLATE
    temperature = 0
    k = 20

    # Perform the similarity search
    doc_list = db.similarity_search_with_score(query, k=k)

    # Sort documents by relevance scores. Place documents with the highest relevance 

    docs = sorted(doc_list, key=lambda x: x[1], reverse=True)

    third = len(docs) // 3

    highest_third = docs[:third]
    middle_third = docs[third:2*third]
    lowest_third = docs[2*third:]

    highest_third = sorted(highest_third, key=lambda x: x[1], reverse=True)
    middle_third = sorted(middle_third, key=lambda x: x[1], reverse=True)
    lowest_third = sorted(lowest_third, key=lambda x: x[1], reverse=True)

    docs = highest_third + lowest_third + middle_third

    docs_page_content = " ".join([d[0].page_content for d in docs])

    # Create an instance of the OpenAI model
    llm = ChatOpenAI(model_name="gpt-3.5-turbo")

    # Create an instance of the LLMChain
    chain = LLMChain(llm=llm, prompt=prompt)

    # Run the LLMChain and print the response
    response = chain.run(roles=roles, question=query, docs=docs_page_content, temperature=temperature)
    print(response)

    # Return the response and the documents
    return response


def read_summary():
    summary = pd.read_excel("data/overall-summary-with-F1-Fbeta.xlsx")
    summary = summary.sort_values("F_beta", ascending=False)
    return summary

read_summary()

	chunk_size	chunk_overlap	temperature	k	hyde	filetype	FN	FP	TP	n_files	precision	recall	F1	F_beta
4	500	250	1	20	1	police-report	20	2	105	5	0.981308	0.840000	0.905172	0.864909
2	2000	1000	1	5	0	police-report	12	32	71	5	0.689320	0.855422	0.763441	0.816092
0	500	250	1	20	1	transcript	3	27	34	4	0.557377	0.918919	0.693878	0.813397
1	500	250	0	20	1	police-report	6	56	60	5	0.517241	0.909091	0.659341	0.789474
8	2000	1000	0	5	0	police-report	15	13	54	5	0.805970	0.782609	0.794118	0.787172
3	2000	1000	1	5	1	transcript	3	11	17	3	0.607143	0.850000	0.708333	0.787037
6	1000	500	0	10	1	transcript	15	31	57	6	0.647727	0.791667	0.712500	0.757979
10	2000	1000	0	5	1	transcript	22	18	60	7	0.769231	0.731707	0.750000	0.738916
7	2000	1000	0	5	1	police-report	13	37	49	5	0.569767	0.790323	0.662162	0.733533
12	1000	500	1	10	1	police-report	34	10	78	5	0.886364	0.696429	0.780000	0.727612
11	2000	1000	1	5	1	police-report	37	19	86	5	0.819048	0.699187	0.754386	0.720268
9	500	250	0	20	1	transcript	19	29	53	6	0.646341	0.736111	0.688312	0.716216
5	1000	500	0	10	1	police-report	13	70	61	5	0.465649	0.824324	0.595122	0.714286
14	2000	1000	0	5	0	transcript	44	36	50	9	0.581395	0.531915	0.555556	0.541126
13	1000	500	1	10	1	transcript	16	32	19	4	0.372549	0.542857	0.441860	0.497382

Table of Contents¶

Using large language models for structured information extraction from the Innocence Project New Orleans' wrongful conviction case files¶

Introduction¶

Structured Data Extraction with Regex¶

Structured Data Extraction with an LLM¶

Evaluations, issues, improvements¶

Future research/next steps:¶

Fine-tuning¶

Appendix:¶

Testing with GPT-4¶

Acknowledgements¶