from IPython.display import Image
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from ast import literal_eval


Image(filename='../imgs/banner.png')

# import tables
def read_format(csvf):
    df = pd.read_csv(csvf)
    df['proportion_matched'] = df.percentage_matched.apply(lambda x: round(x/100, 2))
    metrics = ['precision', 'recall', 'f1_score', 'f_beta_score']
    for metric in metrics:
        df[metric] = df[metric].apply(lambda x: round(x, 2))
    return df

# PROPOSED BUILD (removes whitespace and some repetition)
# NOTE: `read_format()` was not used originally but I've added it
files = {
    # allContext (iterate over all of the content within the document with one api call)
    'fullcontext_sonnet': '../data/allContext/results_claude-3-5-sonnet-20240620.csv',
    'fullcontext_haiku': '../data/allContext/results_claude-3-haiku-20240307.csv',
    'fullcontext_opus': '../data/allContext/results_claude-3-opus-20240229.csv',
    'fullcontext_nemo': '../data/allContext/results_open-mistral-nemo.csv',

    # AllPages (iterate over each page with one api call per page)
    'allpages_haiku': '../data/allPages/results_claude-3-haiku-20240307.csv',
    'allpages_sonnet': '../data/allPages/results_claude-3-5-sonnet-20240620.csv',
    'allpages_mixtral_7b': '../data/allPages/Mixtral-8x7B-Instruct-v0.1.csv',
    'allpages_mixtral_22b': '../data/allPages/Mixtral-8x22B-Instruct-v0.1.csv',

    # use Named Entity Recognition (NER) to calculate the number of entities on each page as a preprocessing step.
    # It then processes the document in three separate iterations.
    # In each iteration, it focuses on different top fractions of pages containing the most entities:
    # first the top 1/4th, then the top 1/2, and finally the top 3/4ths of the document.
    # For each fraction, the script identifies the pages with the highest number of entities and then iterates over each of those pages,
    # similar to the allPages script, to extract the entities.
    'ner_25_haiku': '../data/ner/results_claude-3-haiku-20240307-25per.csv',
    'ner_50_haiku': '../data/ner/results_claude-3-haiku-20240307-50per.csv',
    'ner_75_haiku': '../data/ner/results_claude-3-haiku-20240307-75per.csv',
    'ner_25_sonnet': '../data/ner/results_claude-3-5-sonnet-20240620-25per.csv',
    'ner_50_sonnet': '../data/ner/results_claude-3-5-sonnet-20240620-50per.csv',
    'ner_75_sonnet': '../data/ner/results_claude-3-5-sonnet-20240620-75per.csv',
    'ner_25_mixtral_7b': '../data/ner/Mixtral-8x7B-Instruct-v0.1-25per.csv',
    'ner_50_mixtral_7b': '../data/ner/Mixtral-8x7B-Instruct-v0.1-50per.csv',
    'ner_75_mixtral_7b': '../data/ner/Mixtral-8x7B-Instruct-v0.1-75per.csv',
    'ner_25_mixtral_22b': '../data/ner/Mixtral-8x22B-Instruct-v0.1-25per.csv',
    'ner_50_mixtral_22b': '../data/ner/Mixtral-8x22B-Instruct-v0.1-50per.csv',
    'ner_75_mixtral_22b': '../data/ner/Mixtral-8x22B-Instruct-v0.1-75per.csv',
    # Experimental vision model results
    'haiku_vision_allpages': '../data/Vision/results_claude-3-haiku-20240307.csv',
    'sonnet_vision_allpages': '../data/Vision/results_claude-3-5-sonnet-20240620.csv',
}
dfs = []
for label, path in files.items():
    df = read_format(csvf=path)
    df['analysis_type'] = label
    dfs.append(df)
full = pd.concat(dfs)
full = full.drop(columns=[
    col for col in full.columns
    if ('unnamed' in col.lower()) | (('file' in col.lower()) & ('name' in col.lower()))
    ], errors='ignore')
full = full.rename(columns={"total_ground_truth": "n_entities"})
frozen = pd.read_parquet("../data/frozen/results.parquet")
assert full.equals(frozen), f"Results data do not match frozen/expected data."
assert (df.proportion_matched == df.recall).all()

# minor formatting of results
index = ['model', 'analysis_type',]
maincols = [
    'filetype',
    'token_count', 'n_entities',
    'true_positives', 'false_positives',
    'recall','precision', 'f1_score', 'f_beta_score',
    'matched_names', 'unmatched_names',
]
main_windex = index + maincols
df = full[main_windex]
df_names = df.copy()

df.sample(3).T

results_df = df.groupby("analysis_type").agg({
    "precision": "mean",
    "recall": "mean",
    "f1_score": "mean",
    "f_beta_score": "mean"
}).sort_values("recall", ascending=False)

metrics = ['precision', 'recall', 'f1_score', 'f_beta_score']
for metric in metrics:
    results_df[metric] = results_df[metric].apply(lambda x: round(x, 2))

results_df

template = """
    As an AI assistant, my role is to meticulously analyze criminal justice documents and extract information about law enforcement personnel.

    Query: {question}

    Documents: {docs}

    The response will contain:

    1) The name of a law enforcement personnel. Law enforcement personnel can be identified by searching for these name prefixes: ofcs., officers, sergeants, sgts., lieutenants, lts., captains, cpts., commanders, sheriffs, deputies, dtys., detectives, dets., inspectors, technicians, analysts, coroners.

    Please prefix the name with "Officer Name: ".
    For example, "Officer Name: John Smith".

    2) If available, provide an in-depth description of the context of their mention.
    If the context induces ambiguity regarding the individual's employment in law enforcement, please make this clear in your response.

    Please prefix this information with "Officer Context: ".

    3) Review the context to discern the role of the officer. For example, Lead Detective (Homicide Division), Supervising Officer (Crime Lab), Detective, Officer on Scene, Arresting Officer, Crime Lab Analyst

    Please prefix this information with "Officer Role: "
    For example, "Officer Role: Lead Detective"

    The full response should follow the format below, with no prefixes such as 1., 2., 3., a., b., c., etc.:

    Officer Name: John Smith
    Officer Context: Mentioned as someone who was present during a search, along with other detectives from different units.
    Officer Role: Patrol Officer

    Officer Name:
    Officer Context:
    Officer Role:

    - Do not include any prefixes
    - Only derive responses from factual information found within the police reports.
    - If the context of an identified person's mention is not clear in the report, provide their name and note that the context is not specified.
"""

haiku_tbl = df[df.analysis_type.str.contains("allpages_haiku")]
haiku_tbl[maincols].sort_values("recall", ascending=False).reset_index(drop=True)

def analyze_entity_characteristics(df):
    results = []

    for analysis_type, group in df.groupby('analysis_type'):
        all_entities = []
        matched_entities = []
        unmatched_entities = []

        for _, row in group.iterrows():
            matched = set(literal_eval(row['matched_names']))
            unmatched = set(literal_eval(row['unmatched_names']))
            all_entities.extend(matched.union(unmatched))
            matched_entities.extend(matched)
            unmatched_entities.extend(unmatched)

        def categorize_entities(entities):
            single_word = [e for e in entities if len(e.split()) == 1]
            multi_word = [e for e in entities if len(e.split()) > 1]
            return single_word, multi_word

        all_single, all_multi = categorize_entities(all_entities)
        matched_single, matched_multi = categorize_entities(matched_entities)

        total_single = len(all_single)
        total_multi = len(all_multi)

        unmatched_single = total_single - len(matched_single)
        unmatched_multi = total_multi - len(matched_multi)

        pct_unmatched_single = round(unmatched_single / total_single, 3) * 100 if total_single > 0 else 0
        pct_unmatched_multi = round(unmatched_multi / total_multi, 3) * 100 if total_multi > 0 else 0

        results.append({
            'Analysis Type': analysis_type,
            'total_single_word_entities': total_single,
            'total_multi_word_entities': total_multi,
            'unmatched_single_word_entities': unmatched_single,
            'unmatched_multi_word_entities': unmatched_multi,
            'pct_unmatched_single_word': pct_unmatched_single,
            'pct_unmatched_multi_word': pct_unmatched_multi
        })

    return pd.DataFrame(results)

# Assuming all_data is your input DataFrame
analysis_comparison_df = analyze_entity_characteristics(df)
analysis_comparison_df.to_csv("../data/output/compare.csv")

analysis_comparison_df =  analysis_comparison_df[analysis_comparison_df["Analysis Type"].str.contains("allpages")]

analysis_comparison_df.T

def calculate_entity_complexity_features(row):
    matched = set(literal_eval(row['matched_names']))
    unmatched = set(literal_eval(row['unmatched_names']))
    all_entities = matched.union(unmatched)

    if all_entities:
        avg_entity_length = np.mean([len(entity.split()) for entity in all_entities])
        pct_multi_word = sum(len(entity.split()) > 1 for entity in all_entities) / len(all_entities)
        pct_single_word = sum(len(entity.split()) == 1 for entity in all_entities) / len(all_entities)
    else:
        avg_entity_length = 0
        pct_multi_word = 0
        pct_single_word = 0

    return pd.Series({
        'avg_entity_length': avg_entity_length,
        'pct_multi_word_entities': pct_multi_word,
        'pct_single_word_entities': pct_single_word
    })

def analyze_data(data):
    complexity_features = data.apply(calculate_entity_complexity_features, axis=1)
    data = pd.concat([data, complexity_features], axis=1)

    data['entities_per_token'] = data['n_entities'] / data['token_count']

    X = data[['avg_entity_length', 'pct_multi_word_entities', 'pct_single_word_entities']]
    y = data['f_beta_score']

    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)

    predictions = rf.predict(X)
    rmse = np.sqrt(mean_squared_error(y, predictions))
    r2 = r2_score(y, predictions)

    importances = rf.feature_importances_
    feature_names = X.columns

    feature_importances = {name: importance for name, importance in zip(feature_names, importances)}

    return rmse, r2, feature_importances, data

def process_data(data):
    results = []
    for analysis_type, group in data.groupby('analysis_type'):
        if 'filetype' in group.columns:
            doc_types = group['filetype'].unique()
        else:
            doc_types = ['combined']

        for doc_type in doc_types:
            if doc_type != 'combined':
                filtered_data = group[group['filetype'] == doc_type]
            else:
                filtered_data = group

            if not filtered_data.empty:
                rmse, r2, feature_importances, updated_data = analyze_data(filtered_data)
                result = {
                    'Analysis_Type': analysis_type,
                    'Document_Type': doc_type,
                    'Average_F_Beta_Score': round(updated_data['f_beta_score'].mean(), 2),
                    'Average_RMSE': round(rmse, 2),
                    'Average_R2': round(r2, 2),
                    'Average_Entities_Per_Token': round(updated_data['entities_per_token'].mean(), 2),
                    'Average_Entity_Length': round(updated_data['avg_entity_length'].mean(), 2),
                    'Percent_Multi_Word_Entities': round(updated_data['pct_multi_word_entities'].mean(), 2),
                    **feature_importances
                }

                # Extract NER percentage if it's an NER analysis type
                if 'ner' in analysis_type:
                    result['NER_Percentage'] = analysis_type.split('_')[1]
                else:
                    result['NER_Percentage'] = 'N/A'

                results.append(result)
    return results

def calculate_weighted_mean_feature_importances(grouped_results_df):
    weighted_feature_importances = {
        'Average Entity Length': [],
        'Percent Multi-Word Entities': [],
        'Percent Single-Word Entities': []
    }
    weights = []

    for index, row in grouped_results_df.iterrows():
        weight = row['Average_R2']
        weighted_feature_importances['Average Entity Length'].append(row['avg_entity_length'] * weight)
        weighted_feature_importances['Percent Multi-Word Entities'].append(row['pct_multi_word_entities'] * weight)
        weighted_feature_importances['Percent Single-Word Entities'].append(row['pct_single_word_entities'] * weight)
        weights.append(weight)

    total_weight = sum(weights)
    weighted_mean_feature_importances = {
        feature: sum(values) / total_weight
        for feature, values in weighted_feature_importances.items()
    }

    return weighted_mean_feature_importances


results = process_data(df_names)
results_df = pd.DataFrame(results)
groupby_columns = ['Analysis_Type', 'Document_Type', 'NER_Percentage']
grouped_results_df = results_df.groupby(groupby_columns).mean().reset_index()
grouped_results_df = grouped_results_df[grouped_results_df["Analysis_Type"].str.contains("allpages")]
for col in ('avg_entity_length', 'pct_multi_word_entities', 'pct_single_word_entities'):
    grouped_results_df[col] = grouped_results_df[col].apply(lambda x: round(x, 2))

weighted_mean_feature_importances = calculate_weighted_mean_feature_importances(grouped_results_df)
print("Weighted Mean Feature Importances:")
for feature, importance in weighted_mean_feature_importances.items():
    print(f"{feature}: {importance:.2f}")

# Calculate and print the absolute threshold
absolute_threshold = sum(weighted_mean_feature_importances.values()) / len(weighted_mean_feature_importances)
print(f"\nAbsolute Threshold for Importance: {absolute_threshold:.2f}")

grouped_results_df

Weighted Mean Feature Importances:
Average Entity Length: 0.32
Percent Multi-Word Entities: 0.32
Percent Single-Word Entities: 0.36

Absolute Threshold for Importance: 0.33

	10	0	12
model	claude-3-5-sonnet-20240620	claude-3-haiku-20240307	claude-3-haiku-20240307
analysis_type	sonnet_vision_allpages	ner_25_haiku	allpages_haiku
filetype	report	report	transcript
token_count	5	5871	18718
n_entities	31	11	4
true_positives	15	6	4
false_positives	10	1	31
recall	0.48	0.55	1.0
precision	0.6	0.86	0.11
f1_score	0.54	0.67	0.21
f_beta_score	0.5	0.59	0.39
matched_names	{'mark mccrarey', 'kerry granderson', 'martin ...	{'raymond loosemore', 'jefferson', 'spong', 'g...	{'martin venezia', 'ursin', 'little', 'ruiz'}
unmatched_names	{'gerald kuhn', 'ignacious tanner', 'rob micha...	{'g. noble', "harry o'neal", 'thomas sanders',...	set()

	precision	recall	f1_score	f_beta_score
analysis_type
allpages_sonnet	0.49	0.93	0.61	0.75
ner_75_sonnet	0.51	0.92	0.63	0.76
allpages_haiku	0.29	0.91	0.42	0.59
ner_50_sonnet	0.54	0.88	0.64	0.75
ner_75_haiku	0.32	0.86	0.44	0.59
ner_50_haiku	0.35	0.83	0.46	0.60
ner_25_sonnet	0.58	0.76	0.63	0.69
allpages_mixtral_7b	0.18	0.76	0.28	0.42
ner_75_mixtral_7b	0.21	0.71	0.30	0.44
ner_25_haiku	0.45	0.71	0.50	0.59
allpages_mixtral_22b	0.26	0.69	0.34	0.46
ner_50_mixtral_7b	0.25	0.69	0.34	0.46
ner_75_mixtral_22b	0.31	0.68	0.40	0.51
sonnet_vision_allpages	0.45	0.67	0.51	0.57
ner_50_mixtral_22b	0.35	0.66	0.43	0.52
haiku_vision_allpages	0.37	0.65	0.45	0.54
fullcontext_opus	0.79	0.63	0.69	0.65
ner_25_mixtral_22b	0.46	0.58	0.47	0.51
fullcontext_haiku	0.72	0.57	0.62	0.58
ner_25_mixtral_7b	0.31	0.55	0.37	0.44
fullcontext_sonnet	0.82	0.48	0.58	0.51
fullcontext_nemo	0.49	0.20	0.25	0.22

	filetype	token_count	n_entities	true_positives	false_positives	recall	precision	f1_score	f_beta_score	matched_names	unmatched_names
0	report	20475	13	13	39	1.00	0.25	0.40	0.62	{'accardo', 'lawrence hingle', 'mangana', 'm. ...	set()
1	transcript	18718	4	4	31	1.00	0.11	0.21	0.39	{'martin venezia', 'ursin', 'little', 'ruiz'}	set()
2	transcript	1899	4	4	6	1.00	0.40	0.57	0.77	{'john treadaway', "harry o'neal", 'mason spon...	set()
3	transcript	22254	6	6	35	1.00	0.15	0.26	0.46	{'martin venezia', 'little', 'jerry ursin', 's...	set()
4	report	11696	10	10	16	1.00	0.38	0.56	0.76	{'martin venezia', 'henry kirsch', 'charles li...	set()
5	report	7488	11	11	11	1.00	0.50	0.67	0.83	{'john dillmann', 'tim sevzeneau', 'fred danta...	set()
6	transcript	101239	10	10	62	1.00	0.14	0.24	0.45	{'quinton', 'milton weaver', 'george serio', '...	set()
7	report	10589	25	25	11	1.00	0.69	0.82	0.92	{'gary sallienger', 'nate addison', 'lasalle r...	set()
8	report	14348	12	12	17	1.00	0.41	0.59	0.78	{'hilton cox', 'gebbia', 'allen tidwell', 'wal...	set()
9	transcript	41487	3	3	27	1.00	0.10	0.18	0.36	{'barrett morton', 'garner', 'kenneth leary'}	set()
10	transcript	87361	8	8	54	1.00	0.13	0.23	0.43	{'garrett', 'stewart', 'a sison', 'herman cade...	set()
11	report	22021	31	30	36	0.97	0.45	0.62	0.79	{'kerry granderson', 'michael fejka', 'byron a...	{'kerry farve'}
12	transcript	88804	17	16	52	0.94	0.24	0.38	0.59	{'ray miller', 'john miller', 'lambert', 'john...	{'tracy'}
13	report	26172	13	12	39	0.92	0.24	0.38	0.58	{'denour j', 'jerry hall', 'ralph peperone', '...	{'o’neal'}
14	transcript	102827	32	29	165	0.91	0.15	0.26	0.45	{'davillier', 'gebbia', 'walley goodey', 'char...	{'lynn anderson', 'doug gremillion', 'woodall'}
15	report	7337	5	4	9	0.80	0.31	0.44	0.61	{'ralph sacks', 'john morse', 'j. whitehurst',...	{'hanet crackum'}
16	transcript	7932	5	4	13	0.80	0.24	0.36	0.54	{'john dillman', 'sison', 'michae rice', 'john...	{'james ducose'}
17	report	5871	11	8	6	0.73	0.57	0.64	0.69	{'g. noble', 'raymond loosemore', 'jefferson',...	{'thomas sanders', "harry o'neal", 'patricia f...
18	report	3748	11	3	26	0.27	0.10	0.15	0.21	{'white', 'serio', 'anthony keeton'}	{'weaver', 'quinton', 'anderson', 'mosley', 'c...

	0	1	2	3	8	21
Analysis Type	allpages_haiku	allpages_mixtral_22b	allpages_mixtral_7b	allpages_sonnet	haiku_vision_allpages	sonnet_vision_allpages
total_single_word_entities	55	55	55	55	54	54
total_multi_word_entities	176	176	176	176	167	167
unmatched_single_word_entities	11	20	16	10	22	20
unmatched_multi_word_entities	8	39	27	8	54	54
pct_unmatched_single_word	20.0	36.4	29.1	18.2	40.7	37.0
pct_unmatched_multi_word	4.5	22.2	15.3	4.5	32.3	32.3

	Analysis_Type	Document_Type	NER_Percentage	Average_F_Beta_Score	Average_RMSE	Average_R2	Average_Entities_Per_Token	Average_Entity_Length	Percent_Multi_Word_Entities	avg_entity_length	pct_multi_word_entities	pct_single_word_entities
0	allpages_haiku	report	N/A	0.68	0.05	0.91	0.00	1.80	0.78	0.30	0.28	0.42
1	allpages_haiku	transcript	N/A	0.49	0.04	0.86	0.00	1.70	0.69	0.40	0.33	0.28
2	allpages_mixtral_22b	report	N/A	0.59	0.05	0.87	0.00	1.80	0.78	0.26	0.34	0.40
3	allpages_mixtral_22b	transcript	N/A	0.32	0.08	0.74	0.00	1.70	0.69	0.34	0.34	0.33
4	allpages_mixtral_7b	report	N/A	0.53	0.07	0.86	0.00	1.80	0.78	0.25	0.29	0.46
5	allpages_mixtral_7b	transcript	N/A	0.29	0.06	0.83	0.00	1.70	0.69	0.30	0.27	0.42
6	allpages_sonnet	report	N/A	0.78	0.05	0.88	0.00	1.80	0.78	0.34	0.31	0.35
7	allpages_sonnet	transcript	N/A	0.73	0.04	0.87	0.00	1.70	0.69	0.38	0.34	0.28
16	haiku_vision_allpages	report	N/A	0.54	0.11	0.81	inf	1.79	0.77	0.31	0.41	0.28
17	haiku_vision_allpages	transcript	N/A	0.54	0.08	0.77	0.87	1.70	0.69	0.27	0.28	0.45
42	sonnet_vision_allpages	report	N/A	0.57	0.11	0.81	inf	1.76	0.74	0.35	0.35	0.30
43	sonnet_vision_allpages	transcript	N/A	0.58	0.09	0.70	1.15	1.73	0.73	0.29	0.33	0.37

Table of Contents¶

Introduction ¶

Analysis Types ¶

Results ¶

Addressing Precision and F1 Scores ¶

Key Insight ¶

Additional Insights ¶

Feature Importance Analysis ¶

Conclusion ¶

Resources ¶