Skip to content
Permalink
33e3dcea4b
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
1081 lines (861 sloc) 46.9 KB
# extracting elements -----------------------------------------------------
extract_elements_v2 <- function(s) {
# Extract elements enclosed within backticks using regex
pattern <- "`(.*?)`"
elements <- regmatches(s, gregexpr(pattern, s))[[1]]
# Remove the backticks from the matched elements
elements <- gsub("`", "", elements)
return(elements)
}
# hallucination identification ------------------------------------------------
id_hallucinations<-function(trial_df,matches_df){
### Deprecated; use id_hallucinations_v2 ###
# takes the raw input from Nafis and returns the counts of each type of
# hallucination for each trial row
# extracts the model from its corresponding column for each of the models
gptzs_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-zs',
matches=matches_df$gpt4o_zs_gen_matches)
gptts_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-ts',
matches=matches_df$gpt4o_ts_gen_matches)
llamazs_matches<-data.frame(trial_id=matches_df$NCTId,
model='llama3-70b-in-zs',
matches=matches_df$llama3_70b_it_zs_gen_matches)
llamats_matches<-data.frame(trial_id=matches_df$NCTId,
model='llama3-70b-in-ts',
matches=matches_df$llama3_70b_it_ts_gen_matches)
# combine the above; essentially transferred from wide to long form
matches<-rbind(gptzs_matches,gptts_matches,llamazs_matches,llamats_matches)
# remove the trials used for three shot prompting, convert from Json, then
# throw out the old matches column
matches_parsed <- matches %>%
filter(!trial_id %in% c("NCT00000620", "NCT01483560", "NCT04280783")) %>%
mutate(new_matches = lapply(matches, fromJSON)) %>%
select(trial_id, model, new_matches)
# keep the original df in case this part messes everything up (mostly for
# debugging, no longer needed as it works fine)
matches_parsed_test<-matches_parsed
# ok, bear with me here
# loop through each row of the matches df
#
# The if conditions account for if any of the lists are empty; otherwise, it
# returns NA values which mess with the later code chunks
for (ind in 1:nrow(matches_parsed)){
# extract the matched reference features into its own column
matches_parsed_test$matched_reference_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$matched_features)>0){
matches_parsed$new_matches[[ind]]$matched_features[,1]} else {list()}
# extract the matched candidate features into its own column
matches_parsed_test$matched_candidate_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$matched_features)>0){
matches_parsed$new_matches[[ind]]$matched_features[,2]}else{list()}
# extract the remaining reference features into its own column
matches_parsed_test$remaining_reference_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$remaining_reference_features)>0){
matches_parsed$new_matches[[ind]]$remaining_reference_features}else{
list()}
# extract the remaining candidate features into its own column
matches_parsed_test$remaining_candidate_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$remaining_candidate_features)>0){
matches_parsed$new_matches[[ind]]$remaining_candidate_features}else{list()
}
# concatenate a sequence of NAs to separate the remaining candidate features
# from the remaining reference features (i.e. make it look more like what
# the class had originally for the matches)
matches_parsed_test$reference[[ind]]=as.list(c(
matches_parsed_test$matched_reference_features[[ind]],
matches_parsed_test$remaining_reference_features[[ind]],
rep(NA,length(matches_parsed_test$remaining_candidate_features[[ind]]))))
matches_parsed_test$candidate[[ind]]=as.list(c(
matches_parsed_test$matched_candidate_features[[ind]],
rep(NA,length(matches_parsed_test$remaining_reference_features[[ind]])),
matches_parsed_test$remaining_candidate_features[[ind]]))
}
# just take the columns with the trial id, generative model, reference feature
# list, and candidate feature list (the ones we just created with the NAs),
# then expand it out and sort by trial id
full_matches<-matches_parsed_test %>%
select(trial_id,model,reference,candidate) %>%
unnest(c(reference,candidate)) %>%
arrange(trial_id)
# going from wide to long form for the trial info dataframe
# remove the trial group (as that was not in the original class data in this
# table) and all generated columns
trial_gptzs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
# identify model as gpt 0 shot
trial_gptzs$model='gpt4-omni-zs'
# re-add the gpt 0 shot generated results
trial_gptzs$candidate=trial_df$gpt4o_zs_gen
# same as above but for gpt 3 shot
trial_gptts<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
trial_gptts$model='gpt4-omni-ts'
trial_gptts$candidate=trial_df$gpt4o_ts_gen
# same as above but for llama 0 shot
trial_llamazs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
trial_llamazs$model='llama3-70b-in-zs'
trial_llamazs$candidate=trial_df$llama3_70b_it_zs_gen
# same as above but for llama 3 shot
trial_llamats<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
trial_llamats$model='llama3-70b-in-ts'
trial_llamats$candidate=trial_df$llama3_70b_it_ts_gen
# combine the above; it is now long form :)
new_trial_df<-rbind(trial_gptzs,trial_gptts,trial_llamazs,trial_llamats)
# this is to differentiate between CT-Pub and CT-Repo; the true reference
# features are stored in different column names between the two
#
# In both cases, take the trial id, reference feature list, candidate feature
# list, and generative model columns, and remove the trials used for 3 shot
# prompting
if ('Paper_BaselineMeasures_Corrected' %in% colnames(new_trial_df)){
trial_features<-new_trial_df %>%
dplyr::select(NCTId,Paper_BaselineMeasures_Corrected,candidate,model) %>%
dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
colnames(trial_features)<-c('trial_id','true_ref_features',
'true_can_features','model')
} else {
trial_features<-new_trial_df %>%
dplyr::select(NCTId,API_BaselineMeasures_Corrected,candidate,model) %>%
dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
colnames(trial_features)<-c('trial_id','true_ref_features',
'true_can_features','model')
}
# remove factors from matches df; it was giving me some issues when trying to
# get rid of NAs, so this fixed that
full_matches<-data.frame(matrix(unlist(full_matches),nrow=nrow(full_matches)),
stringsAsFactors=FALSE)
colnames(full_matches)<-c('trial_id','model','reference','candidate')
# extract the reference features for each trial according to the evaluator
eval_ref_features <- full_matches %>%
dplyr::select(trial_id,model,reference) %>%
dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
drop_na()
# surround these features in backticks and add a comma and space after each
eval_ref_features$reference<-paste0("`",eval_ref_features$reference,"`, ")
# roll up the evaluators reference feature list into a df with 1 row for each
# trial instance
eval_ref_features<-eval_ref_features %>%
dplyr::group_by(trial_id,model) %>%
dplyr::mutate(match_ref_features=paste0(reference,collapse="")) %>%
dplyr::select(trial_id,model,match_ref_features) %>%
dplyr::distinct()
# combine the dfs with the true reference features and evaluator-reported
# reference features
reference_features<-merge(trial_features,eval_ref_features)
# loop through each row of this df to count each of the 3 types of
# hallucinations
for (i in 1:nrow(reference_features)){
# calculate addition hallucinations by counting how many reference features
# the evaluator reported, counting how many of the features the evaluator
# reported are in the true feature list, then finding the difference between
# those two numbers
reference_features$num_pos_halls[i]<-length(as.list(extract_elements_v2(
reference_features$match_ref_features[[i]]))[[1]])-sum(as.list(
extract_elements_v2(reference_features$match_ref_features[[i]])[[1]])
%in% as.list(extract_elements_v2(
reference_features$true_ref_features[[i]]))[[1]])
# calculate removal hallucinations by counting how many true reference
# features there were, counting how many true reference features were
# reported by the evaluator, then finding teh difference between those two
# numbers
reference_features$num_neg_halls[i]<-length(as.list(extract_elements_v2(
reference_features$true_ref_features[[i]]))[[1]])-sum(as.list(
extract_elements_v2(reference_features$true_ref_features[[i]])[[1]])
%in% as.list(extract_elements_v2(
reference_features$match_ref_features[[i]]))[[1]])
# calculate the multi-match hallucinations
# create a table of counts for each true reference feature
true_count=table(extract_elements_v2(
reference_features$true_ref_features[[i]]))
# create a table of counts for each reference feature according to the
# evaluator
match_count=table(extract_elements_v2(
reference_features$match_ref_features[[i]]))
# initialize the multi-match hallucination counter
multi_halls=c()
# loop through each true reference feature
for (feat in extract_elements_v2(
reference_features$true_ref_features[[i]])){
# calculate the multi-match hallucinations for that feature by counting
# how many times it appears in the true reference feature list, counting
# how many times it appears in the evaluators reference feature list, and
# finding the difference between those two numbers. If the difference is
# negative, that is a negative hallucination, not a multi-match, so set
# those to 0 to count correctly
multi_halls[feat]=max(sum(as.numeric(match_count[feat])-as.numeric(
true_count[feat])),0,na.rm=TRUE)
}
# the number of multi-match hallucinations for the trial is the sum of the
# multi-match hallucinations for each of its features
reference_features$num_multi_halls[[i]]=sum(multi_halls)
}
# the above returned the multi-match hallucinations as a list, which is not
# ideal, so convert it to a number
reference_features$num_multi_halls=as.numeric(
reference_features$num_multi_halls)
return(reference_features)
}
id_hallucinations_v2<-function(trial_df,matches_df,models){
# takes the raw input from Nafis and returns the counts of each type of
# hallucination for each trial row
#
# Things to add: models as input
# extract hallucinated features themselves, not just counts
# eval models
# This assumes the column names of the generated features/matched features/etc
# are the model name itself, but this is not exactly the case. In the matches
# dataframe from Nafis, they were i.e. 'gpt4o_zs_gen_matches' for
# 'gpt4-omni-zs'; similar for the trials dataframe
#
# Have not tested this!
rows <- nrow(matches_df)*nrow(models)
matches1<-data.frame(trial_id=character(N),
model=character(N),
matches=character(N))
for (iter in nrow(models)){
matches1$trial_id[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
matches_df$NCTId
matches1$model[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
models[iter]
matches1$matches[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
matches_df[,models[iter]]
}
# extracts the model from its corresponding column for each of the models
gptzs_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-zs',
matches=matches_df$gpt4o_zs_gen_matches)
gptts_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-ts',
matches=matches_df$gpt4o_ts_gen_matches)
llamazs_matches<-data.frame(trial_id=matches_df$NCTId,
model='llama3-70b-in-zs',
matches=matches_df$llama3_70b_it_zs_gen_matches)
llamats_matches<-data.frame(trial_id=matches_df$NCTId,
model='llama3-70b-in-ts',
matches=matches_df$llama3_70b_it_ts_gen_matches)
# combine the above; essentially transferred from wide to long form
matches<-rbind(gptzs_matches,gptts_matches,llamazs_matches,llamats_matches)
# remove the trials used for three shot prompting, convert from Json, then
# throw out the old matches column
matches_parsed <- matches %>%
filter(!trial_id %in% c("NCT00000620", "NCT01483560", "NCT04280783")) %>%
mutate(new_matches = lapply(matches, fromJSON)) %>%
select(trial_id, model, new_matches)
# keep the original df in case this part messes everything up (mostly for
# debugging, no longer needed as it works fine)
matches_parsed_test<-matches_parsed
# ok, bear with me here
# loop through each row of the matches df
#
# The if conditions account for if any of the lists are empty; otherwise, it
# returns NA values which mess with the later code chunks
for (ind in 1:nrow(matches_parsed)){
# extract the matched reference features into its own column
matches_parsed_test$matched_reference_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$matched_features)>0){
matches_parsed$new_matches[[ind]]$matched_features[,1]} else {list()}
# extract the matched candidate features into its own column
matches_parsed_test$matched_candidate_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$matched_features)>0){
matches_parsed$new_matches[[ind]]$matched_features[,2]}else{list()}
# extract the remaining reference features into its own column
matches_parsed_test$remaining_reference_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$remaining_reference_features)>0){
matches_parsed$new_matches[[ind]]$remaining_reference_features}else{
list()}
# extract the remaining candidate features into its own column
matches_parsed_test$remaining_candidate_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$remaining_candidate_features)>0){
matches_parsed$new_matches[[ind]]$remaining_candidate_features}else{list()
}
# concatenate a sequence of NAs to separate the remaining candidate features
# from the remaining reference features (i.e. make it look more like what
# the class had originally for the matches)
matches_parsed_test$reference[[ind]]=as.list(c(
matches_parsed_test$matched_reference_features[[ind]],
matches_parsed_test$remaining_reference_features[[ind]],
rep(NA,length(matches_parsed_test$remaining_candidate_features[[ind]]))))
matches_parsed_test$candidate[[ind]]=as.list(c(
matches_parsed_test$matched_candidate_features[[ind]],
rep(NA,length(matches_parsed_test$remaining_reference_features[[ind]])),
matches_parsed_test$remaining_candidate_features[[ind]]))
}
# just take the columns with the trial id, generative model, reference feature
# list, and candidate feature list (the ones we just created with the NAs),
# then expand it out and sort by trial id
full_matches<-matches_parsed_test %>%
select(trial_id,model,reference,candidate) %>%
unnest(c(reference,candidate)) %>%
arrange(trial_id)
# going from wide to long form for the trial info dataframe
# remove the trial group (as that was not in the original class data in this
# table) and all generated columns
#
#
#
#
# Need to do here what I did for the matches df!!
rows <- nrow(trial_df)*nrow(models)
new_trial_df1<-select(trial_df,-any_of(models))
new_trial_df1<-do.call("rbind",
replicate(nrow(models),trial_df,simplify=FALSE))
for (iter in nrow(models)){
new_trial_df1$model[((iter-1)*nrow(new_trial_df1)+1):(iter*nrow(
new_trial_df1))]<-models[iter]
new_trial_df1$candidate[((iter-1)*nrow(matches_df)+1):(iter*nrow(
new_trial_df1))]<-trial_df[,models[iter]]
}
trial_gptzs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
# identify model as gpt 0 shot
trial_gptzs$model='gpt4-omni-zs'
# re-add the gpt 0 shot generated results
trial_gptzs$candidate=trial_df$gpt4o_zs_gen
# same as above but for gpt 3 shot
trial_gptts<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
trial_gptts$model='gpt4-omni-ts'
trial_gptts$candidate=trial_df$gpt4o_ts_gen
# same as above but for llama 0 shot
trial_llamazs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
trial_llamazs$model='llama3-70b-in-zs'
trial_llamazs$candidate=trial_df$llama3_70b_it_zs_gen
# same as above but for llama 3 shot
trial_llamats<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
trial_llamats$model='llama3-70b-in-ts'
trial_llamats$candidate=trial_df$llama3_70b_it_ts_gen
# combine the above; it is now long form :)
new_trial_df<-rbind(trial_gptzs,trial_gptts,trial_llamazs,trial_llamats)
# this is to differentiate between CT-Pub and CT-Repo; the true reference
# features are stored in different column names between the two
#
# In both cases, take the trial id, reference feature list, candidate feature
# list, and generative model columns, and remove the trials used for 3 shot
# prompting
if ('Paper_BaselineMeasures_Corrected' %in% colnames(new_trial_df)){
trial_features<-new_trial_df %>%
dplyr::select(NCTId,Paper_BaselineMeasures_Corrected,candidate,model) %>%
dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
colnames(trial_features)<-c('trial_id','true_ref_features',
'true_can_features','model')
} else {
trial_features<-new_trial_df %>%
dplyr::select(NCTId,API_BaselineMeasures_Corrected,candidate,model) %>%
dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
colnames(trial_features)<-c('trial_id','true_ref_features',
'true_can_features','model')
}
# remove factors from matches df; it was giving me some issues when trying to
# get rid of NAs, so this fixed that
full_matches<-data.frame(matrix(unlist(full_matches),nrow=nrow(full_matches)),
stringsAsFactors=FALSE)
colnames(full_matches)<-c('trial_id','model','reference','candidate')
# extract the reference features for each trial according to the evaluator
eval_ref_features <- full_matches %>%
dplyr::select(trial_id,model,reference) %>%
dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
drop_na()
# surround these features in backticks and add a comma and space after each
eval_ref_features$reference<-paste0("`",eval_ref_features$reference,"`, ")
# roll up the evaluators reference feature list into a df with 1 row for each
# trial instance
eval_ref_features<-eval_ref_features %>%
dplyr::group_by(trial_id,model) %>%
dplyr::mutate(match_ref_features=paste0(reference,collapse="")) %>%
dplyr::select(trial_id,model,match_ref_features) %>%
dplyr::distinct()
# extract the candidate features for each trial according to the evaluator
eval_can_features <- full_matches %>%
dplyr::select(trial_id,model,candidate) %>%
dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
drop_na()
# surround these features in backticks and add a comma and space after each
eval_can_features$candidate<-paste0("`",eval_can_features$candidate,"`, ")
# roll up the evaluators candidate feature list into a df with 1 row for each
# trial instance
eval_can_features<-eval_can_features %>%
dplyr::group_by(trial_id,model) %>%
dplyr::mutate(match_can_features=paste0(candidate,collapse="")) %>%
dplyr::select(trial_id,model,match_can_features) %>%
dplyr::distinct()
# combine the dfs with the true features and evaluator-reported features
features<-merge(merge(trial_features,eval_ref_features),eval_can_features)
# loop through each row of this df to count each of the 3 types of
# hallucinations
for (i in 1:nrow(features)){
# calculate addition hallucinations by counting how many reference features
# the evaluator reported, counting how many of the reference features the
# evaluator reported are in the true reference feature list, then finding
# the difference between those two numbers, then doing the same thing for
# the candidate features, and summing those 2 final numbers
# features$num_pos_halls[i]<-(length(extract_elements_v2(
# features$match_ref_features[[i]]))-sum(extract_elements_v2(
# features$match_ref_features[[i]]) %in% extract_elements_v2(
# features$true_ref_features[[i]])))+(length(extract_elements_v2(
# features$match_can_features[[i]]))-sum(extract_elements_v2(
# features$match_can_features[[i]]) %in% extract_elements_v2(
# features$true_can_features[[i]])))
features$num_add_halls[i]<-sum(!(extract_elements_v2(
features$match_ref_features[[i]]) %in% extract_elements_v2(
features$true_ref_features[[i]])))+sum(!(extract_elements_v2(
features$match_can_features[[i]]) %in% extract_elements_v2(
features$true_can_features[[i]])))
# calculate removal hallucinations by counting how many true reference
# features there were, counting how many true reference features were
# reported by the evaluator, then finding the difference between those two
# numbers, then doing the same thing for the candidate features, and summing
# those 2 final numbers
# features$num_neg_halls[i]<-(length(extract_elements_v2(
# features$true_ref_features[[i]]))-sum(extract_elements_v2(
# features$true_ref_features[[i]]) %in% extract_elements_v2(
# features$match_ref_features[[i]])))+(length(extract_elements_v2(
# features$true_can_features[[i]]))-sum(extract_elements_v2(
# features$true_can_features[[i]]) %in% extract_elements_v2(
# features$match_can_features[[i]])))
#
features$num_rem_halls[i]<-sum(!(extract_elements_v2(
features$true_ref_features[[i]]) %in% extract_elements_v2(
features$match_ref_features[[i]])))+sum(!(extract_elements_v2(
features$true_can_features[[i]]) %in% extract_elements_v2(
features$match_can_features[[i]])))
# calculate the multi-match hallucinations
# create a table of counts for each true reference feature
true_ref_count=table(extract_elements_v2(features$true_ref_features[[i]]))
# create a table of counts for each reference feature according to the
# evaluator
match_ref_count=table(extract_elements_v2(features$match_ref_features[[i]]))
# initialize the reference multi-match hallucination counter
multi_halls_ref=c()
# loop through each true reference feature
for (feat1 in extract_elements_v2(features$true_ref_features[[i]])){
# calculate the multi-match hallucinations for that feature by counting
# how many times it appears in the true reference feature list, counting
# how many times it appears in the evaluators reference feature list, and
# finding the difference between those two numbers. If the difference is
# negative, that is a negative hallucination, not a multi-match, so set
# those to 0 to count correctly
multi_halls_ref[feat1]=max(sum(as.numeric(
match_ref_count[feat1])-as.numeric(true_ref_count[feat1])),0,na.rm=TRUE)
}
# create a table of counts for each true candidate feature
true_can_count=table(extract_elements_v2(features$true_can_features[[i]]))
# create a table of counts for each candidate feature according to the
# evaluator
match_can_count=table(extract_elements_v2(features$match_can_features[[i]]))
# initialize the reference multi-match hallucination counter
multi_halls_can=c()
# loop through each true reference feature
for (feat2 in extract_elements_v2(features$true_can_features[[i]])){
# calculate the multi-match hallucinations for that feature by counting
# how many times it appears in the true candidate feature list, counting
# how many times it appears in the evaluators candidate feature list, and
# finding the difference between those two numbers. If the difference is
# negative, that is a negative hallucination, not a multi-match, so set
# those to 0 to count correctly
multi_halls_can[feat2]=max(sum(as.numeric(
match_can_count[feat1])-as.numeric(true_can_count[feat2])),0,na.rm=TRUE)
}
# the number of multi-match hallucinations for the trial is the sum of the
# multi-match hallucinations for each of its features
features$num_multi_halls[[i]]=sum(multi_halls_ref,multi_halls_can)
features$total_halls[i]=sum(features$num_add_halls[i],
features$num_rem_halls[i],
as.numeric(features$num_multi_halls[i]))
features$num_ref[i]<-length(extract_elements_v2(
features$true_ref_features[i]))
features$num_can[i]<-length(extract_elements_v2(
features$true_can_features[i]))
}
# the above returned the multi-match hallucinations as a list, which is not
# ideal, so convert it to a number
features$num_multi_halls=as.numeric(features$num_multi_halls)
return(features)
}
# NEW PLAN: make 2 functions: 1) restructure our input data into a more
# generic, useful form
# 2) remove hallucinations from generic data
# structure
RemoveHallucinations_v1<-function(Matches,ReferenceList,CandidateList){
# Matches should be a list containing the matches, with Matches[1] being from
# the reference list and Matches[2] being from the candidate list
# ReferenceList should be the true reference feature list
# CandidateList should be the true candidate feature list
#
# Currently, this extracts all true (non-hallucinated) matches and all
# addition match hallucinations (just the hallucinated feature, not the whole
# match) and calculates the corrected metrics
TrueMatches<-as.data.frame(matrix(nrow=nrow(Matches),ncol=2))
colnames(TrueMatches)=colnames(Matches)
Hallucinations<-rep(NA,2*nrow(Matches))
for (iter in 1:nrow(Matches)){
if (Matches[iter,1]%in%ReferenceList){
if (Matches[iter,2]%in%CandidateList){
TrueMatches[iter,]=Matches[iter,]
} else {
Hallucinations[iter]=Matches[iter,2]
}
} else {
Hallucinations[nrow(Matches)+iter]=Matches[iter,1]
if (!(Matches[iter,2]%in%CandidateList)){
Hallucinations[iter]=Matches[iter,2]
}
}
}
TrueMatches<-TrueMatches %>%
na.omit()
Hallucinations<-Hallucinations[!is.na(Hallucinations)]
precision<-nrow(TrueMatches)/length(CandidateList)
recall<-nrow(TrueMatches)/length(ReferenceList)
f1<-2*precision*recall/(precision+recall)
result<-list(TrueMatches=TrueMatches,Hallucinations=Hallucinations,
precision=precision,recall=recall,f1=f1)
return(result)
}
RemoveHallucinations_v2<-function(Matches,ReferenceList,CandidateList){
# Matches should be a list containing the matches, with Matches[1] being from
# the reference list and Matches[2] being from the candidate list
# ReferenceList should be the true reference feature list
# CandidateList should be the true candidate feature list
#
# Currently, this extracts all true (non-hallucinated) matches, all addition
# match hallucinations (just the hallucinated feature, not the whole match),
# and all multi-match hallucinations (again, just the hallucinated feature),
# and calculates the corrected metrics.
# count the number of times each feature appears in each list; useful for
# multi-match hallucination identification
Rtab<-as.data.frame(table(ReferenceList))
Ctab<-as.data.frame(table(CandidateList))
MRtab<-as.data.frame(table(Matches[,1]))
MCtab<-as.data.frame(table(Matches[,2]))
# Extract the matches in which both the reference feature and candidate
# feature are real original features
TrueMatches<-Matches[(Matches[,1]%in%ReferenceList)&
(Matches[,2]%in%CandidateList),,drop=FALSE]
# Extract the addition hallucinations i.e. all the matched features which were
# not in the original lists
AHallucinations<-c(Matches[!(Matches[,1]%in%ReferenceList),1],
Matches[!(Matches[,2]%in%CandidateList),2])
# initialize empty vectors for the indices in which multi-match hallucinations
# occur...
Hindices<-c()
# ...and for the hallucinations themselves
MHallucinations<-c()
# loop through the rows of the matches
if (length(TrueMatches)>0){
for (Riter in 1:nrow(TrueMatches)){
feat<-TrueMatches[Riter,1]
if (MRtab$Freq[MRtab$Var1==feat]>Rtab$Freq[Rtab$ReferenceList==feat]){
MRtab$Freq[MRtab$Var1==feat]=MRtab$Freq[MRtab$Var1==feat]-1
MHallucinations<-c(MHallucinations,feat)
Hindices<-c(Hindices,Riter)
}
}
for (Citer in 1:nrow(TrueMatches)){
feat<-TrueMatches[Riter,2]
if (MCtab$Freq[MCtab$Var1==feat]>Ctab$Freq[Ctab$CandidateList==feat]){
MCtab$Freq[MCtab$Var1==feat]=MCtab$Freq[MCtab$Var1==feat]-1
MHallucinations<-c(MHallucinations,feat)
Hindices<-c(Hindices,Citer)
}
}
if (length(Hindices)>0){
TrueMatches<-TrueMatches[-Hindices,,drop=FALSE]
}
}
Hallucinations<-c(AHallucinations,MHallucinations)
precision<-max(nrow(TrueMatches),0,na.rm=TRUE)/length(CandidateList)
recall<-max(nrow(TrueMatches),0,na.rm=TRUE)/length(ReferenceList)
f1<-max(2*precision*recall/(precision+recall),0,na.rm=TRUE)
UnmatchedReferenceFeature<-ReferenceList[!(ReferenceList%in%TrueMatches[,1])]
UnmatchedCandidateFeature<-CandidateList[!(CandidateList%in%TrueMatches[,2])]
result<-list(TrueMatches=TrueMatches,Hallucinations=Hallucinations,
UnmatchedReferenceFeature=UnmatchedReferenceFeature,
UnmatchedCandidateFeature=UnmatchedCandidateFeature,
precision=precision,recall=recall,f1=f1)
return(result)
}
id_hallucinations_v3<-function(trial_df,matches_df,gen_models){
# takes the raw input from Nafis and returns the counts of each type of
# hallucination for each trial row
#
# Things to add: models as input
# extract hallucinated features themselves, not just counts
# eval models
# This assumes the column names of the generated features/matched features/etc
# are the model name itself, but this is not exactly the case. In the matches
# dataframe from Nafis, they were i.e. 'gpt4o_zs_gen_matches' for
# 'gpt4-omni-zs'; similar for the trials dataframe
#
# Have not tested this!
rows<-nrow(matches_df)*length(gen_models)
matches1<-data.frame(trial_id=character(rows),
model=character(rows),
matchings=character(rows))
for (iter in nrow(gen_models)){
matches1$trial_id[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
matches_df$NCTId
matches1$model[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
gen_models[iter]
matches1$matchings[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
matches_df[,gen_models[iter]]
}
# # extracts the model from its corresponding column for each of the models
# gptzs_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-zs',
# matches=matches_df$gpt4o_zs_gen_matches)
#
# gptts_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-ts',
# matches=matches_df$gpt4o_ts_gen_matches)
#
# llamazs_matches<-data.frame(trial_id=matches_df$NCTId,
# model='llama3-70b-in-zs',
# matches=matches_df$llama3_70b_it_zs_gen_matches)
#
# llamats_matches<-data.frame(trial_id=matches_df$NCTId,
# model='llama3-70b-in-ts',
# matches=matches_df$llama3_70b_it_ts_gen_matches)
#
# # combine the above; essentially transferred from wide to long form
# matches<-rbind(gptzs_matches,gptts_matches,llamazs_matches,llamats_matches)
# remove the trials used for three shot prompting, convert from Json, then
# throw out the old matches column
matches_parsed <- matches1 %>%
filter(!trial_id %in% c("NCT00000620", "NCT01483560", "NCT04280783")) %>%
mutate(new_matches = lapply(matchings, fromJSON)) %>%
select(trial_id, model, new_matches)
# keep the original df in case this part messes everything up (mostly for
# debugging, no longer needed as it works fine)
matches_parsed_test<-matches_parsed
# ok, bear with me here
# loop through each row of the matches df
#
# The if conditions account for if any of the lists are empty; otherwise, it
# returns NA values which mess with the later code chunks
for (ind in 1:nrow(matches_parsed)){
# extract the matched reference features into its own column
matches_parsed_test$matched_reference_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$matched_features)>0){
matches_parsed$new_matches[[ind]]$matched_features[,1]} else {list()}
# extract the matched candidate features into its own column
matches_parsed_test$matched_candidate_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$matched_features)>0){
matches_parsed$new_matches[[ind]]$matched_features[,2]}else{list()}
# extract the remaining reference features into its own column
matches_parsed_test$remaining_reference_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$remaining_reference_features)>0){
matches_parsed$new_matches[[ind]]$remaining_reference_features}else{
list()}
# extract the remaining candidate features into its own column
matches_parsed_test$remaining_candidate_features[[ind]]=if(length(
matches_parsed$new_matches[[ind]]$remaining_candidate_features)>0){
matches_parsed$new_matches[[ind]]$remaining_candidate_features}else{list()
}
# concatenate a sequence of NAs to separate the remaining candidate features
# from the remaining reference features (i.e. make it look more like what
# the class had originally for the matches)
matches_parsed_test$reference[[ind]]=as.list(c(
matches_parsed_test$matched_reference_features[[ind]],
matches_parsed_test$remaining_reference_features[[ind]],
rep(NA,length(matches_parsed_test$remaining_candidate_features[[ind]]))))
matches_parsed_test$candidate[[ind]]=as.list(c(
matches_parsed_test$matched_candidate_features[[ind]],
rep(NA,length(matches_parsed_test$remaining_reference_features[[ind]])),
matches_parsed_test$remaining_candidate_features[[ind]]))
}
# just take the columns with the trial id, generative model, reference feature
# list, and candidate feature list (the ones we just created with the NAs),
# then expand it out and sort by trial id
full_matches<-matches_parsed_test %>%
select(trial_id,model,reference,candidate) %>%
unnest(c(reference,candidate)) %>%
arrange(trial_id)
# going from wide to long form for the trial info dataframe
# remove the trial group (as that was not in the original class data in this
# table) and all generated columns
#
#
#
#
# (Need to do here what I did for the matches df) done!!
new_trial_df1<-select(trial_df,-any_of(gen_models))
new_trial_df1<-do.call("rbind",
replicate(length(gen_models),new_trial_df1,simplify=FALSE))
for (iter in nrow(gen_models)){
new_trial_df1$model[((iter-1)*nrow(new_trial_df1)+1):(iter*nrow(
new_trial_df1))]<-gen_models[iter]
new_trial_df1$candidate[((iter-1)*nrow(new_trial_df1)+1):(iter*nrow(
new_trial_df1))]<-trial_df[,gen_models[iter]]
}
# trial_gptzs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
# llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
# # identify model as gpt 0 shot
# trial_gptzs$model='gpt4-omni-zs'
# # re-add the gpt 0 shot generated results
# trial_gptzs$candidate=trial_df$gpt4o_zs_gen
#
# # same as above but for gpt 3 shot
# trial_gptts<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
# llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
# trial_gptts$model='gpt4-omni-ts'
# trial_gptts$candidate=trial_df$gpt4o_ts_gen
#
# # same as above but for llama 0 shot
# trial_llamazs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
# llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
# trial_llamazs$model='llama3-70b-in-zs'
# trial_llamazs$candidate=trial_df$llama3_70b_it_zs_gen
#
# # same as above but for llama 3 shot
# trial_llamats<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
# llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
# trial_llamats$model='llama3-70b-in-ts'
# trial_llamats$candidate=trial_df$llama3_70b_it_ts_gen
#
# # combine the above; it is now long form :)
# new_trial_df<-rbind(trial_gptzs,trial_gptts,trial_llamazs,trial_llamats)
# this is to differentiate between CT-Pub and CT-Repo; the true reference
# features are stored in different column names between the two
#
# In both cases, take the trial id, reference feature list, candidate feature
# list, and generative model columns, and remove the trials used for 3 shot
# prompting
if ('Paper_BaselineMeasures_Corrected' %in% colnames(new_trial_df1)){
trial_features<-new_trial_df1 %>%
dplyr::select(NCTId,Paper_BaselineMeasures_Corrected,candidate,model) %>%
dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
colnames(trial_features)<-c('trial_id','true_ref_features',
'true_can_features','model')
} else {
trial_features<-new_trial_df1 %>%
dplyr::select(NCTId,API_BaselineMeasures_Corrected,candidate,model) %>%
dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
colnames(trial_features)<-c('trial_id','true_ref_features',
'true_can_features','model')
}
# remove factors from matches df; it was giving me some issues when trying to
# get rid of NAs, so this fixed that
full_matches<-data.frame(matrix(unlist(full_matches),nrow=nrow(full_matches)),
stringsAsFactors=FALSE)
colnames(full_matches)<-c('trial_id','model','reference','candidate')
# extract the reference features for each trial according to the evaluator
eval_ref_features <- full_matches %>%
dplyr::select(trial_id,model,reference) %>%
dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
drop_na()
# surround these features in backticks and add a comma and space after each
eval_ref_features$reference<-paste0("`",eval_ref_features$reference,"`, ")
# roll up the evaluators reference feature list into a df with 1 row for each
# trial instance
eval_ref_features<-eval_ref_features %>%
dplyr::group_by(trial_id,model) %>%
dplyr::mutate(match_ref_features=paste0(reference,collapse="")) %>%
dplyr::select(trial_id,model,match_ref_features) %>%
dplyr::distinct()
# extract the candidate features for each trial according to the evaluator
eval_can_features <- full_matches %>%
dplyr::select(trial_id,model,candidate) %>%
dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
drop_na()
# surround these features in backticks and add a comma and space after each
eval_can_features$candidate<-paste0("`",eval_can_features$candidate,"`, ")
# roll up the evaluators candidate feature list into a df with 1 row for each
# trial instance
eval_can_features<-eval_can_features %>%
dplyr::group_by(trial_id,model) %>%
dplyr::mutate(match_can_features=paste0(candidate,collapse="")) %>%
dplyr::select(trial_id,model,match_can_features) %>%
dplyr::distinct()
# combine the dfs with the true features and evaluator-reported features
features<-merge(merge(trial_features,eval_ref_features),eval_can_features)
# loop through each row of this df to count each of the 3 types of
# hallucinations
for (i in 1:nrow(features)){
# calculate addition hallucinations by counting how many reference features
# the evaluator reported, counting how many of the reference features the
# evaluator reported are in the true reference feature list, then finding
# the difference between those two numbers, then doing the same thing for
# the candidate features, and summing those 2 final numbers
# features$num_pos_halls[i]<-(length(extract_elements_v2(
# features$match_ref_features[[i]]))-sum(extract_elements_v2(
# features$match_ref_features[[i]]) %in% extract_elements_v2(
# features$true_ref_features[[i]])))+(length(extract_elements_v2(
# features$match_can_features[[i]]))-sum(extract_elements_v2(
# features$match_can_features[[i]]) %in% extract_elements_v2(
# features$true_can_features[[i]])))
features$num_add_halls[i]<-sum(!(extract_elements_v2(
features$match_ref_features[[i]]) %in% extract_elements_v2(
features$true_ref_features[[i]])))+sum(!(extract_elements_v2(
features$match_can_features[[i]]) %in% extract_elements_v2(
features$true_can_features[[i]])))
# calculate removal hallucinations by counting how many true reference
# features there were, counting how many true reference features were
# reported by the evaluator, then finding the difference between those two
# numbers, then doing the same thing for the candidate features, and summing
# those 2 final numbers
# features$num_neg_halls[i]<-(length(extract_elements_v2(
# features$true_ref_features[[i]]))-sum(extract_elements_v2(
# features$true_ref_features[[i]]) %in% extract_elements_v2(
# features$match_ref_features[[i]])))+(length(extract_elements_v2(
# features$true_can_features[[i]]))-sum(extract_elements_v2(
# features$true_can_features[[i]]) %in% extract_elements_v2(
# features$match_can_features[[i]])))
#
features$num_rem_halls[i]<-sum(!(extract_elements_v2(
features$true_ref_features[[i]]) %in% extract_elements_v2(
features$match_ref_features[[i]])))+sum(!(extract_elements_v2(
features$true_can_features[[i]]) %in% extract_elements_v2(
features$match_can_features[[i]])))
# calculate the multi-match hallucinations
# create a table of counts for each true reference feature
true_ref_count=table(extract_elements_v2(features$true_ref_features[[i]]))
# create a table of counts for each reference feature according to the
# evaluator
match_ref_count=table(extract_elements_v2(features$match_ref_features[[i]]))
# initialize the reference multi-match hallucination counter
multi_halls_ref=c()
# loop through each true reference feature
for (feat1 in extract_elements_v2(features$true_ref_features[[i]])){
# calculate the multi-match hallucinations for that feature by counting
# how many times it appears in the true reference feature list, counting
# how many times it appears in the evaluators reference feature list, and
# finding the difference between those two numbers. If the difference is
# negative, that is a negative hallucination, not a multi-match, so set
# those to 0 to count correctly
multi_halls_ref[feat1]=max(sum(as.numeric(
match_ref_count[feat1])-as.numeric(true_ref_count[feat1])),0,na.rm=TRUE)
}
# create a table of counts for each true candidate feature
true_can_count=table(extract_elements_v2(features$true_can_features[[i]]))
# create a table of counts for each candidate feature according to the
# evaluator
match_can_count=table(extract_elements_v2(features$match_can_features[[i]]))
# initialize the reference multi-match hallucination counter
multi_halls_can=c()
# loop through each true reference feature
for (feat2 in extract_elements_v2(features$true_can_features[[i]])){
# calculate the multi-match hallucinations for that feature by counting
# how many times it appears in the true candidate feature list, counting
# how many times it appears in the evaluators candidate feature list, and
# finding the difference between those two numbers. If the difference is
# negative, that is a negative hallucination, not a multi-match, so set
# those to 0 to count correctly
multi_halls_can[feat2]=max(sum(as.numeric(
match_can_count[feat1])-as.numeric(true_can_count[feat2])),0,na.rm=TRUE)
}
# the number of multi-match hallucinations for the trial is the sum of the
# multi-match hallucinations for each of its features
features$num_multi_halls[[i]]=sum(multi_halls_ref,multi_halls_can)
features$total_halls[i]=sum(features$num_add_halls[i],
features$num_rem_halls[i],
as.numeric(features$num_multi_halls[i]))
features$num_ref[i]<-length(extract_elements_v2(
features$true_ref_features[i]))
features$num_can[i]<-length(extract_elements_v2(
features$true_can_features[i]))
}
# the above returned the multi-match hallucinations as a list, which is not
# ideal, so convert it to a number
features$num_multi_halls=as.numeric(features$num_multi_halls)
return(features)
}