functions.R


# extracting elements -----------------------------------------------------

extract_elements_v2 <- function(s) {
  # Extract elements enclosed within backticks using regex
  pattern <- "`(.*?)`"
  elements <- regmatches(s, gregexpr(pattern, s))[[1]]

  # Remove the backticks from the matched elements
  elements <- gsub("`", "", elements)

  return(elements)

}


# hallucination identification ------------------------------------------------
id_hallucinations<-function(trial_df,matches_df){
  ### Deprecated; use id_hallucinations_v2 ###

  # takes the raw input from Nafis and returns the counts of each type of
  # hallucination for each trial row

  # extracts the model from its corresponding column for each of the models
  gptzs_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-zs',
                            matches=matches_df$gpt4o_zs_gen_matches)

  gptts_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-ts',
                            matches=matches_df$gpt4o_ts_gen_matches)

  llamazs_matches<-data.frame(trial_id=matches_df$NCTId,
                              model='llama3-70b-in-zs',
                              matches=matches_df$llama3_70b_it_zs_gen_matches)

  llamats_matches<-data.frame(trial_id=matches_df$NCTId,
                              model='llama3-70b-in-ts',
                              matches=matches_df$llama3_70b_it_ts_gen_matches)

  # combine the above; essentially transferred from wide to long form
  matches<-rbind(gptzs_matches,gptts_matches,llamazs_matches,llamats_matches)

  # remove the trials used for three shot prompting, convert from Json, then
  # throw out the old matches column
  matches_parsed <- matches %>%
    filter(!trial_id %in% c("NCT00000620", "NCT01483560", "NCT04280783")) %>%
    mutate(new_matches = lapply(matches, fromJSON)) %>%
    select(trial_id, model, new_matches)

  # keep the original df in case this part messes everything up (mostly for
  # debugging, no longer needed as it works fine)
  matches_parsed_test<-matches_parsed

  # ok, bear with me here
  # loop through each row of the matches df
  #
  # The if conditions account for if any of the lists are empty; otherwise, it
  # returns NA values which mess with the later code chunks
  for (ind in 1:nrow(matches_parsed)){
    # extract the matched reference features into its own column
    matches_parsed_test$matched_reference_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$matched_features)>0){
      matches_parsed$new_matches[[ind]]$matched_features[,1]} else {list()}
    # extract the matched candidate features into its own column
    matches_parsed_test$matched_candidate_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$matched_features)>0){
        matches_parsed$new_matches[[ind]]$matched_features[,2]}else{list()}
    # extract the remaining reference features into its own column
    matches_parsed_test$remaining_reference_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$remaining_reference_features)>0){
        matches_parsed$new_matches[[ind]]$remaining_reference_features}else{
          list()}
    # extract the remaining candidate features into its own column
    matches_parsed_test$remaining_candidate_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$remaining_candidate_features)>0){
      matches_parsed$new_matches[[ind]]$remaining_candidate_features}else{list()
        }

    # concatenate a sequence of NAs to separate the remaining candidate features
    # from the remaining reference features (i.e. make it look more like what
    # the class had originally for the matches)
    matches_parsed_test$reference[[ind]]=as.list(c(
      matches_parsed_test$matched_reference_features[[ind]],
      matches_parsed_test$remaining_reference_features[[ind]],
      rep(NA,length(matches_parsed_test$remaining_candidate_features[[ind]]))))
    matches_parsed_test$candidate[[ind]]=as.list(c(
      matches_parsed_test$matched_candidate_features[[ind]],
      rep(NA,length(matches_parsed_test$remaining_reference_features[[ind]])),
      matches_parsed_test$remaining_candidate_features[[ind]]))
  }

  # just take the columns with the trial id, generative model, reference feature
  # list, and candidate feature list (the ones we just created with the NAs),
  # then expand it out and sort by trial id
  full_matches<-matches_parsed_test %>%
    select(trial_id,model,reference,candidate) %>%
    unnest(c(reference,candidate)) %>%
    arrange(trial_id)


  # going from wide to long form for the trial info dataframe
  # remove the trial group (as that was not in the original class data in this
  # table) and all generated columns
  trial_gptzs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
                                  llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
  # identify model as gpt 0 shot
  trial_gptzs$model='gpt4-omni-zs'
  # re-add the gpt 0 shot generated results
  trial_gptzs$candidate=trial_df$gpt4o_zs_gen

  # same as above but for gpt 3 shot
  trial_gptts<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
                                  llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
  trial_gptts$model='gpt4-omni-ts'
  trial_gptts$candidate=trial_df$gpt4o_ts_gen

  # same as above but for llama 0 shot
  trial_llamazs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
                                    llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
  trial_llamazs$model='llama3-70b-in-zs'
  trial_llamazs$candidate=trial_df$llama3_70b_it_zs_gen

  # same as above but for llama 3 shot
  trial_llamats<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
                                    llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
  trial_llamats$model='llama3-70b-in-ts'
  trial_llamats$candidate=trial_df$llama3_70b_it_ts_gen

  # combine the above; it is now long form :)
  new_trial_df<-rbind(trial_gptzs,trial_gptts,trial_llamazs,trial_llamats)

  # this is to differentiate between CT-Pub and CT-Repo; the true reference
  # features are stored in different column names between the two
  #
  # In both cases, take the trial id, reference feature list, candidate feature
  # list, and generative model columns, and remove the trials used for 3 shot
  # prompting
  if ('Paper_BaselineMeasures_Corrected' %in% colnames(new_trial_df)){
    trial_features<-new_trial_df %>%
      dplyr::select(NCTId,Paper_BaselineMeasures_Corrected,candidate,model) %>%
      dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
    colnames(trial_features)<-c('trial_id','true_ref_features',
                                'true_can_features','model')
  } else {
    trial_features<-new_trial_df %>%
      dplyr::select(NCTId,API_BaselineMeasures_Corrected,candidate,model) %>%
      dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
    colnames(trial_features)<-c('trial_id','true_ref_features',
                                'true_can_features','model')
  }

  # remove factors from matches df; it was giving me some issues when trying to
  # get rid of NAs, so this fixed that
  full_matches<-data.frame(matrix(unlist(full_matches),nrow=nrow(full_matches)),
                           stringsAsFactors=FALSE)
  colnames(full_matches)<-c('trial_id','model','reference','candidate')

  # extract the reference features for each trial according to the evaluator
  eval_ref_features <- full_matches %>%
    dplyr::select(trial_id,model,reference) %>%
    dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
    drop_na()

  # surround these features in backticks and add a comma and space after each
  eval_ref_features$reference<-paste0("`",eval_ref_features$reference,"`, ")

  # roll up the evaluators reference feature list into a df with 1 row for each
  # trial instance
  eval_ref_features<-eval_ref_features %>%
    dplyr::group_by(trial_id,model) %>%
    dplyr::mutate(match_ref_features=paste0(reference,collapse="")) %>%
    dplyr::select(trial_id,model,match_ref_features) %>%
    dplyr::distinct()

  # combine the dfs with the true reference features and evaluator-reported
  # reference features
  reference_features<-merge(trial_features,eval_ref_features)

  # loop through each row of this df to count each of the 3 types of
  # hallucinations
  for (i in 1:nrow(reference_features)){
    # calculate addition hallucinations by counting how many reference features
    # the evaluator reported, counting how many of the features the evaluator
    # reported are in the true feature list, then finding the difference between
    # those two numbers
    reference_features$num_pos_halls[i]<-length(as.list(extract_elements_v2(
      reference_features$match_ref_features[[i]]))[[1]])-sum(as.list(
        extract_elements_v2(reference_features$match_ref_features[[i]])[[1]])
        %in% as.list(extract_elements_v2(
          reference_features$true_ref_features[[i]]))[[1]])
    # calculate removal hallucinations by counting how many true reference
    # features there were, counting how many true reference features were
    # reported by the evaluator, then finding teh difference between those two
    # numbers
    reference_features$num_neg_halls[i]<-length(as.list(extract_elements_v2(
      reference_features$true_ref_features[[i]]))[[1]])-sum(as.list(
        extract_elements_v2(reference_features$true_ref_features[[i]])[[1]])
        %in% as.list(extract_elements_v2(
          reference_features$match_ref_features[[i]]))[[1]])

    # calculate the multi-match hallucinations
    # create a table of counts for each true reference feature
    true_count=table(extract_elements_v2(
      reference_features$true_ref_features[[i]]))
    # create a table of counts for each reference feature according to the
    # evaluator
    match_count=table(extract_elements_v2(
      reference_features$match_ref_features[[i]]))
    # initialize the multi-match hallucination counter
    multi_halls=c()
    # loop through each true reference feature
    for (feat in extract_elements_v2(
      reference_features$true_ref_features[[i]])){
      # calculate the multi-match hallucinations for that feature by counting
      # how many times it appears in the true reference feature list, counting
      # how many times it appears in the evaluators reference feature list, and
      # finding the difference between those two numbers. If the difference is
      # negative, that is a negative hallucination, not a multi-match, so set
      # those to 0 to count correctly
      multi_halls[feat]=max(sum(as.numeric(match_count[feat])-as.numeric(
        true_count[feat])),0,na.rm=TRUE)
    }
    # the number of multi-match hallucinations for the trial is the sum of the
    # multi-match hallucinations for each of its features
    reference_features$num_multi_halls[[i]]=sum(multi_halls)

  }
  # the above returned the multi-match hallucinations as a list, which is not
  # ideal, so convert it to a number
  reference_features$num_multi_halls=as.numeric(
    reference_features$num_multi_halls)

  return(reference_features)
}


id_hallucinations_v2<-function(trial_df,matches_df,models){

  # takes the raw input from Nafis and returns the counts of each type of
  # hallucination for each trial row
  #
  # Things to add:  models as input
  #                 extract hallucinated features themselves, not just counts
  #                 eval models


  # This assumes the column names of the generated features/matched features/etc
  # are the model name itself, but this is not exactly the case. In the matches
  # dataframe from Nafis, they were i.e. 'gpt4o_zs_gen_matches' for
  # 'gpt4-omni-zs'; similar for the trials dataframe
  #
  # Have not tested this!
  rows <- nrow(matches_df)*nrow(models)
  matches1<-data.frame(trial_id=character(N),
                       model=character(N),
                       matches=character(N))

  for (iter in nrow(models)){

    matches1$trial_id[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
      matches_df$NCTId
    matches1$model[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
      models[iter]
    matches1$matches[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
      matches_df[,models[iter]]
  }


  # extracts the model from its corresponding column for each of the models
  gptzs_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-zs',
                            matches=matches_df$gpt4o_zs_gen_matches)

  gptts_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-ts',
                            matches=matches_df$gpt4o_ts_gen_matches)

  llamazs_matches<-data.frame(trial_id=matches_df$NCTId,
                              model='llama3-70b-in-zs',
                              matches=matches_df$llama3_70b_it_zs_gen_matches)

  llamats_matches<-data.frame(trial_id=matches_df$NCTId,
                              model='llama3-70b-in-ts',
                              matches=matches_df$llama3_70b_it_ts_gen_matches)

  # combine the above; essentially transferred from wide to long form
  matches<-rbind(gptzs_matches,gptts_matches,llamazs_matches,llamats_matches)

  # remove the trials used for three shot prompting, convert from Json, then
  # throw out the old matches column
  matches_parsed <- matches %>%
    filter(!trial_id %in% c("NCT00000620", "NCT01483560", "NCT04280783")) %>%
    mutate(new_matches = lapply(matches, fromJSON)) %>%
    select(trial_id, model, new_matches)

  # keep the original df in case this part messes everything up (mostly for
  # debugging, no longer needed as it works fine)
  matches_parsed_test<-matches_parsed

  # ok, bear with me here
  # loop through each row of the matches df
  #
  # The if conditions account for if any of the lists are empty; otherwise, it
  # returns NA values which mess with the later code chunks
  for (ind in 1:nrow(matches_parsed)){
    # extract the matched reference features into its own column
    matches_parsed_test$matched_reference_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$matched_features)>0){
      matches_parsed$new_matches[[ind]]$matched_features[,1]} else {list()}
    # extract the matched candidate features into its own column
    matches_parsed_test$matched_candidate_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$matched_features)>0){
      matches_parsed$new_matches[[ind]]$matched_features[,2]}else{list()}
    # extract the remaining reference features into its own column
    matches_parsed_test$remaining_reference_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$remaining_reference_features)>0){
      matches_parsed$new_matches[[ind]]$remaining_reference_features}else{
        list()}
    # extract the remaining candidate features into its own column
    matches_parsed_test$remaining_candidate_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$remaining_candidate_features)>0){
      matches_parsed$new_matches[[ind]]$remaining_candidate_features}else{list()
      }

    # concatenate a sequence of NAs to separate the remaining candidate features
    # from the remaining reference features (i.e. make it look more like what
    # the class had originally for the matches)
    matches_parsed_test$reference[[ind]]=as.list(c(
      matches_parsed_test$matched_reference_features[[ind]],
      matches_parsed_test$remaining_reference_features[[ind]],
      rep(NA,length(matches_parsed_test$remaining_candidate_features[[ind]]))))
    matches_parsed_test$candidate[[ind]]=as.list(c(
      matches_parsed_test$matched_candidate_features[[ind]],
      rep(NA,length(matches_parsed_test$remaining_reference_features[[ind]])),
      matches_parsed_test$remaining_candidate_features[[ind]]))
  }

  # just take the columns with the trial id, generative model, reference feature
  # list, and candidate feature list (the ones we just created with the NAs),
  # then expand it out and sort by trial id
  full_matches<-matches_parsed_test %>%
    select(trial_id,model,reference,candidate) %>%
    unnest(c(reference,candidate)) %>%
    arrange(trial_id)


  # going from wide to long form for the trial info dataframe
  # remove the trial group (as that was not in the original class data in this
  # table) and all generated columns
  #
  #
  #
  #
  # Need to do here what I did for the matches df!!

  rows <- nrow(trial_df)*nrow(models)

  new_trial_df1<-select(trial_df,-any_of(models))
  new_trial_df1<-do.call("rbind",
                         replicate(nrow(models),trial_df,simplify=FALSE))

  for (iter in nrow(models)){

    new_trial_df1$model[((iter-1)*nrow(new_trial_df1)+1):(iter*nrow(
      new_trial_df1))]<-models[iter]
    new_trial_df1$candidate[((iter-1)*nrow(matches_df)+1):(iter*nrow(
      new_trial_df1))]<-trial_df[,models[iter]]

  }


  trial_gptzs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
                                  llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
  # identify model as gpt 0 shot
  trial_gptzs$model='gpt4-omni-zs'
  # re-add the gpt 0 shot generated results
  trial_gptzs$candidate=trial_df$gpt4o_zs_gen

  # same as above but for gpt 3 shot
  trial_gptts<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
                                  llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
  trial_gptts$model='gpt4-omni-ts'
  trial_gptts$candidate=trial_df$gpt4o_ts_gen

  # same as above but for llama 0 shot
  trial_llamazs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
                                    llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
  trial_llamazs$model='llama3-70b-in-zs'
  trial_llamazs$candidate=trial_df$llama3_70b_it_zs_gen

  # same as above but for llama 3 shot
  trial_llamats<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
                                    llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
  trial_llamats$model='llama3-70b-in-ts'
  trial_llamats$candidate=trial_df$llama3_70b_it_ts_gen

  # combine the above; it is now long form :)
  new_trial_df<-rbind(trial_gptzs,trial_gptts,trial_llamazs,trial_llamats)

  # this is to differentiate between CT-Pub and CT-Repo; the true reference
  # features are stored in different column names between the two
  #
  # In both cases, take the trial id, reference feature list, candidate feature
  # list, and generative model columns, and remove the trials used for 3 shot
  # prompting
  if ('Paper_BaselineMeasures_Corrected' %in% colnames(new_trial_df)){
    trial_features<-new_trial_df %>%
      dplyr::select(NCTId,Paper_BaselineMeasures_Corrected,candidate,model) %>%
      dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
    colnames(trial_features)<-c('trial_id','true_ref_features',
                                'true_can_features','model')
  } else {
    trial_features<-new_trial_df %>%
      dplyr::select(NCTId,API_BaselineMeasures_Corrected,candidate,model) %>%
      dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
    colnames(trial_features)<-c('trial_id','true_ref_features',
                                'true_can_features','model')
  }

  # remove factors from matches df; it was giving me some issues when trying to
  # get rid of NAs, so this fixed that
  full_matches<-data.frame(matrix(unlist(full_matches),nrow=nrow(full_matches)),
                           stringsAsFactors=FALSE)
  colnames(full_matches)<-c('trial_id','model','reference','candidate')

  # extract the reference features for each trial according to the evaluator
  eval_ref_features <- full_matches %>%
    dplyr::select(trial_id,model,reference) %>%
    dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
    drop_na()

  # surround these features in backticks and add a comma and space after each
  eval_ref_features$reference<-paste0("`",eval_ref_features$reference,"`, ")

  # roll up the evaluators reference feature list into a df with 1 row for each
  # trial instance
  eval_ref_features<-eval_ref_features %>%
    dplyr::group_by(trial_id,model) %>%
    dplyr::mutate(match_ref_features=paste0(reference,collapse="")) %>%
    dplyr::select(trial_id,model,match_ref_features) %>%
    dplyr::distinct()

  # extract the candidate features for each trial according to the evaluator
  eval_can_features <- full_matches %>%
    dplyr::select(trial_id,model,candidate) %>%
    dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
    drop_na()

  # surround these features in backticks and add a comma and space after each
  eval_can_features$candidate<-paste0("`",eval_can_features$candidate,"`, ")

  # roll up the evaluators candidate feature list into a df with 1 row for each
  # trial instance
  eval_can_features<-eval_can_features %>%
    dplyr::group_by(trial_id,model) %>%
    dplyr::mutate(match_can_features=paste0(candidate,collapse="")) %>%
    dplyr::select(trial_id,model,match_can_features) %>%
    dplyr::distinct()

  # combine the dfs with the true  features and evaluator-reported features
  features<-merge(merge(trial_features,eval_ref_features),eval_can_features)

  # loop through each row of this df to count each of the 3 types of
  # hallucinations
  for (i in 1:nrow(features)){
    # calculate addition hallucinations by counting how many reference features
    # the evaluator reported, counting how many of the reference features the
    # evaluator reported are in the true reference feature list, then finding
    # the difference between those two numbers, then doing the same thing for
    # the candidate features, and summing those 2 final numbers
    # features$num_pos_halls[i]<-(length(extract_elements_v2(
    #   features$match_ref_features[[i]]))-sum(extract_elements_v2(
    #     features$match_ref_features[[i]]) %in% extract_elements_v2(
    #       features$true_ref_features[[i]])))+(length(extract_elements_v2(
    #         features$match_can_features[[i]]))-sum(extract_elements_v2(
    #           features$match_can_features[[i]]) %in% extract_elements_v2(
    #             features$true_can_features[[i]])))
    features$num_add_halls[i]<-sum(!(extract_elements_v2(
      features$match_ref_features[[i]]) %in% extract_elements_v2(
        features$true_ref_features[[i]])))+sum(!(extract_elements_v2(
          features$match_can_features[[i]]) %in% extract_elements_v2(
            features$true_can_features[[i]])))
    # calculate removal hallucinations by counting how many true reference
    # features there were, counting how many true reference features were
    # reported by the evaluator, then finding the difference between those two
    # numbers, then doing the same thing for the candidate features, and summing
    # those 2 final numbers
    # features$num_neg_halls[i]<-(length(extract_elements_v2(
    #   features$true_ref_features[[i]]))-sum(extract_elements_v2(
    #     features$true_ref_features[[i]]) %in% extract_elements_v2(
    #       features$match_ref_features[[i]])))+(length(extract_elements_v2(
    #         features$true_can_features[[i]]))-sum(extract_elements_v2(
    #           features$true_can_features[[i]]) %in% extract_elements_v2(
    #             features$match_can_features[[i]])))
    #
    features$num_rem_halls[i]<-sum(!(extract_elements_v2(
      features$true_ref_features[[i]]) %in% extract_elements_v2(
        features$match_ref_features[[i]])))+sum(!(extract_elements_v2(
          features$true_can_features[[i]]) %in% extract_elements_v2(
            features$match_can_features[[i]])))

    # calculate the multi-match hallucinations
    # create a table of counts for each true reference feature
    true_ref_count=table(extract_elements_v2(features$true_ref_features[[i]]))
    # create a table of counts for each reference feature according to the
    # evaluator
    match_ref_count=table(extract_elements_v2(features$match_ref_features[[i]]))
    # initialize the reference multi-match hallucination counter
    multi_halls_ref=c()
    # loop through each true reference feature
    for (feat1 in extract_elements_v2(features$true_ref_features[[i]])){
      # calculate the multi-match hallucinations for that feature by counting
      # how many times it appears in the true reference feature list, counting
      # how many times it appears in the evaluators reference feature list, and
      # finding the difference between those two numbers. If the difference is
      # negative, that is a negative hallucination, not a multi-match, so set
      # those to 0 to count correctly
      multi_halls_ref[feat1]=max(sum(as.numeric(
        match_ref_count[feat1])-as.numeric(true_ref_count[feat1])),0,na.rm=TRUE)
    }

    # create a table of counts for each true candidate feature
    true_can_count=table(extract_elements_v2(features$true_can_features[[i]]))
    # create a table of counts for each candidate feature according to the
    # evaluator
    match_can_count=table(extract_elements_v2(features$match_can_features[[i]]))
    # initialize the reference multi-match hallucination counter
    multi_halls_can=c()
    # loop through each true reference feature
    for (feat2 in extract_elements_v2(features$true_can_features[[i]])){
      # calculate the multi-match hallucinations for that feature by counting
      # how many times it appears in the true candidate feature list, counting
      # how many times it appears in the evaluators candidate feature list, and
      # finding the difference between those two numbers. If the difference is
      # negative, that is a negative hallucination, not a multi-match, so set
      # those to 0 to count correctly
      multi_halls_can[feat2]=max(sum(as.numeric(
        match_can_count[feat1])-as.numeric(true_can_count[feat2])),0,na.rm=TRUE)
    }
    # the number of multi-match hallucinations for the trial is the sum of the
    # multi-match hallucinations for each of its features
    features$num_multi_halls[[i]]=sum(multi_halls_ref,multi_halls_can)

    features$total_halls[i]=sum(features$num_add_halls[i],
                                features$num_rem_halls[i],
                                as.numeric(features$num_multi_halls[i]))


    features$num_ref[i]<-length(extract_elements_v2(
      features$true_ref_features[i]))
    features$num_can[i]<-length(extract_elements_v2(
      features$true_can_features[i]))
  }
  # the above returned the multi-match hallucinations as a list, which is not
  # ideal, so convert it to a number
  features$num_multi_halls=as.numeric(features$num_multi_halls)
  return(features)
}


# NEW PLAN: make 2 functions: 1) restructure our input data into a more
#                                generic, useful form
#                             2) remove hallucinations from generic data
#                                structure

RemoveHallucinations_v1<-function(Matches,ReferenceList,CandidateList){
  # Matches should be a list containing the matches, with Matches[1] being from
  # the reference list and Matches[2] being from the candidate list
  # ReferenceList should be the true reference feature list
  # CandidateList should be the true candidate feature list
  #
  # Currently, this extracts all true (non-hallucinated) matches and all
  # addition match hallucinations (just the hallucinated feature, not the whole
  # match) and calculates the corrected metrics

  TrueMatches<-as.data.frame(matrix(nrow=nrow(Matches),ncol=2))
  colnames(TrueMatches)=colnames(Matches)
  Hallucinations<-rep(NA,2*nrow(Matches))
  for (iter in 1:nrow(Matches)){
    if (Matches[iter,1]%in%ReferenceList){
      if (Matches[iter,2]%in%CandidateList){
        TrueMatches[iter,]=Matches[iter,]
      } else {
        Hallucinations[iter]=Matches[iter,2]
      }
    } else {
      Hallucinations[nrow(Matches)+iter]=Matches[iter,1]
      if (!(Matches[iter,2]%in%CandidateList)){
        Hallucinations[iter]=Matches[iter,2]
      }
    }
  }
  TrueMatches<-TrueMatches %>%
    na.omit()
  Hallucinations<-Hallucinations[!is.na(Hallucinations)]

  precision<-nrow(TrueMatches)/length(CandidateList)
  recall<-nrow(TrueMatches)/length(ReferenceList)
  f1<-2*precision*recall/(precision+recall)


  result<-list(TrueMatches=TrueMatches,Hallucinations=Hallucinations,
               precision=precision,recall=recall,f1=f1)

  return(result)
}


RemoveHallucinations_v2<-function(Matches,ReferenceList,CandidateList){
  # Matches should be a list containing the matches, with Matches[1] being from
  # the reference list and Matches[2] being from the candidate list
  # ReferenceList should be the true reference feature list
  # CandidateList should be the true candidate feature list
  #
  # Currently, this extracts all true (non-hallucinated) matches, all addition
  # match hallucinations (just the hallucinated feature, not the whole match),
  # and all multi-match hallucinations (again, just the hallucinated feature),
  # and calculates the corrected metrics.

  # count the number of times each feature appears in each list; useful for
  # multi-match hallucination identification
  Rtab<-as.data.frame(table(ReferenceList))
  Ctab<-as.data.frame(table(CandidateList))
  MRtab<-as.data.frame(table(Matches[,1]))
  MCtab<-as.data.frame(table(Matches[,2]))

  # Extract the matches in which both the reference feature and candidate
  # feature are real original features
  TrueMatches<-Matches[(Matches[,1]%in%ReferenceList)&
                         (Matches[,2]%in%CandidateList),,drop=FALSE]
  # Extract the addition hallucinations i.e. all the matched features which were
  # not in the original lists
  AHallucinations<-c(Matches[!(Matches[,1]%in%ReferenceList),1],
                     Matches[!(Matches[,2]%in%CandidateList),2])

  # initialize empty vectors for the indices in which multi-match hallucinations
  # occur...
  Hindices<-c()
  # ...and for the hallucinations themselves
  MHallucinations<-c()
  # loop through the rows of the matches
  if (length(TrueMatches)>0){
    for (Riter in 1:nrow(TrueMatches)){
      feat<-TrueMatches[Riter,1]
      if (MRtab$Freq[MRtab$Var1==feat]>Rtab$Freq[Rtab$ReferenceList==feat]){
        MRtab$Freq[MRtab$Var1==feat]=MRtab$Freq[MRtab$Var1==feat]-1
        MHallucinations<-c(MHallucinations,feat)
        Hindices<-c(Hindices,Riter)
      }
    }
    for (Citer in 1:nrow(TrueMatches)){
      feat<-TrueMatches[Riter,2]
      if (MCtab$Freq[MCtab$Var1==feat]>Ctab$Freq[Ctab$CandidateList==feat]){
        MCtab$Freq[MCtab$Var1==feat]=MCtab$Freq[MCtab$Var1==feat]-1
        MHallucinations<-c(MHallucinations,feat)
        Hindices<-c(Hindices,Citer)
      }
    }
    if (length(Hindices)>0){
      TrueMatches<-TrueMatches[-Hindices,,drop=FALSE]
    }
  }

  Hallucinations<-c(AHallucinations,MHallucinations)

  precision<-max(nrow(TrueMatches),0,na.rm=TRUE)/length(CandidateList)
  recall<-max(nrow(TrueMatches),0,na.rm=TRUE)/length(ReferenceList)
  f1<-max(2*precision*recall/(precision+recall),0,na.rm=TRUE)

  UnmatchedReferenceFeature<-ReferenceList[!(ReferenceList%in%TrueMatches[,1])]
  UnmatchedCandidateFeature<-CandidateList[!(CandidateList%in%TrueMatches[,2])]

  result<-list(TrueMatches=TrueMatches,Hallucinations=Hallucinations,
               UnmatchedReferenceFeature=UnmatchedReferenceFeature,
               UnmatchedCandidateFeature=UnmatchedCandidateFeature,
               precision=precision,recall=recall,f1=f1)

  return(result)
}


id_hallucinations_v3<-function(trial_df,matches_df,gen_models){

  # takes the raw input from Nafis and returns the counts of each type of
  # hallucination for each trial row
  #
  # Things to add:  models as input
  #                 extract hallucinated features themselves, not just counts
  #                 eval models


  # This assumes the column names of the generated features/matched features/etc
  # are the model name itself, but this is not exactly the case. In the matches
  # dataframe from Nafis, they were i.e. 'gpt4o_zs_gen_matches' for
  # 'gpt4-omni-zs'; similar for the trials dataframe
  #
  # Have not tested this!
  rows<-nrow(matches_df)*length(gen_models)
  matches1<-data.frame(trial_id=character(rows),
                       model=character(rows),
                       matchings=character(rows))

  for (iter in nrow(gen_models)){

    matches1$trial_id[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
      matches_df$NCTId
    matches1$model[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
      gen_models[iter]
    matches1$matchings[((iter-1)*nrow(matches_df)+1):(iter*nrow(matches_df))]<-
      matches_df[,gen_models[iter]]
  }


            # # extracts the model from its corresponding column for each of the models
            # gptzs_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-zs',
            #                           matches=matches_df$gpt4o_zs_gen_matches)
            #
            # gptts_matches<-data.frame(trial_id=matches_df$NCTId,model='gpt4-omni-ts',
            #                           matches=matches_df$gpt4o_ts_gen_matches)
            #
            # llamazs_matches<-data.frame(trial_id=matches_df$NCTId,
            #                             model='llama3-70b-in-zs',
            #                             matches=matches_df$llama3_70b_it_zs_gen_matches)
            #
            # llamats_matches<-data.frame(trial_id=matches_df$NCTId,
            #                             model='llama3-70b-in-ts',
            #                             matches=matches_df$llama3_70b_it_ts_gen_matches)
            #
            # # combine the above; essentially transferred from wide to long form
            # matches<-rbind(gptzs_matches,gptts_matches,llamazs_matches,llamats_matches)

  # remove the trials used for three shot prompting, convert from Json, then
  # throw out the old matches column
  matches_parsed <- matches1 %>%
    filter(!trial_id %in% c("NCT00000620", "NCT01483560", "NCT04280783")) %>%
    mutate(new_matches = lapply(matchings, fromJSON)) %>%
    select(trial_id, model, new_matches)

  # keep the original df in case this part messes everything up (mostly for
  # debugging, no longer needed as it works fine)
  matches_parsed_test<-matches_parsed

  # ok, bear with me here
  # loop through each row of the matches df
  #
  # The if conditions account for if any of the lists are empty; otherwise, it
  # returns NA values which mess with the later code chunks
  for (ind in 1:nrow(matches_parsed)){
    # extract the matched reference features into its own column
    matches_parsed_test$matched_reference_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$matched_features)>0){
      matches_parsed$new_matches[[ind]]$matched_features[,1]} else {list()}
    # extract the matched candidate features into its own column
    matches_parsed_test$matched_candidate_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$matched_features)>0){
      matches_parsed$new_matches[[ind]]$matched_features[,2]}else{list()}
    # extract the remaining reference features into its own column
    matches_parsed_test$remaining_reference_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$remaining_reference_features)>0){
      matches_parsed$new_matches[[ind]]$remaining_reference_features}else{
        list()}
    # extract the remaining candidate features into its own column
    matches_parsed_test$remaining_candidate_features[[ind]]=if(length(
      matches_parsed$new_matches[[ind]]$remaining_candidate_features)>0){
      matches_parsed$new_matches[[ind]]$remaining_candidate_features}else{list()
      }

    # concatenate a sequence of NAs to separate the remaining candidate features
    # from the remaining reference features (i.e. make it look more like what
    # the class had originally for the matches)
    matches_parsed_test$reference[[ind]]=as.list(c(
      matches_parsed_test$matched_reference_features[[ind]],
      matches_parsed_test$remaining_reference_features[[ind]],
      rep(NA,length(matches_parsed_test$remaining_candidate_features[[ind]]))))
    matches_parsed_test$candidate[[ind]]=as.list(c(
      matches_parsed_test$matched_candidate_features[[ind]],
      rep(NA,length(matches_parsed_test$remaining_reference_features[[ind]])),
      matches_parsed_test$remaining_candidate_features[[ind]]))
  }

  # just take the columns with the trial id, generative model, reference feature
  # list, and candidate feature list (the ones we just created with the NAs),
  # then expand it out and sort by trial id
  full_matches<-matches_parsed_test %>%
    select(trial_id,model,reference,candidate) %>%
    unnest(c(reference,candidate)) %>%
    arrange(trial_id)


  # going from wide to long form for the trial info dataframe
  # remove the trial group (as that was not in the original class data in this
  # table) and all generated columns
  #
  #
  #
  #
  # (Need to do here what I did for the matches df) done!!

  new_trial_df1<-select(trial_df,-any_of(gen_models))
  new_trial_df1<-do.call("rbind",
                         replicate(length(gen_models),new_trial_df1,simplify=FALSE))

  for (iter in nrow(gen_models)){

    new_trial_df1$model[((iter-1)*nrow(new_trial_df1)+1):(iter*nrow(
      new_trial_df1))]<-gen_models[iter]
    new_trial_df1$candidate[((iter-1)*nrow(new_trial_df1)+1):(iter*nrow(
      new_trial_df1))]<-trial_df[,gen_models[iter]]

  }


            # trial_gptzs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
            #                                 llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
            # # identify model as gpt 0 shot
            # trial_gptzs$model='gpt4-omni-zs'
            # # re-add the gpt 0 shot generated results
            # trial_gptzs$candidate=trial_df$gpt4o_zs_gen
            #
            # # same as above but for gpt 3 shot
            # trial_gptts<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
            #                                 llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
            # trial_gptts$model='gpt4-omni-ts'
            # trial_gptts$candidate=trial_df$gpt4o_ts_gen
            #
            # # same as above but for llama 0 shot
            # trial_llamazs<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
            #                                   llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
            # trial_llamazs$model='llama3-70b-in-zs'
            # trial_llamazs$candidate=trial_df$llama3_70b_it_zs_gen
            #
            # # same as above but for llama 3 shot
            # trial_llamats<-select(trial_df,-c(TrialGroup,gpt4o_zs_gen,gpt4o_ts_gen,
            #                                   llama3_70b_it_zs_gen,llama3_70b_it_ts_gen))
            # trial_llamats$model='llama3-70b-in-ts'
            # trial_llamats$candidate=trial_df$llama3_70b_it_ts_gen
            #
            # # combine the above; it is now long form :)
            # new_trial_df<-rbind(trial_gptzs,trial_gptts,trial_llamazs,trial_llamats)

  # this is to differentiate between CT-Pub and CT-Repo; the true reference
  # features are stored in different column names between the two
  #
  # In both cases, take the trial id, reference feature list, candidate feature
  # list, and generative model columns, and remove the trials used for 3 shot
  # prompting
  if ('Paper_BaselineMeasures_Corrected' %in% colnames(new_trial_df1)){
    trial_features<-new_trial_df1 %>%
      dplyr::select(NCTId,Paper_BaselineMeasures_Corrected,candidate,model) %>%
      dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
    colnames(trial_features)<-c('trial_id','true_ref_features',
                                'true_can_features','model')
  } else {
    trial_features<-new_trial_df1 %>%
      dplyr::select(NCTId,API_BaselineMeasures_Corrected,candidate,model) %>%
      dplyr::filter(!NCTId %in% c("NCT00000620", "NCT01483560", "NCT04280783"))
    colnames(trial_features)<-c('trial_id','true_ref_features',
                                'true_can_features','model')
  }

  # remove factors from matches df; it was giving me some issues when trying to
  # get rid of NAs, so this fixed that
  full_matches<-data.frame(matrix(unlist(full_matches),nrow=nrow(full_matches)),
                           stringsAsFactors=FALSE)
  colnames(full_matches)<-c('trial_id','model','reference','candidate')

  # extract the reference features for each trial according to the evaluator
  eval_ref_features <- full_matches %>%
    dplyr::select(trial_id,model,reference) %>%
    dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
    drop_na()

  # surround these features in backticks and add a comma and space after each
  eval_ref_features$reference<-paste0("`",eval_ref_features$reference,"`, ")

  # roll up the evaluators reference feature list into a df with 1 row for each
  # trial instance
  eval_ref_features<-eval_ref_features %>%
    dplyr::group_by(trial_id,model) %>%
    dplyr::mutate(match_ref_features=paste0(reference,collapse="")) %>%
    dplyr::select(trial_id,model,match_ref_features) %>%
    dplyr::distinct()

  # extract the candidate features for each trial according to the evaluator
  eval_can_features <- full_matches %>%
    dplyr::select(trial_id,model,candidate) %>%
    dplyr::filter(!trial_id%in%c("NCT00000620","NCT01483560","NCT04280783"))%>%
    drop_na()

  # surround these features in backticks and add a comma and space after each
  eval_can_features$candidate<-paste0("`",eval_can_features$candidate,"`, ")

  # roll up the evaluators candidate feature list into a df with 1 row for each
  # trial instance
  eval_can_features<-eval_can_features %>%
    dplyr::group_by(trial_id,model) %>%
    dplyr::mutate(match_can_features=paste0(candidate,collapse="")) %>%
    dplyr::select(trial_id,model,match_can_features) %>%
    dplyr::distinct()

  # combine the dfs with the true  features and evaluator-reported features
  features<-merge(merge(trial_features,eval_ref_features),eval_can_features)

  # loop through each row of this df to count each of the 3 types of
  # hallucinations
  for (i in 1:nrow(features)){
    # calculate addition hallucinations by counting how many reference features
    # the evaluator reported, counting how many of the reference features the
    # evaluator reported are in the true reference feature list, then finding
    # the difference between those two numbers, then doing the same thing for
    # the candidate features, and summing those 2 final numbers
    # features$num_pos_halls[i]<-(length(extract_elements_v2(
    #   features$match_ref_features[[i]]))-sum(extract_elements_v2(
    #     features$match_ref_features[[i]]) %in% extract_elements_v2(
    #       features$true_ref_features[[i]])))+(length(extract_elements_v2(
    #         features$match_can_features[[i]]))-sum(extract_elements_v2(
    #           features$match_can_features[[i]]) %in% extract_elements_v2(
    #             features$true_can_features[[i]])))
    features$num_add_halls[i]<-sum(!(extract_elements_v2(
      features$match_ref_features[[i]]) %in% extract_elements_v2(
        features$true_ref_features[[i]])))+sum(!(extract_elements_v2(
          features$match_can_features[[i]]) %in% extract_elements_v2(
            features$true_can_features[[i]])))
    # calculate removal hallucinations by counting how many true reference
    # features there were, counting how many true reference features were
    # reported by the evaluator, then finding the difference between those two
    # numbers, then doing the same thing for the candidate features, and summing
    # those 2 final numbers
    # features$num_neg_halls[i]<-(length(extract_elements_v2(
    #   features$true_ref_features[[i]]))-sum(extract_elements_v2(
    #     features$true_ref_features[[i]]) %in% extract_elements_v2(
    #       features$match_ref_features[[i]])))+(length(extract_elements_v2(
    #         features$true_can_features[[i]]))-sum(extract_elements_v2(
    #           features$true_can_features[[i]]) %in% extract_elements_v2(
    #             features$match_can_features[[i]])))
    #
    features$num_rem_halls[i]<-sum(!(extract_elements_v2(
      features$true_ref_features[[i]]) %in% extract_elements_v2(
        features$match_ref_features[[i]])))+sum(!(extract_elements_v2(
          features$true_can_features[[i]]) %in% extract_elements_v2(
            features$match_can_features[[i]])))

    # calculate the multi-match hallucinations
    # create a table of counts for each true reference feature
    true_ref_count=table(extract_elements_v2(features$true_ref_features[[i]]))
    # create a table of counts for each reference feature according to the
    # evaluator
    match_ref_count=table(extract_elements_v2(features$match_ref_features[[i]]))
    # initialize the reference multi-match hallucination counter
    multi_halls_ref=c()
    # loop through each true reference feature
    for (feat1 in extract_elements_v2(features$true_ref_features[[i]])){
      # calculate the multi-match hallucinations for that feature by counting
      # how many times it appears in the true reference feature list, counting
      # how many times it appears in the evaluators reference feature list, and
      # finding the difference between those two numbers. If the difference is
      # negative, that is a negative hallucination, not a multi-match, so set
      # those to 0 to count correctly
      multi_halls_ref[feat1]=max(sum(as.numeric(
        match_ref_count[feat1])-as.numeric(true_ref_count[feat1])),0,na.rm=TRUE)
    }

    # create a table of counts for each true candidate feature
    true_can_count=table(extract_elements_v2(features$true_can_features[[i]]))
    # create a table of counts for each candidate feature according to the
    # evaluator
    match_can_count=table(extract_elements_v2(features$match_can_features[[i]]))
    # initialize the reference multi-match hallucination counter
    multi_halls_can=c()
    # loop through each true reference feature
    for (feat2 in extract_elements_v2(features$true_can_features[[i]])){
      # calculate the multi-match hallucinations for that feature by counting
      # how many times it appears in the true candidate feature list, counting
      # how many times it appears in the evaluators candidate feature list, and
      # finding the difference between those two numbers. If the difference is
      # negative, that is a negative hallucination, not a multi-match, so set
      # those to 0 to count correctly
      multi_halls_can[feat2]=max(sum(as.numeric(
        match_can_count[feat1])-as.numeric(true_can_count[feat2])),0,na.rm=TRUE)
    }
    # the number of multi-match hallucinations for the trial is the sum of the
    # multi-match hallucinations for each of its features
    features$num_multi_halls[[i]]=sum(multi_halls_ref,multi_halls_can)

    features$total_halls[i]=sum(features$num_add_halls[i],
                                features$num_rem_halls[i],
                                as.numeric(features$num_multi_halls[i]))


    features$num_ref[i]<-length(extract_elements_v2(
      features$true_ref_features[i]))
    features$num_can[i]<-length(extract_elements_v2(
      features$true_can_features[i]))
  }
  # the above returned the multi-match hallucinations as a list, which is not
  # ideal, so convert it to a number
  features$num_multi_halls=as.numeric(features$num_multi_halls)
  return(features)
}