diff --git a/StudentNotebooks/Assignment03/dar-f24-assignment3-template.Rmd b/StudentNotebooks/Assignment03/dar-f24-assignment3-template.Rmd index 760e7bd..928796f 100644 --- a/StudentNotebooks/Assignment03/dar-f24-assignment3-template.Rmd +++ b/StudentNotebooks/Assignment03/dar-f24-assignment3-template.Rmd @@ -197,7 +197,7 @@ These include the results of various LLM on each trial. NOTES: - * All of these datasets can be found in `/academics/MATP-4910-F24/DAR-CTEval-F24/Data` + * All of these datasets can be found in `~/DAR-CTEval-F24/Data` ## Load the CTBench Eval _Data_ @@ -246,12 +246,12 @@ First we load the data and see that `CT_Pub.df` contains 103 trials with 10 dime ```{r} # Load the CT_Pub data -CT_Pub.df<- readRDS("/academics/MATP-4910-F24/DAR-CTEval-F24/Data/CT_Pub_data.Rds") +CT_Pub.df<- readRDS("~/DAR-CTEval-F24/Data/CT_Pub_data.Rds") dim(CT_Pub.df) # Load the CT_Repo data -CT_Repo.df<- readRDS("/academics/MATP-4910-F24/DAR-CTEval-F24/Data/CT_Repo_data.Rds") +CT_Repo.df<- readRDS("~/DAR-CTEval-F24/Data/CT_Repo_data.Rds") dim(CT_Repo.df) @@ -312,7 +312,7 @@ If the table has an entry such as ```{r} # Load the trials.responses -CT_Pub.responses.df<- readRDS("/academics/MATP-4910-F24/DAR-CTEval-F24/Data/trials.responses.Rds") +CT_Pub.responses.df<- readRDS("~/DAR-CTEval-F24/Data/trials.responses.Rds") # convert model and type to factors CT_Pub.responses.df$trial_group <- as.factor(CT_Pub.responses.df$trial_group) @@ -322,7 +322,7 @@ CT_Pub.responses.df$model <- as.factor(CT_Pub.responses.df$model) dim(CT_Pub.responses.df) # Load the trials.matches -CT_Pub.matches.df<- readRDS("/academics/MATP-4910-F24/DAR-CTEval-F24/Data/trials.matches.Rds") +CT_Pub.matches.df<- readRDS("~/DAR-CTEval-F24/Data/trials.matches.Rds") # convert model and type to factors CT_Pub.matches.df$model <- as.factor(CT_Pub.matches.df$model) diff --git a/StudentNotebooks/Assignment03/dar-f24-assignment3-template.html b/StudentNotebooks/Assignment03/dar-f24-assignment3-template.html index c2b7a22..60227e2 100644 --- a/StudentNotebooks/Assignment03/dar-f24-assignment3-template.html +++ b/StudentNotebooks/Assignment03/dar-f24-assignment3-template.html @@ -11,7 +11,7 @@ - +
NOTES:
/academics/MATP-4910-F24/DAR-CTEval-F24/Data
~/DAR-CTEval-F24/Data
# Load the CT_Pub data
-CT_Pub.df<- readRDS("/academics/MATP-4910-F24/DAR-CTEval-F24/Data/CT_Pub_data.Rds")
+CT_Pub.df<- readRDS("~/DAR-CTEval-F24/Data/CT_Pub_data.Rds")
dim(CT_Pub.df)
## [1] 103 10
# Load the CT_Repo data
-CT_Repo.df<- readRDS("/academics/MATP-4910-F24/DAR-CTEval-F24/Data/CT_Repo_data.Rds")
+CT_Repo.df<- readRDS("~/DAR-CTEval-F24/Data/CT_Repo_data.Rds")
dim(CT_Repo.df)
## [1] 1693 9
@@ -2074,7 +2074,7 @@ candidate descriptor D
had no match in the
reference list.
# Load the trials.responses
-CT_Pub.responses.df<- readRDS("/academics/MATP-4910-F24/DAR-CTEval-F24/Data/trials.responses.Rds")
+CT_Pub.responses.df<- readRDS("~/DAR-CTEval-F24/Data/trials.responses.Rds")
# convert model and type to factors
CT_Pub.responses.df$trial_group <- as.factor(CT_Pub.responses.df$trial_group)
@@ -2084,7 +2084,7 @@ 4.2 Load the CTBench Eval
dim(CT_Pub.responses.df)
## [1] 400 11
# Load the trials.matches
-CT_Pub.matches.df<- readRDS("/academics/MATP-4910-F24/DAR-CTEval-F24/Data/trials.matches.Rds")
+CT_Pub.matches.df<- readRDS("~/DAR-CTEval-F24/Data/trials.matches.Rds")
# convert model and type to factors
CT_Pub.matches.df$model <- as.factor(CT_Pub.matches.df$model)
@@ -2179,14 +2179,16 @@ 5.1 How do results differ
Differences by Model on CT-Pub
-
-
-
+
+
+
+
+model
meanPrecision
sePrecision
meanRecall
@@ -2197,12 +2199,40 @@ 5.1 How do results differ
-0.4344882
-0.0084623
-0.5272768
-0.0104531
-0.4517983
-0.0072764
+gpt4-omni-ts
+0.4194773
+0.0165780
+0.5465613
+0.0206740
+0.4519953
+0.0145372
+
+
+gpt4-omni-zs
+0.4117923
+0.0191232
+0.4988831
+0.0196749
+0.4250843
+0.0150289
+
+
+llama3-70b-in-ts
+0.4372443
+0.0146389
+0.5267929
+0.0209162
+0.4550647
+0.0137912
+
+
+llama3-70b-in-zs
+0.4694388
+0.0167251
+0.5368701
+0.0222861
+0.4750490
+0.0146080
@@ -2227,21 +2257,26 @@ 5.2 How do results differ
meanRecall=mean(recall),
seRecall=std.error(recall),
meanF1=mean(f1),
- sef1=std.error(f1))
-
-kable(CT_Pub_MT_results.df, caption="Differences by Model and Subgroup on CT-pub")
+ sef1=std.error(f1))
+## `summarise()` has grouped output by 'model'. You can override using the
+## `.groups` argument.
+kable(CT_Pub_MT_results.df, caption="Differences by Model and Subgroup on CT-pub")
model | +trial_group | meanPrecision | sePrecision | meanRecall | @@ -2252,12 +2287,204 @@|||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0.4344882 | -0.0084623 | -0.5272768 | -0.0104531 | -0.4517983 | -0.0072764 | +gpt4-omni-ts | +cancer | +0.3376333 | +0.0399497 | +0.5430899 | +0.0392855 | +0.3970881 | +0.0344699 | +
gpt4-omni-ts | +chronic kidney disease | +0.4430021 | +0.0303595 | +0.5625353 | +0.0518243 | +0.4789217 | +0.0327797 | +||||||
gpt4-omni-ts | +diabetes | +0.4315031 | +0.0261183 | +0.5984520 | +0.0363788 | +0.4815179 | +0.0242987 | +||||||
gpt4-omni-ts | +hypertension | +0.4936892 | +0.0421646 | +0.5076353 | +0.0570618 | +0.4708821 | +0.0340076 | +||||||
gpt4-omni-ts | +obesity | +0.3882668 | +0.0495100 | +0.4659332 | +0.0487458 | +0.4034206 | +0.0390612 | +||||||
gpt4-omni-zs | +cancer | +0.3593896 | +0.0648699 | +0.5177248 | +0.0378228 | +0.3884822 | +0.0416444 | +||||||
gpt4-omni-zs | +chronic kidney disease | +0.4498255 | +0.0359125 | +0.4961775 | +0.0422165 | +0.4550535 | +0.0337143 | +||||||
gpt4-omni-zs | +diabetes | +0.4398874 | +0.0275178 | +0.5670120 | +0.0350287 | +0.4747453 | +0.0243584 | +||||||
gpt4-omni-zs | +hypertension | +0.4110457 | +0.0525773 | +0.4404236 | +0.0524825 | +0.3916399 | +0.0329269 | +||||||
gpt4-omni-zs | +obesity | +0.3678517 | +0.0488932 | +0.4016209 | +0.0472745 | +0.3598584 | +0.0359429 | +||||||
llama3-70b-in-ts | +cancer | +0.4093769 | +0.0321481 | +0.5666599 | +0.0483330 | +0.4519619 | +0.0284682 | +||||||
llama3-70b-in-ts | +chronic kidney disease | +0.4538399 | +0.0367768 | +0.5158242 | +0.0568843 | +0.4591601 | +0.0350896 | +||||||
llama3-70b-in-ts | +diabetes | +0.4571022 | +0.0248011 | +0.5708732 | +0.0343273 | +0.4862324 | +0.0235926 | +||||||
llama3-70b-in-ts | +hypertension | +0.4983549 | +0.0370043 | +0.4818363 | +0.0599463 | +0.4657995 | +0.0376744 | +||||||
llama3-70b-in-ts | +obesity | +0.3603799 | +0.0328818 | +0.4540276 | +0.0437946 | +0.3865056 | +0.0317837 | +||||||
llama3-70b-in-zs | +cancer | +0.4138544 | +0.0414752 | +0.6322974 | +0.0492822 | +0.4836421 | +0.0366250 | +||||||
llama3-70b-in-zs | +chronic kidney disease | +0.5265988 | +0.0432749 | +0.5701615 | +0.0637368 | +0.5070008 | +0.0362418 | +||||||
llama3-70b-in-zs | +diabetes | +0.4925006 | +0.0255036 | +0.5353139 | +0.0350723 | +0.4980289 | +0.0246477 | +||||||
llama3-70b-in-zs | +hypertension | +0.5075860 | +0.0488254 | +0.5109607 | +0.0630818 | +0.4757989 | +0.0373848 | +||||||
llama3-70b-in-zs | +obesity | +0.3884561 | +0.0340609 | +0.4418456 | +0.0460536 | +0.3914692 | +0.0307577 |