File size: 169,593 Bytes
217f9ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "langchain-experimental 0.0.64 requires langchain-community<0.3.0,>=0.2.10, but you have langchain-community 0.3.0 which is incompatible.\n",
      "langchain-experimental 0.0.64 requires langchain-core<0.3.0,>=0.2.27, but you have langchain-core 0.3.1 which is incompatible.\n",
      "langgraph 0.2.16 requires langchain-core<0.3,>=0.2.27, but you have langchain-core 0.3.1 which is incompatible.\n",
      "langchain-huggingface 0.0.3 requires langchain-core<0.3,>=0.1.52, but you have langchain-core 0.3.1 which is incompatible.\n",
      "langgraph-checkpoint 1.0.6 requires langchain-core<0.3,>=0.2.22, but you have langchain-core 0.3.1 which is incompatible.\u001b[0m\u001b[31m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!pip install -qU langsmith langchain-core langchain-community langchain-openai langchain-qdrant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -qU pymupdf ragas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import getpass\n",
    "\n",
    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
    "os.environ[\"LANGCHAIN_API_KEY\"] = getpass.getpass(\"LangChain API Key:\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from uuid import uuid4\n",
    "\n",
    "os.environ[\"LANGCHAIN_PROJECT\"] = f\"AIM_Midterm - SDG - {uuid4().hex[0:8]}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RAG Chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import PyMuPDFLoader\n",
    "\n",
    "filepath_NIST = \"data/NIST.AI.600-1.pdf\"\n",
    "filepath_Blueprint = \"data/Blueprint-for-an-AI-Bill-of-Rights.pdf\"\n",
    "\n",
    "documents_NIST = PyMuPDFLoader(filepath_NIST).load()\n",
    "documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = documents_NIST + documents_Blueprint\n",
    "# rag_documents = PyMuPDFLoader(documents).load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size = 500,\n",
    "    chunk_overlap = 50\n",
    ")\n",
    "\n",
    "rag_documents = text_splitter.split_documents(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain_community.vectorstores import Qdrant\n",
    "from langchain_qdrant import QdrantVectorStore\n",
    "from qdrant_client import QdrantClient\n",
    "from qdrant_client.http.models import Distance, VectorParams\n",
    "\n",
    "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
    "\n",
    "vectorstore = Qdrant.from_documents(\n",
    "    documents=rag_documents,\n",
    "    embedding=embeddings,\n",
    "    location=\":memory:\",\n",
    "    collection_name=\"Implications of AI\"\n",
    ")\n",
    "\n",
    "retriever = vectorstore.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.prompts import ChatPromptTemplate\n",
    "\n",
    "RAG_PROMPT = \"\"\"\\\n",
    "Given a provided context and question, you must answer the question based only on context.\n",
    "\n",
    "If you cannot answer the question based on the context - you must say \"I don't know\".\n",
    "\n",
    "Context: {context}\n",
    "Question: {question}\n",
    "\"\"\"\n",
    "\n",
    "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate synthetic data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "efacf74e912843b9942183b711af9f27",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "embedding nodes:   0%|          | 0/284 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Filename and doc_id are the same for all nodes.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef0dc0773c374283aa74489586288487",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating:   0%|          | 0/20 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 2, 'relevance': 3, 'score': 2.0}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Unacceptable use', 'Harmful bias and homogenization', 'GAI risks', 'Information integrity', 'Transparent policies']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['AI Risk Management Framework', 'Bias in Artificial Intelligence', 'Trustworthy AI', 'Language models', 'Synthetic media transparency']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated sentiment analyzer', 'Search engine results', 'Advertisement delivery systems', 'Body scanners at airport checkpoints', 'Algorithmic discrimination protections']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Biometric Information Privacy Act', 'Fair Credit Reporting Act', 'Equal Credit Opportunity Act', 'California law', 'Explainable AI systems']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Algorithmic discrimination', 'Automated systems', 'Protected classifications', 'Equitable design', 'Disparity testing']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated systems', 'Equitable outcomes', 'Timely consideration', 'Effective organizational structure', 'Training and assessment']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 3, 'score': 1.75}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Deepfake', 'AI ChatGPT', 'Large language models', 'Algorithmic systems', 'Generative AI']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Pre-deployment testing', 'GAI applications', 'Structured public feedback', 'Measurement gaps', 'AI Red-teaming']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 3, 'relevance': 3, 'score': 2.75}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Data privacy', 'Automated systems', 'Privacy by design', 'Data collection', 'Risk identification and mitigation']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['AI Risk Management Framework', 'Bias in Artificial Intelligence', 'Trustworthy AI', 'Language models', 'Synthetic media transparency']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated sentiment analyzer', 'Search engine results', 'Advertisement delivery systems', 'Body scanners at airport checkpoints', 'Algorithmic discrimination protections']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 1, 'relevance': 1, 'score': 1.0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 0 times\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Algorithmic discrimination', 'Automated systems', 'Protected classifications', 'Equitable design', 'Disparity testing']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Reporting expectations', 'National Artificial Intelligence Initiative Office', 'Traffic calming measures', 'Ethical AI startups', 'AI Risk Management Framework']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated systems', 'Sensitive domains', 'Human oversight', 'Meaningful access', 'Reporting']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: How do body scanners at airport checkpoints contribute to the discrimination faced by transgender travelers?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What are some potential risks associated with large language models, as discussed in the provided context?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How can designers, developers, and deployers of automated systems protect individuals and communities from algorithmic discrimination?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How do language models contribute to reducing content diversity in writing?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How does the Fair Credit Reporting Act ensure that consumers are notified about certain decisions made about them?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"What is the purpose of AI Red-teaming in testing AI systems?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How do language models contribute to the reduction of content diversity in writing?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated sentiment analyzer', 'Search engine results', 'Advertisement delivery systems', 'Body scanners at airport checkpoints', 'Algorithmic discrimination protections']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What is the importance of training and assessment in ensuring the proper use of automated systems?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: How do body scanners at airport checkpoints contribute to the discrimination faced by transgender travelers?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"What expectations should be met by automated systems used within sensitive domains?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How can designers, developers, and deployers of automated systems protect individuals and communities from algorithmic discrimination?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What actions should be taken to identify unacceptable use in accordance with activities in the AI RMF Map function?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"What measures should be taken to limit data collection in automated systems according to the given context?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 1, 'relevance': 1, 'score': 1.0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 0 times\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What benefits have been publicly described by the US Department of Transportation regarding \"traffic calming\" measures?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: How do advertisement delivery systems reinforce racial and gender stereotypes?\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 3, 'depth': 3, 'structure': 3, 'relevance': 3, 'score': 3.0}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Abusive data practices', 'Privacy protections', 'Data collection', 'Consent requests', 'Surveillance technologies']\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question clearly asks about the purpose of AI Red-teaming in the context of testing AI systems. It is specific and independent, as it does not rely on external references or additional context to be understood. The intent is clear, seeking an explanation of the role and objectives of AI Red-teaming. Therefore, it meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking about the mechanisms by which the Fair Credit Reporting Act (FCRA) ensures consumer notification regarding certain decisions. It does not rely on external references or context, making it independent and self-contained. The intent is also clear, as it seeks to understand the notification process under the FCRA. Therefore, it meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the expectations for automated systems in sensitive domains, which is a clear and specific inquiry. It does not rely on external references and can be understood independently. However, it could benefit from further specificity regarding what is meant by 'sensitive domains' (e.g., healthcare, finance, law enforcement) and what types of expectations are being referred to (e.g., ethical standards, performance metrics, regulatory compliance). Adding these details would enhance clarity and allow for a more focused response.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"What expectations should be met by automated systems used within sensitive domains?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the impact of body scanners at airport checkpoints on the discrimination experienced by transgender travelers. It is specific in its focus on body scanners and a particular group (transgender travelers), which provides a clear intent for the type of information sought. However, the question could benefit from additional context regarding what aspects of discrimination are being referred to (e.g., privacy concerns, profiling, or treatment by security personnel). To enhance clarity and answerability, the question could specify the type of discrimination or the context in which it occurs, such as during the screening process or in the broader travel experience.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: How do body scanners at airport checkpoints contribute to the discrimination faced by transgender travelers?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the impact of body scanners at airport checkpoints on the discrimination experienced by transgender travelers. It is specific in its focus on body scanners and the demographic of transgender travelers, making the intent clear. However, the question could benefit from additional context regarding what aspects of discrimination are being referred to (e.g., privacy concerns, profiling, or treatment by security personnel). To improve clarity and answerability, the question could specify the type of discrimination or provide examples of how body scanners might contribute to it.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [ReasoningEvolution] simple question generated: How do body scanners at airport checkpoints contribute to the discrimination faced by transgender travelers?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the role of language models in reducing content diversity in writing. It is specific and has a clear intent, focusing on the impact of language models on writing diversity. However, it could benefit from additional context or clarification regarding what is meant by 'content diversity' (e.g., diversity in style, topics, perspectives) and how the contribution of language models is being evaluated (e.g., through specific examples or metrics). Providing this context would enhance the clarity and answerability of the question.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"How do language models contribute to reducing content diversity in writing?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about potential risks associated with large language models, referencing 'the provided context' without including or describing this context within the question. This reliance on unspecified external information makes the question unclear for those who do not have access to that context. To improve clarity and answerability, the question could either include a brief summary of the relevant context or be reframed to ask about general risks associated with large language models without depending on external references. Additionally, specifying the types of risks of interest (e.g., ethical, operational, security) could enhance the question's clarity.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: What are some potential risks associated with large language models, as discussed in the provided context?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about measures to limit data collection in automated systems, referencing 'the given context' without providing any specific details about that context. This reliance on unspecified external information makes the question unclear and potentially unanswerable for those who do not have access to the context. To improve clarity and answerability, the question should either include a brief description of the relevant context or be rephrased to focus on general measures that can be taken to limit data collection in automated systems, independent of any specific context.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"What measures should be taken to limit data collection in automated systems according to the given context?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the importance of training and assessment in the context of automated systems. It is clear in its intent, seeking to understand the significance of these processes. However, it could be considered somewhat broad, as it does not specify what type of automated systems are being referred to (e.g., industrial, software, AI) or the specific aspects of training and assessment that are of interest (e.g., effectiveness, safety, compliance). To improve clarity and answerability, the question could specify the type of automated systems and the particular focus of training and assessment being considered.', 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: What is the importance of training and assessment in ensuring the proper use of automated systems?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about actions to identify unacceptable use in relation to the AI RMF Map function. While it specifies a clear topic (AI RMF Map function) and seeks actionable information, it lacks sufficient context for those unfamiliar with the AI RMF (Risk Management Framework) or its specific Map function. To improve clarity and answerability, the question could provide a brief explanation of what the AI RMF Map function entails or what constitutes 'unacceptable use'. This would help ensure that the question is understandable and answerable without requiring external references.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: What actions should be taken to identify unacceptable use in accordance with activities in the AI RMF Map function?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the role of language models in reducing content diversity in writing, which is a specific and clear inquiry. It does not rely on external references or context, making it independent and understandable. The intent is clear, as it seeks to explore the impact of language models on writing diversity. However, to enhance clarity, the question could specify what is meant by 'content diversity' (e.g., thematic diversity, stylistic diversity) and perhaps provide examples of language models or contexts in which this reduction is observed. This would help in providing a more focused and detailed answer.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How can individuals be protected from abusive data practices according to the given context?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the benefits of 'traffic calming' measures as described by the US Department of Transportation. It is specific in its focus on the benefits and the source of information (US Department of Transportation), making the intent clear. However, the term 'traffic calming' could be further clarified for those unfamiliar with it, as it encompasses various strategies. To improve clarity and answerability, the question could specify which types of traffic calming measures are being referred to (e.g., speed bumps, road narrowing) or ask for examples of benefits that have been highlighted.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the impact of advertisement delivery systems on reinforcing racial and gender stereotypes. It is specific and has a clear intent, focusing on the relationship between advertisement systems and societal stereotypes. However, it could benefit from additional context or clarification regarding which advertisement delivery systems are being referred to (e.g., digital, traditional media) and what specific aspects of stereotypes are being considered (e.g., portrayal, targeting). Providing such details would enhance the clarity and answerability of the question.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question addresses the roles of designers, developers, and deployers of automated systems in protecting individuals and communities from algorithmic discrimination. It is clear in its intent, specifying the target audience (designers, developers, deployers) and the issue at hand (algorithmic discrimination). However, the question is somewhat broad and could benefit from more specificity regarding the types of automated systems or contexts in which this protection is sought. To improve clarity and answerability, the question could specify particular industries (e.g., healthcare, finance, social media) or types of algorithmic discrimination (e.g., bias in hiring algorithms, facial recognition technology) to narrow the focus and provide a more targeted inquiry.', 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"How can designers, developers, and deployers of automated systems protect individuals and communities from algorithmic discrimination?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question addresses the roles of designers, developers, and deployers of automated systems in protecting individuals and communities from algorithmic discrimination. It is clear in its intent, specifying the target audience (designers, developers, deployers) and the issue at hand (algorithmic discrimination). However, the question is somewhat broad and could benefit from more specificity regarding the types of automated systems or contexts in which this protection is sought. To improve clarity and answerability, the question could specify particular sectors (e.g., healthcare, finance, law enforcement) or types of algorithmic discrimination (e.g., bias in hiring algorithms, facial recognition technology) to narrow the focus and provide a more targeted inquiry.', 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"How can designers, developers, and deployers of automated systems protect individuals and communities from algorithmic discrimination?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"What criteria should automated systems in sensitive domains meet, including considerations for human alternatives and fallback mechanisms?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"How do LM impact content diversity in writing?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"How do the gender settings on body scanners at airport checkpoints contribute to the discrimination faced by transgender travelers in terms of security and privacy?\"\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'AI Red-teaming is a structured testing exercise used to probe an AI system to find flaws and vulnerabilities, such as inaccurate, harmful, or discriminatory outputs. This exercise is often conducted in a controlled environment and in collaboration with system developers.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'The answer to given question is not present in context', 'verdict': -1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about measures to limit data collection in automated systems, referencing 'the given context' without providing any specific details about what that context entails. This reliance on unspecified external information makes the question unclear and potentially unanswerable for those who do not have access to that context. To improve clarity and answerability, the question should either include a brief description of the relevant context or be rephrased to focus on general measures that can be taken to limit data collection in automated systems, independent of any specific context.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about potential risks associated with large language models, referencing 'the provided context' without including or describing this context within the question itself. This reliance on unspecified external information makes the question unclear for those who do not have access to the context. To improve clarity and answerability, the question could either include a brief summary of the relevant context or be rephrased to focus on general risks associated with large language models without depending on external references. For example, it could ask, 'What are some commonly recognized risks associated with large language models?' which would make it more self-contained.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks how individuals can be protected from abusive data practices, referencing 'the given context' without providing any specific details about what that context entails. This reliance on unspecified external information makes the question unclear and unanswerable for those who do not have access to that context. To improve clarity and answerability, the question should either include a brief description of the relevant context or be rephrased to ask for general strategies or measures for protection against abusive data practices without depending on external references.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"How can individuals be protected from abusive data practices according to the given context?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the importance of training and assessment in the context of automated systems, which is a clear and specific inquiry. It conveys its intent well, seeking to understand the role these elements play in the effective use of such systems. However, the question could be improved by specifying what type of automated systems are being referred to (e.g., industrial, software, AI) and what aspects of training and assessment are of interest (e.g., effectiveness, safety, user competence). This additional detail would enhance clarity and allow for a more focused response.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': \"The benefits of 'traffic calming' measures have been publicly described by the US Department of Transportation, focusing on slowing vehicle speeds to improve health and safety.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 1, 'relevance': 1, 'score': 1.0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 0 times\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question addresses the impact of an operator's perception of a passenger's gender identity on the screening process for transgender travelers at airport checkpoints. It is specific in its focus on the relationship between perception and screening processes, making the intent clear. However, the question may require additional context regarding the specific screening processes or the nature of the operators' perceptions to be fully answerable. To improve clarity and answerability, the question could specify what aspects of the screening process are being referred to (e.g., procedures, biases) or provide context on how gender identity is typically perceived in airport security settings.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question addresses the roles of designers, developers, and deployers of automated systems in protecting individuals and communities from algorithmic discrimination. It is clear in its intent, specifying the target audience (designers, developers, deployers) and the issue at hand (algorithmic discrimination). However, the question is somewhat broad and could benefit from more specificity regarding the types of automated systems or contexts being referred to (e.g., AI in hiring, law enforcement, etc.). To improve clarity and answerability, the question could specify particular scenarios or examples of algorithmic discrimination, or ask for specific strategies or frameworks that could be employed to mitigate such discrimination.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question addresses the impact of gender settings on body scanners at airport checkpoints on the discrimination experienced by transgender travelers, focusing on security and privacy issues. It is specific and has a clear intent, as it seeks to explore a particular aspect of airport security technology and its implications for a marginalized group. The question is independent and can be understood without needing additional context or references. However, it could be improved by specifying what aspects of security and privacy are being referred to, such as specific incidents, policies, or studies that illustrate the discrimination faced. This would enhance clarity and provide a more focused direction for the answer.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Advertisement delivery systems reinforce racial and gender stereotypes by predicting who is most likely to click on a job advertisement and delivering ads in ways that perpetuate biases. For example, these systems may overwhelmingly direct supermarket cashier ads to women and jobs with taxi companies to primarily Black people, thus reinforcing existing stereotypes.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the impact of language models (LM) on content diversity in writing. It is relatively clear in its intent, seeking to understand the relationship between LMs and the diversity of written content. However, the term 'content diversity' could be interpreted in various ways (e.g., thematic diversity, stylistic diversity, etc.), which introduces some ambiguity. To improve clarity and answerability, the question could specify what aspect of content diversity is being referred to or provide examples of the types of diversity being considered. Additionally, clarifying whether the question pertains to specific types of LMs or writing contexts would enhance its specificity.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated systems', 'Sensitive domains', 'Human oversight', 'Meaningful access', 'Reporting', 'Automated systems', 'Human alternatives', 'Timely human consideration', 'Fallback system', 'Opt-out mechanism']\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the criteria that automated systems in sensitive domains should meet, specifically mentioning considerations for human alternatives and fallback mechanisms. It is clear in its intent and specifies the topic of interest, making it understandable. However, the term 'sensitive domains' could be interpreted in various ways (e.g., healthcare, finance, security), which may lead to ambiguity in the answer. To improve clarity and answerability, the question could specify which sensitive domains are being referred to or provide examples. Additionally, it could clarify what is meant by 'criteria' (e.g., ethical, technical, operational) to guide the response more effectively.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question addresses the roles of designers, developers, and deployers of automated systems in protecting individuals and communities from algorithmic discrimination. It is clear in its intent, specifying the target audience (designers, developers, deployers) and the issue at hand (algorithmic discrimination). However, the question is somewhat broad and could benefit from more specificity regarding the types of automated systems or contexts in which this protection is sought. To improve clarity and answerability, the question could specify particular industries (e.g., healthcare, finance) or types of automated systems (e.g., AI algorithms, machine learning models) to narrow down the focus.', 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Data broker exposes', 'Facial recognition technology', 'Surveillance technology in public housing', 'Enforcement actions by the FTC', 'Cheating-detection companies']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 3, 'score': 1.75}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Deepfake', 'AI ChatGPT', 'Large language models', 'Algorithmic systems', 'Generative AI']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['GAI system incidents', 'Organizational risk management authority', 'Remediation plan', 'Deactivation criteria', 'Third-party GAI resources']\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about actions to identify unacceptable use in relation to the AI RMF Map function. While it specifies the context (AI RMF Map function) and the type of information sought (actions to identify unacceptable use), it lacks clarity because it does not define what 'unacceptable use' entails or what the AI RMF Map function specifically refers to. This could lead to ambiguity in the response, as different interpretations of 'unacceptable use' may exist. To improve clarity and answerability, the question could provide a brief definition of 'unacceptable use' and describe the AI RMF Map function or its purpose, allowing for a more focused and relevant response.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"How do LM affect content diversity in writing?\"\n",
      "[ragas.testset.evolutions.DEBUG] [ReasoningEvolution] question compressed: How does the operator's perception of a passenger's gender identity affect the screening process for transgender travelers at airport checkpoints?\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"How do airport body scanner gender settings impact transgender traveler discrimination?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks how individuals can be protected from abusive data practices, referencing 'the given context' without providing any specific details about what that context entails. This reliance on unspecified external information makes the question unclear and unanswerable for those who do not have access to the context. To improve clarity and answerability, the question should either include a brief description of the relevant context or be rephrased to ask for general strategies or measures for protection against abusive data practices without depending on external references.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"What should be provided in terms of notice and instructions for an opt-out mechanism in automated systems?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"What criteria should automated systems in sensitive domains meet, including human alternatives and fallbacks?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Data privacy', 'Privacy Act of 1974', 'NIST Privacy Framework', 'Biometrics moratorium', 'Workplace surveillance']\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'The Fair Credit Reporting Act ensures that consumers are notified about certain decisions made about them by requiring that consumers who are denied credit receive \"adverse action\" notices. Anyone who relies on the information in a credit report to deny a consumer credit must provide an \"adverse action\" notice to the consumer, which includes \"notice of the reasons a creditor took adverse action on the application or on an existing credit account.\" Additionally, under the risk-based pricing rule, lenders must inform borrowers of their credit score or explain when they are receiving worse terms due to information in their credit report. The law gives every applicant the right to a specific explanation if their credit application is denied, even if a complex algorithm was used.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] seed question generated: How has facial recognition technology been used in public housing and what backlash has it prompted?\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Digital surveillance', 'Reproductive health clinics', 'Private equity firm', 'Facial recognition in schools', 'Labor-Management Reporting and Disclosure Act']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 1, 'relevance': 1, 'score': 1.0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What are the sociotechnical harms associated with algorithmic systems, as discussed in the research by Shelby et al. (2023)?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What procedures should be established and maintained for escalating GAI system incidents to the organizational risk management authority?\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 2, 'relevance': 3, 'score': 2.0}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Unacceptable use', 'Harmful bias and homogenization', 'GAI risks', 'Information integrity', 'Transparent policies']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What are the key privacy protections provided by the Privacy Act of 1974 for personal information in federal records systems?\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions address the impact of airport body scanners on transgender travelers, focusing on discrimination and gender settings. They share the same constraints and depth of inquiry.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What data tracking issues have arisen regarding reproductive health clinics, as mentioned in the context?\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': \"Training and assessment are crucial in ensuring the proper use of automated systems. Individuals involved in administering, interacting with, or interpreting the outputs of automated systems should receive training on how to interpret system outputs, mitigate automation bias, and ensure the system is used appropriately. Regular training updates are necessary to keep up with system changes and ensure proper usage. Ongoing assessment is also vital to verify that human involvement does not compromise the system's safety, effectiveness, or lead to algorithmic discrimination.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions address the discrimination faced by transgender travelers in airport security settings, focusing on the impact of body scanners and perceptions of gender identity. They share similar constraints and depth of inquiry regarding the experiences of transgender individuals during screening.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] evolution_filter failed, retrying with 1\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['AI Bill of Rights', 'White House Office of Science and Technology Policy', 'Automated Systems', 'Civil Rights', 'Democratic Values']\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions inquire about the impact of language models on content diversity in writing, sharing the same constraints and depth of inquiry.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated systems', 'Algorithmic discrimination', 'Independent evaluation', 'Algorithmic impact assessment', 'Reporting']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What is the importance of establishing transparent policies in managing GAI risks?\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions inquire about the standards or requirements for automated systems in sensitive domains. However, the second question introduces additional elements such as human alternatives and fallbacks, which expands its breadth and depth of inquiry.', 'verdict': 0}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the necessary notice and instructions for an opt-out mechanism in automated systems. It is specific and conveys a clear intent, focusing on the requirements for communication regarding opt-out options. However, it could benefit from further clarification regarding the context or type of automated systems being referred to (e.g., marketing, data collection, etc.), as different systems may have varying requirements. To improve clarity and answerability, the question could specify the type of automated system or the legal or ethical framework it is operating under.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Healthcare navigators', 'Automated customer service', 'Ballot curing laws', 'AI-driven call response systems', 'Fallback system']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"What should be included in an algorithmic impact assessment for automated systems?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What is the purpose of the Blueprint for an AI Bill of Rights in relation to automated systems?\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Designers, developers, and deployers of automated systems can protect individuals and communities from algorithmic discrimination by taking proactive and continuous measures. This includes conducting proactive equity assessments during system design, using representative data, avoiding proxies for demographic features, ensuring accessibility for people with disabilities, conducting pre-deployment and ongoing disparity testing and mitigation, and maintaining clear organizational oversight. Independent evaluation and plain language reporting, such as an algorithmic impact assessment that includes disparity testing results and mitigation information, should be performed and made public whenever possible to confirm these protections.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 3, 'depth': 3, 'structure': 3, 'relevance': 3, 'score': 3.0}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Data privacy', 'Automated systems', 'Privacy by design', 'Data collection', 'Risk identification and mitigation']\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the application of facial recognition technology in public housing and the resulting backlash. It is clear in its intent, specifying both the use case (public housing) and the aspect of concern (backlash). The question is independent and can be understood without needing additional context or references. However, to enhance clarity, it could specify the type of backlash (e.g., legal, social, ethical) or provide examples of specific instances where this technology has been implemented in public housing. This would allow for a more focused and detailed response.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [ReasoningEvolution] simple question generated: How has facial recognition technology been used in public housing and what backlash has it prompted?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking about the key privacy protections offered by the Privacy Act of 1974 regarding personal information in federal records systems. It does not rely on external references or additional context, making it independent and self-contained. The intent is also clear, as it seeks specific information about privacy protections. Therefore, it meets all the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking about the procedures for escalating GAI system incidents to the organizational risk management authority. It does not rely on external references and can be understood independently. However, it could be improved by specifying what types of incidents are being referred to (e.g., security breaches, operational failures) or the context in which these procedures are to be applied (e.g., a specific industry or organization). This additional detail would enhance clarity and ensure that the answer is more tailored to the specific needs of the inquiry.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: What procedures should be established and maintained for escalating GAI system incidents to the organizational risk management authority?\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['AI risks', 'Risk response options', 'Model release approach', 'Robustness of risk controls', 'GAI system outputs']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How do ballot curing laws in at least 24 states require a fallback system for voters to correct their ballots?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about data tracking issues related to reproductive health clinics, referencing 'the context' without providing any specific details or information about what that context entails. This reliance on unspecified external information makes the question unclear and unanswerable for those who do not have access to the mentioned context. To improve clarity and answerability, the question could either include a brief summary of the relevant context or specify the types of data tracking issues being inquired about (e.g., privacy concerns, data accuracy, reporting standards).\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: What data tracking issues have arisen regarding reproductive health clinics, as mentioned in the context?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"What does it mean for automated systems to have privacy by design and by default?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the sociotechnical harms associated with algorithmic systems as discussed in a specific research paper by Shelby et al. (2023). While it identifies a clear topic (sociotechnical harms and algorithmic systems) and seeks specific information, it relies on the reader's access to and familiarity with the mentioned research, which is not provided in the question. This reliance on an external reference makes the question less independent and potentially unclear for those who are not familiar with the work of Shelby et al. To improve clarity and answerability, the question could be reframed to ask for a general overview of sociotechnical harms associated with algorithmic systems without referencing a specific study, or it could summarize key points from the research to provide context.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: What are the sociotechnical harms associated with algorithmic systems, as discussed in the research by Shelby et al. (2023)?\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How can organizations compare GAI system outputs against predefined risk tolerance and guidelines?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the importance of establishing transparent policies in managing GAI (General Artificial Intelligence) risks. It is clear in its intent, focusing on the significance of transparency in policy-making related to GAI risks. The question is independent and can be understood without needing additional context or references. However, it could be improved by specifying what aspects of GAI risks are being referred to (e.g., ethical concerns, safety measures, regulatory compliance) to provide a more focused answer. Overall, the question is specific and clear, making it answerable based on the details provided.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: What is the importance of establishing transparent policies in managing GAI risks?\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"What measures and protocols should be implemented for both escalating GAI system incidents to the organizational risk management authority and safely decommissioning AI systems?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the components that should be included in an algorithmic impact assessment for automated systems. It is clear in its intent, specifying the topic of interest (algorithmic impact assessment) and the context (automated systems). The question is self-contained and does not rely on external references, making it understandable and answerable based on the details provided. Therefore, it meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"What should be included in an algorithmic impact assessment for automated systems?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the purpose of the Blueprint for an AI Bill of Rights specifically in relation to automated systems. It is clear and specific, indicating the topic of interest (Blueprint for an AI Bill of Rights) and its context (automated systems). The intent is straightforward, seeking to understand the role or objectives of the Blueprint concerning these systems. Therefore, the question is independent and can be answered without needing additional context or references. It meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"What components should be part of an algorithmic impact assessment for automated systems to ensure equity and accessibility?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the concepts of 'privacy by design' and 'privacy by default' in the context of automated systems. It is clear in its intent, seeking an explanation of these terms and their implications for automated systems. The question is independent and does not rely on external references, making it understandable and answerable based on the details provided. Therefore, it meets the criteria for clarity and answerability.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [ReasoningEvolution] simple question generated: \"What does it mean for automated systems to have privacy by design and by default?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about data tracking issues related to reproductive health clinics, referencing 'the context' without providing any specific details or information about what that context entails. This reliance on unspecified external information makes the question unclear and unanswerable for those who do not have access to the mentioned context. To improve clarity and answerability, the question could either include a brief summary of the relevant context or specify the types of data tracking issues being inquired about (e.g., privacy concerns, data accuracy, reporting standards).\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"Why is it crucial to have transparent policies in place for managing GAI risks, considering the characteristics of trustworthy AI and the needed level of risk management activities?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about ballot curing laws in at least 24 states and their requirement for a fallback system for voters to correct their ballots. It is specific in its focus on ballot curing laws and the aspect of a fallback system, which provides a clear intent. However, the phrase 'fallback system' could be interpreted in various ways, and the question does not specify what is meant by this term. To improve clarity and answerability, the question could define what is meant by 'fallback system' or provide examples of such systems. Additionally, it could specify whether it seeks a general overview or detailed examples from specific states.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"How do ballot curing laws in at least 24 states require a fallback system for voters to correct their ballots?\"\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Those impacted by an automated system should be given a brief, clear notice that they are entitled to opt-out, along with clear instructions for how to opt-out. Instructions should be provided in an accessible form and should be easily findable by those impacted by the automated system. The brevity, clarity, and accessibility of the notice and instructions should be assessed (e.g., via user experience research).', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks how organizations can compare outputs from Generative AI (GAI) systems against predefined risk tolerance and guidelines. It is specific in its focus on organizations and the comparison process, making the intent clear. However, the question could benefit from additional context regarding what types of outputs are being referred to, what specific risk tolerances and guidelines are in place, and whether there are particular methodologies or frameworks that should be considered. Providing examples or specifying the domain (e.g., finance, healthcare) could enhance clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"How can organizations compare GAI system outputs against predefined risk tolerance and guidelines?\"\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': \"The Privacy Act of 1974 provides privacy protections for personal information in federal records systems by imposing limits on data retention, granting individuals the right to access and correct their data, and requiring that federal agencies only retain data that is relevant and necessary for their statutory purpose or to comply with an Executive Order. The law also allows individuals to access their information stored in federal systems of records, contest the contents of a record, and seek legal relief if the agency does not comply with the Act's requirements, including amending or correcting inaccurate information and awarding monetary damages for adverse determinations.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the negative response to the use of facial recognition technology in public housing, specifically referencing reports from the New York Times. While it is clear in its intent to understand the public's reaction and specifies the context (facial recognition technology in public housing), it assumes familiarity with the specific reports from the New York Times without providing any details or context about those reports. This reliance on external references makes the question less independent and potentially unclear for those who may not have access to or knowledge of the specific articles. To improve clarity and answerability, the question could include a brief summary of the reported issues or controversies surrounding the technology's use in public housing, or it could ask for general reasons for negative responses without tying it to specific articles.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"How has the use of facial recognition technology in public housing led to a negative response, as reported in the New York Times?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the sociotechnical harms associated with algorithmic systems as discussed in a specific research study by Shelby et al. (2023). While it identifies a clear topic (sociotechnical harms and algorithmic systems) and seeks specific information, it relies on access to the research by Shelby et al. without providing any details or context about the findings or the nature of the harms discussed. This makes the question less independent and potentially unclear for those who are not familiar with the study. To improve clarity and answerability, the question could either summarize key points from the research or specify the types of sociotechnical harms of interest, allowing for a more focused response without needing to reference the external study.', 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'The Blueprint for an AI Bill of Rights is intended to support the development of policies and practices that protect civil rights and promote democratic values in the building, deployment, and governance of automated systems.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking about the components necessary for an algorithmic impact assessment focused on equity and accessibility in automated systems. It does not rely on external references or context, making it independent and understandable. The intent is also clear, as it seeks a list or description of components relevant to the assessment. Therefore, it meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"How do ballot curing laws in at least 24 states ensure a human fallback system is available for voters to rectify ballot issues caused by automated signature matching systems?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"How should organizations assess GAI system outputs in relation to predefined risk tolerance and guidelines, and what actions should be taken based on this comparison?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Synthetic training data', 'Non-synthetic training data', 'Model collapse', 'Environmental impact', 'Sustainability']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['GAI technologies', 'Content Provenance', 'Provenance data tracking', 'Synthetic content detection', 'Digital transparency mechanisms']\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"What's needed for an algorithmic impact assessment for automated systems to ensure equity and accessibility?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is specific and seeks information on measures and protocols for two distinct processes: escalating GAI system incidents to the organizational risk management authority and safely decommissioning AI systems. It is clear in its intent and does not rely on external references, making it understandable and answerable based on the details provided. However, to enhance clarity, the question could specify the type of organization or context (e.g., industry, size) to tailor the measures and protocols more effectively. Additionally, breaking down the question into two separate parts could improve focus and clarity.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How can concerns of model collapse be mitigated by assessing the proportion of synthetic to non-synthetic training data?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking about the design of automated systems with a focus on privacy protection throughout the development life cycle. It does not rely on external references and can be understood independently. The intent is evident, seeking information on best practices or principles for privacy in system design. However, it could be improved by specifying the type of automated systems (e.g., software applications, AI systems) or the context in which these systems are being developed (e.g., healthcare, finance) to provide a more focused response.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the importance of transparent policies for managing GAI (General Artificial Intelligence) risks, linking it to the characteristics of trustworthy AI and the necessary level of risk management activities. While it is specific in its focus on GAI risks and trustworthy AI, the question is somewhat complex and may be difficult to answer without additional context regarding what constitutes 'trustworthy AI' and the specific 'risk management activities' being referred to. To improve clarity and answerability, the question could be reframed to define or provide examples of trustworthy AI characteristics and specify the types of risk management activities in question. This would help ensure that the intent is clear and that the question can be answered without ambiguity.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"Why is it crucial to have transparent policies in place for managing GAI risks, considering the characteristics of trustworthy AI and the needed level of risk management activities?\"\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Automated systems used within sensitive domains should meet expectations such as narrowly scoped data and inferences, tailored to the situation, human consideration before high-risk decisions, meaningful access to examine the system, demonstrate access to human alternatives, consideration, and fallback. Reporting should include an assessment of timeliness and the extent of additional burden for human alternatives, aggregate statistics about who chooses the human alternative, along with the results of the assessment about brevity, clarity, and accessibility of notice and opt-out instructions. Reporting on the accessibility, timeliness, and effectiveness of human consideration and fallback should be made public at regular intervals for as long as the system is in use. This should include aggregated information about the number and type of requests for consideration, fallback employed, and any repeated requests; the timeliness of the handling of these requests, including mean wait times for different types of requests as well as maximum wait times; and information about the procedures used to address requests for consideration along with the results of the evaluation of their accessibility. For systems used in sensitive domains, reporting should include information about training and governance procedures for these technologies. Reporting should also include documentation of goals and assessment of meeting those goals, consideration of data included, and documentation of the governance of reasonable access to the technology. Reporting should be provided in a clear and machine-readable manner.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How can provenance data tracking and synthetic content detection mechanisms help in managing risks associated with GAI technologies?\"\n",
      "[ragas.testset.evolutions.DEBUG] [ReasoningEvolution] question compressed: \"How should automated systems be designed to ensure privacy is protected by default and throughout the development life cycle?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks how organizations should evaluate GAI system outputs against established risk tolerance and guidelines, as well as what actions should be taken based on this evaluation. It is specific in its intent, focusing on assessment methods and subsequent actions, which makes it clear. However, the question could benefit from more detail regarding what specific aspects of GAI system outputs should be assessed (e.g., accuracy, bias, compliance) and what types of risk tolerance and guidelines are being referred to. Providing examples or specifying the context of the guidelines would enhance clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"What measures are needed for handling GAI system incidents and decommissioning AI systems safely?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the negative response to the use of facial recognition technology in public housing, specifically referencing reports from the New York Times. While it is clear in its intent to understand the public's reaction and specifies the context (facial recognition technology in public housing), it assumes familiarity with the specific reports from the New York Times without providing any details or context about those reports. This reliance on external references makes the question less independent and potentially unclear for those who may not have access to or knowledge of the specific articles. To improve clarity and answerability, the question could include a brief summary of the reported issues or controversies surrounding the technology's use in public housing, or it could ask for general insights into the negative responses without tying it to a specific source.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question is specific and seeks to understand how ballot curing laws in at least 24 states provide a human fallback system for voters dealing with issues from automated signature matching systems. It clearly identifies the topic (ballot curing laws, automated signature matching) and the desired information (the mechanism of ensuring a human fallback). However, the phrase 'at least 24 states' could be seen as slightly vague, as it does not specify which states are being referred to or if there are particular examples that would enhance understanding. To improve clarity and answerability, the question could specify which states are included or provide a brief context about the nature of ballot curing laws in these states.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'The first question asks about the general components of an algorithmic impact assessment, while the second question specifically emphasizes equity and accessibility, indicating a narrower focus. Therefore, they do not have the same depth and breadth of inquiry.', 'verdict': 0}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"How to evaluate GAI system outputs against risk tolerance and guidelines, and what actions to take?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Ongoing monitoring', 'Periodic review', 'Organizational roles and responsibilities', 'Content provenance', 'Incident monitoring']\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"How do ballot curing laws in 24+ states provide a backup for voters to fix signature matching issues?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question addresses the mitigation of model collapse by evaluating the ratio of synthetic to non-synthetic training data. It is specific in its focus on model collapse and the method of assessment, making the intent clear. However, the term 'model collapse' may require additional context for those unfamiliar with the concept, as it can refer to various issues in machine learning models. To enhance clarity and answerability, the question could briefly define 'model collapse' or specify the type of models being discussed. Overall, the question is mostly clear and independent, but a slight adjustment could improve its accessibility.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"How can concerns of model collapse be mitigated by assessing the proportion of synthetic to non-synthetic training data?\"\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'The first question focuses on procedures for escalating incidents to risk management, while the second question addresses measures for handling incidents and decommissioning AI systems. They differ in their specific focus and requirements, leading to different depths of inquiry.', 'verdict': 0}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question addresses the importance of transparent policies for managing GAI (General Artificial Intelligence) risks, linking it to the characteristics of trustworthy AI and the necessary level of risk management activities. It is specific in its focus and intent, making it clear what information is being sought. However, the question could be improved by providing a brief definition or context for 'trustworthy AI' and 'GAI risks' to ensure that all readers understand these terms. This would enhance clarity and answerability for those who may not be familiar with the specific concepts being referenced.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': \"The first question focuses on the concepts of 'privacy by design and by default', while the second question is about ensuring privacy in the design of automated systems. They differ in their specific inquiries and depth, leading to different requirements.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"Why are transparent policies important for managing GAI risks?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks how provenance data tracking and synthetic content detection mechanisms can assist in managing risks related to Generative AI (GAI) technologies. It is specific in its focus on two mechanisms (provenance data tracking and synthetic content detection) and their application to risk management in GAI. The intent is clear, as it seeks to understand the relationship between these mechanisms and risk management. However, the question could be improved by providing a brief context or examples of the types of risks associated with GAI technologies, which would help in formulating a more comprehensive answer. Overall, the question is understandable and answerable based on the details provided.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"How can provenance data tracking and synthetic content detection mechanisms help in managing risks associated with GAI technologies?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What is the importance of incident monitoring for GAI systems according to the given context?\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions focus on evaluating GAI system outputs in relation to risk tolerance and guidelines. However, the second question introduces an additional aspect of determining actions to take, which adds depth to the inquiry.', 'verdict': 0}\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'The answer to given question is not present in context', 'verdict': -1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"How can the concerns of model collapse be addressed by evaluating the ratio of synthetic to non-synthetic training data in the context of environmental impact and sustainability of AI model training and management activities?\"\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions inquire about ballot curing laws in states and their provisions for voters to correct ballots, focusing on similar requirements and depth of inquiry regarding voter assistance.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 2 times\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions inquire about the significance of transparent policies in the context of managing GAI risks, sharing the same constraints and depth of inquiry.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"How can provenance data tracking and synthetic content detection mechanisms contribute to mitigating risks associated with GAI technologies?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 2, 'relevance': 2, 'score': 1.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['NIST Trustworthy and Responsible AI', 'AI 600-1', 'Artificial Intelligence Risk Management Framework', 'Generative Artificial Intelligence Profile', 'U.S. Department of Commerce']\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Biometric Information Privacy Act', 'Fair Credit Reporting Act', 'Equal Credit Opportunity Act', 'California law', 'Explainable AI systems']\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the importance of incident monitoring for GAI systems, referencing 'the given context' without providing any details about what that context entails. This reliance on unspecified external information makes the question unclear and unanswerable for those who do not have access to that context. To improve clarity and answerability, the question should either include a brief summary of the relevant context or be rephrased to ask about the importance of incident monitoring in general terms, without depending on external references.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: What is the importance of incident monitoring for GAI systems according to the given context?\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'The answer to given question is not present in context', 'verdict': -1}\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What is the focus of the NIST Trustworthy and Responsible AI publication?\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Privacy in automated system design can be ensured by incorporating privacy protections by default, assessing privacy risks throughout the development life cycle, implementing appropriate mitigation measures, minimizing data collection, clearly communicating data usage, limiting data collection to specific goals, avoiding mission creep, establishing clear timelines for data retention, proactively identifying and managing risks, and following privacy-preserving security best practices.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question addresses the concerns of model collapse in relation to the ratio of synthetic to non-synthetic training data, specifically within the context of environmental impact and sustainability of AI model training and management. While it is specific in its focus on model collapse and the evaluation of data types, the question is somewhat complex and may be difficult to answer without additional context regarding what is meant by 'model collapse' and how it relates to environmental sustainability. To improve clarity and answerability, the question could be reframed to define 'model collapse' and provide a brief explanation of how the ratio of training data types is linked to sustainability. This would help ensure that the intent is clear and that the question is accessible to a broader audience.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"How can the concerns of model collapse be addressed by evaluating the ratio of synthetic to non-synthetic training data in the context of environmental impact and sustainability of AI model training and management activities?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"What requirements does California law impose on warehouse employers regarding notice and explanation of quotas for employees?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks how provenance data tracking and synthetic content detection mechanisms can help mitigate risks related to Generative AI (GAI) technologies. It is specific in its focus on two mechanisms (provenance data tracking and synthetic content detection) and their role in addressing risks associated with GAI. The intent is clear, as it seeks to understand the contributions of these mechanisms. However, the question could be improved by providing a brief context or examples of the specific risks associated with GAI technologies that are being referred to. This would enhance clarity and allow for a more targeted response.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the importance of incident monitoring for GAI systems, referencing 'the given context' without providing any details about what that context entails. This reliance on unspecified external information makes the question unclear and unanswerable for those who do not have access to that context. To improve clarity and answerability, the question should either include a brief description of the relevant context or be rephrased to focus on the general importance of incident monitoring for GAI systems without depending on external references.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 2 times\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"How can data tracking and content detection help reduce risks in GAI?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the focus of the NIST Trustworthy and Responsible AI publication. It is clear and specific, indicating that the answer should relate to the main themes or objectives of this publication. However, it assumes familiarity with the NIST publication without providing any context or details about its content. To improve clarity and answerability, the question could specify what aspects of the publication are of interest (e.g., guidelines, principles, frameworks) or provide a brief description of the publication's purpose.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: What is the focus of the NIST Trustworthy and Responsible AI publication?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is specific and clear, asking about the requirements imposed by California law on warehouse employers concerning notice and explanation of quotas for employees. It does not rely on external references and can be understood independently. The intent is clear, seeking legal information about employer obligations. Therefore, it meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"What requirements does California law impose on warehouse employers regarding notice and explanation of quotas for employees?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: What is the primary focus of the NIST publication on Trustworthy and Responsible AI?\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 3, 'depth': 3, 'structure': 3, 'relevance': 3, 'score': 3.0}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Data privacy', 'Automated systems', 'Surveillance oversight', 'Algorithmic discrimination', 'Consent practices']\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"What details must California warehouse employers provide to employees regarding work quotas, and why is this information crucial?\"\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': \"The first question specifically mentions 'provenance data tracking' and 'synthetic content detection mechanisms', while the second question generalizes these concepts to 'data tracking' and 'content detection'. This difference in specificity leads to a variation in depth and breadth of inquiry.\", 'verdict': 0}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question addresses the concerns of model collapse in relation to the ratio of synthetic to non-synthetic training data, specifically within the context of environmental impact and sustainability of AI model training and management. While it is specific in its focus on model collapse and the evaluation of data types, the question is somewhat complex and may be difficult to answer without additional context regarding what is meant by 'model collapse' and how it relates to environmental sustainability. To improve clarity and answerability, the question could be reframed to define 'model collapse' and provide a brief explanation of how the ratio of training data types is linked to sustainability. This would help ensure that the intent is clear and that the question is accessible to a broader audience.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 1 times\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How can algorithmic discrimination be prevented in surveillance systems according to the expectations for automated systems?\"\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': \"Establish and maintain procedures for escalating GAI system incidents to the organizational risk management authority when specific criteria for deactivation or disengagement is met for a particular context of use or for the GAI system as a whole. Establish and maintain procedures for the remediation of issues which trigger incident response processes for the use of a GAI system, and provide stakeholders timelines associated with the remediation plan. Establish and regularly review specific criteria that warrants the deactivation of GAI systems in accordance with set risk tolerances and appetites. Processes and procedures are in place for decommissioning and phasing out AI systems safely and in a manner that does not increase risks or decrease the organization's trustworthiness. Protocols are put in place to ensure GAI systems are able to be deactivated when necessary. Consider the following factors when decommissioning GAI systems: Data retention requirements; Data security, e.g., containment, protocols, Data leakage after decommissioning; Dependencies between upstream, downstream, or other data, internet of things (IOT) or AI systems; Use of open-source data or models; Users' emotional entanglement with GAI functions.\", 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated systems', 'Explanatory mechanisms', 'Risk assessment', 'Explanation validity', 'Summary reporting']\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the primary focus of a specific NIST publication regarding Trustworthy and Responsible AI. It is clear and specific, indicating that the answer should summarize the main themes or objectives of the publication. However, it assumes familiarity with the NIST publication without providing any context or details about its content. To improve clarity and answerability, the question could specify the year of the publication or the particular aspects of Trustworthy and Responsible AI it addresses, which would help in providing a more focused response.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking for details that California warehouse employers must provide to employees about work quotas, as well as the importance of this information. It does not rely on external references and can be understood independently. The intent is straightforward, seeking both factual information and an explanation of its significance. Therefore, it meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How should the level of risk be assessed in automated systems according to the given context?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"What's the main focus of the NIST publication on Trustworthy and Responsible AI?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"What details must California warehouse employers provide to employees about work quotas, and why is it important?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question addresses the prevention of algorithmic discrimination in surveillance systems, specifically in relation to the expectations for automated systems. It is clear in its intent, focusing on a specific issue (algorithmic discrimination) and its context (surveillance systems and automated systems). However, the phrase 'according to the expectations for automated systems' could be interpreted in various ways, as it does not specify what these expectations are or who defines them. To enhance clarity and answerability, the question could specify the source of these expectations (e.g., legal standards, ethical guidelines) or provide examples of what is meant by 'expectations'.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [ReasoningEvolution] simple question generated: \"How can algorithmic discrimination be prevented in surveillance systems according to the expectations for automated systems?\"\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions inquire about the obligations of California warehouse employers concerning employee quotas, focusing on the requirements and the importance of providing information. However, the second question introduces an additional aspect of importance, which may lead to a broader inquiry.', 'verdict': 0}\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions inquire about the main focus of the same publication regarding Trustworthy and Responsible AI, sharing identical constraints and requirements.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 3 times\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Data tracking and content detection can help reduce risks in GAI by providing information about the origin and history of content, enabling better knowledge of trustworthiness in AI systems. Provenance data tracking techniques can assess the authenticity, integrity, intellectual property rights, and potential manipulations in digital content. Techniques such as digital watermarking, metadata recording, and digital fingerprinting can be used to track the history and origin of data inputs, metadata, and synthetic content, allowing for the determination of authenticity. These approaches can assist AI actors in understanding the trade-offs and impacts of early-stage model decisions on downstream performance and synthetic outputs, ultimately enhancing content provenance and reducing risks in GAI.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about assessing the level of risk in automated systems, referencing 'the given context' without providing any specific details about what that context entails. This reliance on unspecified external information makes the question unclear and difficult to answer for someone who does not have access to that context. To improve clarity and answerability, the question could either include a brief description of the relevant context or specify the criteria or factors that should be considered when assessing risk in automated systems. Additionally, clarifying what type of automated systems are being referred to (e.g., industrial, software, autonomous vehicles) would enhance understanding.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"How should the level of risk be assessed in automated systems according to the given context?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 3, 'depth': 3, 'structure': 3, 'relevance': 3, 'score': 3.0}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['AI Bill of Rights', 'Automated systems', 'Technical companion', 'Principles', 'Protecting rights']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How does the technical companion for the Blueprint for an AI Bill of Rights provide guidance for protecting the rights of the American public in the age of artificial intelligence?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about assessing the level of risk in automated systems, referencing 'the given context' without providing any specific details about what that context entails. This reliance on unspecified external information makes the question unclear and difficult to answer for someone who does not have access to that context. To improve clarity and answerability, the question should either include a brief description of the relevant context or be rephrased to focus on general principles or methods for assessing risk in automated systems without needing additional information. For example, it could ask, 'What are the general methods for assessing risk in automated systems?' which would make it more self-contained.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 2 times\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question addresses the prevention of algorithmic discrimination in surveillance systems, focusing on ongoing assessments for potential harms and the necessity of surveillance. It is specific in its intent and clearly outlines the aspects it seeks to explore. However, the phrasing is somewhat complex and could benefit from simplification for better clarity. To improve, the question could be rephrased to break down the components more clearly, such as asking separately about the methods for ongoing assessment and the criteria for determining necessity in surveillance. This would make it easier to understand and answer.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Warehousing employers in California that use quota systems are required to provide employees with a written description of each quota that applies to the employee. This description must include the quantified number of tasks or materials to be produced or handled within a defined time period, as well as any potential adverse employment action that could result from failing to meet the quota. Providing these details is important to ensure transparency, fairness, and accountability in the workplace, allowing employees to understand the expectations placed upon them and the consequences of not meeting those expectations.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [ReasoningEvolution] question compressed: \"How can algorithmic discrimination be prevented in surveillance systems by ensuring ongoing assessment for potential harms and avoiding surveillance unless strictly necessary, according to the expectations for automated systems?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Ongoing monitoring', 'Periodic review', 'Organizational roles and responsibilities', 'Content provenance', 'Incident monitoring']\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the guidance provided by the technical companion for the Blueprint for an AI Bill of Rights regarding the protection of American public rights in the context of artificial intelligence. It is specific in its focus on the technical companion and its implications for rights protection, making the intent clear. However, the question assumes familiarity with the 'technical companion' and the 'Blueprint for an AI Bill of Rights' without providing any context or definitions, which may hinder understanding for those not already knowledgeable about these topics. To improve clarity and answerability, the question could include a brief description of what the technical companion entails or the key principles of the AI Bill of Rights it refers to.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: \"How does the technical companion for the Blueprint for an AI Bill of Rights provide guidance for protecting the rights of the American public in the age of artificial intelligence?\"\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What is the importance of ongoing monitoring and periodic review in the risk management process for GAI systems?\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': \"The first question includes a specific context regarding 'expectations for automated systems', which adds a layer of depth and requirement that is absent in the second question. Therefore, they are not equal.\", 'verdict': 0}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the significance of ongoing monitoring and periodic review in the risk management process specifically for GAI (General Artificial Intelligence) systems. It is clear in its intent, focusing on the importance of these practices within a defined context (risk management for GAI systems). The question is independent and does not rely on external references, making it understandable and answerable based on the details provided. Therefore, it meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: What is the importance of ongoing monitoring and periodic review in the risk management process for GAI systems?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the guidance provided by the technical companion for the Blueprint for an AI Bill of Rights regarding the protection of American public rights in the context of artificial intelligence. It is specific in its focus on the technical companion and its implications for rights protection, making the intent clear. However, the question assumes familiarity with the 'technical companion' and the 'Blueprint for an AI Bill of Rights' without providing any context or definitions, which may hinder understanding for those not already knowledgeable about these topics. To improve clarity and answerability, the question could include a brief description of what the technical companion entails or the key principles of the AI Bill of Rights it refers to.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 4 times\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"Why is ongoing monitoring and periodic review essential in the risk management process for GAI systems, and how does it relate to establishing organizational responsibilities and documenting risks?\"\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated systems', 'Algorithmic discrimination', 'Independent evaluation', 'Algorithmic impact assessment', 'Reporting']\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': \"Algorithmic discrimination in surveillance systems can be prevented by conducting a thorough assessment before deployment and continuously throughout the system's use. This assessment should specifically focus on ensuring that there is no algorithmic discrimination, especially based on community membership, when the system is deployed in a real-world context. Additionally, surveillance should be limited to what is strictly necessary for a legitimate purpose and should use the least invasive means of monitoring available. Clear and specific notice should be provided to individuals subject to monitoring, and information related to identity should be carefully limited to avoid discriminatory targeting or guidance of surveillance systems.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What expectations should be met by automated systems according to the given context?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear in its intent, asking for the importance of ongoing monitoring and periodic review in the risk management process for GAI systems, as well as its relationship to organizational responsibilities and risk documentation. It specifies the context (GAI systems) and the aspects to be discussed (monitoring, review, responsibilities, documentation), making it understandable. However, the complexity of the terms used may require some background knowledge in risk management and GAI systems for a complete answer. To enhance clarity and answerability, the question could be simplified or broken down into smaller parts, such as asking first about the role of monitoring and review, and then about their relationship to responsibilities and documentation.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the expectations for automated systems based on 'the given context', but it does not provide any details about what that context is. This reliance on unspecified external information makes the question unclear and unanswerable for someone who does not have access to that context. To improve clarity and answerability, the question should either include a brief description of the relevant context or specify the criteria or standards being referred to. Additionally, clarifying what type of expectations (e.g., performance, reliability, ethical considerations) is being inquired about would enhance the question's specificity.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: What expectations should be met by automated systems according to the given context?\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"Why is ongoing monitoring and review important in GAI risk management, and how does it relate to organizational responsibilities and risk documentation?\"\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': 'Both questions address the significance of ongoing monitoring and review in the context of GAI risk management, but the second question expands the inquiry by including organizational responsibilities and risk documentation, leading to a difference in depth and breadth.', 'verdict': 0}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks about the expectations for automated systems based on 'the given context', but it does not provide any details about what that context is. This reliance on unspecified external information makes the question unclear and unanswerable for someone who does not have access to that context. To improve clarity and answerability, the question should either include a brief description of the relevant context or specify the criteria or standards being referred to regarding automated systems. Additionally, clarifying what type of expectations (e.g., performance, reliability, ethical considerations) is being inquired about would enhance the question's specificity.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 5 times\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 1, 'relevance': 1, 'score': 1.0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 5 times\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 2, 'relevance': 2, 'score': 1.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Technical companion', 'AI Bill of Rights', 'Algorithmic discrimination protections', 'Data privacy', 'Human alternatives']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: What is the purpose of the technical companion to the Blueprint for an AI Bill of Rights?\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Ongoing monitoring and periodic review of the risk management process and its outcomes are crucial in GAI systems to ensure that organizational roles and responsibilities are clearly defined. This includes determining the frequency of periodic reviews to identify gaps in content provenance, incident monitoring, incident response, and incident disclosures. By maintaining a document retention policy for test, evaluation, validation, and verification (TEVV), organizations can ensure transparency and accountability in GAI systems. These mechanisms help in inventorying AI systems and resourcing them according to organizational risk priorities, while also considering data provenance, known issues, human oversight roles, intellectual property rights, and underlying foundation models. Overall, ongoing monitoring and review are essential for effective governance, oversight, and risk management in GAI systems.', 'verdict': 1}\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks for the purpose of a specific document, the technical companion to the Blueprint for an AI Bill of Rights. It is clear in its intent, specifying the document in question and what information is being sought (its purpose). However, the question assumes familiarity with both the 'technical companion' and the 'Blueprint for an AI Bill of Rights', which may not be universally known. To improve clarity and answerability, the question could provide a brief description of what the Blueprint for an AI Bill of Rights entails or the context in which the technical companion operates, making it more accessible to a broader audience.\", 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: What is the purpose of the technical companion to the Blueprint for an AI Bill of Rights?\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: What additional document provides practical guidance for implementing the principles outlined in the Blueprint for an AI Bill of Rights?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks for an additional document that provides practical guidance related to the principles in the 'Blueprint for an AI Bill of Rights'. While it specifies the topic and seeks a specific type of information (an additional document), it lacks clarity because it does not define what is meant by 'practical guidance' or what specific principles are being referred to. Additionally, the question assumes familiarity with the 'Blueprint for an AI Bill of Rights' without providing any context or details about it. To improve clarity and answerability, the question could specify which principles are of interest or what kind of practical guidance is being sought (e.g., implementation strategies, ethical considerations).\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] rewritten question: What additional document provides practical guidance for implementing the principles outlined in the Blueprint for an AI Bill of Rights?\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': \"The question asks for an additional document that provides practical guidance related to the principles in the 'Blueprint for an AI Bill of Rights'. While it specifies the topic and seeks a specific type of information (an additional document), it lacks clarity because it does not define what is meant by 'practical guidance' or what specific principles are being referred to. Additionally, the question assumes familiarity with the 'Blueprint for an AI Bill of Rights' without providing any context or details about it. To improve clarity and answerability, the question could specify which principles are of interest or what kind of practical guidance is being sought (e.g., implementation strategies, ethical considerations).\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.INFO] retrying evolution: 4 times\n",
      "[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}\n",
      "[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Algorithmic discrimination', 'Automated systems', 'Protected classifications', 'Equitable design', 'Disparity testing']\n",
      "[ragas.testset.evolutions.INFO] seed question generated: \"How can designers, developers, and deployers of automated systems ensure equitable design to prevent algorithmic discrimination?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking how various stakeholders (designers, developers, and deployers) of automated systems can ensure equitable design to prevent algorithmic discrimination. It does not rely on external references and conveys a clear intent to seek actionable strategies or principles. However, it could be improved by specifying what aspects of equitable design or algorithmic discrimination are of interest, as this could lead to more focused and relevant responses. For example, the question could mention specific areas such as data collection practices, model training, or user impact assessments.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] simple question generated: \"How can designers, developers, and deployers of automated systems ensure equitable design to prevent algorithmic discrimination?\"\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question generated: \"How can stakeholders ensure equitable design in automated systems to prevent algorithmic discrimination?\"\n",
      "[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking how stakeholders can ensure equitable design in automated systems to prevent algorithmic discrimination. It identifies the target audience (stakeholders) and the issue at hand (algorithmic discrimination), making the intent of the question understandable. The question is independent and does not rely on external references or context, allowing for a direct response based on general knowledge of equitable design principles and practices in technology. Therefore, it meets the criteria for clarity and answerability.', 'verdict': 1}\n",
      "[ragas.testset.evolutions.DEBUG] [MultiContextEvolution] multicontext question compressed: \"How can stakeholders prevent algorithmic discrimination in automated systems?\"\n",
      "[ragas.testset.filters.DEBUG] evolution filter: {'reason': \"The first question specifically addresses the roles of designers, developers, and deployers in ensuring equitable design, while the second question uses the broader term 'stakeholders' without specifying roles, leading to differences in depth and breadth of inquiry.\", 'verdict': 0}\n",
      "[ragas.testset.evolutions.DEBUG] answer generated: {'answer': 'Stakeholders can prevent algorithmic discrimination in automated systems by conducting proactive equity assessments during the design phase, ensuring representative and robust data is used, guarding against proxies for demographic features, and performing ongoing disparity testing and mitigation. Additionally, stakeholders should prioritize accessibility for people with disabilities, conduct pre-deployment and ongoing disparity testing, and provide clear organizational oversight. Independent evaluation and plain language reporting should be performed and made public whenever possible to confirm these protections.', 'verdict': 1}\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>contexts</th>\n",
       "      <th>ground_truth</th>\n",
       "      <th>evolution_type</th>\n",
       "      <th>metadata</th>\n",
       "      <th>episode_done</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>How do language models contribute to the reduc...</td>\n",
       "      <td>[ \\n57 \\nNational Institute of Standards and T...</td>\n",
       "      <td>The answer to given question is not present in...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>What should be provided in terms of notice and...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCONSI...</td>\n",
       "      <td>Those impacted by an automated system should b...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>How can designers, developers, and deployers o...</td>\n",
       "      <td>[ ­­­­­­­\\nALGORITHMIC DISCRIMINATION Protecti...</td>\n",
       "      <td>Designers, developers, and deployers of automa...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>What benefits have been publicly described by ...</td>\n",
       "      <td>[ \\nENDNOTES\\n12. Expectations about reporting...</td>\n",
       "      <td>The benefits of 'traffic calming' measures hav...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What is the purpose of AI Red-teaming in testi...</td>\n",
       "      <td>[ \\n49 \\nearly lifecycle TEVV approaches are d...</td>\n",
       "      <td>AI Red-teaming is a structured testing exercis...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>What is the importance of training and assessm...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCO...</td>\n",
       "      <td>Training and assessment are crucial in ensurin...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>How do advertisement delivery systems reinforc...</td>\n",
       "      <td>[ \\n \\n  \\nWHY THIS PRINCIPLE IS IMPORTANT\\nTh...</td>\n",
       "      <td>Advertisement delivery systems reinforce racia...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>What is the purpose of the Blueprint for an AI...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nAbo...</td>\n",
       "      <td>The Blueprint for an AI Bill of Rights is inte...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>What are the key privacy protections provided ...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...</td>\n",
       "      <td>The Privacy Act of 1974 provides privacy prote...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>How does the Fair Credit Reporting Act ensure ...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...</td>\n",
       "      <td>The Fair Credit Reporting Act ensures that con...</td>\n",
       "      <td>simple</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Why is ongoing monitoring and review important...</td>\n",
       "      <td>[ \\n16 \\nGOVERN 1.5: Ongoing monitoring and pe...</td>\n",
       "      <td>Ongoing monitoring and periodic review of the ...</td>\n",
       "      <td>multi_context</td>\n",
       "      <td>[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>What criteria should automated systems in sens...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCO...</td>\n",
       "      <td>Automated systems used within sensitive domain...</td>\n",
       "      <td>multi_context</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>How can stakeholders prevent algorithmic discr...</td>\n",
       "      <td>[ ­­­­­­­\\nALGORITHMIC DISCRIMINATION Protecti...</td>\n",
       "      <td>Stakeholders can prevent algorithmic discrimin...</td>\n",
       "      <td>multi_context</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>How to evaluate GAI system outputs against ris...</td>\n",
       "      <td>[ \\n40 \\nMANAGE 1.3: Responses to the AI risks...</td>\n",
       "      <td>The answer to given question is not present in...</td>\n",
       "      <td>multi_context</td>\n",
       "      <td>[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>How can data tracking and content detection he...</td>\n",
       "      <td>[ \\n51 \\ngeneral public participants. For exam...</td>\n",
       "      <td>Data tracking and content detection can help r...</td>\n",
       "      <td>multi_context</td>\n",
       "      <td>[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>What's needed for an algorithmic impact assess...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\n \\nWHAT SHOULD BE EXPECTED ...</td>\n",
       "      <td>The answer to given question is not present in...</td>\n",
       "      <td>multi_context</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>What details must California warehouse employe...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...</td>\n",
       "      <td>Warehousing employers in California that use q...</td>\n",
       "      <td>multi_context</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>What measures are needed for handling GAI syst...</td>\n",
       "      <td>[ \\n42 \\nMG-2.4-002 \\nEstablish and maintain p...</td>\n",
       "      <td>Establish and maintain procedures for escalati...</td>\n",
       "      <td>multi_context</td>\n",
       "      <td>[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>How can privacy be ensured in automated system...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\nDATA PRIVACY \\nWHAT SHOULD ...</td>\n",
       "      <td>Privacy in automated system design can be ensu...</td>\n",
       "      <td>reasoning</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>How can algorithmic discrimination be prevente...</td>\n",
       "      <td>[ \\n \\n \\n \\n \\n \\nDATA PRIVACY \\nWHAT SHOULD ...</td>\n",
       "      <td>Algorithmic discrimination in surveillance sys...</td>\n",
       "      <td>reasoning</td>\n",
       "      <td>[{'source': 'data/Blueprint-for-an-AI-Bill-of-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             question  \\\n",
       "0   How do language models contribute to the reduc...   \n",
       "1   What should be provided in terms of notice and...   \n",
       "2   How can designers, developers, and deployers o...   \n",
       "3   What benefits have been publicly described by ...   \n",
       "4   What is the purpose of AI Red-teaming in testi...   \n",
       "5   What is the importance of training and assessm...   \n",
       "6   How do advertisement delivery systems reinforc...   \n",
       "7   What is the purpose of the Blueprint for an AI...   \n",
       "8   What are the key privacy protections provided ...   \n",
       "9   How does the Fair Credit Reporting Act ensure ...   \n",
       "10  Why is ongoing monitoring and review important...   \n",
       "11  What criteria should automated systems in sens...   \n",
       "12  How can stakeholders prevent algorithmic discr...   \n",
       "13  How to evaluate GAI system outputs against ris...   \n",
       "14  How can data tracking and content detection he...   \n",
       "15  What's needed for an algorithmic impact assess...   \n",
       "16  What details must California warehouse employe...   \n",
       "17  What measures are needed for handling GAI syst...   \n",
       "18  How can privacy be ensured in automated system...   \n",
       "19  How can algorithmic discrimination be prevente...   \n",
       "\n",
       "                                             contexts  \\\n",
       "0   [ \\n57 \\nNational Institute of Standards and T...   \n",
       "1   [ \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCONSI...   \n",
       "2   [ ­­­­­­­\\nALGORITHMIC DISCRIMINATION Protecti...   \n",
       "3   [ \\nENDNOTES\\n12. Expectations about reporting...   \n",
       "4   [ \\n49 \\nearly lifecycle TEVV approaches are d...   \n",
       "5   [ \\n \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCO...   \n",
       "6   [ \\n \\n  \\nWHY THIS PRINCIPLE IS IMPORTANT\\nTh...   \n",
       "7   [ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nAbo...   \n",
       "8   [ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...   \n",
       "9   [ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...   \n",
       "10  [ \\n16 \\nGOVERN 1.5: Ongoing monitoring and pe...   \n",
       "11  [ \\n \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCO...   \n",
       "12  [ ­­­­­­­\\nALGORITHMIC DISCRIMINATION Protecti...   \n",
       "13  [ \\n40 \\nMANAGE 1.3: Responses to the AI risks...   \n",
       "14  [ \\n51 \\ngeneral public participants. For exam...   \n",
       "15  [ \\n \\n \\n \\n \\n \\n \\nWHAT SHOULD BE EXPECTED ...   \n",
       "16  [ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...   \n",
       "17  [ \\n42 \\nMG-2.4-002 \\nEstablish and maintain p...   \n",
       "18  [ \\n \\n \\n \\n \\n \\nDATA PRIVACY \\nWHAT SHOULD ...   \n",
       "19  [ \\n \\n \\n \\n \\n \\nDATA PRIVACY \\nWHAT SHOULD ...   \n",
       "\n",
       "                                         ground_truth evolution_type  \\\n",
       "0   The answer to given question is not present in...         simple   \n",
       "1   Those impacted by an automated system should b...         simple   \n",
       "2   Designers, developers, and deployers of automa...         simple   \n",
       "3   The benefits of 'traffic calming' measures hav...         simple   \n",
       "4   AI Red-teaming is a structured testing exercis...         simple   \n",
       "5   Training and assessment are crucial in ensurin...         simple   \n",
       "6   Advertisement delivery systems reinforce racia...         simple   \n",
       "7   The Blueprint for an AI Bill of Rights is inte...         simple   \n",
       "8   The Privacy Act of 1974 provides privacy prote...         simple   \n",
       "9   The Fair Credit Reporting Act ensures that con...         simple   \n",
       "10  Ongoing monitoring and periodic review of the ...  multi_context   \n",
       "11  Automated systems used within sensitive domain...  multi_context   \n",
       "12  Stakeholders can prevent algorithmic discrimin...  multi_context   \n",
       "13  The answer to given question is not present in...  multi_context   \n",
       "14  Data tracking and content detection can help r...  multi_context   \n",
       "15  The answer to given question is not present in...  multi_context   \n",
       "16  Warehousing employers in California that use q...  multi_context   \n",
       "17  Establish and maintain procedures for escalati...  multi_context   \n",
       "18  Privacy in automated system design can be ensu...      reasoning   \n",
       "19  Algorithmic discrimination in surveillance sys...      reasoning   \n",
       "\n",
       "                                             metadata  episode_done  \n",
       "0   [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...          True  \n",
       "1   [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "2   [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "3   [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "4   [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...          True  \n",
       "5   [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "6   [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "7   [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "8   [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "9   [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "10  [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...          True  \n",
       "11  [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "12  [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "13  [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...          True  \n",
       "14  [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...          True  \n",
       "15  [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "16  [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "17  [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...          True  \n",
       "18  [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  \n",
       "19  [{'source': 'data/Blueprint-for-an-AI-Bill-of-...          True  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain_community.document_loaders import PyMuPDFLoader\n",
    "from ragas.testset.generator import TestsetGenerator\n",
    "from ragas.testset.evolutions import simple, reasoning, multi_context\n",
    "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
    "\n",
    "\n",
    "filepath_NIST = \"data/NIST.AI.600-1.pdf\"\n",
    "filepath_Blueprint = \"data/Blueprint-for-an-AI-Bill-of-Rights.pdf\"\n",
    "\n",
    "documents_NIST = PyMuPDFLoader(filepath_NIST).load()\n",
    "documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()\n",
    "documents = documents_NIST + documents_Blueprint\n",
    "\n",
    "generator_llm = ChatOpenAI(model=\"gpt-3.5-turbo\")\n",
    "critic_llm = ChatOpenAI(model=\"gpt-4o-mini\", tags=[\"base_llm\"]) \n",
    "embeddings = OpenAIEmbeddings()\n",
    "\n",
    "generator = TestsetGenerator.from_langchain(\n",
    "    generator_llm,\n",
    "    critic_llm,\n",
    "    embeddings\n",
    ")\n",
    "\n",
    "distributions = {\n",
    "    simple: 0.5,\n",
    "    multi_context: 0.4,\n",
    "    reasoning: 0.1\n",
    "}\n",
    "\n",
    "testset = generator.generate_with_langchain_docs(documents, 20, distributions, with_debugging_logs=True)\n",
    "testset.to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "from langsmith import Client\n",
    "from datasets import Dataset\n",
    "\n",
    "\n",
    "client = Client()\n",
    "\n",
    "dataset_name = \"Implications of AI\"\n",
    "\n",
    "dataset = client.create_dataset(\n",
    "    dataset_name=dataset_name,\n",
    "    description=\"Questions about the implications of AI\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "for test in testset.to_pandas().iterrows():\n",
    "  client.create_example(\n",
    "      inputs={\n",
    "          \"question\": test[1][\"question\"]\n",
    "      },\n",
    "      outputs={\n",
    "          \"answer\": test[1][\"ground_truth\"]\n",
    "      },\n",
    "      metadata={\n",
    "          \"context\": test[0]\n",
    "      },\n",
    "      dataset_id=dataset.id\n",
    "  )\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "How do language models contribute to the reduction of content diversity in writing?\n",
      "The answer to given question is not present in context\n"
     ]
    }
   ],
   "source": [
    "test_questions = testset.to_pandas()[\"question\"].values.tolist()\n",
    "test_groundtruths = testset.to_pandas()[\"ground_truth\"].values.tolist()\n",
    "\n",
    "print(test_questions[0])\n",
    "print(test_groundtruths[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'response': AIMessage(content=\"I don't know.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 1238, 'total_tokens': 1242, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_2d87079ca9', 'finish_reason': 'stop', 'logprobs': None}, id='run-6db82f54-ddff-4079-b8a4-dd0dbe43a358-0', usage_metadata={'input_tokens': 1238, 'output_tokens': 4, 'total_tokens': 1242}), 'context': [Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 6, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '09678965559446c18cbfacbdc1e2979f', '_collection_name': 'Implications of AI'}, page_content='3 \\nthe abuse, misuse, and unsafe repurposing by humans (adversarial or not), and others result \\nfrom interactions between a human and an AI system.  \\n• \\nTime scale: GAI risks may materialize abruptly or across extended periods. Examples include \\nimmediate (and/or prolonged) emotional harm and potential risks to physical safety due to the \\ndistribution of harmful deepfake images, or the long-term effect of disinformation on societal \\ntrust in public institutions.'), Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 6, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': 'ed31388e1ee34066941bc3339ce0309e', '_collection_name': 'Implications of AI'}, page_content='Scientific Report on the Safety of Advanced AI, could be: 1) Technical / Model risks (or risk from malfunction): \\nConfabulation; Dangerous or Violent Recommendations; Data Privacy; Value Chain and Component Integration; \\nHarmful Bias, and Homogenization; 2) Misuse by humans (or malicious use): CBRN Information or Capabilities; \\nData Privacy; Human-AI Configuration; Obscene, Degrading, and/or Abusive Content; Information Integrity;'), Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 58, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '771251b765e44f43b849dc70ed71ec1b', '_collection_name': 'Implications of AI'}, page_content='55 \\nDe Angelo, D. (2024) Short, Mid and Long-Term Impacts of AI in Cybersecurity. Palo Alto Networks. \\nhttps://www.paloaltonetworks.com/blog/2024/02/impacts-of-ai-in-cybersecurity/ \\nDe Freitas, J. et al. (2023) Chatbots and Mental Health: Insights into the Safety of Generative AI. Harvard \\nBusiness School. https://www.hbs.edu/ris/Publication%20Files/23-011_c1bdd417-f717-47b6-bccb-\\n5438c6e65c1a_f6fd9798-3c2d-4932-b222-056231fe69d7.pdf'), Document(metadata={'source': 'data/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'data/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 3, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '4f4a06bdfdb44c28ae0299187309082f', '_collection_name': 'Implications of AI'}, page_content='discussions include that AI has transformative potential to improve Americans’ lives, and that preventing the \\nharms of these technologies is both necessary and achievable. The Appendix includes a full list of public engage-\\nments. \\n4')]}\n"
     ]
    }
   ],
   "source": [
    "from langchain_openai import ChatOpenAI\n",
    "from operator import itemgetter\n",
    "from langchain_core.runnables import RunnablePassthrough, RunnableParallel\n",
    "from langchain.schema import StrOutputParser\n",
    "from langchain_core.runnables import RunnablePassthrough\n",
    "\n",
    "llm = ChatOpenAI(model=\"gpt-4o-mini\", tags=[\"base_llm\"]) \n",
    "\n",
    "rag_chain = (\n",
    "    {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n",
    "    | rag_prompt | llm | StrOutputParser()\n",
    ")\n",
    "\n",
    "retrieval_augmented_qa_chain = (\n",
    "    # INVOKE CHAIN WITH: {\"question\" : \"<<SOME USER QUESTION>>\"}\n",
    "    # \"question\" : populated by getting the value of the \"question\" key\n",
    "    # \"context\"  : populated by getting the value of the \"question\" key and chaining it into the base_retriever\n",
    "    {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n",
    "    # \"context\"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)\n",
    "    #              by getting the value of the \"context\" key from the previous step\n",
    "    | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
    "    # \"response\" : the \"context\" and \"question\" values are used to format our prompt object and then piped\n",
    "    #              into the LLM and stored in a key called \"response\"\n",
    "    # \"context\"  : populated by getting the value of the \"context\" key from the previous step\n",
    "    | {\"response\": rag_prompt | llm, \"context\": itemgetter(\"context\")}\n",
    ")\n",
    "\n",
    "result = retrieval_augmented_qa_chain.invoke({\"question\" : \"Is AI a threat to humanity?\"})\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "answers = []\n",
    "contexts = []\n",
    "\n",
    "for question in test_questions:\n",
    "  response = retrieval_augmented_qa_chain.invoke({\"question\" : question})\n",
    "  answers.append(response[\"response\"].content)\n",
    "  contexts.append([context.page_content for context in response[\"context\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'How do language models contribute to the reduction of content diversity in writing?',\n",
       " 'answer': 'Language models can contribute to the reduction of content diversity in writing by producing overly homogenized outputs, which can be incorrect or lead to unreliable decision-making and amplify harmful biases. This phenomenon can flow from foundation models to downstream models and systems, with the foundation models acting as “bottlenecks” or single points of failure. Overly homogenized content can also contribute to what is referred to as “model collapse.”',\n",
       " 'contexts': ['https://doi.org/10.1787/2448f04b-en \\nOECD (2024) \"Defining AI incidents and related terms\" OECD Artificial Intelligence Papers, No. 16, OECD \\nPublishing, Paris. https://doi.org/10.1787/d1a8d965-en \\nOpenAI (2023) GPT-4 System Card. https://cdn.openai.com/papers/gpt-4-system-card.pdf \\nOpenAI (2024) GPT-4 Technical Report. https://arxiv.org/pdf/2303.08774 \\nPadmakumar, V. et al. (2024) Does writing with language models reduce content diversity? ICLR. \\nhttps://arxiv.org/pdf/2309.05196',\n",
       "  '9 \\nand reduced content diversity). Overly homogenized outputs can themselves be incorrect, or they may \\nlead to unreliable decision-making or amplify harmful biases. These phenomena can flow from \\nfoundation models to downstream models and systems, with the foundation models acting as \\n“bottlenecks,” or single points of failure.  \\nOverly homogenized content can contribute to “model collapse.” Model collapse can occur when model',\n",
       "  'https://arxiv.org/pdf/2310.11986 \\nWeidinger, L. et al. (2022) Taxonomy of Risks posed by Language Models. FAccT ’22. \\nhttps://dl.acm.org/doi/pdf/10.1145/3531146.3533088 \\nWest, D. (2023) AI poses disproportionate risks to women. Brookings. \\nhttps://www.brookings.edu/articles/ai-poses-disproportionate-risks-to-women/ \\nWu, K. et al. (2024) How well do LLMs cite relevant medical references? An evaluation framework and \\nanalyses. arXiv. https://arxiv.org/pdf/2402.02008',\n",
       "  'Shumailov, I. et al. (2023) The curse of recursion: training on generated data makes models forget. arXiv. \\nhttps://arxiv.org/pdf/2305.17493v2 \\nSmith, A. et al. (2023) Hallucination or Confabulation? Neuroanatomy as metaphor in Large Language \\nModels. PLOS Digital Health. \\nhttps://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000388 \\nSoice, E. et al. (2023) Can large language models democratize access to dual-use biotechnology? arXiv. \\nhttps://arxiv.org/abs/2306.03809'],\n",
       " 'ground_truth': 'The answer to given question is not present in context'}"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "response_dataset = Dataset.from_dict({\n",
    "    \"question\" : test_questions,\n",
    "    \"answer\" : answers,\n",
    "    \"contexts\" : contexts,\n",
    "    \"ground_truth\" : test_groundtruths\n",
    "})\n",
    "response_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "# text_splitter = RecursiveCharacterTextSplitter(\n",
    "#     chunk_size = 500,\n",
    "#     chunk_overlap = 50\n",
    "# )\n",
    "\n",
    "# rag_documents = text_splitter.split_documents(rag_documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ragas import evaluate\n",
    "from ragas.metrics import (\n",
    "    faithfulness,\n",
    "    answer_relevancy,\n",
    "    answer_correctness,\n",
    "    context_recall,\n",
    "    context_precision,\n",
    ")\n",
    "\n",
    "metrics = [\n",
    "    faithfulness,\n",
    "    answer_relevancy,\n",
    "    context_recall,\n",
    "    context_precision,\n",
    "    answer_correctness,\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1c87e1fd34114cc9802a54a9ab8e1ca1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "results = evaluate(response_dataset, metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>contexts</th>\n",
       "      <th>answer</th>\n",
       "      <th>ground_truth</th>\n",
       "      <th>faithfulness</th>\n",
       "      <th>answer_relevancy</th>\n",
       "      <th>context_recall</th>\n",
       "      <th>context_precision</th>\n",
       "      <th>answer_correctness</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>How do language models contribute to the reduc...</td>\n",
       "      <td>[https://doi.org/10.1787/2448f04b-en \\nOECD (2...</td>\n",
       "      <td>Language models can contribute to the reductio...</td>\n",
       "      <td>The answer to given question is not present in...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.967219</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.178897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>What should be provided in terms of notice and...</td>\n",
       "      <td>[alternative, where appropriate \\nBrief, clear...</td>\n",
       "      <td>There should be a brief, clear notice that ind...</td>\n",
       "      <td>Those impacted by an automated system should b...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.958077</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.952916</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>How can designers, developers, and deployers o...</td>\n",
       "      <td>[systems have the capacity to drive extraordin...</td>\n",
       "      <td>Designers, developers, and deployers of automa...</td>\n",
       "      <td>Designers, developers, and deployers of automa...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.945499</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.770302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>What benefits have been publicly described by ...</td>\n",
       "      <td>[15. See, e.g., Charles Pruitt. People Doing W...</td>\n",
       "      <td>I don't know.</td>\n",
       "      <td>The benefits of 'traffic calming' measures hav...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.181544</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What is the purpose of AI Red-teaming in testi...</td>\n",
       "      <td>[sense of AI-generated information, and subseq...</td>\n",
       "      <td>The purpose of AI Red-teaming in testing AI sy...</td>\n",
       "      <td>AI Red-teaming is a structured testing exercis...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.817249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>What is the importance of training and assessm...</td>\n",
       "      <td>[Training and assessment. Anyone administering...</td>\n",
       "      <td>Training and assessment are important in ensur...</td>\n",
       "      <td>Training and assessment are crucial in ensurin...</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.980111</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.862464</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>How do advertisement delivery systems reinforc...</td>\n",
       "      <td>[ering ads in ways that reinforce racial and g...</td>\n",
       "      <td>Advertisement delivery systems reinforce racia...</td>\n",
       "      <td>Advertisement delivery systems reinforce racia...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.846395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>What is the purpose of the Blueprint for an AI...</td>\n",
       "      <td>[Examples of automated systems for which the B...</td>\n",
       "      <td>The purpose of the Blueprint for an AI Bill of...</td>\n",
       "      <td>The Blueprint for an AI Bill of Rights is inte...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.974553</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.993355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>What are the key privacy protections provided ...</td>\n",
       "      <td>[records systems, including limits on data ret...</td>\n",
       "      <td>The key privacy protections provided by the Pr...</td>\n",
       "      <td>The Privacy Act of 1974 provides privacy prote...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.845967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>How does the Fair Credit Reporting Act ensure ...</td>\n",
       "      <td>[beyond simple notice to include reporting ele...</td>\n",
       "      <td>The Fair Credit Reporting Act ensures that con...</td>\n",
       "      <td>The Fair Credit Reporting Act ensures that con...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.915813</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.620742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Why is ongoing monitoring and review important...</td>\n",
       "      <td>[16 \\nGOVERN 1.5: Ongoing monitoring and perio...</td>\n",
       "      <td>Ongoing monitoring and periodic review are imp...</td>\n",
       "      <td>Ongoing monitoring and periodic review of the ...</td>\n",
       "      <td>0.687500</td>\n",
       "      <td>0.953297</td>\n",
       "      <td>0.60</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.614902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>What criteria should automated systems in sens...</td>\n",
       "      <td>[HUMAN ALTERNATIVES, \\nCONSIDERATION, AND \\nFA...</td>\n",
       "      <td>Automated systems in sensitive domains should ...</td>\n",
       "      <td>Automated systems used within sensitive domain...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.929181</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.979904</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>How can stakeholders prevent algorithmic discr...</td>\n",
       "      <td>[systems have the capacity to drive extraordin...</td>\n",
       "      <td>Stakeholders can prevent algorithmic discrimin...</td>\n",
       "      <td>Stakeholders can prevent algorithmic discrimin...</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.831405</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>How to evaluate GAI system outputs against ris...</td>\n",
       "      <td>[GAI resources; Apply organizational risk tole...</td>\n",
       "      <td>To evaluate GAI system outputs against risk to...</td>\n",
       "      <td>The answer to given question is not present in...</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.967755</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.176636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>How can data tracking and content detection he...</td>\n",
       "      <td>[assessments, and alerting, dynamic risk asses...</td>\n",
       "      <td>I don't know.</td>\n",
       "      <td>Data tracking and content detection can help r...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.50</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.182611</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>What's needed for an algorithmic impact assess...</td>\n",
       "      <td>[consultation, design stage equity assessments...</td>\n",
       "      <td>An algorithmic impact assessment for automated...</td>\n",
       "      <td>The answer to given question is not present in...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.985743</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.173255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>What details must California warehouse employe...</td>\n",
       "      <td>[tion about quotas, potentially facilitated by...</td>\n",
       "      <td>California warehouse employers are required to...</td>\n",
       "      <td>Warehousing employers in California that use q...</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.945762</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.805556</td>\n",
       "      <td>0.541300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>What measures are needed for handling GAI syst...</td>\n",
       "      <td>[17 \\nGOVERN 1.7: Processes and procedures are...</td>\n",
       "      <td>For handling GAI system incidents and decommis...</td>\n",
       "      <td>Establish and maintain procedures for escalati...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.50</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.755853</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>How can privacy be ensured in automated system...</td>\n",
       "      <td>[Protect privacy by design and by default \\nPr...</td>\n",
       "      <td>Privacy can be ensured in automated system des...</td>\n",
       "      <td>Privacy in automated system design can be ensu...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.40</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.840599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>How can algorithmic discrimination be prevente...</td>\n",
       "      <td>[systems have the capacity to drive extraordin...</td>\n",
       "      <td>Algorithmic discrimination can be prevented in...</td>\n",
       "      <td>Algorithmic discrimination in surveillance sys...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.700298</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             question  \\\n",
       "0   How do language models contribute to the reduc...   \n",
       "1   What should be provided in terms of notice and...   \n",
       "2   How can designers, developers, and deployers o...   \n",
       "3   What benefits have been publicly described by ...   \n",
       "4   What is the purpose of AI Red-teaming in testi...   \n",
       "5   What is the importance of training and assessm...   \n",
       "6   How do advertisement delivery systems reinforc...   \n",
       "7   What is the purpose of the Blueprint for an AI...   \n",
       "8   What are the key privacy protections provided ...   \n",
       "9   How does the Fair Credit Reporting Act ensure ...   \n",
       "10  Why is ongoing monitoring and review important...   \n",
       "11  What criteria should automated systems in sens...   \n",
       "12  How can stakeholders prevent algorithmic discr...   \n",
       "13  How to evaluate GAI system outputs against ris...   \n",
       "14  How can data tracking and content detection he...   \n",
       "15  What's needed for an algorithmic impact assess...   \n",
       "16  What details must California warehouse employe...   \n",
       "17  What measures are needed for handling GAI syst...   \n",
       "18  How can privacy be ensured in automated system...   \n",
       "19  How can algorithmic discrimination be prevente...   \n",
       "\n",
       "                                             contexts  \\\n",
       "0   [https://doi.org/10.1787/2448f04b-en \\nOECD (2...   \n",
       "1   [alternative, where appropriate \\nBrief, clear...   \n",
       "2   [systems have the capacity to drive extraordin...   \n",
       "3   [15. See, e.g., Charles Pruitt. People Doing W...   \n",
       "4   [sense of AI-generated information, and subseq...   \n",
       "5   [Training and assessment. Anyone administering...   \n",
       "6   [ering ads in ways that reinforce racial and g...   \n",
       "7   [Examples of automated systems for which the B...   \n",
       "8   [records systems, including limits on data ret...   \n",
       "9   [beyond simple notice to include reporting ele...   \n",
       "10  [16 \\nGOVERN 1.5: Ongoing monitoring and perio...   \n",
       "11  [HUMAN ALTERNATIVES, \\nCONSIDERATION, AND \\nFA...   \n",
       "12  [systems have the capacity to drive extraordin...   \n",
       "13  [GAI resources; Apply organizational risk tole...   \n",
       "14  [assessments, and alerting, dynamic risk asses...   \n",
       "15  [consultation, design stage equity assessments...   \n",
       "16  [tion about quotas, potentially facilitated by...   \n",
       "17  [17 \\nGOVERN 1.7: Processes and procedures are...   \n",
       "18  [Protect privacy by design and by default \\nPr...   \n",
       "19  [systems have the capacity to drive extraordin...   \n",
       "\n",
       "                                               answer  \\\n",
       "0   Language models can contribute to the reductio...   \n",
       "1   There should be a brief, clear notice that ind...   \n",
       "2   Designers, developers, and deployers of automa...   \n",
       "3                                       I don't know.   \n",
       "4   The purpose of AI Red-teaming in testing AI sy...   \n",
       "5   Training and assessment are important in ensur...   \n",
       "6   Advertisement delivery systems reinforce racia...   \n",
       "7   The purpose of the Blueprint for an AI Bill of...   \n",
       "8   The key privacy protections provided by the Pr...   \n",
       "9   The Fair Credit Reporting Act ensures that con...   \n",
       "10  Ongoing monitoring and periodic review are imp...   \n",
       "11  Automated systems in sensitive domains should ...   \n",
       "12  Stakeholders can prevent algorithmic discrimin...   \n",
       "13  To evaluate GAI system outputs against risk to...   \n",
       "14                                      I don't know.   \n",
       "15  An algorithmic impact assessment for automated...   \n",
       "16  California warehouse employers are required to...   \n",
       "17  For handling GAI system incidents and decommis...   \n",
       "18  Privacy can be ensured in automated system des...   \n",
       "19  Algorithmic discrimination can be prevented in...   \n",
       "\n",
       "                                         ground_truth  faithfulness  \\\n",
       "0   The answer to given question is not present in...      1.000000   \n",
       "1   Those impacted by an automated system should b...      1.000000   \n",
       "2   Designers, developers, and deployers of automa...      1.000000   \n",
       "3   The benefits of 'traffic calming' measures hav...      0.000000   \n",
       "4   AI Red-teaming is a structured testing exercis...      1.000000   \n",
       "5   Training and assessment are crucial in ensurin...      0.750000   \n",
       "6   Advertisement delivery systems reinforce racia...      1.000000   \n",
       "7   The Blueprint for an AI Bill of Rights is inte...      1.000000   \n",
       "8   The Privacy Act of 1974 provides privacy prote...      1.000000   \n",
       "9   The Fair Credit Reporting Act ensures that con...      1.000000   \n",
       "10  Ongoing monitoring and periodic review of the ...      0.687500   \n",
       "11  Automated systems used within sensitive domain...      1.000000   \n",
       "12  Stakeholders can prevent algorithmic discrimin...      0.857143   \n",
       "13  The answer to given question is not present in...      0.888889   \n",
       "14  Data tracking and content detection can help r...      0.000000   \n",
       "15  The answer to given question is not present in...      1.000000   \n",
       "16  Warehousing employers in California that use q...      0.200000   \n",
       "17  Establish and maintain procedures for escalati...      1.000000   \n",
       "18  Privacy in automated system design can be ensu...      1.000000   \n",
       "19  Algorithmic discrimination in surveillance sys...      1.000000   \n",
       "\n",
       "    answer_relevancy  context_recall  context_precision  answer_correctness  \n",
       "0           0.967219            1.00           0.000000            0.178897  \n",
       "1           0.958077            1.00           1.000000            0.952916  \n",
       "2           0.945499            1.00           1.000000            0.770302  \n",
       "3           0.000000            1.00           1.000000            0.181544  \n",
       "4           1.000000            1.00           0.916667            0.817249  \n",
       "5           0.980111            0.75           1.000000            0.862464  \n",
       "6           1.000000            1.00           1.000000            0.846395  \n",
       "7           0.974553            1.00           1.000000            0.993355  \n",
       "8           1.000000            1.00           1.000000            0.845967  \n",
       "9           0.915813            1.00           1.000000            0.620742  \n",
       "10          0.953297            0.60           1.000000            0.614902  \n",
       "11          0.929181            0.00           1.000000            0.979904  \n",
       "12          1.000000            1.00           1.000000            0.831405  \n",
       "13          0.967755            1.00           0.000000            0.176636  \n",
       "14          0.000000            0.50           1.000000            0.182611  \n",
       "15          0.985743            1.00           0.000000            0.173255  \n",
       "16          0.945762            0.50           0.805556            0.541300  \n",
       "17          1.000000            0.50           1.000000            0.755853  \n",
       "18          1.000000            0.40           1.000000            0.840599  \n",
       "19          1.000000            1.00           1.000000            0.700298  "
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results_df = results.to_pandas()\n",
    "results_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# eval_llm = ChatOpenAI(model=\"gpt-4o-mini\", tags=[\"base_llm\"]) "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llmops-course",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}