josipabebic commited on
Commit
9e6d1e9
·
verified ·
1 Parent(s): 573e720

Delete CroSlo code.ipynb

Browse files
Files changed (1) hide show
  1. CroSlo code.ipynb +0 -825
CroSlo code.ipynb DELETED
@@ -1,825 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "36ee7edb",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "name": "stdout",
11
- "output_type": "stream",
12
- "text": [
13
- "\n",
14
- "\n",
15
- "=== Treniranje i evaluacija za trening skup: train_combined ===\n",
16
- "\n",
17
- "--- Fine-tuning model: EMBEDDIA/crosloengual-bert ---\n"
18
- ]
19
- },
20
- {
21
- "name": "stderr",
22
- "output_type": "stream",
23
- "text": [
24
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/crosloengual-bert and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
25
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
26
- ]
27
- },
28
- {
29
- "data": {
30
- "application/vnd.jupyter.widget-view+json": {
31
- "model_id": "8c4ee5202c46457ab2c37d2f8e6a67ae",
32
- "version_major": 2,
33
- "version_minor": 0
34
- },
35
- "text/plain": [
36
- "Map: 0%| | 0/7577 [00:00<?, ? examples/s]"
37
- ]
38
- },
39
- "metadata": {},
40
- "output_type": "display_data"
41
- },
42
- {
43
- "name": "stderr",
44
- "output_type": "stream",
45
- "text": [
46
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
47
- " warnings.warn(warn_msg)\n"
48
- ]
49
- },
50
- {
51
- "data": {
52
- "text/html": [
53
- "\n",
54
- " <div>\n",
55
- " \n",
56
- " <progress value='1422' max='1422' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
57
- " [1422/1422 1:27:47, Epoch 3/3]\n",
58
- " </div>\n",
59
- " <table border=\"1\" class=\"dataframe\">\n",
60
- " <thead>\n",
61
- " <tr style=\"text-align: left;\">\n",
62
- " <th>Step</th>\n",
63
- " <th>Training Loss</th>\n",
64
- " </tr>\n",
65
- " </thead>\n",
66
- " <tbody>\n",
67
- " <tr>\n",
68
- " <td>50</td>\n",
69
- " <td>0.855500</td>\n",
70
- " </tr>\n",
71
- " <tr>\n",
72
- " <td>100</td>\n",
73
- " <td>0.748700</td>\n",
74
- " </tr>\n",
75
- " <tr>\n",
76
- " <td>150</td>\n",
77
- " <td>0.619600</td>\n",
78
- " </tr>\n",
79
- " <tr>\n",
80
- " <td>200</td>\n",
81
- " <td>0.618300</td>\n",
82
- " </tr>\n",
83
- " <tr>\n",
84
- " <td>250</td>\n",
85
- " <td>0.630800</td>\n",
86
- " </tr>\n",
87
- " <tr>\n",
88
- " <td>300</td>\n",
89
- " <td>0.639400</td>\n",
90
- " </tr>\n",
91
- " <tr>\n",
92
- " <td>350</td>\n",
93
- " <td>0.636500</td>\n",
94
- " </tr>\n",
95
- " <tr>\n",
96
- " <td>400</td>\n",
97
- " <td>0.595900</td>\n",
98
- " </tr>\n",
99
- " <tr>\n",
100
- " <td>450</td>\n",
101
- " <td>0.598500</td>\n",
102
- " </tr>\n",
103
- " <tr>\n",
104
- " <td>500</td>\n",
105
- " <td>0.464200</td>\n",
106
- " </tr>\n",
107
- " <tr>\n",
108
- " <td>550</td>\n",
109
- " <td>0.430400</td>\n",
110
- " </tr>\n",
111
- " <tr>\n",
112
- " <td>600</td>\n",
113
- " <td>0.456200</td>\n",
114
- " </tr>\n",
115
- " <tr>\n",
116
- " <td>650</td>\n",
117
- " <td>0.461900</td>\n",
118
- " </tr>\n",
119
- " <tr>\n",
120
- " <td>700</td>\n",
121
- " <td>0.459500</td>\n",
122
- " </tr>\n",
123
- " <tr>\n",
124
- " <td>750</td>\n",
125
- " <td>0.419300</td>\n",
126
- " </tr>\n",
127
- " <tr>\n",
128
- " <td>800</td>\n",
129
- " <td>0.469700</td>\n",
130
- " </tr>\n",
131
- " <tr>\n",
132
- " <td>850</td>\n",
133
- " <td>0.463700</td>\n",
134
- " </tr>\n",
135
- " <tr>\n",
136
- " <td>900</td>\n",
137
- " <td>0.411900</td>\n",
138
- " </tr>\n",
139
- " <tr>\n",
140
- " <td>950</td>\n",
141
- " <td>0.461800</td>\n",
142
- " </tr>\n",
143
- " <tr>\n",
144
- " <td>1000</td>\n",
145
- " <td>0.364100</td>\n",
146
- " </tr>\n",
147
- " <tr>\n",
148
- " <td>1050</td>\n",
149
- " <td>0.329400</td>\n",
150
- " </tr>\n",
151
- " <tr>\n",
152
- " <td>1100</td>\n",
153
- " <td>0.346800</td>\n",
154
- " </tr>\n",
155
- " <tr>\n",
156
- " <td>1150</td>\n",
157
- " <td>0.262100</td>\n",
158
- " </tr>\n",
159
- " <tr>\n",
160
- " <td>1200</td>\n",
161
- " <td>0.290200</td>\n",
162
- " </tr>\n",
163
- " <tr>\n",
164
- " <td>1250</td>\n",
165
- " <td>0.223900</td>\n",
166
- " </tr>\n",
167
- " <tr>\n",
168
- " <td>1300</td>\n",
169
- " <td>0.330000</td>\n",
170
- " </tr>\n",
171
- " <tr>\n",
172
- " <td>1350</td>\n",
173
- " <td>0.307000</td>\n",
174
- " </tr>\n",
175
- " <tr>\n",
176
- " <td>1400</td>\n",
177
- " <td>0.236200</td>\n",
178
- " </tr>\n",
179
- " </tbody>\n",
180
- "</table><p>"
181
- ],
182
- "text/plain": [
183
- "<IPython.core.display.HTML object>"
184
- ]
185
- },
186
- "metadata": {},
187
- "output_type": "display_data"
188
- },
189
- {
190
- "name": "stderr",
191
- "output_type": "stream",
192
- "text": [
193
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
194
- " warnings.warn(warn_msg)\n"
195
- ]
196
- },
197
- {
198
- "name": "stdout",
199
- "output_type": "stream",
200
- "text": [
201
- "\n",
202
- "Evaluacija na test skupu test-1\n"
203
- ]
204
- },
205
- {
206
- "data": {
207
- "application/vnd.jupyter.widget-view+json": {
208
- "model_id": "23e05a0258f045b4901a9fa9bfc7c151",
209
- "version_major": 2,
210
- "version_minor": 0
211
- },
212
- "text/plain": [
213
- "Map: 0%| | 0/653 [00:00<?, ? examples/s]"
214
- ]
215
- },
216
- "metadata": {},
217
- "output_type": "display_data"
218
- },
219
- {
220
- "name": "stderr",
221
- "output_type": "stream",
222
- "text": [
223
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
224
- " warnings.warn(warn_msg)\n"
225
- ]
226
- },
227
- {
228
- "data": {
229
- "text/html": [],
230
- "text/plain": [
231
- "<IPython.core.display.HTML object>"
232
- ]
233
- },
234
- "metadata": {},
235
- "output_type": "display_data"
236
- },
237
- {
238
- "name": "stdout",
239
- "output_type": "stream",
240
- "text": [
241
- "Confusion Matrix:\n",
242
- "[[111 47 7]\n",
243
- " [ 77 328 25]\n",
244
- " [ 3 28 27]]\n",
245
- "\n",
246
- "Classification Report:\n",
247
- " precision recall f1-score support\n",
248
- "\n",
249
- " negative 0.58 0.67 0.62 165\n",
250
- " neutral 0.81 0.76 0.79 430\n",
251
- " positive 0.46 0.47 0.46 58\n",
252
- "\n",
253
- " accuracy 0.71 653\n",
254
- " macro avg 0.62 0.63 0.62 653\n",
255
- "weighted avg 0.72 0.71 0.72 653\n",
256
- "\n",
257
- "Predikcije spremljene u results_train_combined_croslo/predictions_test_1.csv\n",
258
- "\n",
259
- "Evaluacija na test skupu test-2\n"
260
- ]
261
- },
262
- {
263
- "data": {
264
- "application/vnd.jupyter.widget-view+json": {
265
- "model_id": "e3c39ebf0f60449880c3d03a8c00e518",
266
- "version_major": 2,
267
- "version_minor": 0
268
- },
269
- "text/plain": [
270
- "Map: 0%| | 0/741 [00:00<?, ? examples/s]"
271
- ]
272
- },
273
- "metadata": {},
274
- "output_type": "display_data"
275
- },
276
- {
277
- "name": "stderr",
278
- "output_type": "stream",
279
- "text": [
280
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
281
- " warnings.warn(warn_msg)\n"
282
- ]
283
- },
284
- {
285
- "data": {
286
- "text/html": [],
287
- "text/plain": [
288
- "<IPython.core.display.HTML object>"
289
- ]
290
- },
291
- "metadata": {},
292
- "output_type": "display_data"
293
- },
294
- {
295
- "name": "stdout",
296
- "output_type": "stream",
297
- "text": [
298
- "Confusion Matrix:\n",
299
- "[[198 15 3]\n",
300
- " [ 16 411 4]\n",
301
- " [ 5 11 78]]\n",
302
- "\n",
303
- "Classification Report:\n",
304
- " precision recall f1-score support\n",
305
- "\n",
306
- " negative 0.90 0.92 0.91 216\n",
307
- " neutral 0.94 0.95 0.95 431\n",
308
- " positive 0.92 0.83 0.87 94\n",
309
- "\n",
310
- " accuracy 0.93 741\n",
311
- " macro avg 0.92 0.90 0.91 741\n",
312
- "weighted avg 0.93 0.93 0.93 741\n",
313
- "\n",
314
- "Predikcije spremljene u results_train_combined_croslo/predictions_test_2.csv\n",
315
- "\n",
316
- "Evaluacija na test skupu test-3\n"
317
- ]
318
- },
319
- {
320
- "data": {
321
- "application/vnd.jupyter.widget-view+json": {
322
- "model_id": "0bbd241f299b482b991116e930c1355a",
323
- "version_major": 2,
324
- "version_minor": 0
325
- },
326
- "text/plain": [
327
- "Map: 0%| | 0/793 [00:00<?, ? examples/s]"
328
- ]
329
- },
330
- "metadata": {},
331
- "output_type": "display_data"
332
- },
333
- {
334
- "name": "stderr",
335
- "output_type": "stream",
336
- "text": [
337
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
338
- " warnings.warn(warn_msg)\n"
339
- ]
340
- },
341
- {
342
- "data": {
343
- "text/html": [],
344
- "text/plain": [
345
- "<IPython.core.display.HTML object>"
346
- ]
347
- },
348
- "metadata": {},
349
- "output_type": "display_data"
350
- },
351
- {
352
- "name": "stdout",
353
- "output_type": "stream",
354
- "text": [
355
- "Confusion Matrix:\n",
356
- "[[204 56 7]\n",
357
- " [ 7 254 2]\n",
358
- " [ 9 116 138]]\n",
359
- "\n",
360
- "Classification Report:\n",
361
- " precision recall f1-score support\n",
362
- "\n",
363
- " negative 0.93 0.76 0.84 267\n",
364
- " neutral 0.60 0.97 0.74 263\n",
365
- " positive 0.94 0.52 0.67 263\n",
366
- "\n",
367
- " accuracy 0.75 793\n",
368
- " macro avg 0.82 0.75 0.75 793\n",
369
- "weighted avg 0.82 0.75 0.75 793\n",
370
- "\n",
371
- "Predikcije spremljene u results_train_combined_croslo/predictions_test_3.csv\n",
372
- "\n",
373
- "Sažetak metrika po test skupovima s prosjekom:\n",
374
- " Test Set Accuracy F1 Macro Precision Macro Recall Macro\n",
375
- "0 test-1 0.713629 0.624216 0.617558 0.633678\n",
376
- "1 test-2 0.927126 0.909619 0.920753 0.900017\n",
377
- "2 test-3 0.751576 0.749418 0.820764 0.751513\n",
378
- "Average NaN 0.797444 0.761084 0.786359 0.761736\n",
379
- "Sažetak metrika spremljen u results_train_combined_croslo/summary_metrics_with_average.csv\n",
380
- "\n",
381
- "\n",
382
- "=== Treniranje i evaluacija za trening skup: train_2 ===\n",
383
- "\n",
384
- "--- Fine-tuning model: EMBEDDIA/crosloengual-bert ---\n"
385
- ]
386
- },
387
- {
388
- "name": "stderr",
389
- "output_type": "stream",
390
- "text": [
391
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/crosloengual-bert and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
392
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
393
- ]
394
- },
395
- {
396
- "data": {
397
- "application/vnd.jupyter.widget-view+json": {
398
- "model_id": "d55bf912626244ecaf1dcf7ba9334726",
399
- "version_major": 2,
400
- "version_minor": 0
401
- },
402
- "text/plain": [
403
- "Map: 0%| | 0/2221 [00:00<?, ? examples/s]"
404
- ]
405
- },
406
- "metadata": {},
407
- "output_type": "display_data"
408
- },
409
- {
410
- "name": "stderr",
411
- "output_type": "stream",
412
- "text": [
413
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
414
- " warnings.warn(warn_msg)\n"
415
- ]
416
- },
417
- {
418
- "data": {
419
- "text/html": [
420
- "\n",
421
- " <div>\n",
422
- " \n",
423
- " <progress value='417' max='417' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
424
- " [417/417 22:04, Epoch 3/3]\n",
425
- " </div>\n",
426
- " <table border=\"1\" class=\"dataframe\">\n",
427
- " <thead>\n",
428
- " <tr style=\"text-align: left;\">\n",
429
- " <th>Step</th>\n",
430
- " <th>Training Loss</th>\n",
431
- " </tr>\n",
432
- " </thead>\n",
433
- " <tbody>\n",
434
- " <tr>\n",
435
- " <td>50</td>\n",
436
- " <td>0.848800</td>\n",
437
- " </tr>\n",
438
- " <tr>\n",
439
- " <td>100</td>\n",
440
- " <td>0.610900</td>\n",
441
- " </tr>\n",
442
- " <tr>\n",
443
- " <td>150</td>\n",
444
- " <td>0.549600</td>\n",
445
- " </tr>\n",
446
- " <tr>\n",
447
- " <td>200</td>\n",
448
- " <td>0.381800</td>\n",
449
- " </tr>\n",
450
- " <tr>\n",
451
- " <td>250</td>\n",
452
- " <td>0.401700</td>\n",
453
- " </tr>\n",
454
- " <tr>\n",
455
- " <td>300</td>\n",
456
- " <td>0.326100</td>\n",
457
- " </tr>\n",
458
- " <tr>\n",
459
- " <td>350</td>\n",
460
- " <td>0.233100</td>\n",
461
- " </tr>\n",
462
- " <tr>\n",
463
- " <td>400</td>\n",
464
- " <td>0.218200</td>\n",
465
- " </tr>\n",
466
- " </tbody>\n",
467
- "</table><p>"
468
- ],
469
- "text/plain": [
470
- "<IPython.core.display.HTML object>"
471
- ]
472
- },
473
- "metadata": {},
474
- "output_type": "display_data"
475
- },
476
- {
477
- "name": "stdout",
478
- "output_type": "stream",
479
- "text": [
480
- "\n",
481
- "Evaluacija na test skupu test-1\n"
482
- ]
483
- },
484
- {
485
- "data": {
486
- "application/vnd.jupyter.widget-view+json": {
487
- "model_id": "b0d528152cdd4bfcb2c1892d4a79faca",
488
- "version_major": 2,
489
- "version_minor": 0
490
- },
491
- "text/plain": [
492
- "Map: 0%| | 0/653 [00:00<?, ? examples/s]"
493
- ]
494
- },
495
- "metadata": {},
496
- "output_type": "display_data"
497
- },
498
- {
499
- "name": "stderr",
500
- "output_type": "stream",
501
- "text": [
502
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
503
- " warnings.warn(warn_msg)\n"
504
- ]
505
- },
506
- {
507
- "data": {
508
- "text/html": [],
509
- "text/plain": [
510
- "<IPython.core.display.HTML object>"
511
- ]
512
- },
513
- "metadata": {},
514
- "output_type": "display_data"
515
- },
516
- {
517
- "name": "stdout",
518
- "output_type": "stream",
519
- "text": [
520
- "Confusion Matrix:\n",
521
- "[[114 36 15]\n",
522
- " [ 85 302 43]\n",
523
- " [ 7 22 29]]\n",
524
- "\n",
525
- "Classification Report:\n",
526
- " precision recall f1-score support\n",
527
- "\n",
528
- " negative 0.55 0.69 0.61 165\n",
529
- " neutral 0.84 0.70 0.76 430\n",
530
- " positive 0.33 0.50 0.40 58\n",
531
- "\n",
532
- " accuracy 0.68 653\n",
533
- " macro avg 0.58 0.63 0.59 653\n",
534
- "weighted avg 0.72 0.68 0.69 653\n",
535
- "\n",
536
- "Predikcije spremljene u results_train_2_croslo/predictions_test_1.csv\n",
537
- "\n",
538
- "Evaluacija na test skupu test-2\n"
539
- ]
540
- },
541
- {
542
- "data": {
543
- "application/vnd.jupyter.widget-view+json": {
544
- "model_id": "06d0e128fb81415da0396d033248ac89",
545
- "version_major": 2,
546
- "version_minor": 0
547
- },
548
- "text/plain": [
549
- "Map: 0%| | 0/741 [00:00<?, ? examples/s]"
550
- ]
551
- },
552
- "metadata": {},
553
- "output_type": "display_data"
554
- },
555
- {
556
- "name": "stderr",
557
- "output_type": "stream",
558
- "text": [
559
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
560
- " warnings.warn(warn_msg)\n"
561
- ]
562
- },
563
- {
564
- "data": {
565
- "text/html": [],
566
- "text/plain": [
567
- "<IPython.core.display.HTML object>"
568
- ]
569
- },
570
- "metadata": {},
571
- "output_type": "display_data"
572
- },
573
- {
574
- "name": "stdout",
575
- "output_type": "stream",
576
- "text": [
577
- "Confusion Matrix:\n",
578
- "[[170 36 10]\n",
579
- " [ 45 366 20]\n",
580
- " [ 15 24 55]]\n",
581
- "\n",
582
- "Classification Report:\n",
583
- " precision recall f1-score support\n",
584
- "\n",
585
- " negative 0.74 0.79 0.76 216\n",
586
- " neutral 0.86 0.85 0.85 431\n",
587
- " positive 0.65 0.59 0.61 94\n",
588
- "\n",
589
- " accuracy 0.80 741\n",
590
- " macro avg 0.75 0.74 0.74 741\n",
591
- "weighted avg 0.80 0.80 0.80 741\n",
592
- "\n",
593
- "Predikcije spremljene u results_train_2_croslo/predictions_test_2.csv\n",
594
- "\n",
595
- "Evaluacija na test skupu test-3\n"
596
- ]
597
- },
598
- {
599
- "data": {
600
- "application/vnd.jupyter.widget-view+json": {
601
- "model_id": "8d10a2ad3b5c4cad9e15f9a863c14653",
602
- "version_major": 2,
603
- "version_minor": 0
604
- },
605
- "text/plain": [
606
- "Map: 0%| | 0/793 [00:00<?, ? examples/s]"
607
- ]
608
- },
609
- "metadata": {},
610
- "output_type": "display_data"
611
- },
612
- {
613
- "name": "stderr",
614
- "output_type": "stream",
615
- "text": [
616
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
617
- " warnings.warn(warn_msg)\n"
618
- ]
619
- },
620
- {
621
- "data": {
622
- "text/html": [],
623
- "text/plain": [
624
- "<IPython.core.display.HTML object>"
625
- ]
626
- },
627
- "metadata": {},
628
- "output_type": "display_data"
629
- },
630
- {
631
- "name": "stdout",
632
- "output_type": "stream",
633
- "text": [
634
- "Confusion Matrix:\n",
635
- "[[193 59 15]\n",
636
- " [ 20 234 9]\n",
637
- " [ 19 116 128]]\n",
638
- "\n",
639
- "Classification Report:\n",
640
- " precision recall f1-score support\n",
641
- "\n",
642
- " negative 0.83 0.72 0.77 267\n",
643
- " neutral 0.57 0.89 0.70 263\n",
644
- " positive 0.84 0.49 0.62 263\n",
645
- "\n",
646
- " accuracy 0.70 793\n",
647
- " macro avg 0.75 0.70 0.70 793\n",
648
- "weighted avg 0.75 0.70 0.70 793\n",
649
- "\n",
650
- "Predikcije spremljene u results_train_2_croslo/predictions_test_3.csv\n",
651
- "\n",
652
- "Sažetak metrika po test skupovima s prosjekom:\n",
653
- " Test Set Accuracy F1 Macro Precision Macro Recall Macro\n",
654
- "0 test-1 0.681470 0.593037 0.575207 0.631078\n",
655
- "1 test-2 0.797571 0.743666 0.748448 0.740444\n",
656
- "2 test-3 0.699874 0.695614 0.748710 0.699757\n",
657
- "Average NaN 0.726305 0.677439 0.690788 0.690426\n",
658
- "Sažetak metrika spremljen u results_train_2_croslo/summary_metrics_with_average.csv\n"
659
- ]
660
- }
661
- ],
662
- "source": [
663
- "import pandas as pd\n",
664
- "import torch\n",
665
- "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
666
- "from datasets import Dataset\n",
667
- "from sklearn.metrics import classification_report, confusion_matrix\n",
668
- "\n",
669
- "def load_and_prepare_data(train_path):\n",
670
- " df = pd.read_csv(train_path)\n",
671
- " df = df.rename(columns={\"Label\": \"label\"})\n",
672
- " return Dataset.from_pandas(df)\n",
673
- "\n",
674
- "def load_and_prepare_test_data(test_path):\n",
675
- " df = pd.read_csv(test_path)\n",
676
- " df = df.rename(columns={\"Label\": \"label\"})\n",
677
- " return Dataset.from_pandas(df), df\n",
678
- "\n",
679
- "def tokenize_dataset(dataset, tokenizer):\n",
680
- " def tokenize_function(examples):\n",
681
- " return tokenizer(examples['Sentence'], padding='max_length', truncation=True, max_length=128)\n",
682
- " tokenized = dataset.map(tokenize_function, batched=True)\n",
683
- " tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n",
684
- " return tokenized\n",
685
- "\n",
686
- "def compute_metrics(eval_pred):\n",
687
- " logits, labels = eval_pred\n",
688
- " preds = torch.argmax(torch.tensor(logits), axis=1).numpy()\n",
689
- " report = classification_report(labels, preds, output_dict=True)\n",
690
- " acc = report['accuracy']\n",
691
- " f1 = report['macro avg']['f1-score']\n",
692
- " precision = report['macro avg']['precision']\n",
693
- " recall = report['macro avg']['recall']\n",
694
- " return {\n",
695
- " 'accuracy': acc,\n",
696
- " 'f1_macro': f1,\n",
697
- " 'precision_macro': precision,\n",
698
- " 'recall_macro': recall\n",
699
- " }\n",
700
- "\n",
701
- "def train_and_evaluate(model_name, train_dataset, test_datasets, raw_test_dfs, output_base_dir):\n",
702
- " print(f\"\\n--- Fine-tuning model: {model_name} ---\")\n",
703
- "\n",
704
- " tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
705
- " model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)\n",
706
- "\n",
707
- " tokenized_train = tokenize_dataset(train_dataset, tokenizer)\n",
708
- "\n",
709
- " training_args = TrainingArguments(\n",
710
- " output_dir=f\"{output_base_dir}/model\",\n",
711
- " learning_rate=2e-5,\n",
712
- " per_device_train_batch_size=16,\n",
713
- " per_device_eval_batch_size=32,\n",
714
- " num_train_epochs=3,\n",
715
- " weight_decay=0.01,\n",
716
- " load_best_model_at_end=False,\n",
717
- " logging_dir=f\"{output_base_dir}/logs\",\n",
718
- " logging_steps=50,\n",
719
- " save_total_limit=2,\n",
720
- " seed=42,\n",
721
- " )\n",
722
- "\n",
723
- " trainer = Trainer(\n",
724
- " model=model,\n",
725
- " args=training_args,\n",
726
- " train_dataset=tokenized_train,\n",
727
- " compute_metrics=compute_metrics,\n",
728
- " )\n",
729
- "\n",
730
- " trainer.train()\n",
731
- " trainer.save_model()\n",
732
- "\n",
733
- " results_list = []\n",
734
- "\n",
735
- " for i, (test_dataset, raw_test_df) in enumerate(zip(test_datasets, raw_test_dfs), start=1):\n",
736
- " print(f\"\\nEvaluacija na test skupu test-{i}\")\n",
737
- " tokenized_test = tokenize_dataset(test_dataset, tokenizer)\n",
738
- " predictions_output = trainer.predict(tokenized_test)\n",
739
- "\n",
740
- " preds = torch.argmax(torch.tensor(predictions_output.predictions), axis=1).numpy()\n",
741
- " labels = predictions_output.label_ids\n",
742
- "\n",
743
- " report = classification_report(labels, preds, target_names=['negative', 'neutral', 'positive'], output_dict=True)\n",
744
- "\n",
745
- " accuracy = report['accuracy']\n",
746
- " f1_macro = report['macro avg']['f1-score']\n",
747
- " precision_macro = report['macro avg']['precision']\n",
748
- " recall_macro = report['macro avg']['recall']\n",
749
- "\n",
750
- " results_list.append({\n",
751
- " 'Test Set': f'test-{i}',\n",
752
- " 'Accuracy': accuracy,\n",
753
- " 'F1 Macro': f1_macro,\n",
754
- " 'Precision Macro': precision_macro,\n",
755
- " 'Recall Macro': recall_macro\n",
756
- " })\n",
757
- "\n",
758
- " print(\"Confusion Matrix:\")\n",
759
- " print(confusion_matrix(labels, preds))\n",
760
- " print(\"\\nClassification Report:\")\n",
761
- " print(classification_report(labels, preds, target_names=['negative', 'neutral', 'positive']))\n",
762
- "\n",
763
- " output_df = raw_test_df.copy()\n",
764
- " output_df['predicted_label'] = preds\n",
765
- " output_df['correct'] = output_df['label'] == output_df['predicted_label']\n",
766
- " output_csv = f\"{output_base_dir}/predictions_test_{i}.csv\"\n",
767
- " output_df.to_csv(output_csv, index=False)\n",
768
- " print(f\"Predikcije spremljene u {output_csv}\")\n",
769
- "\n",
770
- " # Izračun prosjeka za sve metrike\n",
771
- " df_results = pd.DataFrame(results_list)\n",
772
- " df_results.loc['Average'] = df_results.mean(numeric_only=True)\n",
773
- "\n",
774
- " print(\"\\nSažetak metrika po test skupovima s prosjekom:\")\n",
775
- " print(df_results)\n",
776
- "\n",
777
- " df_results.to_csv(f\"{output_base_dir}/summary_metrics_with_average.csv\", index=True)\n",
778
- " print(f\"Sažetak metrika spremljen u {output_base_dir}/summary_metrics_with_average.csv\")\n",
779
- "\n",
780
- "if __name__ == \"__main__\":\n",
781
- " train_files = {\n",
782
- " \"train_combined\": \"TRAIN.csv\",\n",
783
- " \"train_2\": \"train-2.csv\"\n",
784
- " }\n",
785
- "\n",
786
- " test_files = [\"test-1.csv\", \"test-2.csv\", \"test-3.csv\"]\n",
787
- " test_datasets = []\n",
788
- " raw_test_dfs = []\n",
789
- " for f in test_files:\n",
790
- " ds, df = load_and_prepare_test_data(f)\n",
791
- " test_datasets.append(ds)\n",
792
- " raw_test_dfs.append(df)\n",
793
- "\n",
794
- " model_name = \"EMBEDDIA/crosloengual-bert\"\n",
795
- "\n",
796
- " for train_name, train_path in train_files.items():\n",
797
- " print(f\"\\n\\n=== Treniranje i evaluacija za trening skup: {train_name} ===\")\n",
798
- " train_dataset = load_and_prepare_data(train_path)\n",
799
- " output_dir = f\"results_{train_name}_croslo\"\n",
800
- " train_and_evaluate(model_name, train_dataset, test_datasets, raw_test_dfs, output_dir)\n"
801
- ]
802
- }
803
- ],
804
- "metadata": {
805
- "kernelspec": {
806
- "display_name": "Python 3",
807
- "language": "python",
808
- "name": "python3"
809
- },
810
- "language_info": {
811
- "codemirror_mode": {
812
- "name": "ipython",
813
- "version": 3
814
- },
815
- "file_extension": ".py",
816
- "mimetype": "text/x-python",
817
- "name": "python",
818
- "nbconvert_exporter": "python",
819
- "pygments_lexer": "ipython3",
820
- "version": "3.13.3"
821
- }
822
- },
823
- "nbformat": 4,
824
- "nbformat_minor": 5
825
- }