mratsim commited on
Commit
ba0c130
·
verified ·
1 Parent(s): c27b664

Update calibrate_software_engineer.yaml with new multilingual calibration

Browse files
Files changed (1) hide show
  1. calibrate_software_engineer.yaml +24 -18
calibrate_software_engineer.yaml CHANGED
@@ -1,29 +1,29 @@
1
  calibration_set:
2
  _templates:
3
  programming_languages: &programming_languages "Solve the following problem using {{ ['Zephyr', 'Prolog', 'Cobol', 'Apex', 'Crystal', 'Fortran', 'Nim', 'Delphi', 'Ada', 'Objective-C', 'VBA', 'Perl', 'Groovy', 'MATLAB', 'Solidity', 'Visual Basic', 'OCaml', 'Erlang', 'Julia', 'Lisp', 'F#', 'Clojure', 'GDScript', 'Scala', 'R', 'Haskell', 'Ruby', 'Elixir', 'Lua', 'Zig', 'Dart', 'Swift', 'Metal', 'PowerShell', 'PHP', 'Kotlin', 'C', 'Java', 'C++', 'C#', 'Bash/Shell', 'Go', 'Rust', 'TypeScript', 'HTML/CSS', 'SQL', 'JavaScript', 'Python', 'Lean', 'Coq', 'Pony', 'D', 'Racket', 'Haxe', 'x86-64 ASM', 'ARM-64 ASM', 'LLVM IR', 'GLSL', 'CUDA', 'Vulkan'][hash(row|string) % 60] }}\n***\n"
4
- spoken_languages: &spoken_languages "Answer in {{ ['Arabic', 'Chinese', 'French', 'German', 'Hebrew', 'Hindi', 'Japanese', 'Korean', 'Portuguese', 'Russian', 'Spanish', 'Turkish'][hash(row|string) % 12] }}\n***\n"
5
  max_seq_length: 8192
6
  shuffle: true
7
  seed: 42
8
  datasets:
9
 
10
- # Category Summary (Total: 590 samples)
11
  # =====================================================
12
- # General chat (24 samples - 4.07%)
13
- # Instruction and Reasoning tuning (14 samples - 2.37%)
14
- # Multilingual (36 samples - 6.10%)
15
- # Tool use (100 samples - 16.95%)
16
- # Code / Programming / Software Engineering / Devops (328 samples - 55.59%)
17
- # Math (12 samples - 2.03%)
18
- # Sciences (16 samples - 2.71%)
19
- # Medical (8 samples - 1.36%)
20
- # Finance (8 samples - 1.36%)
21
- # Business (16 samples - 2.71%)
22
- # Humanities and Philosophy (8 samples - 1.36%)
23
- # Creative Writing, Adventure, Roleplay (13 samples - 2.20%)
24
- # General Knowledge and Pop Culture (2 samples - 0.34%)
25
- # Behavioral skills (4 samples - 0.68%)
26
- # Misc (1 sample - 0.17%)
27
  # =====================================================
28
 
29
  # Research
@@ -90,7 +90,7 @@ calibration_set:
90
  formatter: sharegpt
91
  num_samples: 4
92
 
93
- # Multilingual (36 samples)
94
  # ---------------------------------------------------------------------------
95
  - dataset: HuggingFaceH4/Multilingual-Thinking
96
  split: train
@@ -108,6 +108,12 @@ calibration_set:
108
  num_samples: 4
109
  streaming: true
110
 
 
 
 
 
 
 
111
  # Tool use (include commented out ToolAce) (100 samples)
112
  # ---------------------------------------------------------------------------
113
 
 
1
  calibration_set:
2
  _templates:
3
  programming_languages: &programming_languages "Solve the following problem using {{ ['Zephyr', 'Prolog', 'Cobol', 'Apex', 'Crystal', 'Fortran', 'Nim', 'Delphi', 'Ada', 'Objective-C', 'VBA', 'Perl', 'Groovy', 'MATLAB', 'Solidity', 'Visual Basic', 'OCaml', 'Erlang', 'Julia', 'Lisp', 'F#', 'Clojure', 'GDScript', 'Scala', 'R', 'Haskell', 'Ruby', 'Elixir', 'Lua', 'Zig', 'Dart', 'Swift', 'Metal', 'PowerShell', 'PHP', 'Kotlin', 'C', 'Java', 'C++', 'C#', 'Bash/Shell', 'Go', 'Rust', 'TypeScript', 'HTML/CSS', 'SQL', 'JavaScript', 'Python', 'Lean', 'Coq', 'Pony', 'D', 'Racket', 'Haxe', 'x86-64 ASM', 'ARM-64 ASM', 'LLVM IR', 'GLSL', 'CUDA', 'Vulkan'][hash(row|string) % 60] }}\n***\n"
4
+ spoken_languages: &spoken_languages "Answer in {{ ['Arabic', 'Chinese', 'French', 'German', 'Greek', 'Hebrew', 'Hindi', 'Japanese', 'Korean', 'Portuguese', 'Russian', 'Spanish', 'Turkish'][hash(row|string) % 13] }}\n***\n"
5
  max_seq_length: 8192
6
  shuffle: true
7
  seed: 42
8
  datasets:
9
 
10
+ # Category Summary (Total: 624 samples)
11
  # =====================================================
12
+ # General chat (24 samples - 3.85%)
13
+ # Instruction and Reasoning tuning (14 samples - 2.24%)
14
+ # Multilingual (70 samples - 11.22%)
15
+ # Tool use (100 samples - 16.03%)
16
+ # Code / Programming / Software Engineering / Devops (328 samples - 52.56%)
17
+ # Math (12 samples - 1.92%)
18
+ # Sciences (16 samples - 2.56%)
19
+ # Medical (8 samples - 1.28%)
20
+ # Finance (8 samples - 1.28%)
21
+ # Business (16 samples - 2.56%)
22
+ # Humanities and Philosophy (8 samples - 1.28%)
23
+ # Creative Writing, Adventure, Roleplay (13 samples - 2.08%)
24
+ # General Knowledge and Pop Culture (2 samples - 0.32%)
25
+ # Behavioral skills (4 samples - 0.64%)
26
+ # Misc (1 sample - 0.16%)
27
  # =====================================================
28
 
29
  # Research
 
90
  formatter: sharegpt
91
  num_samples: 4
92
 
93
+ # Multilingual (70 samples)
94
  # ---------------------------------------------------------------------------
95
  - dataset: HuggingFaceH4/Multilingual-Thinking
96
  split: train
 
108
  num_samples: 4
109
  streaming: true
110
 
111
+ - dataset: droussis/euroblocks_sft_1sample_per_lang
112
+ split: train
113
+ columns: [conversations]
114
+ formatter: chat_completion
115
+ num_samples: 34
116
+
117
  # Tool use (include commented out ToolAce) (100 samples)
118
  # ---------------------------------------------------------------------------
119