Spaces:
Running
Running
Commit
·
0aa8adc
1
Parent(s):
4a63305
Fix SRT timestamp alignment with ground truth
Browse filesAdjusted all transcription SRT files to start at 00:00:00,000 to match the ground truth timeline. This resolves the synchronization issue where transcripts were not displaying in sync with the audio playback.
Changes:
- AssemblyAI: removed 80ms offset
- Nova3: removed 80ms offset
- Speechmatics: removed 120ms offset
- Gladia: already aligned, no changes needed
Added adjust_srt_timing.py script to automate timing corrections for future updates.
- adjust_srt_timing.py +123 -0
- srt-out/assembly.srt +470 -470
- srt-out/nova3.srt +576 -576
- srt-out/speechmatics.srt +414 -414
adjust_srt_timing.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Adjust SRT file timestamps to align with ground truth.
|
| 4 |
+
This script removes timing offsets to ensure all transcripts start at 00:00:00,000
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def parse_timestamp(timestamp_str):
|
| 12 |
+
"""Convert SRT timestamp to milliseconds."""
|
| 13 |
+
# Format: HH:MM:SS,mmm
|
| 14 |
+
time_part, ms_part = timestamp_str.split(',')
|
| 15 |
+
h, m, s = map(int, time_part.split(':'))
|
| 16 |
+
ms = int(ms_part)
|
| 17 |
+
return (h * 3600 + m * 60 + s) * 1000 + ms
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def format_timestamp(ms):
|
| 21 |
+
"""Convert milliseconds to SRT timestamp format."""
|
| 22 |
+
hours = ms // 3600000
|
| 23 |
+
ms %= 3600000
|
| 24 |
+
minutes = ms // 60000
|
| 25 |
+
ms %= 60000
|
| 26 |
+
seconds = ms // 1000
|
| 27 |
+
milliseconds = ms % 1000
|
| 28 |
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def adjust_srt_timing(input_path, output_path, offset_ms):
|
| 32 |
+
"""
|
| 33 |
+
Adjust all timestamps in an SRT file by subtracting offset_ms.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
input_path: Path to input SRT file
|
| 37 |
+
output_path: Path to output SRT file
|
| 38 |
+
offset_ms: Offset in milliseconds to subtract from all timestamps
|
| 39 |
+
"""
|
| 40 |
+
with open(input_path, 'r', encoding='utf-8') as f:
|
| 41 |
+
content = f.read()
|
| 42 |
+
|
| 43 |
+
# Remove BOM if present
|
| 44 |
+
content = content.lstrip('\ufeff')
|
| 45 |
+
|
| 46 |
+
# Pattern to match timestamp lines: HH:MM:SS,mmm --> HH:MM:SS,mmm
|
| 47 |
+
timestamp_pattern = re.compile(
|
| 48 |
+
r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})'
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
def adjust_match(match):
|
| 52 |
+
start_str = match.group(1)
|
| 53 |
+
end_str = match.group(2)
|
| 54 |
+
|
| 55 |
+
start_ms = parse_timestamp(start_str)
|
| 56 |
+
end_ms = parse_timestamp(end_str)
|
| 57 |
+
|
| 58 |
+
# Subtract offset
|
| 59 |
+
new_start_ms = max(0, start_ms - offset_ms)
|
| 60 |
+
new_end_ms = max(0, end_ms - offset_ms)
|
| 61 |
+
|
| 62 |
+
new_start = format_timestamp(new_start_ms)
|
| 63 |
+
new_end = format_timestamp(new_end_ms)
|
| 64 |
+
|
| 65 |
+
return f"{new_start} --> {new_end}"
|
| 66 |
+
|
| 67 |
+
adjusted_content = timestamp_pattern.sub(adjust_match, content)
|
| 68 |
+
|
| 69 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 70 |
+
f.write(adjusted_content)
|
| 71 |
+
|
| 72 |
+
print(f"✓ Adjusted {input_path.name}: offset={offset_ms}ms → {output_path.name}")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def find_first_timestamp(srt_path):
|
| 76 |
+
"""Find the first timestamp in an SRT file."""
|
| 77 |
+
with open(srt_path, 'r', encoding='utf-8') as f:
|
| 78 |
+
content = f.read()
|
| 79 |
+
|
| 80 |
+
timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->')
|
| 81 |
+
match = timestamp_pattern.search(content)
|
| 82 |
+
if match:
|
| 83 |
+
return parse_timestamp(match.group(1))
|
| 84 |
+
return 0
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def main():
|
| 88 |
+
srt_dir = Path(__file__).parent / "srt-out"
|
| 89 |
+
|
| 90 |
+
# Files to adjust
|
| 91 |
+
srt_files = [
|
| 92 |
+
"assembly.srt",
|
| 93 |
+
"gladia.srt",
|
| 94 |
+
"nova3.srt",
|
| 95 |
+
"speechmatics.srt"
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
print("Analyzing SRT files for timing offset...\n")
|
| 99 |
+
|
| 100 |
+
for filename in srt_files:
|
| 101 |
+
input_path = srt_dir / filename
|
| 102 |
+
if not input_path.exists():
|
| 103 |
+
print(f"⚠ Skipping {filename} (not found)")
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
# Find first timestamp
|
| 107 |
+
first_ts_ms = find_first_timestamp(input_path)
|
| 108 |
+
|
| 109 |
+
if first_ts_ms == 0:
|
| 110 |
+
print(f"✓ {filename} already starts at 00:00:00,000 (no adjustment needed)")
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
# Calculate offset
|
| 114 |
+
offset_ms = first_ts_ms
|
| 115 |
+
|
| 116 |
+
# Adjust the file in place
|
| 117 |
+
adjust_srt_timing(input_path, input_path, offset_ms)
|
| 118 |
+
|
| 119 |
+
print("\n✅ All SRT files have been adjusted to start at 00:00:00,000")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
main()
|
srt-out/assembly.srt
CHANGED
|
@@ -1,1880 +1,1880 @@
|
|
| 1 |
1
|
| 2 |
-
00:00:00,
|
| 3 |
Hello and welcome to a audio data set consisting
|
| 4 |
|
| 5 |
2
|
| 6 |
-
00:00:05,
|
| 7 |
of one single episode of a non-existent podcast. Or I
|
| 8 |
|
| 9 |
3
|
| 10 |
-
00:00:10,
|
| 11 |
may append this to a podcast that I set up
|
| 12 |
|
| 13 |
4
|
| 14 |
-
00:00:13,
|
| 15 |
recently regarding my with my thoughts on speech
|
| 16 |
|
| 17 |
5
|
| 18 |
-
00:00:19,
|
| 19 |
tech and AI in particular, more AI in generative AI,
|
| 20 |
|
| 21 |
6
|
| 22 |
-
00:00:24,
|
| 23 |
I would say. But in any event, the purpose of
|
| 24 |
|
| 25 |
7
|
| 26 |
-
00:00:28,
|
| 27 |
this Voice recording is actually to create a lengthy
|
| 28 |
|
| 29 |
8
|
| 30 |
-
00:00:33,
|
| 31 |
voice sample for a quick evaluation, a back of the
|
| 32 |
|
| 33 |
9
|
| 34 |
-
00:00:37,
|
| 35 |
envelope evaluation, as they might say, for different speech attack
|
| 36 |
|
| 37 |
10
|
| 38 |
-
00:00:40,
|
| 39 |
models. And I'm doing this because I thought I had
|
| 40 |
|
| 41 |
11
|
| 42 |
-
00:00:43,
|
| 43 |
made a great breakthrough in my journey with speech tech,
|
| 44 |
|
| 45 |
12
|
| 46 |
-
00:00:47,
|
| 47 |
and that was succeeding in the elusive task of fine-tuning
|
| 48 |
|
| 49 |
13
|
| 50 |
-
00:00:50,
|
| 51 |
Whisper. Whisper is, and I'm going to just talk, I'm
|
| 52 |
|
| 53 |
14
|
| 54 |
-
00:00:54,
|
| 55 |
trying to mix up, I'm going to try a few
|
| 56 |
|
| 57 |
15
|
| 58 |
-
00:00:58,
|
| 59 |
different styles of speaking. I might whisper something at some
|
| 60 |
|
| 61 |
16
|
| 62 |
-
00:01:01,
|
| 63 |
point. As well. And I'll go back to speaking loud
|
| 64 |
|
| 65 |
17
|
| 66 |
-
00:01:04,
|
| 67 |
in, in different parts. I'm going to sound really like
|
| 68 |
|
| 69 |
18
|
| 70 |
-
00:01:08,
|
| 71 |
a crazy person because I'm also going to try to
|
| 72 |
|
| 73 |
19
|
| 74 |
-
00:01:11,
|
| 75 |
speak at different pitches and cadences in order to really
|
| 76 |
|
| 77 |
20
|
| 78 |
-
00:01:16,
|
| 79 |
try to put a speech attacks model through its paces,
|
| 80 |
|
| 81 |
21
|
| 82 |
-
00:01:20,
|
| 83 |
which is trying to make sense of is this guy
|
| 84 |
|
| 85 |
22
|
| 86 |
-
00:01:23,
|
| 87 |
just rambling on incoherently in one long sentence or are
|
| 88 |
|
| 89 |
23
|
| 90 |
-
00:01:28,
|
| 91 |
these just actually a series of step, standalone,
|
| 92 |
|
| 93 |
24
|
| 94 |
-
00:01:34,
|
| 95 |
step alone, standalone sentences? And how is it gonna handle
|
| 96 |
|
| 97 |
25
|
| 98 |
-
00:01:37,
|
| 99 |
step alone? That's not a word. What happens when you
|
| 100 |
|
| 101 |
26
|
| 102 |
-
00:01:40,
|
| 103 |
use speech to text and you use a fake word?
|
| 104 |
|
| 105 |
27
|
| 106 |
-
00:01:43,
|
| 107 |
And then you're like, wait, that's not actually, that word
|
| 108 |
|
| 109 |
28
|
| 110 |
-
00:01:45,
|
| 111 |
doesn't exist. How does AI handle that? And these and
|
| 112 |
|
| 113 |
29
|
| 114 |
-
00:01:50,
|
| 115 |
more are all the questions that I'm seeking to answer
|
| 116 |
|
| 117 |
30
|
| 118 |
-
00:01:54,
|
| 119 |
in this training data. Now, why was it trying to
|
| 120 |
|
| 121 |
31
|
| 122 |
-
00:01:57,
|
| 123 |
fine tune Whisper? And what is Whisper? As I said,
|
| 124 |
|
| 125 |
32
|
| 126 |
-
00:02:00,
|
| 127 |
I'm going to try to record this at a couple
|
| 128 |
|
| 129 |
33
|
| 130 |
-
00:02:03,
|
| 131 |
of different levels of technicality for folks who are, you
|
| 132 |
|
| 133 |
34
|
| 134 |
-
00:02:07,
|
| 135 |
know, in the normal world and not totally stuck down
|
| 136 |
|
| 137 |
35
|
| 138 |
-
00:02:11,
|
| 139 |
the rabbit hole of AI, which I have to say
|
| 140 |
|
| 141 |
36
|
| 142 |
-
00:02:13,
|
| 143 |
is a really wonderful rabbit hole to be down. It's
|
| 144 |
|
| 145 |
37
|
| 146 |
-
00:02:18,
|
| 147 |
a really interesting area and speech and voice tech is
|
| 148 |
|
| 149 |
38
|
| 150 |
-
00:02:21,
|
| 151 |
the aspect of it that I find actually the most,
|
| 152 |
|
| 153 |
39
|
| 154 |
-
00:02:
|
| 155 |
I'm not sure I would say the most interesting because
|
| 156 |
|
| 157 |
40
|
| 158 |
-
00:02:27,
|
| 159 |
there's just so much that is fascinating in AI. But
|
| 160 |
|
| 161 |
41
|
| 162 |
-
00:02:31,
|
| 163 |
the most that I find the most personally transformative in
|
| 164 |
|
| 165 |
42
|
| 166 |
-
00:02:34,
|
| 167 |
terms of the impact that it's had on my daily
|
| 168 |
|
| 169 |
43
|
| 170 |
-
00:02:
|
| 171 |
work life and productivity and how I sort of work.
|
| 172 |
|
| 173 |
44
|
| 174 |
-
00:02:42,
|
| 175 |
And I'm persevering hard with the task of trying
|
| 176 |
|
| 177 |
45
|
| 178 |
-
00:02:47,
|
| 179 |
to get a good solution working for Linux, which if
|
| 180 |
|
| 181 |
46
|
| 182 |
-
00:02:50,
|
| 183 |
anyone actually does listen to this, not just for the
|
| 184 |
|
| 185 |
47
|
| 186 |
-
00:02:52,
|
| 187 |
training data and for the actual content, this is sparked
|
| 188 |
|
| 189 |
48
|
| 190 |
-
00:02:56,
|
| 191 |
I had, besides the fine tune not working, well, that
|
| 192 |
|
| 193 |
49
|
| 194 |
-
00:03:00,
|
| 195 |
was the failure. Um, I used Claude code because one
|
| 196 |
|
| 197 |
50
|
| 198 |
-
00:03:05,
|
| 199 |
thinks these days that there is nothing short of solving,
|
| 200 |
|
| 201 |
51
|
| 202 |
-
00:03:
|
| 203 |
you know, the, the reason of life or something, that
|
| 204 |
|
| 205 |
52
|
| 206 |
-
00:03:15,
|
| 207 |
Claude and agentic AI can't do, which is not really
|
| 208 |
|
| 209 |
53
|
| 210 |
-
00:03:19,
|
| 211 |
the case. Uh, it does seem that way sometimes, but
|
| 212 |
|
| 213 |
54
|
| 214 |
-
00:03:22,
|
| 215 |
it fails a lot as well. And this is one
|
| 216 |
|
| 217 |
55
|
| 218 |
-
00:03:24,
|
| 219 |
of those, instances where last week I put together an
|
| 220 |
|
| 221 |
56
|
| 222 |
-
00:03:27,
|
| 223 |
hour of voice training data, basically speaking, just random things
|
| 224 |
|
| 225 |
57
|
| 226 |
-
00:03:32,
|
| 227 |
for 3 minutes. And it was actually kind of tedious
|
| 228 |
|
| 229 |
58
|
| 230 |
-
00:03:37,
|
| 231 |
because the texts were really weird. Some of them were
|
| 232 |
|
| 233 |
59
|
| 234 |
-
00:03:39,
|
| 235 |
it was like it was AI generated. I tried before
|
| 236 |
|
| 237 |
60
|
| 238 |
-
00:03:43,
|
| 239 |
to read Sherlock Holmes for an hour and I just
|
| 240 |
|
| 241 |
61
|
| 242 |
-
00:03:45,
|
| 243 |
couldn't. I was so bored after 10 minutes that I
|
| 244 |
|
| 245 |
62
|
| 246 |
-
00:03:48,
|
| 247 |
was like, okay, no, I'm just going to have to
|
| 248 |
|
| 249 |
63
|
| 250 |
-
00:03:50,
|
| 251 |
find something else to read. So I used a created
|
| 252 |
|
| 253 |
64
|
| 254 |
-
00:03:55,
|
| 255 |
with AI studio vibe coded a synthetic text generator. Which
|
| 256 |
|
| 257 |
65
|
| 258 |
-
00:04:01,
|
| 259 |
actually I thought was probably a better way of doing
|
| 260 |
|
| 261 |
66
|
| 262 |
-
00:04:
|
| 263 |
it because it would give me more short samples with
|
| 264 |
|
| 265 |
67
|
| 266 |
-
00:04:07,
|
| 267 |
more varied content. So I was like, okay, give me
|
| 268 |
|
| 269 |
68
|
| 270 |
-
00:04:10,
|
| 271 |
a voice note, like I'm recording an email, give me
|
| 272 |
|
| 273 |
69
|
| 274 |
-
00:04:14,
|
| 275 |
a short story to read, give me prose to read.
|
| 276 |
|
| 277 |
70
|
| 278 |
-
00:04:18,
|
| 279 |
So I came up with all these different things and
|
| 280 |
|
| 281 |
71
|
| 282 |
-
00:04:20,
|
| 283 |
they added a little timer to it so I could
|
| 284 |
|
| 285 |
72
|
| 286 |
-
00:04:22,
|
| 287 |
see how close I was to one hour. And I
|
| 288 |
|
| 289 |
73
|
| 290 |
-
00:04:26,
|
| 291 |
spent like an hour one afternoon or probably two hours
|
| 292 |
|
| 293 |
74
|
| 294 |
-
00:04:29,
|
| 295 |
by the time you you do retakes. And whatever, because
|
| 296 |
|
| 297 |
75
|
| 298 |
-
00:04:33,
|
| 299 |
you want to, it gave me a source of truth,
|
| 300 |
|
| 301 |
76
|
| 302 |
-
00:04:37,
|
| 303 |
which I'm not sure if that's the scientific way to
|
| 304 |
|
| 305 |
77
|
| 306 |
-
00:04:40,
|
| 307 |
approach this topic of gathering, training data, but I thought
|
| 308 |
|
| 309 |
78
|
| 310 |
-
00:04:44,
|
| 311 |
made sense. Um, I have a lot of audio data
|
| 312 |
|
| 313 |
79
|
| 314 |
-
00:04:48,
|
| 315 |
from recording voice notes, which I've also kind of used,
|
| 316 |
|
| 317 |
80
|
| 318 |
-
00:04:52,
|
| 319 |
been experimenting with using for a different purpose, slightly different
|
| 320 |
|
| 321 |
81
|
| 322 |
-
00:04:56,
|
| 323 |
annotating task types. It's more a text classification experiment
|
| 324 |
|
| 325 |
82
|
| 326 |
-
00:05:01,
|
| 327 |
or, Well, it's more than that actually. I'm working on
|
| 328 |
|
| 329 |
83
|
| 330 |
-
00:05:04,
|
| 331 |
a voice app. So it's a prototype, I guess, is
|
| 332 |
|
| 333 |
84
|
| 334 |
-
00:05:08,
|
| 335 |
really more accurate. But you can do that and you
|
| 336 |
|
| 337 |
85
|
| 338 |
-
00:05:12,
|
| 339 |
can work backwards. You're like, you listen back to a
|
| 340 |
|
| 341 |
86
|
| 342 |
-
00:05:15,
|
| 343 |
voice note and you painfully go through one of those
|
| 344 |
|
| 345 |
87
|
| 346 |
-
00:05:19,
|
| 347 |
transcribing, you know, where you start and stop and scrub
|
| 348 |
|
| 349 |
88
|
| 350 |
-
00:05:22,
|
| 351 |
around it and you fix the errors, but it's really,
|
| 352 |
|
| 353 |
89
|
| 354 |
-
00:05:24,
|
| 355 |
really boring to do that. So I thought it would
|
| 356 |
|
| 357 |
90
|
| 358 |
-
00:05:26,
|
| 359 |
be less tedious in the long term if I just
|
| 360 |
|
| 361 |
91
|
| 362 |
-
00:05:30,
|
| 363 |
recorded the source of truth. So it gave me these
|
| 364 |
|
| 365 |
92
|
| 366 |
-
00:05:33,
|
| 367 |
three minute snippets. I recorded them. It saved an MP3
|
| 368 |
|
| 369 |
93
|
| 370 |
-
00:05:36,
|
| 371 |
and a TXT in the same folder, and I created
|
| 372 |
|
| 373 |
94
|
| 374 |
-
00:05:39,
|
| 375 |
an error with that data. So I was very hopeful,
|
| 376 |
|
| 377 |
95
|
| 378 |
-
00:05:43,
|
| 379 |
quietly, a little bit hopeful that I could actually fine
|
| 380 |
|
| 381 |
96
|
| 382 |
-
00:05:
|
| 383 |
tune Whisper. I want to fine tune Whisper because when
|
| 384 |
|
| 385 |
97
|
| 386 |
-
00:05:50,
|
| 387 |
I got into Voicetech last November, my wife was in
|
| 388 |
|
| 389 |
98
|
| 390 |
-
00:05:54,
|
| 391 |
the US and I was alone at home. And when
|
| 392 |
|
| 393 |
99
|
| 394 |
-
00:05:58,
|
| 395 |
crazy people like me do really wild things like use
|
| 396 |
|
| 397 |
100
|
| 398 |
-
00:06:01,
|
| 399 |
voice to tech technology. That was basically when I started
|
| 400 |
|
| 401 |
101
|
| 402 |
-
00:06:06,
|
| 403 |
doing it, I didn't feel like a crazy person speaking
|
| 404 |
|
| 405 |
102
|
| 406 |
-
00:06:08,
|
| 407 |
to myself. And my expectations weren't that high. I used
|
| 408 |
|
| 409 |
103
|
| 410 |
-
00:06:14,
|
| 411 |
speech tech now and again, tried it out. It was
|
| 412 |
|
| 413 |
104
|
| 414 |
-
00:06:17,
|
| 415 |
like, it'd be really cool if you could just, like,
|
| 416 |
|
| 417 |
105
|
| 418 |
-
00:06:19,
|
| 419 |
speak into your computer. And whatever I tried out that
|
| 420 |
|
| 421 |
106
|
| 422 |
-
00:06:23,
|
| 423 |
had Linux support was just. It was not good, basically.
|
| 424 |
|
| 425 |
107
|
| 426 |
-
00:06:27,
|
| 427 |
And this blew me away from the first go. I
|
| 428 |
|
| 429 |
108
|
| 430 |
-
00:06:29,
|
| 431 |
mean, it wasn't 100% accurate out of the box and
|
| 432 |
|
| 433 |
109
|
| 434 |
-
00:06:32,
|
| 435 |
it took work, but it was good enough that there
|
| 436 |
|
| 437 |
110
|
| 438 |
-
00:06:
|
| 439 |
was a solid foundation and it kind of passed that
|
| 440 |
|
| 441 |
111
|
| 442 |
-
00:06:38,
|
| 443 |
pivot point that it's actually worth doing this. You know,
|
| 444 |
|
| 445 |
112
|
| 446 |
-
00:06:42,
|
| 447 |
there's a point where it's so like the transcript is
|
| 448 |
|
| 449 |
113
|
| 450 |
-
00:06:44,
|
| 451 |
you don't have to get 100% accuracy for it to
|
| 452 |
|
| 453 |
114
|
| 454 |
-
00:06:47,
|
| 455 |
be worth your time for speech attacks to be a
|
| 456 |
|
| 457 |
115
|
| 458 |
-
00:06:50,
|
| 459 |
worthwhile addition to your productivity, but you do need to
|
| 460 |
|
| 461 |
116
|
| 462 |
-
00:06:52,
|
| 463 |
get above, let's say, I don't know, 85%. If it's
|
| 464 |
|
| 465 |
117
|
| 466 |
-
00:06:56,
|
| 467 |
60% or 50%, you inevitably say, screw it, I'll just
|
| 468 |
|
| 469 |
118
|
| 470 |
-
00:06:59,
|
| 471 |
type it because you end up missing errors in the
|
| 472 |
|
| 473 |
119
|
| 474 |
-
00:07:02,
|
| 475 |
transcript and it becomes actually worse. You end up in
|
| 476 |
|
| 477 |
120
|
| 478 |
-
00:07:05,
|
| 479 |
a worse position than you started with. That's been my
|
| 480 |
|
| 481 |
121
|
| 482 |
-
00:07:07,
|
| 483 |
experience. So I was like, oh, this is actually really,
|
| 484 |
|
| 485 |
122
|
| 486 |
-
00:07:12,
|
| 487 |
really good now. How did that happen? And the answer
|
| 488 |
|
| 489 |
123
|
| 490 |
-
00:07:14,
|
| 491 |
is ASR whisper being open source and the transformer
|
| 492 |
|
| 493 |
124
|
| 494 |
-
00:07:19,
|
| 495 |
architecture. If you want to go back to the to
|
| 496 |
|
| 497 |
125
|
| 498 |
-
00:07:23,
|
| 499 |
the underpinnings, which really blows my mind and it's on
|
| 500 |
|
| 501 |
126
|
| 502 |
-
00:07:26,
|
| 503 |
my list. To read through that paper. All you need
|
| 504 |
|
| 505 |
127
|
| 506 |
-
00:07:30,
|
| 507 |
is attention as attentively as can be done
|
| 508 |
|
| 509 |
128
|
| 510 |
-
00:07:36,
|
| 511 |
with my limited brain because it's super, super high level
|
| 512 |
|
| 513 |
129
|
| 514 |
-
00:07:39,
|
| 515 |
stuff, super advanced stuff, I mean. But that, I think
|
| 516 |
|
| 517 |
130
|
| 518 |
-
00:07:44,
|
| 519 |
of all the things that are fascinating about the sudden
|
| 520 |
|
| 521 |
131
|
| 522 |
-
00:07:49,
|
| 523 |
rise in AI and the dramatic capabilities. I find it
|
| 524 |
|
| 525 |
132
|
| 526 |
-
00:07:53,
|
| 527 |
fascinating that a few people are like, hang on, you've
|
| 528 |
|
| 529 |
133
|
| 530 |
-
00:07:56,
|
| 531 |
got this thing that can speak to you, like a
|
| 532 |
|
| 533 |
134
|
| 534 |
-
00:07:58,
|
| 535 |
chatbot, an LLM, and then you've got image generation. Okay,
|
| 536 |
|
| 537 |
135
|
| 538 |
-
00:08:03,
|
| 539 |
so firstly, those two things on the surface have nothing
|
| 540 |
|
| 541 |
136
|
| 542 |
-
00:08:06,
|
| 543 |
in common. So like, how are they, how did that
|
| 544 |
|
| 545 |
137
|
| 546 |
-
00:08:10,
|
| 547 |
just happen all at the same time? And then when
|
| 548 |
|
| 549 |
138
|
| 550 |
-
00:08:12,
|
| 551 |
you extend that further, you're like, Suno, right? You can
|
| 552 |
|
| 553 |
139
|
| 554 |
-
00:08:17,
|
| 555 |
sing a song and AI will come up with and
|
| 556 |
|
| 557 |
140
|
| 558 |
-
00:08:20,
|
| 559 |
instrumental. And then you've got Whisper and you're like, wait
|
| 560 |
|
| 561 |
141
|
| 562 |
-
00:08:23,
|
| 563 |
a second, how did all this stuff, like, if it's
|
| 564 |
|
| 565 |
142
|
| 566 |
-
00:08:25,
|
| 567 |
all AI, what's like, there has to be some commonality.
|
| 568 |
|
| 569 |
143
|
| 570 |
-
00:08:29,
|
| 571 |
Otherwise, these are totally different technologies on the surface of
|
| 572 |
|
| 573 |
144
|
| 574 |
-
00:08:34,
|
| 575 |
it. And the Transformer architecture is, as far as I
|
| 576 |
|
| 577 |
145
|
| 578 |
-
00:08:38,
|
| 579 |
know, the answer. And I can't even say, can't even
|
| 580 |
|
| 581 |
146
|
| 582 |
-
00:08:41,
|
| 583 |
pretend that I really understand what the Transformer architecture means.
|
| 584 |
|
| 585 |
147
|
| 586 |
-
00:08:46,
|
| 587 |
In depth, but I have scanned it and as I
|
| 588 |
|
| 589 |
148
|
| 590 |
-
00:08:49,
|
| 591 |
said, I want to print it and really kind of
|
| 592 |
|
| 593 |
149
|
| 594 |
-
00:08:52,
|
| 595 |
think over it at some point. And I'll probably feel
|
| 596 |
|
| 597 |
150
|
| 598 |
-
00:08:56,
|
| 599 |
bad about myself, I think, because weren't those guys in
|
| 600 |
|
| 601 |
151
|
| 602 |
-
00:08:59,
|
| 603 |
their 20s? Like, that's crazy. I think I asked ChatGPT
|
| 604 |
|
| 605 |
152
|
| 606 |
-
00:09:03,
|
| 607 |
once who wrote that paper and how old were they
|
| 608 |
|
| 609 |
153
|
| 610 |
-
00:09:08,
|
| 611 |
when it was published in Arciv? And I was expecting,
|
| 612 |
|
| 613 |
154
|
| 614 |
-
00:09:11,
|
| 615 |
like, I don't know, What do you imagine? I personally
|
| 616 |
|
| 617 |
155
|
| 618 |
-
00:09:
|
| 619 |
imagine kind of like, you know, you have these breakthroughs
|
| 620 |
|
| 621 |
156
|
| 622 |
-
00:09:16,
|
| 623 |
during COVID and things like that where like these kind
|
| 624 |
|
| 625 |
157
|
| 626 |
-
00:09:19,
|
| 627 |
of really obscure scientists are like in their 50s and
|
| 628 |
|
| 629 |
158
|
| 630 |
-
00:09:22,
|
| 631 |
they've just kind of been laboring in labs and wearily
|
| 632 |
|
| 633 |
159
|
| 634 |
-
00:09:27,
|
| 635 |
in writing and publishing in kind of obscure academic publications.
|
| 636 |
|
| 637 |
160
|
| 638 |
-
00:09:30,
|
| 639 |
And they finally like hit a big or win a
|
| 640 |
|
| 641 |
161
|
| 642 |
-
00:09:33,
|
| 643 |
Nobel Prize and then their household names. So that was
|
| 644 |
|
| 645 |
162
|
| 646 |
-
00:09:37,
|
| 647 |
kind of what I had in mind. That was the
|
| 648 |
|
| 649 |
163
|
| 650 |
-
00:09:
|
| 651 |
mental image I'd formed of the birth of Arcsight. Like
|
| 652 |
|
| 653 |
164
|
| 654 |
-
00:09:
|
| 655 |
I wasn't expecting 20-somethings in San Francisco, though. I thought
|
| 656 |
|
| 657 |
165
|
| 658 |
-
00:09:46,
|
| 659 |
that was both very, very funny, very cool, and actually
|
| 660 |
|
| 661 |
166
|
| 662 |
-
00:09:
|
| 663 |
kind of inspiring. It's nice to think that people who,
|
| 664 |
|
| 665 |
167
|
| 666 |
-
00:09:53,
|
| 667 |
you know, just you might put them in the kind
|
| 668 |
|
| 669 |
168
|
| 670 |
-
00:09:56,
|
| 671 |
of milieu or bubble or world that you are in
|
| 672 |
|
| 673 |
169
|
| 674 |
-
00:09:59,
|
| 675 |
are credibly in through, you know, the series of connections
|
| 676 |
|
| 677 |
170
|
| 678 |
-
00:10:03,
|
| 679 |
that are coming up with such literally world changing innovations.
|
| 680 |
|
| 681 |
171
|
| 682 |
-
00:10:07,
|
| 683 |
So that was, I thought, anyway. That's that was cool.
|
| 684 |
|
| 685 |
172
|
| 686 |
-
00:10:11,
|
| 687 |
Okay, voice training data. How are we doing? We're about
|
| 688 |
|
| 689 |
173
|
| 690 |
-
00:10:14,
|
| 691 |
10 minutes and I'm still talking about voice technology. So
|
| 692 |
|
| 693 |
174
|
| 694 |
-
00:10:18,
|
| 695 |
Whisper was brilliant and I was so excited that I
|
| 696 |
|
| 697 |
175
|
| 698 |
-
00:10:22,
|
| 699 |
was my first instinct was to like guess like, oh
|
| 700 |
|
| 701 |
176
|
| 702 |
-
00:10:25,
|
| 703 |
my gosh, I have to get like a really good
|
| 704 |
|
| 705 |
177
|
| 706 |
-
00:10:26,
|
| 707 |
microphone for this. So I didn't go on a spending
|
| 708 |
|
| 709 |
178
|
| 710 |
-
00:10:30,
|
| 711 |
spree because I said, I'm gonna have to just wait
|
| 712 |
|
| 713 |
179
|
| 714 |
-
00:10:32,
|
| 715 |
a month and see if I still use this. And
|
| 716 |
|
| 717 |
180
|
| 718 |
-
00:10:36,
|
| 719 |
It just kind of became, it's become really part of
|
| 720 |
|
| 721 |
181
|
| 722 |
-
00:10:39,
|
| 723 |
my daily routine. Like if I'm writing an email, I'll
|
| 724 |
|
| 725 |
182
|
| 726 |
-
00:10:43,
|
| 727 |
record a voice note. And then I've developed and it's
|
| 728 |
|
| 729 |
183
|
| 730 |
-
00:10:
|
| 731 |
nice to see that everyone is like developing the same
|
| 732 |
|
| 733 |
184
|
| 734 |
-
00:10:49,
|
| 735 |
things in parallel. Like that's my kind of a weird
|
| 736 |
|
| 737 |
185
|
| 738 |
-
00:10:
|
| 739 |
thing to say, but when I look, I kind of
|
| 740 |
|
| 741 |
186
|
| 742 |
-
00:10:54,
|
| 743 |
came, when I started working on this, these prototypes on
|
| 744 |
|
| 745 |
187
|
| 746 |
-
00:10:59,
|
| 747 |
GitHub, which is where I just kind of share very
|
| 748 |
|
| 749 |
188
|
| 750 |
-
00:11:01,
|
| 751 |
freely and loosely, ideas and first iterations on concepts.
|
| 752 |
|
| 753 |
189
|
| 754 |
-
00:11:08,
|
| 755 |
And for want of a better word, I called it
|
| 756 |
|
| 757 |
190
|
| 758 |
-
00:11:10,
|
| 759 |
like LLM post-processing or cleanup or basically a system prompt
|
| 760 |
|
| 761 |
191
|
| 762 |
-
00:11:15,
|
| 763 |
that after you get back the raw text from Whisper,
|
| 764 |
|
| 765 |
192
|
| 766 |
-
00:11:19,
|
| 767 |
you run it through a model and say, okay, this
|
| 768 |
|
| 769 |
193
|
| 770 |
-
00:11:22,
|
| 771 |
is crappy text, like add sentence structure and fix it
|
| 772 |
|
| 773 |
194
|
| 774 |
-
00:11:27,
|
| 775 |
up. And now when I'm exploring the different tools that
|
| 776 |
|
| 777 |
195
|
| 778 |
-
00:11:32,
|
| 779 |
are out there that people have built, I see quite
|
| 780 |
|
| 781 |
196
|
| 782 |
-
00:11:35,
|
| 783 |
a number of projects have basically done the same thing,
|
| 784 |
|
| 785 |
197
|
| 786 |
-
00:11:40,
|
| 787 |
lest that be misconstrued. I'm not saying for a millisecond
|
| 788 |
|
| 789 |
198
|
| 790 |
-
00:11:43,
|
| 791 |
that I inspired them. I'm sure this has been a
|
| 792 |
|
| 793 |
199
|
| 794 |
-
00:11:46,
|
| 795 |
thing that's been integrated into tools for a while, but
|
| 796 |
|
| 797 |
200
|
| 798 |
-
00:11:50,
|
| 799 |
it's the kind of thing that when you start using
|
| 800 |
|
| 801 |
201
|
| 802 |
-
00:11:52,
|
| 803 |
these tools every day, the need for it is almost
|
| 804 |
|
| 805 |
202
|
| 806 |
-
00:11:
|
| 807 |
instantly apparent because text that doesn't have any punctuation or
|
| 808 |
|
| 809 |
203
|
| 810 |
-
00:11:59,
|
| 811 |
Paragraph spacing takes a long time to, you know, it
|
| 812 |
|
| 813 |
204
|
| 814 |
-
00:12:03,
|
| 815 |
takes so long to get it into a presentable email
|
| 816 |
|
| 817 |
205
|
| 818 |
-
00:12:05,
|
| 819 |
that again, it's, it's, it, it moves speech tech into
|
| 820 |
|
| 821 |
206
|
| 822 |
-
00:12:
|
| 823 |
that before that inflection point where you're like, no, it's
|
| 824 |
|
| 825 |
207
|
| 826 |
-
00:12:13,
|
| 827 |
just not worth it. It's like, it's, it'll just be
|
| 828 |
|
| 829 |
208
|
| 830 |
-
00:12:16,
|
| 831 |
quicker to type this. So it's a big, it's a
|
| 832 |
|
| 833 |
209
|
| 834 |
-
00:12:18,
|
| 835 |
little touch that actually is a big deal. Uh, so
|
| 836 |
|
| 837 |
210
|
| 838 |
-
00:12:21,
|
| 839 |
I was on Whisper and I've been using Whisper and
|
| 840 |
|
| 841 |
211
|
| 842 |
-
00:12:25,
|
| 843 |
I kind of, early on found a couple of tools.
|
| 844 |
|
| 845 |
212
|
| 846 |
-
00:12:28,
|
| 847 |
I couldn't find what I was looking for on Linux,
|
| 848 |
|
| 849 |
213
|
| 850 |
-
00:12:30,
|
| 851 |
which is basically just something that'll run in the background.
|
| 852 |
|
| 853 |
214
|
| 854 |
-
00:12:35,
|
| 855 |
It'll give it an API key and it will just
|
| 856 |
|
| 857 |
215
|
| 858 |
-
00:12:38,
|
| 859 |
like transcribe with like a little key to start and
|
| 860 |
|
| 861 |
216
|
| 862 |
-
00:12:
|
| 863 |
stop the dictation. And the issues were I discovered that
|
| 864 |
|
| 865 |
217
|
| 866 |
-
00:12:47,
|
| 867 |
like most people involved in creating these projects were very
|
| 868 |
|
| 869 |
218
|
| 870 |
-
00:12:51,
|
| 871 |
much focused on local models, running Whisper locally because you
|
| 872 |
|
| 873 |
219
|
| 874 |
-
00:12:55,
|
| 875 |
can. And I tried that a bunch of times and
|
| 876 |
|
| 877 |
220
|
| 878 |
-
00:12:58,
|
| 879 |
just never got results that were as good as the
|
| 880 |
|
| 881 |
221
|
| 882 |
-
00:13:00,
|
| 883 |
cloud. And when I began looking at the cost of
|
| 884 |
|
| 885 |
222
|
| 886 |
-
00:13:03,
|
| 887 |
the speech to text APIs and what I was spending,
|
| 888 |
|
| 889 |
223
|
| 890 |
-
00:13:06,
|
| 891 |
I just thought there is, it's actually, in my opinion,
|
| 892 |
|
| 893 |
224
|
| 894 |
-
00:13:09,
|
| 895 |
just one of the better deals in API spending and
|
| 896 |
|
| 897 |
225
|
| 898 |
-
00:13:12,
|
| 899 |
in cloud. Like it's just not that expensive for very,
|
| 900 |
|
| 901 |
226
|
| 902 |
-
00:13:15,
|
| 903 |
very good models that are much more, you know, you're
|
| 904 |
|
| 905 |
227
|
| 906 |
-
00:13:19,
|
| 907 |
gonna be able to run the full model. The latest
|
| 908 |
|
| 909 |
228
|
| 910 |
-
00:13:21,
|
| 911 |
model versus whatever you can run on your average GPU,
|
| 912 |
|
| 913 |
229
|
| 914 |
-
00:13:26,
|
| 915 |
unless you want to buy a crazy GPU. It doesn't
|
| 916 |
|
| 917 |
230
|
| 918 |
-
00:13:29,
|
| 919 |
really make sense to me. Now, privacy is another concern
|
| 920 |
|
| 921 |
231
|
| 922 |
-
00:13:32,
|
| 923 |
that I know is kind of like a very much
|
| 924 |
|
| 925 |
232
|
| 926 |
-
00:13:
|
| 927 |
a separate thing that people just don't want their voice
|
| 928 |
|
| 929 |
233
|
| 930 |
-
00:13:37,
|
| 931 |
data and their voice leaving their local environment, maybe for
|
| 932 |
|
| 933 |
234
|
| 934 |
-
00:13:40,
|
| 935 |
regulatory reasons as well. But I'm not in that. I
|
| 936 |
|
| 937 |
235
|
| 938 |
-
00:13:44,
|
| 939 |
neither really care about people listening to my grocery list
|
| 940 |
|
| 941 |
236
|
| 942 |
-
00:13:49,
|
| 943 |
consisting of reminding myself that I need to buy more
|
| 944 |
|
| 945 |
237
|
| 946 |
-
00:13:51,
|
| 947 |
beer, Cheetos, and hummus, which is kind of the three
|
| 948 |
|
| 949 |
238
|
| 950 |
-
00:13:55,
|
| 951 |
staples of my diet during periods of poorer nutrition. But
|
| 952 |
|
| 953 |
239
|
| 954 |
-
00:
|
| 955 |
the kind of stuff that I transcribe, it's just not,
|
| 956 |
|
| 957 |
240
|
| 958 |
-
00:14:
|
| 959 |
it's not a privacy thing I'm that sort of sensitive
|
| 960 |
|
| 961 |
241
|
| 962 |
-
00:14:07,
|
| 963 |
about and I don't do anything so sensitive or secure
|
| 964 |
|
| 965 |
242
|
| 966 |
-
00:14:13,
|
| 967 |
that requires air gapping. So I looked at the pricing
|
| 968 |
|
| 969 |
243
|
| 970 |
-
00:14:16,
|
| 971 |
and especially the kind of older model mini Some of
|
| 972 |
|
| 973 |
244
|
| 974 |
-
00:14:19,
|
| 975 |
them are very, very affordable. And I did a back
|
| 976 |
|
| 977 |
245
|
| 978 |
-
00:14:22,
|
| 979 |
of the, I did a calculation once with ChatGPT and
|
| 980 |
|
| 981 |
246
|
| 982 |
-
00:14:25,
|
| 983 |
I was like, okay, this is the API price for
|
| 984 |
|
| 985 |
247
|
| 986 |
-
00:14:29,
|
| 987 |
I can't remember whatever the model was. Let's say I
|
| 988 |
|
| 989 |
248
|
| 990 |
-
00:14:32,
|
| 991 |
just go at it like nonstop, which it rarely happens.
|
| 992 |
|
| 993 |
249
|
| 994 |
-
00:14:35,
|
| 995 |
Probably, I would say on average, I might dictate 30
|
| 996 |
|
| 997 |
250
|
| 998 |
-
00:14:38,
|
| 999 |
to 60 minutes per day if I was probably summing
|
| 1000 |
|
| 1001 |
251
|
| 1002 |
-
00:14:41,
|
| 1003 |
up the emails, documents, outlines, which
|
| 1004 |
|
| 1005 |
252
|
| 1006 |
-
00:14:47,
|
| 1007 |
is a lot, but it's still a fairly modest amount.
|
| 1008 |
|
| 1009 |
253
|
| 1010 |
-
00:14:50,
|
| 1011 |
And I was like, Some days I do go on
|
| 1012 |
|
| 1013 |
254
|
| 1014 |
-
00:14:52,
|
| 1015 |
like one or two days where I've been usually when
|
| 1016 |
|
| 1017 |
255
|
| 1018 |
-
00:14:54,
|
| 1019 |
I'm like kind of out of the house and just
|
| 1020 |
|
| 1021 |
256
|
| 1022 |
-
00:14:57,
|
| 1023 |
have something like I have nothing else to do. Like
|
| 1024 |
|
| 1025 |
257
|
| 1026 |
-
00:15:00,
|
| 1027 |
if I'm at a hospital, we have a newborn and
|
| 1028 |
|
| 1029 |
258
|
| 1030 |
-
00:15:04,
|
| 1031 |
you're waiting for like eight hours and hours for an
|
| 1032 |
|
| 1033 |
259
|
| 1034 |
-
00:15:07,
|
| 1035 |
appointment. And I would probably have listened to podcasts before
|
| 1036 |
|
| 1037 |
260
|
| 1038 |
-
00:15:11,
|
| 1039 |
becoming a speech fanatic. And I'm like, oh, wait, let
|
| 1040 |
|
| 1041 |
261
|
| 1042 |
-
00:15:14,
|
| 1043 |
me just get down. Let me just get these ideas
|
| 1044 |
|
| 1045 |
262
|
| 1046 |
-
00:15:16,
|
| 1047 |
out of my head. And that's when I'll go on
|
| 1048 |
|
| 1049 |
263
|
| 1050 |
-
00:15:19,
|
| 1051 |
my speech binges. But those are like once every few
|
| 1052 |
|
| 1053 |
264
|
| 1054 |
-
00:15:21,
|
| 1055 |
months, like not frequently. But I said, okay, let's just
|
| 1056 |
|
| 1057 |
265
|
| 1058 |
-
00:15:25,
|
| 1059 |
say if I'm gonna price out Cloud SCT, if I
|
| 1060 |
|
| 1061 |
266
|
| 1062 |
-
00:15:29,
|
| 1063 |
was like dedicated every second of every waking hour to
|
| 1064 |
|
| 1065 |
267
|
| 1066 |
-
00:15:34,
|
| 1067 |
transcribing for some odd reason, I mean, I'd have to
|
| 1068 |
|
| 1069 |
268
|
| 1070 |
-
00:15:
|
| 1071 |
like eat and use the toilet. Like, you know, there's
|
| 1072 |
|
| 1073 |
269
|
| 1074 |
-
00:15:40,
|
| 1075 |
only so many hours I'm awake for. So like, let's
|
| 1076 |
|
| 1077 |
270
|
| 1078 |
-
00:15:43,
|
| 1079 |
just say a maximum of like 40 hour, 45 minutes.
|
| 1080 |
|
| 1081 |
271
|
| 1082 |
-
00:15:47,
|
| 1083 |
In the hour. Then I said, all right, let's just
|
| 1084 |
|
| 1085 |
272
|
| 1086 |
-
00:15:49,
|
| 1087 |
say 50. Who knows? You're dictating on the toilet. We
|
| 1088 |
|
| 1089 |
273
|
| 1090 |
-
00:15:53,
|
| 1091 |
do it. So it could be. You could just do
|
| 1092 |
|
| 1093 |
274
|
| 1094 |
-
00:15:55,
|
| 1095 |
60. But whatever I did. And every day, like, you're
|
| 1096 |
|
| 1097 |
275
|
| 1098 |
-
00:15:59,
|
| 1099 |
going flat out seven days a week dictating non-stop I
|
| 1100 |
|
| 1101 |
276
|
| 1102 |
-
00:16:02,
|
| 1103 |
was like, what's my monthly API bill gonna be at
|
| 1104 |
|
| 1105 |
277
|
| 1106 |
-
00:16:
|
| 1107 |
this price? And it came out to, like, 70 or
|
| 1108 |
|
| 1109 |
278
|
| 1110 |
-
00:16:08,
|
| 1111 |
80 bucks. And I was like, well, that would be
|
| 1112 |
|
| 1113 |
279
|
| 1114 |
-
00:16:11,
|
| 1115 |
an extraordinary. Amount of dictation. And I would hope that
|
| 1116 |
|
| 1117 |
280
|
| 1118 |
-
00:16:16,
|
| 1119 |
there was some compelling reason more worth more than $70
|
| 1120 |
|
| 1121 |
281
|
| 1122 |
-
00:16:20,
|
| 1123 |
that I embarked upon that project. So given that that's
|
| 1124 |
|
| 1125 |
282
|
| 1126 |
-
00:16:23,
|
| 1127 |
kind of the max point for me, I said that's
|
| 1128 |
|
| 1129 |
283
|
| 1130 |
-
00:16:25,
|
| 1131 |
actually very, very affordable. Now you're gonna, if you want
|
| 1132 |
|
| 1133 |
284
|
| 1134 |
-
00:16:29,
|
| 1135 |
to spec out the costs and you want to do
|
| 1136 |
|
| 1137 |
285
|
| 1138 |
-
00:16:31,
|
| 1139 |
the post-processing that I really do feel is valuable, that's
|
| 1140 |
|
| 1141 |
286
|
| 1142 |
-
00:16:36,
|
| 1143 |
gonna cost some more as well, unless you're using Gemini,
|
| 1144 |
|
| 1145 |
287
|
| 1146 |
-
00:16:41,
|
| 1147 |
which needless to say is a random person sitting in
|
| 1148 |
|
| 1149 |
288
|
| 1150 |
-
00:16:44,
|
| 1151 |
Jerusalem. I have no affiliation, nor with Google, nor anthropic,
|
| 1152 |
|
| 1153 |
289
|
| 1154 |
-
00:16:49,
|
| 1155 |
nor Gemini, nor any major tech vendor for that matter.
|
| 1156 |
|
| 1157 |
290
|
| 1158 |
-
00:16:53,
|
| 1159 |
I like Gemini not so much as a everyday model.
|
| 1160 |
|
| 1161 |
291
|
| 1162 |
-
00:16:57,
|
| 1163 |
It's kind of underwhelmed in that respect, I would say.
|
| 1164 |
|
| 1165 |
292
|
| 1166 |
-
00:17:00,
|
| 1167 |
But for multimodal, I think it's got a lot to
|
| 1168 |
|
| 1169 |
293
|
| 1170 |
-
00:17:02,
|
| 1171 |
offer. And I think that the transcribing functionality whereby it
|
| 1172 |
|
| 1173 |
294
|
| 1174 |
-
00:17:06,
|
| 1175 |
can process audio with a system prompt and both give
|
| 1176 |
|
| 1177 |
295
|
| 1178 |
-
00:17:12,
|
| 1179 |
you transcription that's cleaned up that reduces two steps to
|
| 1180 |
|
| 1181 |
296
|
| 1182 |
-
00:17:15,
|
| 1183 |
one. And that for me is a very, very big
|
| 1184 |
|
| 1185 |
297
|
| 1186 |
-
00:17:18,
|
| 1187 |
deal. And I feel like even Google has haven't really
|
| 1188 |
|
| 1189 |
298
|
| 1190 |
-
00:17:21,
|
| 1191 |
sort of thought through how useful the that modality is
|
| 1192 |
|
| 1193 |
299
|
| 1194 |
-
00:17:26,
|
| 1195 |
and what kind of use cases you can achieve with
|
| 1196 |
|
| 1197 |
300
|
| 1198 |
-
00:17:29,
|
| 1199 |
it. Because I found in the course of this year,
|
| 1200 |
|
| 1201 |
301
|
| 1202 |
-
00:17:31,
|
| 1203 |
just an endless list of really kind of system prompt
|
| 1204 |
|
| 1205 |
302
|
| 1206 |
-
00:17:36,
|
| 1207 |
system prompt stuff that I can say, okay, I've used
|
| 1208 |
|
| 1209 |
303
|
| 1210 |
-
00:17:40,
|
| 1211 |
it to capture context data for AI, which is literally
|
| 1212 |
|
| 1213 |
304
|
| 1214 |
-
00:17:43,
|
| 1215 |
I might speak for if I wanted to have a
|
| 1216 |
|
| 1217 |
305
|
| 1218 |
-
00:17:45,
|
| 1219 |
good bank of context data about who knows my childhood
|
| 1220 |
|
| 1221 |
306
|
| 1222 |
-
00:17:50,
|
| 1223 |
more realistically, maybe my career goals, something that would just
|
| 1224 |
|
| 1225 |
307
|
| 1226 |
-
00:17:54,
|
| 1227 |
be like really boring to type out. So I'll just
|
| 1228 |
|
| 1229 |
308
|
| 1230 |
-
00:17:56,
|
| 1231 |
like sit in my car and record it for 10
|
| 1232 |
|
| 1233 |
309
|
| 1234 |
-
00:18:00,
|
| 1235 |
minutes. And that 10 minutes you get a lot of
|
| 1236 |
|
| 1237 |
310
|
| 1238 |
-
00:18:03,
|
| 1239 |
information in. Um, emails, which is short text, just
|
| 1240 |
|
| 1241 |
311
|
| 1242 |
-
00:18:09,
|
| 1243 |
there is a whole bunch and all these workflows kind
|
| 1244 |
|
| 1245 |
312
|
| 1246 |
-
00:18:12,
|
| 1247 |
of require a little bit of treatment afterwards and different
|
| 1248 |
|
| 1249 |
313
|
| 1250 |
-
00:18:14,
|
| 1251 |
treatment. My context pipeline is kind of like just extract
|
| 1252 |
|
| 1253 |
314
|
| 1254 |
-
00:18:18,
|
| 1255 |
the bare essentials. So you end up with me talking
|
| 1256 |
|
| 1257 |
315
|
| 1258 |
-
00:18:21,
|
| 1259 |
very loosely about sort of what I've done in my
|
| 1260 |
|
| 1261 |
316
|
| 1262 |
-
00:18:23,
|
| 1263 |
career, where I've worked, where I might like to work.
|
| 1264 |
|
| 1265 |
317
|
| 1266 |
-
00:18:25,
|
| 1267 |
And it goes, it condenses that down to very robotic
|
| 1268 |
|
| 1269 |
318
|
| 1270 |
-
00:18:29,
|
| 1271 |
language that is easy to chunk parse and maybe put
|
| 1272 |
|
| 1273 |
319
|
| 1274 |
-
00:18:32,
|
| 1275 |
into a vector database. Daniel has worked in technology. Daniel
|
| 1276 |
|
| 1277 |
320
|
| 1278 |
-
00:18:37,
|
| 1279 |
has been working in, you know, stuff like that. That's
|
| 1280 |
|
| 1281 |
321
|
| 1282 |
-
00:18:40,
|
| 1283 |
not how you would speak, but I figure it's probably
|
| 1284 |
|
| 1285 |
322
|
| 1286 |
-
00:18:43,
|
| 1287 |
easier to parse for, after all, robots. So we've almost
|
| 1288 |
|
| 1289 |
323
|
| 1290 |
-
00:18:47,
|
| 1291 |
got to 20 minutes and this is actually a success
|
| 1292 |
|
| 1293 |
324
|
| 1294 |
-
00:18:49,
|
| 1295 |
because I wasted 20 minutes of the evening speaking
|
| 1296 |
|
| 1297 |
325
|
| 1298 |
-
00:18:55,
|
| 1299 |
into a microphone and the levels were shot and it
|
| 1300 |
|
| 1301 |
326
|
| 1302 |
-
00:18:59,
|
| 1303 |
was clipping and I said, I can't really do an
|
| 1304 |
|
| 1305 |
327
|
| 1306 |
-
00:19:01,
|
| 1307 |
evaluation. I have to be fair. I have to give
|
| 1308 |
|
| 1309 |
328
|
| 1310 |
-
00:19:04,
|
| 1311 |
the models a chance to do their thing. What am
|
| 1312 |
|
| 1313 |
329
|
| 1314 |
-
00:19:
|
| 1315 |
I hoping to achieve in this? Okay, my fine tune
|
| 1316 |
|
| 1317 |
330
|
| 1318 |
-
00:19:10,
|
| 1319 |
was a dud as mentioned. DeepChrom ST, I'm really, really
|
| 1320 |
|
| 1321 |
331
|
| 1322 |
-
00:19:13,
|
| 1323 |
hopeful that this prototype will work and it's a build
|
| 1324 |
|
| 1325 |
332
|
| 1326 |
-
00:19:16,
|
| 1327 |
in public open source, so anyone is welcome to use
|
| 1328 |
|
| 1329 |
333
|
| 1330 |
-
00:19:19,
|
| 1331 |
it if I make anything good. But that was really
|
| 1332 |
|
| 1333 |
334
|
| 1334 |
-
00:19:22,
|
| 1335 |
exciting for me last night when after hours of trying
|
| 1336 |
|
| 1337 |
335
|
| 1338 |
-
00:19:26,
|
| 1339 |
my own prototype, seeing someone just made something that works
|
| 1340 |
|
| 1341 |
336
|
| 1342 |
-
00:19:30,
|
| 1343 |
like that, you know, you're not gonna have to build
|
| 1344 |
|
| 1345 |
337
|
| 1346 |
-
00:19:32,
|
| 1347 |
a custom conda environment and image. I have AMD GPU,
|
| 1348 |
|
| 1349 |
338
|
| 1350 |
-
00:19:37,
|
| 1351 |
which makes things much more complicated. I didn't find it.
|
| 1352 |
|
| 1353 |
339
|
| 1354 |
-
00:19:41,
|
| 1355 |
And I was about to give up and I said,
|
| 1356 |
|
| 1357 |
340
|
| 1358 |
-
00:19:43,
|
| 1359 |
all right, let me just give Deep Grams Linux thing
|
| 1360 |
|
| 1361 |
341
|
| 1362 |
-
00:19:
|
| 1363 |
a shot. And if this doesn't work, I'm just going
|
| 1364 |
|
| 1365 |
342
|
| 1366 |
-
00:19:49,
|
| 1367 |
to go back to trying to Vibe code something myself.
|
| 1368 |
|
| 1369 |
343
|
| 1370 |
-
00:19:51,
|
| 1371 |
And when I ran the script, I was using Claude
|
| 1372 |
|
| 1373 |
344
|
| 1374 |
-
00:19:55,
|
| 1375 |
code to do the installation process. It ran the script
|
| 1376 |
|
| 1377 |
345
|
| 1378 |
-
00:19:59,
|
| 1379 |
and oh my gosh, it works just like that. The
|
| 1380 |
|
| 1381 |
346
|
| 1382 |
-
00:20:02,
|
| 1383 |
tricky thing For all those who want to know all
|
| 1384 |
|
| 1385 |
347
|
| 1386 |
-
00:20:
|
| 1387 |
the nitty gritty details, was that I
|
| 1388 |
|
| 1389 |
348
|
| 1390 |
-
00:20:11,
|
| 1391 |
don't think it was actually struggling with transcription, but pasting
|
| 1392 |
|
| 1393 |
349
|
| 1394 |
-
00:20:14,
|
| 1395 |
Wayland makes life very hard. And I think there was
|
| 1396 |
|
| 1397 |
350
|
| 1398 |
-
00:20:18,
|
| 1399 |
something not running the right time. Anyway, Deepgram, I looked
|
| 1400 |
|
| 1401 |
351
|
| 1402 |
-
00:20:21,
|
| 1403 |
at how they actually handled that because it worked out
|
| 1404 |
|
| 1405 |
352
|
| 1406 |
-
00:20:23,
|
| 1407 |
of the box when other stuff didn't. And it was
|
| 1408 |
|
| 1409 |
353
|
| 1410 |
-
00:20:27,
|
| 1411 |
quite a clever little mechanism. And but more so than
|
| 1412 |
|
| 1413 |
354
|
| 1414 |
-
00:20:30,
|
| 1415 |
that, the accuracy was brilliant. Now, what am I doing
|
| 1416 |
|
| 1417 |
355
|
| 1418 |
-
00:20:33,
|
| 1419 |
here? This is going to be a 20 minute audio
|
| 1420 |
|
| 1421 |
356
|
| 1422 |
-
00:20:36,
|
| 1423 |
sample. And I think I've done one or two
|
| 1424 |
|
| 1425 |
357
|
| 1426 |
-
00:20:42,
|
| 1427 |
of these before, but I did it with short snappy
|
| 1428 |
|
| 1429 |
358
|
| 1430 |
-
00:20:46,
|
| 1431 |
voice notes. This is kind of long form. This actually
|
| 1432 |
|
| 1433 |
359
|
| 1434 |
-
00:20:50,
|
| 1435 |
might be a better approximation for what's useful to me
|
| 1436 |
|
| 1437 |
360
|
| 1438 |
-
00:20:52,
|
| 1439 |
than voice memos. Like, I need to buy three Bread,
|
| 1440 |
|
| 1441 |
361
|
| 1442 |
-
00:20:
|
| 1443 |
eaters of milk tomorrow and Peter bread, which is probably
|
| 1444 |
|
| 1445 |
362
|
| 1446 |
-
00:20:58,
|
| 1447 |
how like half my voice notes sound. Like if anyone
|
| 1448 |
|
| 1449 |
363
|
| 1450 |
-
00:21:01,
|
| 1451 |
were to, I don't know, like find my phone, they'd
|
| 1452 |
|
| 1453 |
364
|
| 1454 |
-
00:21:04,
|
| 1455 |
be like, this is the most boring person in the
|
| 1456 |
|
| 1457 |
365
|
| 1458 |
-
00:21:05,
|
| 1459 |
world. Although actually, there are some like kind of journaling
|
| 1460 |
|
| 1461 |
366
|
| 1462 |
-
00:21:09,
|
| 1463 |
thoughts as well, but it's a lot of content like
|
| 1464 |
|
| 1465 |
367
|
| 1466 |
-
00:21:11,
|
| 1467 |
that. And the probably for the evaluation, the most useful
|
| 1468 |
|
| 1469 |
368
|
| 1470 |
-
00:21:14,
|
| 1471 |
thing is slightly obscure tech, GitHub, NeocleNo, hugging
|
| 1472 |
|
| 1473 |
369
|
| 1474 |
-
00:21:20,
|
| 1475 |
face, Not so obscure that it's not going to have
|
| 1476 |
|
| 1477 |
370
|
| 1478 |
-
00:21:23,
|
| 1479 |
a chance of knowing it, but hopefully sufficiently well known
|
| 1480 |
|
| 1481 |
371
|
| 1482 |
-
00:21:26,
|
| 1483 |
that the model should get it. I tried to do
|
| 1484 |
|
| 1485 |
372
|
| 1486 |
-
00:21:28,
|
| 1487 |
a little bit of speaking really fast and speaking very
|
| 1488 |
|
| 1489 |
373
|
| 1490 |
-
00:21:31,
|
| 1491 |
slowly. I would say in general, I've spoken, delivered this
|
| 1492 |
|
| 1493 |
374
|
| 1494 |
-
00:21:35,
|
| 1495 |
at a faster pace than I usually would owing to
|
| 1496 |
|
| 1497 |
375
|
| 1498 |
-
00:21:
|
| 1499 |
strong coffee flowing through my bloodstream. And the thing that
|
| 1500 |
|
| 1501 |
376
|
| 1502 |
-
00:21:42,
|
| 1503 |
I'm not going to get in this benchmark is background
|
| 1504 |
|
| 1505 |
377
|
| 1506 |
-
00:21:44,
|
| 1507 |
noise, which in my first take that I had to
|
| 1508 |
|
| 1509 |
378
|
| 1510 |
-
00:21:46,
|
| 1511 |
get rid of, My wife came in with my son
|
| 1512 |
|
| 1513 |
379
|
| 1514 |
-
00:21:50,
|
| 1515 |
and for a goodnight kiss. And that actually would have
|
| 1516 |
|
| 1517 |
380
|
| 1518 |
-
00:21:52,
|
| 1519 |
been super helpful to get in because it was non
|
| 1520 |
|
| 1521 |
381
|
| 1522 |
-
00:21:56,
|
| 1523 |
diarized or if we had diarization, a female, I could
|
| 1524 |
|
| 1525 |
382
|
| 1526 |
-
00:22:00,
|
| 1527 |
say, I want the male voice and that wasn't intended
|
| 1528 |
|
| 1529 |
383
|
| 1530 |
-
00:22:02,
|
| 1531 |
for transcription. And we're not going to get background noise
|
| 1532 |
|
| 1533 |
384
|
| 1534 |
-
00:22:
|
| 1535 |
like people honking their horns, which is something I've done
|
| 1536 |
|
| 1537 |
385
|
| 1538 |
-
00:22:08,
|
| 1539 |
in my main data set where I am trying to
|
| 1540 |
|
| 1541 |
386
|
| 1542 |
-
00:22:11,
|
| 1543 |
go back to some of my voice notes. Annotate them
|
| 1544 |
|
| 1545 |
387
|
| 1546 |
-
00:22:14,
|
| 1547 |
and run a benchmark. But this is going to be
|
| 1548 |
|
| 1549 |
388
|
| 1550 |
-
00:22:16,
|
| 1551 |
just a pure quick test. And as someone,
|
| 1552 |
|
| 1553 |
389
|
| 1554 |
-
00:22:22,
|
| 1555 |
I'm working on a voice note idea. That's my sort
|
| 1556 |
|
| 1557 |
390
|
| 1558 |
-
00:22:24,
|
| 1559 |
of end motivation. Besides thinking it's an ask to the
|
| 1560 |
|
| 1561 |
391
|
| 1562 |
-
00:22:28,
|
| 1563 |
outstanding technology that's coming to viability. And really, I know
|
| 1564 |
|
| 1565 |
392
|
| 1566 |
-
00:22:32,
|
| 1567 |
this sounds cheesy, can actually have a very transformative effect.
|
| 1568 |
|
| 1569 |
393
|
| 1570 |
-
00:22:
|
| 1571 |
It's, you know, voice technology has been life changing for
|
| 1572 |
|
| 1573 |
394
|
| 1574 |
-
00:22:
|
| 1575 |
folks living with disabilities. And I think
|
| 1576 |
|
| 1577 |
395
|
| 1578 |
-
00:22:47,
|
| 1579 |
there's something really nice about the fact that it can
|
| 1580 |
|
| 1581 |
396
|
| 1582 |
-
00:22:49,
|
| 1583 |
also benefit, you know, folks who are able bodied and
|
| 1584 |
|
| 1585 |
397
|
| 1586 |
-
00:22:52,
|
| 1587 |
like we can all in different ways make this tech
|
| 1588 |
|
| 1589 |
398
|
| 1590 |
-
00:22:57,
|
| 1591 |
as useful as possible, regardless of the exact way that
|
| 1592 |
|
| 1593 |
399
|
| 1594 |
-
00:23:00,
|
| 1595 |
we're using it. And I think there's something very powerful
|
| 1596 |
|
| 1597 |
400
|
| 1598 |
-
00:23:03,
|
| 1599 |
in that and it can be very cool. I see
|
| 1600 |
|
| 1601 |
401
|
| 1602 |
-
00:23:06,
|
| 1603 |
huge potential. What excites me about Voicetech? A lot of
|
| 1604 |
|
| 1605 |
402
|
| 1606 |
-
00:23:10,
|
| 1607 |
things actually. Firstly, the fact that it's cheap and accurate,
|
| 1608 |
|
| 1609 |
403
|
| 1610 |
-
00:23:14,
|
| 1611 |
as I mentioned at the very start of this. And
|
| 1612 |
|
| 1613 |
404
|
| 1614 |
-
00:23:17,
|
| 1615 |
it's getting better and better with stuff like accent handling.
|
| 1616 |
|
| 1617 |
405
|
| 1618 |
-
00:23:20,
|
| 1619 |
I'm not sure my fine-tune will actually ever come to
|
| 1620 |
|
| 1621 |
406
|
| 1622 |
-
00:23:23,
|
| 1623 |
fruition in the sense that I'll use it day to
|
| 1624 |
|
| 1625 |
407
|
| 1626 |
-
00:23:25,
|
| 1627 |
day as I imagine. I get like superb flawless words
|
| 1628 |
|
| 1629 |
408
|
| 1630 |
-
00:23:
|
| 1631 |
error rates because I'm just kind of skeptical about Local
|
| 1632 |
|
| 1633 |
409
|
| 1634 |
-
00:23:33,
|
| 1635 |
speech to text, as I mentioned, and I think the
|
| 1636 |
|
| 1637 |
410
|
| 1638 |
-
00:23:37,
|
| 1639 |
pace of innovation and improvement in the models, the main
|
| 1640 |
|
| 1641 |
411
|
| 1642 |
-
00:23:40,
|
| 1643 |
reasons for fine tuning from what I've seen have been
|
| 1644 |
|
| 1645 |
412
|
| 1646 |
-
00:23:44,
|
| 1647 |
people who are something that really blows my mind about
|
| 1648 |
|
| 1649 |
413
|
| 1650 |
-
00:23:
|
| 1651 |
ASR is the idea that it's inherently a lingual or
|
| 1652 |
|
| 1653 |
414
|
| 1654 |
-
00:23:53,
|
| 1655 |
multilingual phonetic based. So as folks who use speak
|
| 1656 |
|
| 1657 |
415
|
| 1658 |
-
00:23:58,
|
| 1659 |
very obscure languages, that there might be a paucity of
|
| 1660 |
|
| 1661 |
416
|
| 1662 |
-
00:24:02,
|
| 1663 |
training data or almost none at all, and therefore the
|
| 1664 |
|
| 1665 |
417
|
| 1666 |
-
00:24:04,
|
| 1667 |
accuracy is significantly reduced. Or folks in very critical
|
| 1668 |
|
| 1669 |
418
|
| 1670 |
-
00:24:10,
|
| 1671 |
environments, I know this is used extensively in medical transcription
|
| 1672 |
|
| 1673 |
419
|
| 1674 |
-
00:24:14,
|
| 1675 |
and dispatcher work, the call centers who send out ambulances,
|
| 1676 |
|
| 1677 |
420
|
| 1678 |
-
00:24:19,
|
| 1679 |
et cetera, where accuracy is absolutely paramount. And in the
|
| 1680 |
|
| 1681 |
421
|
| 1682 |
-
00:24:23,
|
| 1683 |
case of doctors, radiologist, they might be using very specialized
|
| 1684 |
|
| 1685 |
422
|
| 1686 |
-
00:24:26,
|
| 1687 |
vocab all the time. So those are kind of the
|
| 1688 |
|
| 1689 |
423
|
| 1690 |
-
00:24:29,
|
| 1691 |
main two things that I'm not sure that really just
|
| 1692 |
|
| 1693 |
424
|
| 1694 |
-
00:24:31,
|
| 1695 |
for trying to make it better on a few random
|
| 1696 |
|
| 1697 |
425
|
| 1698 |
-
00:24:
|
| 1699 |
tech words with my slightly, I mean, I have an
|
| 1700 |
|
| 1701 |
426
|
| 1702 |
-
00:24:
|
| 1703 |
accent, but like not, you know, an accent that a
|
| 1704 |
|
| 1705 |
427
|
| 1706 |
-
00:24:41,
|
| 1707 |
few other million people have ish. I'm not sure that
|
| 1708 |
|
| 1709 |
428
|
| 1710 |
-
00:24:46,
|
| 1711 |
my little fine tune is gonna actually like the bump
|
| 1712 |
|
| 1713 |
429
|
| 1714 |
-
00:24:50,
|
| 1715 |
in word error reduction, if I ever actually figure out
|
| 1716 |
|
| 1717 |
430
|
| 1718 |
-
00:24:53,
|
| 1719 |
how to do it and get it up to the
|
| 1720 |
|
| 1721 |
431
|
| 1722 |
-
00:24:54,
|
| 1723 |
cloud. By the time we've done that, I suspect that
|
| 1724 |
|
| 1725 |
432
|
| 1726 |
-
00:24:58,
|
| 1727 |
the next generation of ASR will just be so good
|
| 1728 |
|
| 1729 |
433
|
| 1730 |
-
00:25:00,
|
| 1731 |
that it will kind of be, well, that would have
|
| 1732 |
|
| 1733 |
434
|
| 1734 |
-
00:25:
|
| 1735 |
been cool if it worked out, but I'll just use
|
| 1736 |
|
| 1737 |
435
|
| 1738 |
-
00:25:04,
|
| 1739 |
this instead. So that's going to be it for today's
|
| 1740 |
|
| 1741 |
436
|
| 1742 |
-
00:25:08,
|
| 1743 |
episode of voice training data. Single long shot evaluation.
|
| 1744 |
|
| 1745 |
437
|
| 1746 |
-
00:25:14,
|
| 1747 |
Who am I going to compare? Whisper is always good
|
| 1748 |
|
| 1749 |
438
|
| 1750 |
-
00:25:17,
|
| 1751 |
as a benchmark, but I'm more interested in seeing Whisper
|
| 1752 |
|
| 1753 |
439
|
| 1754 |
-
00:25:20,
|
| 1755 |
head to head with two things, really. One is Whisper
|
| 1756 |
|
| 1757 |
440
|
| 1758 |
-
00:25:24,
|
| 1759 |
variants. So you've got these projects like faster Distill Whisper,
|
| 1760 |
|
| 1761 |
441
|
| 1762 |
-
00:25:29,
|
| 1763 |
it's a bit confusing, there's a whole bunch of them.
|
| 1764 |
|
| 1765 |
442
|
| 1766 |
-
00:25:32,
|
| 1767 |
And the emerging ASRs, which are also a thing. My
|
| 1768 |
|
| 1769 |
443
|
| 1770 |
-
00:25:35,
|
| 1771 |
intention for this is I'm not sure I'm going to
|
| 1772 |
|
| 1773 |
444
|
| 1774 |
-
00:25:37,
|
| 1775 |
have the time in any point in the foreseeable future
|
| 1776 |
|
| 1777 |
445
|
| 1778 |
-
00:25:40,
|
| 1779 |
to go back through this whole episode and create a
|
| 1780 |
|
| 1781 |
446
|
| 1782 |
-
00:25:44,
|
| 1783 |
proper source truth, where I fix everything. Might do
|
| 1784 |
|
| 1785 |
447
|
| 1786 |
-
00:25:49,
|
| 1787 |
it if I can get one transcriptions that sufficiently close
|
| 1788 |
|
| 1789 |
448
|
| 1790 |
-
00:25:
|
| 1791 |
to perfection. But what I would actually love to do
|
| 1792 |
|
| 1793 |
449
|
| 1794 |
-
00:25:57,
|
| 1795 |
on Hugging Face, I think would be a great probably
|
| 1796 |
|
| 1797 |
450
|
| 1798 |
-
00:26:00,
|
| 1799 |
how I might visualize this is having the audio waveform
|
| 1800 |
|
| 1801 |
451
|
| 1802 |
-
00:26:03,
|
| 1803 |
play and then have the transcript for each model below
|
| 1804 |
|
| 1805 |
452
|
| 1806 |
-
00:26:08,
|
| 1807 |
it and maybe even a like, you know, to scale
|
| 1808 |
|
| 1809 |
453
|
| 1810 |
-
00:26:13,
|
| 1811 |
and maybe even a local one as well, like local
|
| 1812 |
|
| 1813 |
454
|
| 1814 |
-
00:26:15,
|
| 1815 |
whisper versus OpenAI API, et cetera. And, I
|
| 1816 |
|
| 1817 |
455
|
| 1818 |
-
00:26:21,
|
| 1819 |
can then actually listen back to segments or anyone who
|
| 1820 |
|
| 1821 |
456
|
| 1822 |
-
00:26:23,
|
| 1823 |
wants to can listen back to segments of this recording
|
| 1824 |
|
| 1825 |
457
|
| 1826 |
-
00:26:26,
|
| 1827 |
and see where a particular model struggled and others didn't,
|
| 1828 |
|
| 1829 |
458
|
| 1830 |
-
00:26:31,
|
| 1831 |
as well as the sort of headline finding of which
|
| 1832 |
|
| 1833 |
459
|
| 1834 |
-
00:26:33,
|
| 1835 |
had the best WER, but that would require the source
|
| 1836 |
|
| 1837 |
460
|
| 1838 |
-
00:26:36,
|
| 1839 |
of truth. Okay, that's it. I hope this was, I
|
| 1840 |
|
| 1841 |
461
|
| 1842 |
-
00:26:39,
|
| 1843 |
don't know, maybe useful for other folks interested in STT.
|
| 1844 |
|
| 1845 |
462
|
| 1846 |
-
00:26:42,
|
| 1847 |
You want to see that I always feel think I've
|
| 1848 |
|
| 1849 |
463
|
| 1850 |
-
00:26:45,
|
| 1851 |
just said as something I didn't intend to. STT, I
|
| 1852 |
|
| 1853 |
464
|
| 1854 |
-
00:26:48,
|
| 1855 |
said for those. Listen carefully, including hopefully the models themselves.
|
| 1856 |
|
| 1857 |
465
|
| 1858 |
-
00:26:53,
|
| 1859 |
This has been myself, Daniel Rosell. For more jumbled repositories
|
| 1860 |
|
| 1861 |
466
|
| 1862 |
-
00:26:57,
|
| 1863 |
about my roving interests in AI, but particularly agentic, MCP
|
| 1864 |
|
| 1865 |
467
|
| 1866 |
-
00:27:
|
| 1867 |
and Voicetech, you can find me on GitHub, huggingface.com,
|
| 1868 |
|
| 1869 |
468
|
| 1870 |
-
00:27:10,
|
| 1871 |
which is my personal website, as well as this podcast,
|
| 1872 |
|
| 1873 |
469
|
| 1874 |
-
00:27:13,
|
| 1875 |
whose name I sadly cannot remember. Until next time, thanks
|
| 1876 |
|
| 1877 |
470
|
| 1878 |
-
00:27:
|
| 1879 |
for listening.
|
| 1880 |
|
|
|
|
| 1 |
1
|
| 2 |
+
00:00:00,000 --> 00:00:05,600
|
| 3 |
Hello and welcome to a audio data set consisting
|
| 4 |
|
| 5 |
2
|
| 6 |
+
00:00:05,600 --> 00:00:10,560
|
| 7 |
of one single episode of a non-existent podcast. Or I
|
| 8 |
|
| 9 |
3
|
| 10 |
+
00:00:10,640 --> 00:00:13,280
|
| 11 |
may append this to a podcast that I set up
|
| 12 |
|
| 13 |
4
|
| 14 |
+
00:00:13,520 --> 00:00:19,120
|
| 15 |
recently regarding my with my thoughts on speech
|
| 16 |
|
| 17 |
5
|
| 18 |
+
00:00:19,200 --> 00:00:23,920
|
| 19 |
tech and AI in particular, more AI in generative AI,
|
| 20 |
|
| 21 |
6
|
| 22 |
+
00:00:24,160 --> 00:00:28,560
|
| 23 |
I would say. But in any event, the purpose of
|
| 24 |
|
| 25 |
7
|
| 26 |
+
00:00:28,640 --> 00:00:33,770
|
| 27 |
this Voice recording is actually to create a lengthy
|
| 28 |
|
| 29 |
8
|
| 30 |
+
00:00:33,850 --> 00:00:37,050
|
| 31 |
voice sample for a quick evaluation, a back of the
|
| 32 |
|
| 33 |
9
|
| 34 |
+
00:00:37,050 --> 00:00:40,570
|
| 35 |
envelope evaluation, as they might say, for different speech attack
|
| 36 |
|
| 37 |
10
|
| 38 |
+
00:00:40,810 --> 00:00:43,370
|
| 39 |
models. And I'm doing this because I thought I had
|
| 40 |
|
| 41 |
11
|
| 42 |
+
00:00:43,370 --> 00:00:46,730
|
| 43 |
made a great breakthrough in my journey with speech tech,
|
| 44 |
|
| 45 |
12
|
| 46 |
+
00:00:47,050 --> 00:00:50,650
|
| 47 |
and that was succeeding in the elusive task of fine-tuning
|
| 48 |
|
| 49 |
13
|
| 50 |
+
00:00:50,650 --> 00:00:54,730
|
| 51 |
Whisper. Whisper is, and I'm going to just talk, I'm
|
| 52 |
|
| 53 |
14
|
| 54 |
+
00:00:54,810 --> 00:00:58,170
|
| 55 |
trying to mix up, I'm going to try a few
|
| 56 |
|
| 57 |
15
|
| 58 |
+
00:00:58,330 --> 00:01:01,450
|
| 59 |
different styles of speaking. I might whisper something at some
|
| 60 |
|
| 61 |
16
|
| 62 |
+
00:01:01,530 --> 00:01:04,800
|
| 63 |
point. As well. And I'll go back to speaking loud
|
| 64 |
|
| 65 |
17
|
| 66 |
+
00:01:04,880 --> 00:01:08,000
|
| 67 |
in, in different parts. I'm going to sound really like
|
| 68 |
|
| 69 |
18
|
| 70 |
+
00:01:08,080 --> 00:01:11,040
|
| 71 |
a crazy person because I'm also going to try to
|
| 72 |
|
| 73 |
19
|
| 74 |
+
00:01:11,200 --> 00:01:16,160
|
| 75 |
speak at different pitches and cadences in order to really
|
| 76 |
|
| 77 |
20
|
| 78 |
+
00:01:16,480 --> 00:01:20,480
|
| 79 |
try to put a speech attacks model through its paces,
|
| 80 |
|
| 81 |
21
|
| 82 |
+
00:01:20,640 --> 00:01:22,960
|
| 83 |
which is trying to make sense of is this guy
|
| 84 |
|
| 85 |
22
|
| 86 |
+
00:01:23,120 --> 00:01:27,980
|
| 87 |
just rambling on incoherently in one long sentence or are
|
| 88 |
|
| 89 |
23
|
| 90 |
+
00:01:28,380 --> 00:01:34,140
|
| 91 |
these just actually a series of step, standalone,
|
| 92 |
|
| 93 |
24
|
| 94 |
+
00:01:34,300 --> 00:01:37,340
|
| 95 |
step alone, standalone sentences? And how is it gonna handle
|
| 96 |
|
| 97 |
25
|
| 98 |
+
00:01:37,420 --> 00:01:40,380
|
| 99 |
step alone? That's not a word. What happens when you
|
| 100 |
|
| 101 |
26
|
| 102 |
+
00:01:40,460 --> 00:01:42,940
|
| 103 |
use speech to text and you use a fake word?
|
| 104 |
|
| 105 |
27
|
| 106 |
+
00:01:43,100 --> 00:01:45,500
|
| 107 |
And then you're like, wait, that's not actually, that word
|
| 108 |
|
| 109 |
28
|
| 110 |
+
00:01:45,660 --> 00:01:50,140
|
| 111 |
doesn't exist. How does AI handle that? And these and
|
| 112 |
|
| 113 |
29
|
| 114 |
+
00:01:50,380 --> 00:01:54,220
|
| 115 |
more are all the questions that I'm seeking to answer
|
| 116 |
|
| 117 |
30
|
| 118 |
+
00:01:54,380 --> 00:01:57,420
|
| 119 |
in this training data. Now, why was it trying to
|
| 120 |
|
| 121 |
31
|
| 122 |
+
00:01:57,420 --> 00:02:00,210
|
| 123 |
fine tune Whisper? And what is Whisper? As I said,
|
| 124 |
|
| 125 |
32
|
| 126 |
+
00:02:00,290 --> 00:02:02,930
|
| 127 |
I'm going to try to record this at a couple
|
| 128 |
|
| 129 |
33
|
| 130 |
+
00:02:03,090 --> 00:02:07,410
|
| 131 |
of different levels of technicality for folks who are, you
|
| 132 |
|
| 133 |
34
|
| 134 |
+
00:02:07,410 --> 00:02:11,650
|
| 135 |
know, in the normal world and not totally stuck down
|
| 136 |
|
| 137 |
35
|
| 138 |
+
00:02:11,730 --> 00:02:13,730
|
| 139 |
the rabbit hole of AI, which I have to say
|
| 140 |
|
| 141 |
36
|
| 142 |
+
00:02:13,890 --> 00:02:18,050
|
| 143 |
is a really wonderful rabbit hole to be down. It's
|
| 144 |
|
| 145 |
37
|
| 146 |
+
00:02:18,130 --> 00:02:21,490
|
| 147 |
a really interesting area and speech and voice tech is
|
| 148 |
|
| 149 |
38
|
| 150 |
+
00:02:21,890 --> 00:02:24,530
|
| 151 |
the aspect of it that I find actually the most,
|
| 152 |
|
| 153 |
39
|
| 154 |
+
00:02:24,930 --> 00:02:27,330
|
| 155 |
I'm not sure I would say the most interesting because
|
| 156 |
|
| 157 |
40
|
| 158 |
+
00:02:27,570 --> 00:02:31,290
|
| 159 |
there's just so much that is fascinating in AI. But
|
| 160 |
|
| 161 |
41
|
| 162 |
+
00:02:31,450 --> 00:02:34,250
|
| 163 |
the most that I find the most personally transformative in
|
| 164 |
|
| 165 |
42
|
| 166 |
+
00:02:34,330 --> 00:02:38,890
|
| 167 |
terms of the impact that it's had on my daily
|
| 168 |
|
| 169 |
43
|
| 170 |
+
00:02:38,970 --> 00:02:41,450
|
| 171 |
work life and productivity and how I sort of work.
|
| 172 |
|
| 173 |
44
|
| 174 |
+
00:02:42,090 --> 00:02:47,210
|
| 175 |
And I'm persevering hard with the task of trying
|
| 176 |
|
| 177 |
45
|
| 178 |
+
00:02:47,210 --> 00:02:50,250
|
| 179 |
to get a good solution working for Linux, which if
|
| 180 |
|
| 181 |
46
|
| 182 |
+
00:02:50,250 --> 00:02:52,250
|
| 183 |
anyone actually does listen to this, not just for the
|
| 184 |
|
| 185 |
47
|
| 186 |
+
00:02:52,250 --> 00:02:56,410
|
| 187 |
training data and for the actual content, this is sparked
|
| 188 |
|
| 189 |
48
|
| 190 |
+
00:02:56,750 --> 00:02:59,950
|
| 191 |
I had, besides the fine tune not working, well, that
|
| 192 |
|
| 193 |
49
|
| 194 |
+
00:03:00,030 --> 00:03:05,230
|
| 195 |
was the failure. Um, I used Claude code because one
|
| 196 |
|
| 197 |
50
|
| 198 |
+
00:03:05,470 --> 00:03:09,950
|
| 199 |
thinks these days that there is nothing short of solving,
|
| 200 |
|
| 201 |
51
|
| 202 |
+
00:03:10,990 --> 00:03:15,390
|
| 203 |
you know, the, the reason of life or something, that
|
| 204 |
|
| 205 |
52
|
| 206 |
+
00:03:15,790 --> 00:03:18,990
|
| 207 |
Claude and agentic AI can't do, which is not really
|
| 208 |
|
| 209 |
53
|
| 210 |
+
00:03:19,070 --> 00:03:22,190
|
| 211 |
the case. Uh, it does seem that way sometimes, but
|
| 212 |
|
| 213 |
54
|
| 214 |
+
00:03:22,350 --> 00:03:24,190
|
| 215 |
it fails a lot as well. And this is one
|
| 216 |
|
| 217 |
55
|
| 218 |
+
00:03:24,190 --> 00:03:27,630
|
| 219 |
of those, instances where last week I put together an
|
| 220 |
|
| 221 |
56
|
| 222 |
+
00:03:27,710 --> 00:03:32,010
|
| 223 |
hour of voice training data, basically speaking, just random things
|
| 224 |
|
| 225 |
57
|
| 226 |
+
00:03:32,250 --> 00:03:37,050
|
| 227 |
for 3 minutes. And it was actually kind of tedious
|
| 228 |
|
| 229 |
58
|
| 230 |
+
00:03:37,130 --> 00:03:39,210
|
| 231 |
because the texts were really weird. Some of them were
|
| 232 |
|
| 233 |
59
|
| 234 |
+
00:03:39,450 --> 00:03:43,050
|
| 235 |
it was like it was AI generated. I tried before
|
| 236 |
|
| 237 |
60
|
| 238 |
+
00:03:43,210 --> 00:03:45,130
|
| 239 |
to read Sherlock Holmes for an hour and I just
|
| 240 |
|
| 241 |
61
|
| 242 |
+
00:03:45,130 --> 00:03:48,330
|
| 243 |
couldn't. I was so bored after 10 minutes that I
|
| 244 |
|
| 245 |
62
|
| 246 |
+
00:03:48,330 --> 00:03:50,730
|
| 247 |
was like, okay, no, I'm just going to have to
|
| 248 |
|
| 249 |
63
|
| 250 |
+
00:03:50,730 --> 00:03:55,290
|
| 251 |
find something else to read. So I used a created
|
| 252 |
|
| 253 |
64
|
| 254 |
+
00:03:55,690 --> 00:04:01,280
|
| 255 |
with AI studio vibe coded a synthetic text generator. Which
|
| 256 |
|
| 257 |
65
|
| 258 |
+
00:04:01,600 --> 00:04:03,840
|
| 259 |
actually I thought was probably a better way of doing
|
| 260 |
|
| 261 |
66
|
| 262 |
+
00:04:03,920 --> 00:04:07,440
|
| 263 |
it because it would give me more short samples with
|
| 264 |
|
| 265 |
67
|
| 266 |
+
00:04:07,680 --> 00:04:10,480
|
| 267 |
more varied content. So I was like, okay, give me
|
| 268 |
|
| 269 |
68
|
| 270 |
+
00:04:10,880 --> 00:04:13,760
|
| 271 |
a voice note, like I'm recording an email, give me
|
| 272 |
|
| 273 |
69
|
| 274 |
+
00:04:14,000 --> 00:04:17,680
|
| 275 |
a short story to read, give me prose to read.
|
| 276 |
|
| 277 |
70
|
| 278 |
+
00:04:18,000 --> 00:04:20,400
|
| 279 |
So I came up with all these different things and
|
| 280 |
|
| 281 |
71
|
| 282 |
+
00:04:20,560 --> 00:04:22,560
|
| 283 |
they added a little timer to it so I could
|
| 284 |
|
| 285 |
72
|
| 286 |
+
00:04:22,720 --> 00:04:26,400
|
| 287 |
see how close I was to one hour. And I
|
| 288 |
|
| 289 |
73
|
| 290 |
+
00:04:26,560 --> 00:04:29,600
|
| 291 |
spent like an hour one afternoon or probably two hours
|
| 292 |
|
| 293 |
74
|
| 294 |
+
00:04:29,760 --> 00:04:33,330
|
| 295 |
by the time you you do retakes. And whatever, because
|
| 296 |
|
| 297 |
75
|
| 298 |
+
00:04:33,410 --> 00:04:36,610
|
| 299 |
you want to, it gave me a source of truth,
|
| 300 |
|
| 301 |
76
|
| 302 |
+
00:04:37,330 --> 00:04:40,050
|
| 303 |
which I'm not sure if that's the scientific way to
|
| 304 |
|
| 305 |
77
|
| 306 |
+
00:04:40,210 --> 00:04:44,210
|
| 307 |
approach this topic of gathering, training data, but I thought
|
| 308 |
|
| 309 |
78
|
| 310 |
+
00:04:44,450 --> 00:04:48,130
|
| 311 |
made sense. Um, I have a lot of audio data
|
| 312 |
|
| 313 |
79
|
| 314 |
+
00:04:48,210 --> 00:04:50,770
|
| 315 |
from recording voice notes, which I've also kind of used,
|
| 316 |
|
| 317 |
80
|
| 318 |
+
00:04:52,050 --> 00:04:55,810
|
| 319 |
been experimenting with using for a different purpose, slightly different
|
| 320 |
|
| 321 |
81
|
| 322 |
+
00:04:56,210 --> 00:05:01,410
|
| 323 |
annotating task types. It's more a text classification experiment
|
| 324 |
|
| 325 |
82
|
| 326 |
+
00:05:01,730 --> 00:05:04,160
|
| 327 |
or, Well, it's more than that actually. I'm working on
|
| 328 |
|
| 329 |
83
|
| 330 |
+
00:05:04,160 --> 00:05:08,080
|
| 331 |
a voice app. So it's a prototype, I guess, is
|
| 332 |
|
| 333 |
84
|
| 334 |
+
00:05:08,240 --> 00:05:12,720
|
| 335 |
really more accurate. But you can do that and you
|
| 336 |
|
| 337 |
85
|
| 338 |
+
00:05:12,720 --> 00:05:15,200
|
| 339 |
can work backwards. You're like, you listen back to a
|
| 340 |
|
| 341 |
86
|
| 342 |
+
00:05:15,200 --> 00:05:18,720
|
| 343 |
voice note and you painfully go through one of those
|
| 344 |
|
| 345 |
87
|
| 346 |
+
00:05:19,040 --> 00:05:21,840
|
| 347 |
transcribing, you know, where you start and stop and scrub
|
| 348 |
|
| 349 |
88
|
| 350 |
+
00:05:22,000 --> 00:05:23,920
|
| 351 |
around it and you fix the errors, but it's really,
|
| 352 |
|
| 353 |
89
|
| 354 |
+
00:05:24,080 --> 00:05:26,720
|
| 355 |
really boring to do that. So I thought it would
|
| 356 |
|
| 357 |
90
|
| 358 |
+
00:05:26,800 --> 00:05:29,040
|
| 359 |
be less tedious in the long term if I just
|
| 360 |
|
| 361 |
91
|
| 362 |
+
00:05:30,059 --> 00:05:32,940
|
| 363 |
recorded the source of truth. So it gave me these
|
| 364 |
|
| 365 |
92
|
| 366 |
+
00:05:33,020 --> 00:05:36,140
|
| 367 |
three minute snippets. I recorded them. It saved an MP3
|
| 368 |
|
| 369 |
93
|
| 370 |
+
00:05:36,380 --> 00:05:39,500
|
| 371 |
and a TXT in the same folder, and I created
|
| 372 |
|
| 373 |
94
|
| 374 |
+
00:05:39,580 --> 00:05:42,860
|
| 375 |
an error with that data. So I was very hopeful,
|
| 376 |
|
| 377 |
95
|
| 378 |
+
00:05:43,260 --> 00:05:46,860
|
| 379 |
quietly, a little bit hopeful that I could actually fine
|
| 380 |
|
| 381 |
96
|
| 382 |
+
00:05:46,940 --> 00:05:50,460
|
| 383 |
tune Whisper. I want to fine tune Whisper because when
|
| 384 |
|
| 385 |
97
|
| 386 |
+
00:05:50,540 --> 00:05:54,780
|
| 387 |
I got into Voicetech last November, my wife was in
|
| 388 |
|
| 389 |
98
|
| 390 |
+
00:05:54,780 --> 00:05:58,140
|
| 391 |
the US and I was alone at home. And when
|
| 392 |
|
| 393 |
99
|
| 394 |
+
00:05:58,600 --> 00:06:01,400
|
| 395 |
crazy people like me do really wild things like use
|
| 396 |
|
| 397 |
100
|
| 398 |
+
00:06:01,640 --> 00:06:06,120
|
| 399 |
voice to tech technology. That was basically when I started
|
| 400 |
|
| 401 |
101
|
| 402 |
+
00:06:06,200 --> 00:06:08,760
|
| 403 |
doing it, I didn't feel like a crazy person speaking
|
| 404 |
|
| 405 |
102
|
| 406 |
+
00:06:08,840 --> 00:06:13,720
|
| 407 |
to myself. And my expectations weren't that high. I used
|
| 408 |
|
| 409 |
103
|
| 410 |
+
00:06:14,280 --> 00:06:17,640
|
| 411 |
speech tech now and again, tried it out. It was
|
| 412 |
|
| 413 |
104
|
| 414 |
+
00:06:17,640 --> 00:06:19,160
|
| 415 |
like, it'd be really cool if you could just, like,
|
| 416 |
|
| 417 |
105
|
| 418 |
+
00:06:19,320 --> 00:06:22,760
|
| 419 |
speak into your computer. And whatever I tried out that
|
| 420 |
|
| 421 |
106
|
| 422 |
+
00:06:23,000 --> 00:06:26,590
|
| 423 |
had Linux support was just. It was not good, basically.
|
| 424 |
|
| 425 |
107
|
| 426 |
+
00:06:27,230 --> 00:06:29,470
|
| 427 |
And this blew me away from the first go. I
|
| 428 |
|
| 429 |
108
|
| 430 |
+
00:06:29,470 --> 00:06:32,750
|
| 431 |
mean, it wasn't 100% accurate out of the box and
|
| 432 |
|
| 433 |
109
|
| 434 |
+
00:06:32,830 --> 00:06:34,910
|
| 435 |
it took work, but it was good enough that there
|
| 436 |
|
| 437 |
110
|
| 438 |
+
00:06:34,990 --> 00:06:37,470
|
| 439 |
was a solid foundation and it kind of passed that
|
| 440 |
|
| 441 |
111
|
| 442 |
+
00:06:38,670 --> 00:06:41,870
|
| 443 |
pivot point that it's actually worth doing this. You know,
|
| 444 |
|
| 445 |
112
|
| 446 |
+
00:06:42,030 --> 00:06:44,670
|
| 447 |
there's a point where it's so like the transcript is
|
| 448 |
|
| 449 |
113
|
| 450 |
+
00:06:44,910 --> 00:06:47,310
|
| 451 |
you don't have to get 100% accuracy for it to
|
| 452 |
|
| 453 |
114
|
| 454 |
+
00:06:47,310 --> 00:06:50,030
|
| 455 |
be worth your time for speech attacks to be a
|
| 456 |
|
| 457 |
115
|
| 458 |
+
00:06:50,030 --> 00:06:52,430
|
| 459 |
worthwhile addition to your productivity, but you do need to
|
| 460 |
|
| 461 |
116
|
| 462 |
+
00:06:52,430 --> 00:06:55,970
|
| 463 |
get above, let's say, I don't know, 85%. If it's
|
| 464 |
|
| 465 |
117
|
| 466 |
+
00:06:56,130 --> 00:06:59,810
|
| 467 |
60% or 50%, you inevitably say, screw it, I'll just
|
| 468 |
|
| 469 |
118
|
| 470 |
+
00:06:59,810 --> 00:07:02,770
|
| 471 |
type it because you end up missing errors in the
|
| 472 |
|
| 473 |
119
|
| 474 |
+
00:07:02,770 --> 00:07:05,490
|
| 475 |
transcript and it becomes actually worse. You end up in
|
| 476 |
|
| 477 |
120
|
| 478 |
+
00:07:05,490 --> 00:07:07,570
|
| 479 |
a worse position than you started with. That's been my
|
| 480 |
|
| 481 |
121
|
| 482 |
+
00:07:07,650 --> 00:07:11,970
|
| 483 |
experience. So I was like, oh, this is actually really,
|
| 484 |
|
| 485 |
122
|
| 486 |
+
00:07:12,130 --> 00:07:13,970
|
| 487 |
really good now. How did that happen? And the answer
|
| 488 |
|
| 489 |
123
|
| 490 |
+
00:07:14,130 --> 00:07:19,410
|
| 491 |
is ASR whisper being open source and the transformer
|
| 492 |
|
| 493 |
124
|
| 494 |
+
00:07:19,410 --> 00:07:23,170
|
| 495 |
architecture. If you want to go back to the to
|
| 496 |
|
| 497 |
125
|
| 498 |
+
00:07:23,250 --> 00:07:26,370
|
| 499 |
the underpinnings, which really blows my mind and it's on
|
| 500 |
|
| 501 |
126
|
| 502 |
+
00:07:26,450 --> 00:07:30,680
|
| 503 |
my list. To read through that paper. All you need
|
| 504 |
|
| 505 |
127
|
| 506 |
+
00:07:30,760 --> 00:07:35,960
|
| 507 |
is attention as attentively as can be done
|
| 508 |
|
| 509 |
128
|
| 510 |
+
00:07:36,200 --> 00:07:39,320
|
| 511 |
with my limited brain because it's super, super high level
|
| 512 |
|
| 513 |
129
|
| 514 |
+
00:07:39,640 --> 00:07:44,520
|
| 515 |
stuff, super advanced stuff, I mean. But that, I think
|
| 516 |
|
| 517 |
130
|
| 518 |
+
00:07:44,680 --> 00:07:49,320
|
| 519 |
of all the things that are fascinating about the sudden
|
| 520 |
|
| 521 |
131
|
| 522 |
+
00:07:49,640 --> 00:07:53,700
|
| 523 |
rise in AI and the dramatic capabilities. I find it
|
| 524 |
|
| 525 |
132
|
| 526 |
+
00:07:53,700 --> 00:07:56,100
|
| 527 |
fascinating that a few people are like, hang on, you've
|
| 528 |
|
| 529 |
133
|
| 530 |
+
00:07:56,100 --> 00:07:58,420
|
| 531 |
got this thing that can speak to you, like a
|
| 532 |
|
| 533 |
134
|
| 534 |
+
00:07:58,420 --> 00:08:02,980
|
| 535 |
chatbot, an LLM, and then you've got image generation. Okay,
|
| 536 |
|
| 537 |
135
|
| 538 |
+
00:08:03,060 --> 00:08:06,580
|
| 539 |
so firstly, those two things on the surface have nothing
|
| 540 |
|
| 541 |
136
|
| 542 |
+
00:08:06,900 --> 00:08:10,740
|
| 543 |
in common. So like, how are they, how did that
|
| 544 |
|
| 545 |
137
|
| 546 |
+
00:08:10,900 --> 00:08:12,500
|
| 547 |
just happen all at the same time? And then when
|
| 548 |
|
| 549 |
138
|
| 550 |
+
00:08:12,500 --> 00:08:16,580
|
| 551 |
you extend that further, you're like, Suno, right? You can
|
| 552 |
|
| 553 |
139
|
| 554 |
+
00:08:17,060 --> 00:08:20,030
|
| 555 |
sing a song and AI will come up with and
|
| 556 |
|
| 557 |
140
|
| 558 |
+
00:08:20,190 --> 00:08:23,390
|
| 559 |
instrumental. And then you've got Whisper and you're like, wait
|
| 560 |
|
| 561 |
141
|
| 562 |
+
00:08:23,390 --> 00:08:25,870
|
| 563 |
a second, how did all this stuff, like, if it's
|
| 564 |
|
| 565 |
142
|
| 566 |
+
00:08:25,870 --> 00:08:29,230
|
| 567 |
all AI, what's like, there has to be some commonality.
|
| 568 |
|
| 569 |
143
|
| 570 |
+
00:08:29,470 --> 00:08:34,590
|
| 571 |
Otherwise, these are totally different technologies on the surface of
|
| 572 |
|
| 573 |
144
|
| 574 |
+
00:08:34,590 --> 00:08:38,830
|
| 575 |
it. And the Transformer architecture is, as far as I
|
| 576 |
|
| 577 |
145
|
| 578 |
+
00:08:38,910 --> 00:08:41,550
|
| 579 |
know, the answer. And I can't even say, can't even
|
| 580 |
|
| 581 |
146
|
| 582 |
+
00:08:41,630 --> 00:08:46,270
|
| 583 |
pretend that I really understand what the Transformer architecture means.
|
| 584 |
|
| 585 |
147
|
| 586 |
+
00:08:46,770 --> 00:08:49,250
|
| 587 |
In depth, but I have scanned it and as I
|
| 588 |
|
| 589 |
148
|
| 590 |
+
00:08:49,410 --> 00:08:51,810
|
| 591 |
said, I want to print it and really kind of
|
| 592 |
|
| 593 |
149
|
| 594 |
+
00:08:52,210 --> 00:08:56,050
|
| 595 |
think over it at some point. And I'll probably feel
|
| 596 |
|
| 597 |
150
|
| 598 |
+
00:08:56,290 --> 00:08:59,250
|
| 599 |
bad about myself, I think, because weren't those guys in
|
| 600 |
|
| 601 |
151
|
| 602 |
+
00:08:59,330 --> 00:09:03,410
|
| 603 |
their 20s? Like, that's crazy. I think I asked ChatGPT
|
| 604 |
|
| 605 |
152
|
| 606 |
+
00:09:03,490 --> 00:09:07,890
|
| 607 |
once who wrote that paper and how old were they
|
| 608 |
|
| 609 |
153
|
| 610 |
+
00:09:08,050 --> 00:09:10,770
|
| 611 |
when it was published in Arciv? And I was expecting,
|
| 612 |
|
| 613 |
154
|
| 614 |
+
00:09:11,010 --> 00:09:13,890
|
| 615 |
like, I don't know, What do you imagine? I personally
|
| 616 |
|
| 617 |
155
|
| 618 |
+
00:09:13,970 --> 00:09:16,210
|
| 619 |
imagine kind of like, you know, you have these breakthroughs
|
| 620 |
|
| 621 |
156
|
| 622 |
+
00:09:16,370 --> 00:09:19,810
|
| 623 |
during COVID and things like that where like these kind
|
| 624 |
|
| 625 |
157
|
| 626 |
+
00:09:19,890 --> 00:09:22,770
|
| 627 |
of really obscure scientists are like in their 50s and
|
| 628 |
|
| 629 |
158
|
| 630 |
+
00:09:22,770 --> 00:09:27,170
|
| 631 |
they've just kind of been laboring in labs and wearily
|
| 632 |
|
| 633 |
159
|
| 634 |
+
00:09:27,170 --> 00:09:30,450
|
| 635 |
in writing and publishing in kind of obscure academic publications.
|
| 636 |
|
| 637 |
160
|
| 638 |
+
00:09:30,770 --> 00:09:33,170
|
| 639 |
And they finally like hit a big or win a
|
| 640 |
|
| 641 |
161
|
| 642 |
+
00:09:33,170 --> 00:09:37,250
|
| 643 |
Nobel Prize and then their household names. So that was
|
| 644 |
|
| 645 |
162
|
| 646 |
+
00:09:37,330 --> 00:09:38,990
|
| 647 |
kind of what I had in mind. That was the
|
| 648 |
|
| 649 |
163
|
| 650 |
+
00:09:38,990 --> 00:09:42,990
|
| 651 |
mental image I'd formed of the birth of Arcsight. Like
|
| 652 |
|
| 653 |
164
|
| 654 |
+
00:09:42,990 --> 00:09:46,270
|
| 655 |
I wasn't expecting 20-somethings in San Francisco, though. I thought
|
| 656 |
|
| 657 |
165
|
| 658 |
+
00:09:46,350 --> 00:09:48,830
|
| 659 |
that was both very, very funny, very cool, and actually
|
| 660 |
|
| 661 |
166
|
| 662 |
+
00:09:48,990 --> 00:09:52,510
|
| 663 |
kind of inspiring. It's nice to think that people who,
|
| 664 |
|
| 665 |
167
|
| 666 |
+
00:09:53,310 --> 00:09:56,110
|
| 667 |
you know, just you might put them in the kind
|
| 668 |
|
| 669 |
168
|
| 670 |
+
00:09:56,190 --> 00:09:59,550
|
| 671 |
of milieu or bubble or world that you are in
|
| 672 |
|
| 673 |
169
|
| 674 |
+
00:09:59,630 --> 00:10:03,230
|
| 675 |
are credibly in through, you know, the series of connections
|
| 676 |
|
| 677 |
170
|
| 678 |
+
00:10:03,310 --> 00:10:07,390
|
| 679 |
that are coming up with such literally world changing innovations.
|
| 680 |
|
| 681 |
171
|
| 682 |
+
00:10:07,870 --> 00:10:11,460
|
| 683 |
So that was, I thought, anyway. That's that was cool.
|
| 684 |
|
| 685 |
172
|
| 686 |
+
00:10:11,860 --> 00:10:14,500
|
| 687 |
Okay, voice training data. How are we doing? We're about
|
| 688 |
|
| 689 |
173
|
| 690 |
+
00:10:14,500 --> 00:10:18,580
|
| 691 |
10 minutes and I'm still talking about voice technology. So
|
| 692 |
|
| 693 |
174
|
| 694 |
+
00:10:18,660 --> 00:10:22,100
|
| 695 |
Whisper was brilliant and I was so excited that I
|
| 696 |
|
| 697 |
175
|
| 698 |
+
00:10:22,180 --> 00:10:25,380
|
| 699 |
was my first instinct was to like guess like, oh
|
| 700 |
|
| 701 |
176
|
| 702 |
+
00:10:25,380 --> 00:10:26,820
|
| 703 |
my gosh, I have to get like a really good
|
| 704 |
|
| 705 |
177
|
| 706 |
+
00:10:26,820 --> 00:10:30,580
|
| 707 |
microphone for this. So I didn't go on a spending
|
| 708 |
|
| 709 |
178
|
| 710 |
+
00:10:30,580 --> 00:10:32,740
|
| 711 |
spree because I said, I'm gonna have to just wait
|
| 712 |
|
| 713 |
179
|
| 714 |
+
00:10:32,740 --> 00:10:35,140
|
| 715 |
a month and see if I still use this. And
|
| 716 |
|
| 717 |
180
|
| 718 |
+
00:10:36,430 --> 00:10:38,910
|
| 719 |
It just kind of became, it's become really part of
|
| 720 |
|
| 721 |
181
|
| 722 |
+
00:10:39,070 --> 00:10:43,390
|
| 723 |
my daily routine. Like if I'm writing an email, I'll
|
| 724 |
|
| 725 |
182
|
| 726 |
+
00:10:43,470 --> 00:10:46,990
|
| 727 |
record a voice note. And then I've developed and it's
|
| 728 |
|
| 729 |
183
|
| 730 |
+
00:10:46,990 --> 00:10:49,070
|
| 731 |
nice to see that everyone is like developing the same
|
| 732 |
|
| 733 |
184
|
| 734 |
+
00:10:49,550 --> 00:10:51,950
|
| 735 |
things in parallel. Like that's my kind of a weird
|
| 736 |
|
| 737 |
185
|
| 738 |
+
00:10:51,950 --> 00:10:54,510
|
| 739 |
thing to say, but when I look, I kind of
|
| 740 |
|
| 741 |
186
|
| 742 |
+
00:10:54,670 --> 00:10:58,990
|
| 743 |
came, when I started working on this, these prototypes on
|
| 744 |
|
| 745 |
187
|
| 746 |
+
00:10:59,070 --> 00:11:01,470
|
| 747 |
GitHub, which is where I just kind of share very
|
| 748 |
|
| 749 |
188
|
| 750 |
+
00:11:01,710 --> 00:11:06,730
|
| 751 |
freely and loosely, ideas and first iterations on concepts.
|
| 752 |
|
| 753 |
189
|
| 754 |
+
00:11:08,490 --> 00:11:10,650
|
| 755 |
And for want of a better word, I called it
|
| 756 |
|
| 757 |
190
|
| 758 |
+
00:11:10,730 --> 00:11:15,450
|
| 759 |
like LLM post-processing or cleanup or basically a system prompt
|
| 760 |
|
| 761 |
191
|
| 762 |
+
00:11:15,530 --> 00:11:18,890
|
| 763 |
that after you get back the raw text from Whisper,
|
| 764 |
|
| 765 |
192
|
| 766 |
+
00:11:19,050 --> 00:11:22,010
|
| 767 |
you run it through a model and say, okay, this
|
| 768 |
|
| 769 |
193
|
| 770 |
+
00:11:22,090 --> 00:11:26,970
|
| 771 |
is crappy text, like add sentence structure and fix it
|
| 772 |
|
| 773 |
194
|
| 774 |
+
00:11:27,050 --> 00:11:32,250
|
| 775 |
up. And now when I'm exploring the different tools that
|
| 776 |
|
| 777 |
195
|
| 778 |
+
00:11:32,330 --> 00:11:35,180
|
| 779 |
are out there that people have built, I see quite
|
| 780 |
|
| 781 |
196
|
| 782 |
+
00:11:35,420 --> 00:11:39,100
|
| 783 |
a number of projects have basically done the same thing,
|
| 784 |
|
| 785 |
197
|
| 786 |
+
00:11:40,460 --> 00:11:43,180
|
| 787 |
lest that be misconstrued. I'm not saying for a millisecond
|
| 788 |
|
| 789 |
198
|
| 790 |
+
00:11:43,260 --> 00:11:46,220
|
| 791 |
that I inspired them. I'm sure this has been a
|
| 792 |
|
| 793 |
199
|
| 794 |
+
00:11:46,300 --> 00:11:49,500
|
| 795 |
thing that's been integrated into tools for a while, but
|
| 796 |
|
| 797 |
200
|
| 798 |
+
00:11:50,380 --> 00:11:52,300
|
| 799 |
it's the kind of thing that when you start using
|
| 800 |
|
| 801 |
201
|
| 802 |
+
00:11:52,300 --> 00:11:54,780
|
| 803 |
these tools every day, the need for it is almost
|
| 804 |
|
| 805 |
202
|
| 806 |
+
00:11:54,940 --> 00:11:59,420
|
| 807 |
instantly apparent because text that doesn't have any punctuation or
|
| 808 |
|
| 809 |
203
|
| 810 |
+
00:11:59,800 --> 00:12:03,000
|
| 811 |
Paragraph spacing takes a long time to, you know, it
|
| 812 |
|
| 813 |
204
|
| 814 |
+
00:12:03,160 --> 00:12:05,400
|
| 815 |
takes so long to get it into a presentable email
|
| 816 |
|
| 817 |
205
|
| 818 |
+
00:12:05,560 --> 00:12:09,720
|
| 819 |
that again, it's, it's, it, it moves speech tech into
|
| 820 |
|
| 821 |
206
|
| 822 |
+
00:12:09,960 --> 00:12:13,480
|
| 823 |
that before that inflection point where you're like, no, it's
|
| 824 |
|
| 825 |
207
|
| 826 |
+
00:12:13,480 --> 00:12:15,960
|
| 827 |
just not worth it. It's like, it's, it'll just be
|
| 828 |
|
| 829 |
208
|
| 830 |
+
00:12:16,040 --> 00:12:18,520
|
| 831 |
quicker to type this. So it's a big, it's a
|
| 832 |
|
| 833 |
209
|
| 834 |
+
00:12:18,520 --> 00:12:21,560
|
| 835 |
little touch that actually is a big deal. Uh, so
|
| 836 |
|
| 837 |
210
|
| 838 |
+
00:12:21,720 --> 00:12:25,640
|
| 839 |
I was on Whisper and I've been using Whisper and
|
| 840 |
|
| 841 |
211
|
| 842 |
+
00:12:25,640 --> 00:12:28,110
|
| 843 |
I kind of, early on found a couple of tools.
|
| 844 |
|
| 845 |
212
|
| 846 |
+
00:12:28,270 --> 00:12:30,510
|
| 847 |
I couldn't find what I was looking for on Linux,
|
| 848 |
|
| 849 |
213
|
| 850 |
+
00:12:30,670 --> 00:12:35,470
|
| 851 |
which is basically just something that'll run in the background.
|
| 852 |
|
| 853 |
214
|
| 854 |
+
00:12:35,710 --> 00:12:38,030
|
| 855 |
It'll give it an API key and it will just
|
| 856 |
|
| 857 |
215
|
| 858 |
+
00:12:38,190 --> 00:12:42,910
|
| 859 |
like transcribe with like a little key to start and
|
| 860 |
|
| 861 |
216
|
| 862 |
+
00:12:42,990 --> 00:12:47,310
|
| 863 |
stop the dictation. And the issues were I discovered that
|
| 864 |
|
| 865 |
217
|
| 866 |
+
00:12:47,470 --> 00:12:51,070
|
| 867 |
like most people involved in creating these projects were very
|
| 868 |
|
| 869 |
218
|
| 870 |
+
00:12:51,230 --> 00:12:55,070
|
| 871 |
much focused on local models, running Whisper locally because you
|
| 872 |
|
| 873 |
219
|
| 874 |
+
00:12:55,150 --> 00:12:57,940
|
| 875 |
can. And I tried that a bunch of times and
|
| 876 |
|
| 877 |
220
|
| 878 |
+
00:12:58,020 --> 00:13:00,340
|
| 879 |
just never got results that were as good as the
|
| 880 |
|
| 881 |
221
|
| 882 |
+
00:13:00,340 --> 00:13:03,140
|
| 883 |
cloud. And when I began looking at the cost of
|
| 884 |
|
| 885 |
222
|
| 886 |
+
00:13:03,220 --> 00:13:05,700
|
| 887 |
the speech to text APIs and what I was spending,
|
| 888 |
|
| 889 |
223
|
| 890 |
+
00:13:06,260 --> 00:13:09,460
|
| 891 |
I just thought there is, it's actually, in my opinion,
|
| 892 |
|
| 893 |
224
|
| 894 |
+
00:13:09,620 --> 00:13:12,820
|
| 895 |
just one of the better deals in API spending and
|
| 896 |
|
| 897 |
225
|
| 898 |
+
00:13:12,820 --> 00:13:15,140
|
| 899 |
in cloud. Like it's just not that expensive for very,
|
| 900 |
|
| 901 |
226
|
| 902 |
+
00:13:15,300 --> 00:13:19,300
|
| 903 |
very good models that are much more, you know, you're
|
| 904 |
|
| 905 |
227
|
| 906 |
+
00:13:19,300 --> 00:13:21,880
|
| 907 |
gonna be able to run the full model. The latest
|
| 908 |
|
| 909 |
228
|
| 910 |
+
00:13:21,880 --> 00:13:25,880
|
| 911 |
model versus whatever you can run on your average GPU,
|
| 912 |
|
| 913 |
229
|
| 914 |
+
00:13:26,120 --> 00:13:29,160
|
| 915 |
unless you want to buy a crazy GPU. It doesn't
|
| 916 |
|
| 917 |
230
|
| 918 |
+
00:13:29,160 --> 00:13:31,080
|
| 919 |
really make sense to me. Now, privacy is another concern
|
| 920 |
|
| 921 |
231
|
| 922 |
+
00:13:32,120 --> 00:13:33,880
|
| 923 |
that I know is kind of like a very much
|
| 924 |
|
| 925 |
232
|
| 926 |
+
00:13:33,960 --> 00:13:36,760
|
| 927 |
a separate thing that people just don't want their voice
|
| 928 |
|
| 929 |
233
|
| 930 |
+
00:13:37,000 --> 00:13:40,680
|
| 931 |
data and their voice leaving their local environment, maybe for
|
| 932 |
|
| 933 |
234
|
| 934 |
+
00:13:40,680 --> 00:13:44,200
|
| 935 |
regulatory reasons as well. But I'm not in that. I
|
| 936 |
|
| 937 |
235
|
| 938 |
+
00:13:44,600 --> 00:13:48,840
|
| 939 |
neither really care about people listening to my grocery list
|
| 940 |
|
| 941 |
236
|
| 942 |
+
00:13:49,080 --> 00:13:51,720
|
| 943 |
consisting of reminding myself that I need to buy more
|
| 944 |
|
| 945 |
237
|
| 946 |
+
00:13:51,800 --> 00:13:55,150
|
| 947 |
beer, Cheetos, and hummus, which is kind of the three
|
| 948 |
|
| 949 |
238
|
| 950 |
+
00:13:55,310 --> 00:13:59,870
|
| 951 |
staples of my diet during periods of poorer nutrition. But
|
| 952 |
|
| 953 |
239
|
| 954 |
+
00:13:59,950 --> 00:14:02,430
|
| 955 |
the kind of stuff that I transcribe, it's just not,
|
| 956 |
|
| 957 |
240
|
| 958 |
+
00:14:03,950 --> 00:14:07,710
|
| 959 |
it's not a privacy thing I'm that sort of sensitive
|
| 960 |
|
| 961 |
241
|
| 962 |
+
00:14:07,790 --> 00:14:13,150
|
| 963 |
about and I don't do anything so sensitive or secure
|
| 964 |
|
| 965 |
242
|
| 966 |
+
00:14:13,230 --> 00:14:16,430
|
| 967 |
that requires air gapping. So I looked at the pricing
|
| 968 |
|
| 969 |
243
|
| 970 |
+
00:14:16,510 --> 00:14:19,790
|
| 971 |
and especially the kind of older model mini Some of
|
| 972 |
|
| 973 |
244
|
| 974 |
+
00:14:19,870 --> 00:14:21,950
|
| 975 |
them are very, very affordable. And I did a back
|
| 976 |
|
| 977 |
245
|
| 978 |
+
00:14:22,190 --> 00:14:25,870
|
| 979 |
of the, I did a calculation once with ChatGPT and
|
| 980 |
|
| 981 |
246
|
| 982 |
+
00:14:25,870 --> 00:14:29,230
|
| 983 |
I was like, okay, this is the API price for
|
| 984 |
|
| 985 |
247
|
| 986 |
+
00:14:29,390 --> 00:14:32,270
|
| 987 |
I can't remember whatever the model was. Let's say I
|
| 988 |
|
| 989 |
248
|
| 990 |
+
00:14:32,350 --> 00:14:35,230
|
| 991 |
just go at it like nonstop, which it rarely happens.
|
| 992 |
|
| 993 |
249
|
| 994 |
+
00:14:35,470 --> 00:14:38,830
|
| 995 |
Probably, I would say on average, I might dictate 30
|
| 996 |
|
| 997 |
250
|
| 998 |
+
00:14:38,910 --> 00:14:41,790
|
| 999 |
to 60 minutes per day if I was probably summing
|
| 1000 |
|
| 1001 |
251
|
| 1002 |
+
00:14:41,790 --> 00:14:46,990
|
| 1003 |
up the emails, documents, outlines, which
|
| 1004 |
|
| 1005 |
252
|
| 1006 |
+
00:14:47,230 --> 00:14:49,870
|
| 1007 |
is a lot, but it's still a fairly modest amount.
|
| 1008 |
|
| 1009 |
253
|
| 1010 |
+
00:14:50,030 --> 00:14:51,940
|
| 1011 |
And I was like, Some days I do go on
|
| 1012 |
|
| 1013 |
254
|
| 1014 |
+
00:14:52,100 --> 00:14:54,900
|
| 1015 |
like one or two days where I've been usually when
|
| 1016 |
|
| 1017 |
255
|
| 1018 |
+
00:14:54,900 --> 00:14:56,980
|
| 1019 |
I'm like kind of out of the house and just
|
| 1020 |
|
| 1021 |
256
|
| 1022 |
+
00:14:57,220 --> 00:15:00,500
|
| 1023 |
have something like I have nothing else to do. Like
|
| 1024 |
|
| 1025 |
257
|
| 1026 |
+
00:15:00,660 --> 00:15:04,020
|
| 1027 |
if I'm at a hospital, we have a newborn and
|
| 1028 |
|
| 1029 |
258
|
| 1030 |
+
00:15:04,180 --> 00:15:07,300
|
| 1031 |
you're waiting for like eight hours and hours for an
|
| 1032 |
|
| 1033 |
259
|
| 1034 |
+
00:15:07,380 --> 00:15:10,820
|
| 1035 |
appointment. And I would probably have listened to podcasts before
|
| 1036 |
|
| 1037 |
260
|
| 1038 |
+
00:15:11,380 --> 00:15:14,180
|
| 1039 |
becoming a speech fanatic. And I'm like, oh, wait, let
|
| 1040 |
|
| 1041 |
261
|
| 1042 |
+
00:15:14,340 --> 00:15:16,259
|
| 1043 |
me just get down. Let me just get these ideas
|
| 1044 |
|
| 1045 |
262
|
| 1046 |
+
00:15:16,420 --> 00:15:18,540
|
| 1047 |
out of my head. And that's when I'll go on
|
| 1048 |
|
| 1049 |
263
|
| 1050 |
+
00:15:19,260 --> 00:15:21,820
|
| 1051 |
my speech binges. But those are like once every few
|
| 1052 |
|
| 1053 |
264
|
| 1054 |
+
00:15:21,820 --> 00:15:24,940
|
| 1055 |
months, like not frequently. But I said, okay, let's just
|
| 1056 |
|
| 1057 |
265
|
| 1058 |
+
00:15:25,020 --> 00:15:29,100
|
| 1059 |
say if I'm gonna price out Cloud SCT, if I
|
| 1060 |
|
| 1061 |
266
|
| 1062 |
+
00:15:29,180 --> 00:15:33,900
|
| 1063 |
was like dedicated every second of every waking hour to
|
| 1064 |
|
| 1065 |
267
|
| 1066 |
+
00:15:34,060 --> 00:15:37,900
|
| 1067 |
transcribing for some odd reason, I mean, I'd have to
|
| 1068 |
|
| 1069 |
268
|
| 1070 |
+
00:15:37,980 --> 00:15:40,780
|
| 1071 |
like eat and use the toilet. Like, you know, there's
|
| 1072 |
|
| 1073 |
269
|
| 1074 |
+
00:15:40,860 --> 00:15:43,420
|
| 1075 |
only so many hours I'm awake for. So like, let's
|
| 1076 |
|
| 1077 |
270
|
| 1078 |
+
00:15:43,420 --> 00:15:46,620
|
| 1079 |
just say a maximum of like 40 hour, 45 minutes.
|
| 1080 |
|
| 1081 |
271
|
| 1082 |
+
00:15:47,210 --> 00:15:49,290
|
| 1083 |
In the hour. Then I said, all right, let's just
|
| 1084 |
|
| 1085 |
272
|
| 1086 |
+
00:15:49,290 --> 00:15:52,890
|
| 1087 |
say 50. Who knows? You're dictating on the toilet. We
|
| 1088 |
|
| 1089 |
273
|
| 1090 |
+
00:15:53,050 --> 00:15:55,050
|
| 1091 |
do it. So it could be. You could just do
|
| 1092 |
|
| 1093 |
274
|
| 1094 |
+
00:15:55,130 --> 00:15:59,290
|
| 1095 |
60. But whatever I did. And every day, like, you're
|
| 1096 |
|
| 1097 |
275
|
| 1098 |
+
00:15:59,370 --> 00:16:02,730
|
| 1099 |
going flat out seven days a week dictating non-stop I
|
| 1100 |
|
| 1101 |
276
|
| 1102 |
+
00:16:02,730 --> 00:16:05,850
|
| 1103 |
was like, what's my monthly API bill gonna be at
|
| 1104 |
|
| 1105 |
277
|
| 1106 |
+
00:16:05,930 --> 00:16:08,570
|
| 1107 |
this price? And it came out to, like, 70 or
|
| 1108 |
|
| 1109 |
278
|
| 1110 |
+
00:16:08,570 --> 00:16:10,730
|
| 1111 |
80 bucks. And I was like, well, that would be
|
| 1112 |
|
| 1113 |
279
|
| 1114 |
+
00:16:11,130 --> 00:16:15,700
|
| 1115 |
an extraordinary. Amount of dictation. And I would hope that
|
| 1116 |
|
| 1117 |
280
|
| 1118 |
+
00:16:16,180 --> 00:16:19,940
|
| 1119 |
there was some compelling reason more worth more than $70
|
| 1120 |
|
| 1121 |
281
|
| 1122 |
+
00:16:20,260 --> 00:16:23,460
|
| 1123 |
that I embarked upon that project. So given that that's
|
| 1124 |
|
| 1125 |
282
|
| 1126 |
+
00:16:23,460 --> 00:16:25,460
|
| 1127 |
kind of the max point for me, I said that's
|
| 1128 |
|
| 1129 |
283
|
| 1130 |
+
00:16:25,540 --> 00:16:29,140
|
| 1131 |
actually very, very affordable. Now you're gonna, if you want
|
| 1132 |
|
| 1133 |
284
|
| 1134 |
+
00:16:29,220 --> 00:16:31,700
|
| 1135 |
to spec out the costs and you want to do
|
| 1136 |
|
| 1137 |
285
|
| 1138 |
+
00:16:31,700 --> 00:16:36,260
|
| 1139 |
the post-processing that I really do feel is valuable, that's
|
| 1140 |
|
| 1141 |
286
|
| 1142 |
+
00:16:36,340 --> 00:16:40,820
|
| 1143 |
gonna cost some more as well, unless you're using Gemini,
|
| 1144 |
|
| 1145 |
287
|
| 1146 |
+
00:16:41,300 --> 00:16:44,420
|
| 1147 |
which needless to say is a random person sitting in
|
| 1148 |
|
| 1149 |
288
|
| 1150 |
+
00:16:44,500 --> 00:16:49,060
|
| 1151 |
Jerusalem. I have no affiliation, nor with Google, nor anthropic,
|
| 1152 |
|
| 1153 |
289
|
| 1154 |
+
00:16:49,140 --> 00:16:52,020
|
| 1155 |
nor Gemini, nor any major tech vendor for that matter.
|
| 1156 |
|
| 1157 |
290
|
| 1158 |
+
00:16:53,620 --> 00:16:56,820
|
| 1159 |
I like Gemini not so much as a everyday model.
|
| 1160 |
|
| 1161 |
291
|
| 1162 |
+
00:16:57,300 --> 00:16:59,860
|
| 1163 |
It's kind of underwhelmed in that respect, I would say.
|
| 1164 |
|
| 1165 |
292
|
| 1166 |
+
00:17:00,260 --> 00:17:02,740
|
| 1167 |
But for multimodal, I think it's got a lot to
|
| 1168 |
|
| 1169 |
293
|
| 1170 |
+
00:17:02,740 --> 00:17:06,500
|
| 1171 |
offer. And I think that the transcribing functionality whereby it
|
| 1172 |
|
| 1173 |
294
|
| 1174 |
+
00:17:06,580 --> 00:17:11,900
|
| 1175 |
can process audio with a system prompt and both give
|
| 1176 |
|
| 1177 |
295
|
| 1178 |
+
00:17:12,060 --> 00:17:15,100
|
| 1179 |
you transcription that's cleaned up that reduces two steps to
|
| 1180 |
|
| 1181 |
296
|
| 1182 |
+
00:17:15,260 --> 00:17:18,220
|
| 1183 |
one. And that for me is a very, very big
|
| 1184 |
|
| 1185 |
297
|
| 1186 |
+
00:17:18,380 --> 00:17:21,580
|
| 1187 |
deal. And I feel like even Google has haven't really
|
| 1188 |
|
| 1189 |
298
|
| 1190 |
+
00:17:21,820 --> 00:17:26,700
|
| 1191 |
sort of thought through how useful the that modality is
|
| 1192 |
|
| 1193 |
299
|
| 1194 |
+
00:17:26,780 --> 00:17:29,260
|
| 1195 |
and what kind of use cases you can achieve with
|
| 1196 |
|
| 1197 |
300
|
| 1198 |
+
00:17:29,340 --> 00:17:31,260
|
| 1199 |
it. Because I found in the course of this year,
|
| 1200 |
|
| 1201 |
301
|
| 1202 |
+
00:17:31,900 --> 00:17:36,540
|
| 1203 |
just an endless list of really kind of system prompt
|
| 1204 |
|
| 1205 |
302
|
| 1206 |
+
00:17:36,860 --> 00:17:40,220
|
| 1207 |
system prompt stuff that I can say, okay, I've used
|
| 1208 |
|
| 1209 |
303
|
| 1210 |
+
00:17:40,220 --> 00:17:43,420
|
| 1211 |
it to capture context data for AI, which is literally
|
| 1212 |
|
| 1213 |
304
|
| 1214 |
+
00:17:43,500 --> 00:17:45,660
|
| 1215 |
I might speak for if I wanted to have a
|
| 1216 |
|
| 1217 |
305
|
| 1218 |
+
00:17:45,660 --> 00:17:49,740
|
| 1219 |
good bank of context data about who knows my childhood
|
| 1220 |
|
| 1221 |
306
|
| 1222 |
+
00:17:50,300 --> 00:17:54,220
|
| 1223 |
more realistically, maybe my career goals, something that would just
|
| 1224 |
|
| 1225 |
307
|
| 1226 |
+
00:17:54,300 --> 00:17:56,700
|
| 1227 |
be like really boring to type out. So I'll just
|
| 1228 |
|
| 1229 |
308
|
| 1230 |
+
00:17:56,780 --> 00:18:00,780
|
| 1231 |
like sit in my car and record it for 10
|
| 1232 |
|
| 1233 |
309
|
| 1234 |
+
00:18:00,860 --> 00:18:03,100
|
| 1235 |
minutes. And that 10 minutes you get a lot of
|
| 1236 |
|
| 1237 |
310
|
| 1238 |
+
00:18:03,260 --> 00:18:08,650
|
| 1239 |
information in. Um, emails, which is short text, just
|
| 1240 |
|
| 1241 |
311
|
| 1242 |
+
00:18:09,050 --> 00:18:12,250
|
| 1243 |
there is a whole bunch and all these workflows kind
|
| 1244 |
|
| 1245 |
312
|
| 1246 |
+
00:18:12,410 --> 00:18:14,410
|
| 1247 |
of require a little bit of treatment afterwards and different
|
| 1248 |
|
| 1249 |
313
|
| 1250 |
+
00:18:14,650 --> 00:18:18,090
|
| 1251 |
treatment. My context pipeline is kind of like just extract
|
| 1252 |
|
| 1253 |
314
|
| 1254 |
+
00:18:18,170 --> 00:18:20,970
|
| 1255 |
the bare essentials. So you end up with me talking
|
| 1256 |
|
| 1257 |
315
|
| 1258 |
+
00:18:21,050 --> 00:18:22,970
|
| 1259 |
very loosely about sort of what I've done in my
|
| 1260 |
|
| 1261 |
316
|
| 1262 |
+
00:18:23,050 --> 00:18:25,370
|
| 1263 |
career, where I've worked, where I might like to work.
|
| 1264 |
|
| 1265 |
317
|
| 1266 |
+
00:18:25,850 --> 00:18:28,970
|
| 1267 |
And it goes, it condenses that down to very robotic
|
| 1268 |
|
| 1269 |
318
|
| 1270 |
+
00:18:29,210 --> 00:18:32,490
|
| 1271 |
language that is easy to chunk parse and maybe put
|
| 1272 |
|
| 1273 |
319
|
| 1274 |
+
00:18:32,570 --> 00:18:36,550
|
| 1275 |
into a vector database. Daniel has worked in technology. Daniel
|
| 1276 |
|
| 1277 |
320
|
| 1278 |
+
00:18:37,430 --> 00:18:40,150
|
| 1279 |
has been working in, you know, stuff like that. That's
|
| 1280 |
|
| 1281 |
321
|
| 1282 |
+
00:18:40,150 --> 00:18:43,110
|
| 1283 |
not how you would speak, but I figure it's probably
|
| 1284 |
|
| 1285 |
322
|
| 1286 |
+
00:18:43,350 --> 00:18:47,350
|
| 1287 |
easier to parse for, after all, robots. So we've almost
|
| 1288 |
|
| 1289 |
323
|
| 1290 |
+
00:18:47,430 --> 00:18:49,270
|
| 1291 |
got to 20 minutes and this is actually a success
|
| 1292 |
|
| 1293 |
324
|
| 1294 |
+
00:18:49,750 --> 00:18:55,110
|
| 1295 |
because I wasted 20 minutes of the evening speaking
|
| 1296 |
|
| 1297 |
325
|
| 1298 |
+
00:18:55,190 --> 00:18:59,910
|
| 1299 |
into a microphone and the levels were shot and it
|
| 1300 |
|
| 1301 |
326
|
| 1302 |
+
00:18:59,910 --> 00:19:01,590
|
| 1303 |
was clipping and I said, I can't really do an
|
| 1304 |
|
| 1305 |
327
|
| 1306 |
+
00:19:01,670 --> 00:19:03,990
|
| 1307 |
evaluation. I have to be fair. I have to give
|
| 1308 |
|
| 1309 |
328
|
| 1310 |
+
00:19:04,560 --> 00:19:07,920
|
| 1311 |
the models a chance to do their thing. What am
|
| 1312 |
|
| 1313 |
329
|
| 1314 |
+
00:19:07,920 --> 00:19:10,320
|
| 1315 |
I hoping to achieve in this? Okay, my fine tune
|
| 1316 |
|
| 1317 |
330
|
| 1318 |
+
00:19:10,320 --> 00:19:13,360
|
| 1319 |
was a dud as mentioned. DeepChrom ST, I'm really, really
|
| 1320 |
|
| 1321 |
331
|
| 1322 |
+
00:19:13,440 --> 00:19:16,480
|
| 1323 |
hopeful that this prototype will work and it's a build
|
| 1324 |
|
| 1325 |
332
|
| 1326 |
+
00:19:16,720 --> 00:19:19,280
|
| 1327 |
in public open source, so anyone is welcome to use
|
| 1328 |
|
| 1329 |
333
|
| 1330 |
+
00:19:19,360 --> 00:19:22,320
|
| 1331 |
it if I make anything good. But that was really
|
| 1332 |
|
| 1333 |
334
|
| 1334 |
+
00:19:22,480 --> 00:19:26,480
|
| 1335 |
exciting for me last night when after hours of trying
|
| 1336 |
|
| 1337 |
335
|
| 1338 |
+
00:19:26,560 --> 00:19:30,480
|
| 1339 |
my own prototype, seeing someone just made something that works
|
| 1340 |
|
| 1341 |
336
|
| 1342 |
+
00:19:30,640 --> 00:19:32,400
|
| 1343 |
like that, you know, you're not gonna have to build
|
| 1344 |
|
| 1345 |
337
|
| 1346 |
+
00:19:32,640 --> 00:19:37,460
|
| 1347 |
a custom conda environment and image. I have AMD GPU,
|
| 1348 |
|
| 1349 |
338
|
| 1350 |
+
00:19:37,620 --> 00:19:40,980
|
| 1351 |
which makes things much more complicated. I didn't find it.
|
| 1352 |
|
| 1353 |
339
|
| 1354 |
+
00:19:41,540 --> 00:19:42,980
|
| 1355 |
And I was about to give up and I said,
|
| 1356 |
|
| 1357 |
340
|
| 1358 |
+
00:19:43,060 --> 00:19:45,460
|
| 1359 |
all right, let me just give Deep Grams Linux thing
|
| 1360 |
|
| 1361 |
341
|
| 1362 |
+
00:19:45,940 --> 00:19:49,220
|
| 1363 |
a shot. And if this doesn't work, I'm just going
|
| 1364 |
|
| 1365 |
342
|
| 1366 |
+
00:19:49,220 --> 00:19:50,980
|
| 1367 |
to go back to trying to Vibe code something myself.
|
| 1368 |
|
| 1369 |
343
|
| 1370 |
+
00:19:51,620 --> 00:19:55,460
|
| 1371 |
And when I ran the script, I was using Claude
|
| 1372 |
|
| 1373 |
344
|
| 1374 |
+
00:19:55,540 --> 00:19:59,060
|
| 1375 |
code to do the installation process. It ran the script
|
| 1376 |
|
| 1377 |
345
|
| 1378 |
+
00:19:59,140 --> 00:20:02,020
|
| 1379 |
and oh my gosh, it works just like that. The
|
| 1380 |
|
| 1381 |
346
|
| 1382 |
+
00:20:02,100 --> 00:20:05,980
|
| 1383 |
tricky thing For all those who want to know all
|
| 1384 |
|
| 1385 |
347
|
| 1386 |
+
00:20:05,980 --> 00:20:11,260
|
| 1387 |
the nitty gritty details, was that I
|
| 1388 |
|
| 1389 |
348
|
| 1390 |
+
00:20:11,260 --> 00:20:14,380
|
| 1391 |
don't think it was actually struggling with transcription, but pasting
|
| 1392 |
|
| 1393 |
349
|
| 1394 |
+
00:20:14,700 --> 00:20:18,140
|
| 1395 |
Wayland makes life very hard. And I think there was
|
| 1396 |
|
| 1397 |
350
|
| 1398 |
+
00:20:18,220 --> 00:20:21,500
|
| 1399 |
something not running the right time. Anyway, Deepgram, I looked
|
| 1400 |
|
| 1401 |
351
|
| 1402 |
+
00:20:21,500 --> 00:20:23,820
|
| 1403 |
at how they actually handled that because it worked out
|
| 1404 |
|
| 1405 |
352
|
| 1406 |
+
00:20:23,900 --> 00:20:26,540
|
| 1407 |
of the box when other stuff didn't. And it was
|
| 1408 |
|
| 1409 |
353
|
| 1410 |
+
00:20:27,100 --> 00:20:30,570
|
| 1411 |
quite a clever little mechanism. And but more so than
|
| 1412 |
|
| 1413 |
354
|
| 1414 |
+
00:20:30,650 --> 00:20:33,290
|
| 1415 |
that, the accuracy was brilliant. Now, what am I doing
|
| 1416 |
|
| 1417 |
355
|
| 1418 |
+
00:20:33,290 --> 00:20:35,930
|
| 1419 |
here? This is going to be a 20 minute audio
|
| 1420 |
|
| 1421 |
356
|
| 1422 |
+
00:20:36,490 --> 00:20:42,010
|
| 1423 |
sample. And I think I've done one or two
|
| 1424 |
|
| 1425 |
357
|
| 1426 |
+
00:20:42,170 --> 00:20:46,570
|
| 1427 |
of these before, but I did it with short snappy
|
| 1428 |
|
| 1429 |
358
|
| 1430 |
+
00:20:46,730 --> 00:20:49,770
|
| 1431 |
voice notes. This is kind of long form. This actually
|
| 1432 |
|
| 1433 |
359
|
| 1434 |
+
00:20:50,010 --> 00:20:52,170
|
| 1435 |
might be a better approximation for what's useful to me
|
| 1436 |
|
| 1437 |
360
|
| 1438 |
+
00:20:52,330 --> 00:20:55,890
|
| 1439 |
than voice memos. Like, I need to buy three Bread,
|
| 1440 |
|
| 1441 |
361
|
| 1442 |
+
00:20:55,970 --> 00:20:58,610
|
| 1443 |
eaters of milk tomorrow and Peter bread, which is probably
|
| 1444 |
|
| 1445 |
362
|
| 1446 |
+
00:20:58,770 --> 00:21:01,330
|
| 1447 |
how like half my voice notes sound. Like if anyone
|
| 1448 |
|
| 1449 |
363
|
| 1450 |
+
00:21:01,810 --> 00:21:04,050
|
| 1451 |
were to, I don't know, like find my phone, they'd
|
| 1452 |
|
| 1453 |
364
|
| 1454 |
+
00:21:04,050 --> 00:21:05,570
|
| 1455 |
be like, this is the most boring person in the
|
| 1456 |
|
| 1457 |
365
|
| 1458 |
+
00:21:05,570 --> 00:21:09,330
|
| 1459 |
world. Although actually, there are some like kind of journaling
|
| 1460 |
|
| 1461 |
366
|
| 1462 |
+
00:21:09,330 --> 00:21:11,490
|
| 1463 |
thoughts as well, but it's a lot of content like
|
| 1464 |
|
| 1465 |
367
|
| 1466 |
+
00:21:11,490 --> 00:21:14,450
|
| 1467 |
that. And the probably for the evaluation, the most useful
|
| 1468 |
|
| 1469 |
368
|
| 1470 |
+
00:21:14,530 --> 00:21:20,210
|
| 1471 |
thing is slightly obscure tech, GitHub, NeocleNo, hugging
|
| 1472 |
|
| 1473 |
369
|
| 1474 |
+
00:21:20,290 --> 00:21:22,940
|
| 1475 |
face, Not so obscure that it's not going to have
|
| 1476 |
|
| 1477 |
370
|
| 1478 |
+
00:21:23,020 --> 00:21:26,460
|
| 1479 |
a chance of knowing it, but hopefully sufficiently well known
|
| 1480 |
|
| 1481 |
371
|
| 1482 |
+
00:21:26,460 --> 00:21:28,700
|
| 1483 |
that the model should get it. I tried to do
|
| 1484 |
|
| 1485 |
372
|
| 1486 |
+
00:21:28,780 --> 00:21:31,580
|
| 1487 |
a little bit of speaking really fast and speaking very
|
| 1488 |
|
| 1489 |
373
|
| 1490 |
+
00:21:31,740 --> 00:21:35,020
|
| 1491 |
slowly. I would say in general, I've spoken, delivered this
|
| 1492 |
|
| 1493 |
374
|
| 1494 |
+
00:21:35,180 --> 00:21:37,500
|
| 1495 |
at a faster pace than I usually would owing to
|
| 1496 |
|
| 1497 |
375
|
| 1498 |
+
00:21:37,980 --> 00:21:42,460
|
| 1499 |
strong coffee flowing through my bloodstream. And the thing that
|
| 1500 |
|
| 1501 |
376
|
| 1502 |
+
00:21:42,460 --> 00:21:44,700
|
| 1503 |
I'm not going to get in this benchmark is background
|
| 1504 |
|
| 1505 |
377
|
| 1506 |
+
00:21:44,780 --> 00:21:46,460
|
| 1507 |
noise, which in my first take that I had to
|
| 1508 |
|
| 1509 |
378
|
| 1510 |
+
00:21:46,460 --> 00:21:49,710
|
| 1511 |
get rid of, My wife came in with my son
|
| 1512 |
|
| 1513 |
379
|
| 1514 |
+
00:21:50,030 --> 00:21:52,350
|
| 1515 |
and for a goodnight kiss. And that actually would have
|
| 1516 |
|
| 1517 |
380
|
| 1518 |
+
00:21:52,350 --> 00:21:56,510
|
| 1519 |
been super helpful to get in because it was non
|
| 1520 |
|
| 1521 |
381
|
| 1522 |
+
00:21:56,590 --> 00:22:00,190
|
| 1523 |
diarized or if we had diarization, a female, I could
|
| 1524 |
|
| 1525 |
382
|
| 1526 |
+
00:22:00,190 --> 00:22:02,430
|
| 1527 |
say, I want the male voice and that wasn't intended
|
| 1528 |
|
| 1529 |
383
|
| 1530 |
+
00:22:02,430 --> 00:22:05,870
|
| 1531 |
for transcription. And we're not going to get background noise
|
| 1532 |
|
| 1533 |
384
|
| 1534 |
+
00:22:05,950 --> 00:22:08,270
|
| 1535 |
like people honking their horns, which is something I've done
|
| 1536 |
|
| 1537 |
385
|
| 1538 |
+
00:22:08,430 --> 00:22:11,150
|
| 1539 |
in my main data set where I am trying to
|
| 1540 |
|
| 1541 |
386
|
| 1542 |
+
00:22:11,390 --> 00:22:14,340
|
| 1543 |
go back to some of my voice notes. Annotate them
|
| 1544 |
|
| 1545 |
387
|
| 1546 |
+
00:22:14,580 --> 00:22:16,420
|
| 1547 |
and run a benchmark. But this is going to be
|
| 1548 |
|
| 1549 |
388
|
| 1550 |
+
00:22:16,420 --> 00:22:21,700
|
| 1551 |
just a pure quick test. And as someone,
|
| 1552 |
|
| 1553 |
389
|
| 1554 |
+
00:22:22,260 --> 00:22:24,660
|
| 1555 |
I'm working on a voice note idea. That's my sort
|
| 1556 |
|
| 1557 |
390
|
| 1558 |
+
00:22:24,660 --> 00:22:28,660
|
| 1559 |
of end motivation. Besides thinking it's an ask to the
|
| 1560 |
|
| 1561 |
391
|
| 1562 |
+
00:22:28,660 --> 00:22:32,340
|
| 1563 |
outstanding technology that's coming to viability. And really, I know
|
| 1564 |
|
| 1565 |
392
|
| 1566 |
+
00:22:32,420 --> 00:22:35,940
|
| 1567 |
this sounds cheesy, can actually have a very transformative effect.
|
| 1568 |
|
| 1569 |
393
|
| 1570 |
+
00:22:36,980 --> 00:22:41,130
|
| 1571 |
It's, you know, voice technology has been life changing for
|
| 1572 |
|
| 1573 |
394
|
| 1574 |
+
00:22:41,930 --> 00:22:46,970
|
| 1575 |
folks living with disabilities. And I think
|
| 1576 |
|
| 1577 |
395
|
| 1578 |
+
00:22:47,130 --> 00:22:48,970
|
| 1579 |
there's something really nice about the fact that it can
|
| 1580 |
|
| 1581 |
396
|
| 1582 |
+
00:22:49,130 --> 00:22:52,490
|
| 1583 |
also benefit, you know, folks who are able bodied and
|
| 1584 |
|
| 1585 |
397
|
| 1586 |
+
00:22:52,650 --> 00:22:57,690
|
| 1587 |
like we can all in different ways make this tech
|
| 1588 |
|
| 1589 |
398
|
| 1590 |
+
00:22:57,770 --> 00:23:00,410
|
| 1591 |
as useful as possible, regardless of the exact way that
|
| 1592 |
|
| 1593 |
399
|
| 1594 |
+
00:23:00,410 --> 00:23:03,770
|
| 1595 |
we're using it. And I think there's something very powerful
|
| 1596 |
|
| 1597 |
400
|
| 1598 |
+
00:23:03,850 --> 00:23:06,440
|
| 1599 |
in that and it can be very cool. I see
|
| 1600 |
|
| 1601 |
401
|
| 1602 |
+
00:23:06,600 --> 00:23:10,200
|
| 1603 |
huge potential. What excites me about Voicetech? A lot of
|
| 1604 |
|
| 1605 |
402
|
| 1606 |
+
00:23:10,280 --> 00:23:14,360
|
| 1607 |
things actually. Firstly, the fact that it's cheap and accurate,
|
| 1608 |
|
| 1609 |
403
|
| 1610 |
+
00:23:14,440 --> 00:23:17,080
|
| 1611 |
as I mentioned at the very start of this. And
|
| 1612 |
|
| 1613 |
404
|
| 1614 |
+
00:23:17,240 --> 00:23:19,880
|
| 1615 |
it's getting better and better with stuff like accent handling.
|
| 1616 |
|
| 1617 |
405
|
| 1618 |
+
00:23:20,680 --> 00:23:23,400
|
| 1619 |
I'm not sure my fine-tune will actually ever come to
|
| 1620 |
|
| 1621 |
406
|
| 1622 |
+
00:23:23,480 --> 00:23:25,320
|
| 1623 |
fruition in the sense that I'll use it day to
|
| 1624 |
|
| 1625 |
407
|
| 1626 |
+
00:23:25,400 --> 00:23:28,840
|
| 1627 |
day as I imagine. I get like superb flawless words
|
| 1628 |
|
| 1629 |
408
|
| 1630 |
+
00:23:28,920 --> 00:23:33,340
|
| 1631 |
error rates because I'm just kind of skeptical about Local
|
| 1632 |
|
| 1633 |
409
|
| 1634 |
+
00:23:33,500 --> 00:23:37,100
|
| 1635 |
speech to text, as I mentioned, and I think the
|
| 1636 |
|
| 1637 |
410
|
| 1638 |
+
00:23:37,180 --> 00:23:40,700
|
| 1639 |
pace of innovation and improvement in the models, the main
|
| 1640 |
|
| 1641 |
411
|
| 1642 |
+
00:23:40,860 --> 00:23:44,620
|
| 1643 |
reasons for fine tuning from what I've seen have been
|
| 1644 |
|
| 1645 |
412
|
| 1646 |
+
00:23:44,780 --> 00:23:47,420
|
| 1647 |
people who are something that really blows my mind about
|
| 1648 |
|
| 1649 |
413
|
| 1650 |
+
00:23:47,980 --> 00:23:53,100
|
| 1651 |
ASR is the idea that it's inherently a lingual or
|
| 1652 |
|
| 1653 |
414
|
| 1654 |
+
00:23:53,260 --> 00:23:58,570
|
| 1655 |
multilingual phonetic based. So as folks who use speak
|
| 1656 |
|
| 1657 |
415
|
| 1658 |
+
00:23:58,890 --> 00:24:02,250
|
| 1659 |
very obscure languages, that there might be a paucity of
|
| 1660 |
|
| 1661 |
416
|
| 1662 |
+
00:24:02,250 --> 00:24:04,890
|
| 1663 |
training data or almost none at all, and therefore the
|
| 1664 |
|
| 1665 |
417
|
| 1666 |
+
00:24:04,890 --> 00:24:10,090
|
| 1667 |
accuracy is significantly reduced. Or folks in very critical
|
| 1668 |
|
| 1669 |
418
|
| 1670 |
+
00:24:10,330 --> 00:24:14,250
|
| 1671 |
environments, I know this is used extensively in medical transcription
|
| 1672 |
|
| 1673 |
419
|
| 1674 |
+
00:24:14,330 --> 00:24:19,130
|
| 1675 |
and dispatcher work, the call centers who send out ambulances,
|
| 1676 |
|
| 1677 |
420
|
| 1678 |
+
00:24:19,210 --> 00:24:23,130
|
| 1679 |
et cetera, where accuracy is absolutely paramount. And in the
|
| 1680 |
|
| 1681 |
421
|
| 1682 |
+
00:24:23,130 --> 00:24:26,860
|
| 1683 |
case of doctors, radiologist, they might be using very specialized
|
| 1684 |
|
| 1685 |
422
|
| 1686 |
+
00:24:26,860 --> 00:24:29,420
|
| 1687 |
vocab all the time. So those are kind of the
|
| 1688 |
|
| 1689 |
423
|
| 1690 |
+
00:24:29,500 --> 00:24:31,420
|
| 1691 |
main two things that I'm not sure that really just
|
| 1692 |
|
| 1693 |
424
|
| 1694 |
+
00:24:31,500 --> 00:24:34,940
|
| 1695 |
for trying to make it better on a few random
|
| 1696 |
|
| 1697 |
425
|
| 1698 |
+
00:24:34,940 --> 00:24:37,900
|
| 1699 |
tech words with my slightly, I mean, I have an
|
| 1700 |
|
| 1701 |
426
|
| 1702 |
+
00:24:37,980 --> 00:24:41,020
|
| 1703 |
accent, but like not, you know, an accent that a
|
| 1704 |
|
| 1705 |
427
|
| 1706 |
+
00:24:41,100 --> 00:24:45,900
|
| 1707 |
few other million people have ish. I'm not sure that
|
| 1708 |
|
| 1709 |
428
|
| 1710 |
+
00:24:46,380 --> 00:24:50,300
|
| 1711 |
my little fine tune is gonna actually like the bump
|
| 1712 |
|
| 1713 |
429
|
| 1714 |
+
00:24:50,460 --> 00:24:53,500
|
| 1715 |
in word error reduction, if I ever actually figure out
|
| 1716 |
|
| 1717 |
430
|
| 1718 |
+
00:24:53,500 --> 00:24:54,620
|
| 1719 |
how to do it and get it up to the
|
| 1720 |
|
| 1721 |
431
|
| 1722 |
+
00:24:54,700 --> 00:24:57,870
|
| 1723 |
cloud. By the time we've done that, I suspect that
|
| 1724 |
|
| 1725 |
432
|
| 1726 |
+
00:24:58,190 --> 00:25:00,430
|
| 1727 |
the next generation of ASR will just be so good
|
| 1728 |
|
| 1729 |
433
|
| 1730 |
+
00:25:00,510 --> 00:25:02,990
|
| 1731 |
that it will kind of be, well, that would have
|
| 1732 |
|
| 1733 |
434
|
| 1734 |
+
00:25:02,990 --> 00:25:04,670
|
| 1735 |
been cool if it worked out, but I'll just use
|
| 1736 |
|
| 1737 |
435
|
| 1738 |
+
00:25:04,750 --> 00:25:08,510
|
| 1739 |
this instead. So that's going to be it for today's
|
| 1740 |
|
| 1741 |
436
|
| 1742 |
+
00:25:08,830 --> 00:25:14,030
|
| 1743 |
episode of voice training data. Single long shot evaluation.
|
| 1744 |
|
| 1745 |
437
|
| 1746 |
+
00:25:14,350 --> 00:25:17,150
|
| 1747 |
Who am I going to compare? Whisper is always good
|
| 1748 |
|
| 1749 |
438
|
| 1750 |
+
00:25:17,150 --> 00:25:20,510
|
| 1751 |
as a benchmark, but I'm more interested in seeing Whisper
|
| 1752 |
|
| 1753 |
439
|
| 1754 |
+
00:25:20,590 --> 00:25:24,510
|
| 1755 |
head to head with two things, really. One is Whisper
|
| 1756 |
|
| 1757 |
440
|
| 1758 |
+
00:25:24,590 --> 00:25:29,700
|
| 1759 |
variants. So you've got these projects like faster Distill Whisper,
|
| 1760 |
|
| 1761 |
441
|
| 1762 |
+
00:25:29,780 --> 00:25:31,700
|
| 1763 |
it's a bit confusing, there's a whole bunch of them.
|
| 1764 |
|
| 1765 |
442
|
| 1766 |
+
00:25:32,020 --> 00:25:35,300
|
| 1767 |
And the emerging ASRs, which are also a thing. My
|
| 1768 |
|
| 1769 |
443
|
| 1770 |
+
00:25:35,380 --> 00:25:37,220
|
| 1771 |
intention for this is I'm not sure I'm going to
|
| 1772 |
|
| 1773 |
444
|
| 1774 |
+
00:25:37,220 --> 00:25:39,860
|
| 1775 |
have the time in any point in the foreseeable future
|
| 1776 |
|
| 1777 |
445
|
| 1778 |
+
00:25:40,180 --> 00:25:44,580
|
| 1779 |
to go back through this whole episode and create a
|
| 1780 |
|
| 1781 |
446
|
| 1782 |
+
00:25:44,660 --> 00:25:49,700
|
| 1783 |
proper source truth, where I fix everything. Might do
|
| 1784 |
|
| 1785 |
447
|
| 1786 |
+
00:25:49,780 --> 00:25:52,740
|
| 1787 |
it if I can get one transcriptions that sufficiently close
|
| 1788 |
|
| 1789 |
448
|
| 1790 |
+
00:25:52,980 --> 00:25:57,040
|
| 1791 |
to perfection. But what I would actually love to do
|
| 1792 |
|
| 1793 |
449
|
| 1794 |
+
00:25:57,200 --> 00:25:59,920
|
| 1795 |
on Hugging Face, I think would be a great probably
|
| 1796 |
|
| 1797 |
450
|
| 1798 |
+
00:26:00,240 --> 00:26:02,880
|
| 1799 |
how I might visualize this is having the audio waveform
|
| 1800 |
|
| 1801 |
451
|
| 1802 |
+
00:26:03,200 --> 00:26:08,160
|
| 1803 |
play and then have the transcript for each model below
|
| 1804 |
|
| 1805 |
452
|
| 1806 |
+
00:26:08,160 --> 00:26:12,560
|
| 1807 |
it and maybe even a like, you know, to scale
|
| 1808 |
|
| 1809 |
453
|
| 1810 |
+
00:26:13,120 --> 00:26:15,600
|
| 1811 |
and maybe even a local one as well, like local
|
| 1812 |
|
| 1813 |
454
|
| 1814 |
+
00:26:15,760 --> 00:26:21,100
|
| 1815 |
whisper versus OpenAI API, et cetera. And, I
|
| 1816 |
|
| 1817 |
455
|
| 1818 |
+
00:26:21,180 --> 00:26:23,500
|
| 1819 |
can then actually listen back to segments or anyone who
|
| 1820 |
|
| 1821 |
456
|
| 1822 |
+
00:26:23,500 --> 00:26:25,820
|
| 1823 |
wants to can listen back to segments of this recording
|
| 1824 |
|
| 1825 |
457
|
| 1826 |
+
00:26:26,140 --> 00:26:30,940
|
| 1827 |
and see where a particular model struggled and others didn't,
|
| 1828 |
|
| 1829 |
458
|
| 1830 |
+
00:26:31,420 --> 00:26:33,340
|
| 1831 |
as well as the sort of headline finding of which
|
| 1832 |
|
| 1833 |
459
|
| 1834 |
+
00:26:33,500 --> 00:26:36,860
|
| 1835 |
had the best WER, but that would require the source
|
| 1836 |
|
| 1837 |
460
|
| 1838 |
+
00:26:36,860 --> 00:26:39,580
|
| 1839 |
of truth. Okay, that's it. I hope this was, I
|
| 1840 |
|
| 1841 |
461
|
| 1842 |
+
00:26:39,580 --> 00:26:42,540
|
| 1843 |
don't know, maybe useful for other folks interested in STT.
|
| 1844 |
|
| 1845 |
462
|
| 1846 |
+
00:26:42,860 --> 00:26:45,660
|
| 1847 |
You want to see that I always feel think I've
|
| 1848 |
|
| 1849 |
463
|
| 1850 |
+
00:26:45,660 --> 00:26:48,870
|
| 1851 |
just said as something I didn't intend to. STT, I
|
| 1852 |
|
| 1853 |
464
|
| 1854 |
+
00:26:48,870 --> 00:26:52,470
|
| 1855 |
said for those. Listen carefully, including hopefully the models themselves.
|
| 1856 |
|
| 1857 |
465
|
| 1858 |
+
00:26:53,190 --> 00:26:57,270
|
| 1859 |
This has been myself, Daniel Rosell. For more jumbled repositories
|
| 1860 |
|
| 1861 |
466
|
| 1862 |
+
00:26:57,350 --> 00:27:01,750
|
| 1863 |
about my roving interests in AI, but particularly agentic, MCP
|
| 1864 |
|
| 1865 |
467
|
| 1866 |
+
00:27:01,990 --> 00:27:07,029
|
| 1867 |
and Voicetech, you can find me on GitHub, huggingface.com,
|
| 1868 |
|
| 1869 |
468
|
| 1870 |
+
00:27:10,230 --> 00:27:13,270
|
| 1871 |
which is my personal website, as well as this podcast,
|
| 1872 |
|
| 1873 |
469
|
| 1874 |
+
00:27:13,510 --> 00:27:16,950
|
| 1875 |
whose name I sadly cannot remember. Until next time, thanks
|
| 1876 |
|
| 1877 |
470
|
| 1878 |
+
00:27:16,950 --> 00:27:17,510
|
| 1879 |
for listening.
|
| 1880 |
|
srt-out/nova3.srt
CHANGED
|
@@ -1,2304 +1,2304 @@
|
|
| 1 |
1
|
| 2 |
-
00:00:00,
|
| 3 |
Hello and welcome to a audio dataset consisting of one
|
| 4 |
|
| 5 |
2
|
| 6 |
-
00:00:06,
|
| 7 |
single episode of a nonexistent podcast.
|
| 8 |
|
| 9 |
3
|
| 10 |
-
00:00:08,
|
| 11 |
Or it I may append this to a podcast that
|
| 12 |
|
| 13 |
4
|
| 14 |
-
00:00:12,
|
| 15 |
I set up recently regarding my with my thoughts on
|
| 16 |
|
| 17 |
5
|
| 18 |
-
00:00:18,
|
| 19 |
speech tech and A.
|
| 20 |
|
| 21 |
6
|
| 22 |
-
00:00:20,
|
| 23 |
I.
|
| 24 |
|
| 25 |
7
|
| 26 |
-
00:00:21,
|
| 27 |
In particular, more A.
|
| 28 |
|
| 29 |
8
|
| 30 |
-
00:00:22,
|
| 31 |
I.
|
| 32 |
|
| 33 |
9
|
| 34 |
-
00:00:
|
| 35 |
And generative A.
|
| 36 |
|
| 37 |
10
|
| 38 |
-
00:00:23,
|
| 39 |
I.
|
| 40 |
|
| 41 |
11
|
| 42 |
-
00:00:24,
|
| 43 |
I would I would say.
|
| 44 |
|
| 45 |
12
|
| 46 |
-
00:00:26,
|
| 47 |
But in any event, the purpose of this voice recording
|
| 48 |
|
| 49 |
13
|
| 50 |
-
00:00:30,
|
| 51 |
is actually to create a lengthy voice sample for a
|
| 52 |
|
| 53 |
14
|
| 54 |
-
00:00:35,
|
| 55 |
quick evaluation, a back of the envelope evaluation, they might
|
| 56 |
|
| 57 |
15
|
| 58 |
-
00:00:38,
|
| 59 |
say, for different speech attacks models.
|
| 60 |
|
| 61 |
16
|
| 62 |
-
00:00:41,
|
| 63 |
I'm doing this because I thought I'd made a great
|
| 64 |
|
| 65 |
17
|
| 66 |
-
00:00:43,
|
| 67 |
breakthrough in my journey with speech tech and that was
|
| 68 |
|
| 69 |
18
|
| 70 |
-
00:00:47,
|
| 71 |
succeeding in the elusive task of fine tuning whisper.
|
| 72 |
|
| 73 |
19
|
| 74 |
-
00:00:51,
|
| 75 |
Whisper is, and I'm to just talk, I'm trying to
|
| 76 |
|
| 77 |
20
|
| 78 |
-
00:00:55,
|
| 79 |
mix up.
|
| 80 |
|
| 81 |
21
|
| 82 |
-
00:00:56,
|
| 83 |
I'm going to try a few different styles of speaking
|
| 84 |
|
| 85 |
22
|
| 86 |
-
00:01:00,
|
| 87 |
whisper something at some points as well.
|
| 88 |
|
| 89 |
23
|
| 90 |
-
00:01:03,
|
| 91 |
And I'll go back to speaking loud in in different
|
| 92 |
|
| 93 |
24
|
| 94 |
-
00:01:06,
|
| 95 |
parts are going to sound really like a crazy person
|
| 96 |
|
| 97 |
25
|
| 98 |
-
00:01:
|
| 99 |
because I'm also going to try to speak at different
|
| 100 |
|
| 101 |
26
|
| 102 |
-
00:01:12,
|
| 103 |
pitches and cadences in order to really try to push
|
| 104 |
|
| 105 |
27
|
| 106 |
-
00:01:18,
|
| 107 |
a speech to text model through its paces, which is
|
| 108 |
|
| 109 |
28
|
| 110 |
-
00:01:21,
|
| 111 |
trying to make sense of is this guy just rambling
|
| 112 |
|
| 113 |
29
|
| 114 |
-
00:01:24,
|
| 115 |
on incoherently in one long sentence or are these just
|
| 116 |
|
| 117 |
30
|
| 118 |
-
00:01:
|
| 119 |
actually a series of step standalone, standalone, standalone sentences?
|
| 120 |
|
| 121 |
31
|
| 122 |
-
00:01:36,
|
| 123 |
And how is it going to handle step alone?
|
| 124 |
|
| 125 |
32
|
| 126 |
-
00:01:38,
|
| 127 |
That's not a word.
|
| 128 |
|
| 129 |
33
|
| 130 |
-
00:01:39,
|
| 131 |
What happens when you use speech to text and you
|
| 132 |
|
| 133 |
34
|
| 134 |
-
00:01:
|
| 135 |
use a fake word?
|
| 136 |
|
| 137 |
35
|
| 138 |
-
00:01:43,
|
| 139 |
And then you're like, wait, that's not actually that word
|
| 140 |
|
| 141 |
36
|
| 142 |
-
00:01:45,
|
| 143 |
doesn't exist.
|
| 144 |
|
| 145 |
37
|
| 146 |
-
00:01:46,
|
| 147 |
How does AI handle that?
|
| 148 |
|
| 149 |
38
|
| 150 |
-
00:01:48,
|
| 151 |
And these and more are all the questions that I'm
|
| 152 |
|
| 153 |
39
|
| 154 |
-
00:01:53,
|
| 155 |
seeking to answer in this training data.
|
| 156 |
|
| 157 |
40
|
| 158 |
-
00:01:55,
|
| 159 |
Now, why was I trying to fine tune Whisper?
|
| 160 |
|
| 161 |
41
|
| 162 |
-
00:01:58,
|
| 163 |
And what is Whisper?
|
| 164 |
|
| 165 |
42
|
| 166 |
-
00:01:59,
|
| 167 |
As I said, I'm going to try to record this
|
| 168 |
|
| 169 |
43
|
| 170 |
-
00:02:02,
|
| 171 |
at a couple of different levels of technicality for folks
|
| 172 |
|
| 173 |
44
|
| 174 |
-
00:02:06,
|
| 175 |
who are in the normal world and not totally stuck
|
| 176 |
|
| 177 |
45
|
| 178 |
-
00:02:11,
|
| 179 |
down the rabbit hole of AI, which you have to
|
| 180 |
|
| 181 |
46
|
| 182 |
-
00:02:13,
|
| 183 |
say is a really wonderful rabbit hole to be done.
|
| 184 |
|
| 185 |
47
|
| 186 |
-
00:02:17,
|
| 187 |
It's a really interesting area and speech and voice tech
|
| 188 |
|
| 189 |
48
|
| 190 |
-
00:02:20,
|
| 191 |
is is the aspect of it that I find actually
|
| 192 |
|
| 193 |
49
|
| 194 |
-
00:02:24,
|
| 195 |
most I'm not sure I would say the most interesting
|
| 196 |
|
| 197 |
50
|
| 198 |
-
00:02:27,
|
| 199 |
because there's just so much that is fascinating in AI.
|
| 200 |
|
| 201 |
51
|
| 202 |
-
00:02:31,
|
| 203 |
But the most that I find the most personally transformative
|
| 204 |
|
| 205 |
52
|
| 206 |
-
00:02:34,
|
| 207 |
in terms of the impact that it's had on my
|
| 208 |
|
| 209 |
53
|
| 210 |
-
00:02:38,
|
| 211 |
daily work life and productivity and how I sort of
|
| 212 |
|
| 213 |
54
|
| 214 |
-
00:02:41,
|
| 215 |
work.
|
| 216 |
|
| 217 |
55
|
| 218 |
-
00:02:42,
|
| 219 |
I'm persevering hard with the task of trying to get
|
| 220 |
|
| 221 |
56
|
| 222 |
-
00:02:47,
|
| 223 |
a good solution working for Linux, which if anyone actually
|
| 224 |
|
| 225 |
57
|
| 226 |
-
00:02:50,
|
| 227 |
does listen to this, not just for the training data
|
| 228 |
|
| 229 |
58
|
| 230 |
-
00:02:52,
|
| 231 |
and for the actual content, is sparked.
|
| 232 |
|
| 233 |
59
|
| 234 |
-
00:02:56,
|
| 235 |
I had, besides the fine tune not working, well that
|
| 236 |
|
| 237 |
60
|
| 238 |
-
00:02:59,
|
| 239 |
was the failure.
|
| 240 |
|
| 241 |
61
|
| 242 |
-
00:03:02,
|
| 243 |
I used Claude code because one thinks these days that
|
| 244 |
|
| 245 |
62
|
| 246 |
-
00:03:06,
|
| 247 |
there is nothing short of solving, you know, the the
|
| 248 |
|
| 249 |
63
|
| 250 |
-
00:03:13,
|
| 251 |
reason of life or something that clause and agentic AI
|
| 252 |
|
| 253 |
64
|
| 254 |
-
00:03:17,
|
| 255 |
can't do, which is not really the case.
|
| 256 |
|
| 257 |
65
|
| 258 |
-
00:03:19,
|
| 259 |
It does seem that way sometimes, but it fails a
|
| 260 |
|
| 261 |
66
|
| 262 |
-
00:03:23,
|
| 263 |
lot as well.
|
| 264 |
|
| 265 |
67
|
| 266 |
-
00:03:23,
|
| 267 |
And this is one of those instances where last week
|
| 268 |
|
| 269 |
68
|
| 270 |
-
00:03:26,
|
| 271 |
I put together an hour of voice training data, basically
|
| 272 |
|
| 273 |
69
|
| 274 |
-
00:03:30,
|
| 275 |
speaking just random things for three minutes.
|
| 276 |
|
| 277 |
70
|
| 278 |
-
00:03:35,
|
| 279 |
It was actually kind of tedious because the texts were
|
| 280 |
|
| 281 |
71
|
| 282 |
-
00:03:38,
|
| 283 |
really weird.
|
| 284 |
|
| 285 |
72
|
| 286 |
-
00:03:38,
|
| 287 |
Some of them were, it was like it was AI
|
| 288 |
|
| 289 |
73
|
| 290 |
-
00:03:41,
|
| 291 |
generated.
|
| 292 |
|
| 293 |
74
|
| 294 |
-
00:03:42,
|
| 295 |
I tried before to read Sherlock Holmes for an hour
|
| 296 |
|
| 297 |
75
|
| 298 |
-
00:03:44,
|
| 299 |
and I just couldn't, I was so bored after ten
|
| 300 |
|
| 301 |
76
|
| 302 |
-
00:03:47,
|
| 303 |
minutes that I was like, okay, no, I'm just gonna
|
| 304 |
|
| 305 |
77
|
| 306 |
-
00:03:50,
|
| 307 |
have to find something else to read.
|
| 308 |
|
| 309 |
78
|
| 310 |
-
00:03:51,
|
| 311 |
So I used a created with AI Studio, VibeCoded, a
|
| 312 |
|
| 313 |
79
|
| 314 |
-
00:03:58,
|
| 315 |
synthetic text generator which actually I thought was probably a
|
| 316 |
|
| 317 |
80
|
| 318 |
-
00:04:03,
|
| 319 |
better way of doing it because it would give me
|
| 320 |
|
| 321 |
81
|
| 322 |
-
00:04:05,
|
| 323 |
more short samples with more varied content.
|
| 324 |
|
| 325 |
82
|
| 326 |
-
00:04:
|
| 327 |
So I was like, okay, give me a voice note
|
| 328 |
|
| 329 |
83
|
| 330 |
-
00:04:11,
|
| 331 |
like I'm recording an email, give me a short story
|
| 332 |
|
| 333 |
84
|
| 334 |
-
00:04:14,
|
| 335 |
to read, give me prose to read.
|
| 336 |
|
| 337 |
85
|
| 338 |
-
00:04:18,
|
| 339 |
So I came up with all these different things and
|
| 340 |
|
| 341 |
86
|
| 342 |
-
00:04:20,
|
| 343 |
they added a little timer to it so I could
|
| 344 |
|
| 345 |
87
|
| 346 |
-
00:04:22,
|
| 347 |
see how close I was to one hour.
|
| 348 |
|
| 349 |
88
|
| 350 |
-
00:04:25,
|
| 351 |
And I spent like an hour one afternoon or probably
|
| 352 |
|
| 353 |
89
|
| 354 |
-
00:04:29,
|
| 355 |
two hours by the time you do retakes and whatever
|
| 356 |
|
| 357 |
90
|
| 358 |
-
00:04:33,
|
| 359 |
because you want to it gave me a source of
|
| 360 |
|
| 361 |
91
|
| 362 |
-
00:04:36,
|
| 363 |
truth which I'm not sure if that's the scientific way
|
| 364 |
|
| 365 |
92
|
| 366 |
-
00:04:
|
| 367 |
to approach this topic of gathering training data but I
|
| 368 |
|
| 369 |
93
|
| 370 |
-
00:04:44,
|
| 371 |
thought made sense.
|
| 372 |
|
| 373 |
94
|
| 374 |
-
00:04:46,
|
| 375 |
I have a lot of audio data from recording voice
|
| 376 |
|
| 377 |
95
|
| 378 |
-
00:04:49,
|
| 379 |
notes which I've also kind of used, been experimenting with
|
| 380 |
|
| 381 |
96
|
| 382 |
-
00:04:53,
|
| 383 |
using for a different purpose.
|
| 384 |
|
| 385 |
97
|
| 386 |
-
00:04:55,
|
| 387 |
Slightly different annotating task types.
|
| 388 |
|
| 389 |
98
|
| 390 |
-
00:04:58,
|
| 391 |
It's more a text classification experiment or Well, it's more
|
| 392 |
|
| 393 |
99
|
| 394 |
-
00:05:03,
|
| 395 |
than that actually.
|
| 396 |
|
| 397 |
100
|
| 398 |
-
00:05:03,
|
| 399 |
I'm working on a voice app.
|
| 400 |
|
| 401 |
101
|
| 402 |
-
00:05:
|
| 403 |
So it's a prototype, I guess, is really more accurate.
|
| 404 |
|
| 405 |
102
|
| 406 |
-
00:05:11,
|
| 407 |
But you can do that and you can work backwards.
|
| 408 |
|
| 409 |
103
|
| 410 |
-
00:05:13,
|
| 411 |
Listen back to a voice note and you painfully go
|
| 412 |
|
| 413 |
104
|
| 414 |
-
00:05:18,
|
| 415 |
through one of those transcribing, where you start and stop
|
| 416 |
|
| 417 |
105
|
| 418 |
-
00:05:21,
|
| 419 |
and scrub around it and you fix the errors, but
|
| 420 |
|
| 421 |
106
|
| 422 |
-
00:05:23,
|
| 423 |
it's really, really pouring to do that.
|
| 424 |
|
| 425 |
107
|
| 426 |
-
00:05:26,
|
| 427 |
So I thought it would be less tedious in the
|
| 428 |
|
| 429 |
108
|
| 430 |
-
00:05:
|
| 431 |
long term if I just recorded the source of truth.
|
| 432 |
|
| 433 |
109
|
| 434 |
-
00:05:
|
| 435 |
So it gave me these three minutes snippets.
|
| 436 |
|
| 437 |
110
|
| 438 |
-
00:05:34,
|
| 439 |
I recorded them and saved an MP3 and a TXT
|
| 440 |
|
| 441 |
111
|
| 442 |
-
00:05:37,
|
| 443 |
in the same folder and I created an error that
|
| 444 |
|
| 445 |
112
|
| 446 |
-
00:05:40,
|
| 447 |
data.
|
| 448 |
|
| 449 |
113
|
| 450 |
-
00:05:41,
|
| 451 |
So I was very hopeful, quietly, a little bit hopeful
|
| 452 |
|
| 453 |
114
|
| 454 |
-
00:05:44,
|
| 455 |
that I would be able, that I could actually fine
|
| 456 |
|
| 457 |
115
|
| 458 |
-
00:05:
|
| 459 |
tune Whisper.
|
| 460 |
|
| 461 |
116
|
| 462 |
-
00:05:48,
|
| 463 |
I want to fine tune Whisper because when I got
|
| 464 |
|
| 465 |
117
|
| 466 |
-
00:05:51,
|
| 467 |
into voice tech last November, my wife was in the
|
| 468 |
|
| 469 |
118
|
| 470 |
-
00:05:
|
| 471 |
US and I was alone at home.
|
| 472 |
|
| 473 |
119
|
| 474 |
-
00:05:57,
|
| 475 |
And when crazy people like me do really wild things
|
| 476 |
|
| 477 |
120
|
| 478 |
-
00:06:
|
| 479 |
like use voice to tech technology.
|
| 480 |
|
| 481 |
121
|
| 482 |
-
00:06:03,
|
| 483 |
That was basically when I started doing it, I didn't
|
| 484 |
|
| 485 |
122
|
| 486 |
-
00:06:06,
|
| 487 |
feel like a crazy person speaking to myself.
|
| 488 |
|
| 489 |
123
|
| 490 |
-
00:06:09,
|
| 491 |
And my expectations weren't that high.
|
| 492 |
|
| 493 |
124
|
| 494 |
-
00:06:13,
|
| 495 |
I'd used speech tech now and again, tried it out.
|
| 496 |
|
| 497 |
125
|
| 498 |
-
00:06:17,
|
| 499 |
I was like, it'd be really cool if you could
|
| 500 |
|
| 501 |
126
|
| 502 |
-
00:06:18,
|
| 503 |
just like speak into your computer and whatever I tried
|
| 504 |
|
| 505 |
127
|
| 506 |
-
00:06:22,
|
| 507 |
out that had Linux support was just, it was not
|
| 508 |
|
| 509 |
128
|
| 510 |
-
00:06:25,
|
| 511 |
good basically.
|
| 512 |
|
| 513 |
129
|
| 514 |
-
00:06:27,
|
| 515 |
And this blew me away from the first go.
|
| 516 |
|
| 517 |
130
|
| 518 |
-
00:06:29,
|
| 519 |
I mean, it wasn't one hundred percent accurate out of
|
| 520 |
|
| 521 |
131
|
| 522 |
-
00:06:32,
|
| 523 |
the box and it took work, but it was good
|
| 524 |
|
| 525 |
132
|
| 526 |
-
00:06:34,
|
| 527 |
enough that there was a solid foundation and it kind
|
| 528 |
|
| 529 |
133
|
| 530 |
-
00:06:36,
|
| 531 |
of passed that pivot point that it's actually worth doing
|
| 532 |
|
| 533 |
134
|
| 534 |
-
00:06:41,
|
| 535 |
this.
|
| 536 |
|
| 537 |
135
|
| 538 |
-
00:06:41,
|
| 539 |
You know, there's a point where it's so like, the
|
| 540 |
|
| 541 |
136
|
| 542 |
-
00:06:43,
|
| 543 |
transcript is you don't have to get one hundred percent
|
| 544 |
|
| 545 |
137
|
| 546 |
-
00:06:46,
|
| 547 |
accuracy for it to be worth your time for speech
|
| 548 |
|
| 549 |
138
|
| 550 |
-
00:06:49,
|
| 551 |
to text to be a worthwhile addition to your productivity.
|
| 552 |
|
| 553 |
139
|
| 554 |
-
00:06:51,
|
| 555 |
But you do need to get above, let's say, I
|
| 556 |
|
| 557 |
140
|
| 558 |
-
00:06:53,
|
| 559 |
don't know, eighty five percent.
|
| 560 |
|
| 561 |
141
|
| 562 |
-
00:06:55,
|
| 563 |
If it's sixty percent or fifty percent, you inevitably say,
|
| 564 |
|
| 565 |
142
|
| 566 |
-
00:06:
|
| 567 |
Screw it, I'll just type it.
|
| 568 |
|
| 569 |
143
|
| 570 |
-
00:07:00,
|
| 571 |
Because you end up missing errors in the transcript and
|
| 572 |
|
| 573 |
144
|
| 574 |
-
00:07:03,
|
| 575 |
it becomes actually worse.
|
| 576 |
|
| 577 |
145
|
| 578 |
-
00:07:
|
| 579 |
You end up in a worse position than you started
|
| 580 |
|
| 581 |
146
|
| 582 |
-
00:07:06,
|
| 583 |
with it.
|
| 584 |
|
| 585 |
147
|
| 586 |
-
00:07:
|
| 587 |
That's been my experience.
|
| 588 |
|
| 589 |
148
|
| 590 |
-
00:07:08,
|
| 591 |
So I was like, Oh, this is actually really, really
|
| 592 |
|
| 593 |
149
|
| 594 |
-
00:07:12,
|
| 595 |
good now.
|
| 596 |
|
| 597 |
150
|
| 598 |
-
00:07:12,
|
| 599 |
How did that happen?
|
| 600 |
|
| 601 |
151
|
| 602 |
-
00:07:13,
|
| 603 |
And the answer is ASR, Whisper being open sourced and
|
| 604 |
|
| 605 |
152
|
| 606 |
-
00:07:18,
|
| 607 |
the transformer architecture, if you want to go back to
|
| 608 |
|
| 609 |
153
|
| 610 |
-
00:07:21,
|
| 611 |
the underpinnings, which really blows my mind and it's on
|
| 612 |
|
| 613 |
154
|
| 614 |
-
00:07:26,
|
| 615 |
my list to read through that paper.
|
| 616 |
|
| 617 |
155
|
| 618 |
-
00:07:30,
|
| 619 |
All you need is attention as attentively as can be
|
| 620 |
|
| 621 |
156
|
| 622 |
-
00:07:35,
|
| 623 |
done with my limited brain because it's super super high
|
| 624 |
|
| 625 |
157
|
| 626 |
-
00:07:39,
|
| 627 |
level stuff, super advanced stuff, mean.
|
| 628 |
|
| 629 |
158
|
| 630 |
-
00:07:43,
|
| 631 |
That I think of all the things that are fascinating
|
| 632 |
|
| 633 |
159
|
| 634 |
-
00:07:48,
|
| 635 |
about the sudden rise in AI and the dramatic capabilities,
|
| 636 |
|
| 637 |
160
|
| 638 |
-
00:07:53,
|
| 639 |
I find it fascinating that few people are like, hang
|
| 640 |
|
| 641 |
161
|
| 642 |
-
00:07:55,
|
| 643 |
on, you've got this thing that can speak to you
|
| 644 |
|
| 645 |
162
|
| 646 |
-
00:07:58,
|
| 647 |
like a chatbot, an LLM.
|
| 648 |
|
| 649 |
163
|
| 650 |
-
00:08:00,
|
| 651 |
And then you've got image generation.
|
| 652 |
|
| 653 |
164
|
| 654 |
-
00:08:02,
|
| 655 |
Okay.
|
| 656 |
|
| 657 |
165
|
| 658 |
-
00:08:03,
|
| 659 |
So firstly, two things on the surface have nothing in
|
| 660 |
|
| 661 |
166
|
| 662 |
-
00:08:07,
|
| 663 |
common.
|
| 664 |
|
| 665 |
167
|
| 666 |
-
00:08:08,
|
| 667 |
So how did that just happen all at the same
|
| 668 |
|
| 669 |
168
|
| 670 |
-
00:08:
|
| 671 |
time?
|
| 672 |
|
| 673 |
169
|
| 674 |
-
00:08:12,
|
| 675 |
And then when you extend that further, you're like, Suno.
|
| 676 |
|
| 677 |
170
|
| 678 |
-
00:08:15,
|
| 679 |
You can sing a song and AI will come up
|
| 680 |
|
| 681 |
171
|
| 682 |
-
00:08:19,
|
| 683 |
with an instrumental.
|
| 684 |
|
| 685 |
172
|
| 686 |
-
00:08:21,
|
| 687 |
And then you've got Whisper and you're like, Wait a
|
| 688 |
|
| 689 |
173
|
| 690 |
-
00:08:23,
|
| 691 |
second.
|
| 692 |
|
| 693 |
174
|
| 694 |
-
00:08:24,
|
| 695 |
How did all this stuff If it's all AI, there
|
| 696 |
|
| 697 |
175
|
| 698 |
-
00:08:28,
|
| 699 |
has to be some commonality.
|
| 700 |
|
| 701 |
176
|
| 702 |
-
00:08:29,
|
| 703 |
Otherwise, are totally different technologies on the surface of it.
|
| 704 |
|
| 705 |
177
|
| 706 |
-
00:08:35,
|
| 707 |
And the transformer architecture is, as far as I know,
|
| 708 |
|
| 709 |
178
|
| 710 |
-
00:08:39,
|
| 711 |
the answer.
|
| 712 |
|
| 713 |
179
|
| 714 |
-
00:08:40,
|
| 715 |
And I can't even say, can't even pretend that I
|
| 716 |
|
| 717 |
180
|
| 718 |
-
00:08:42,
|
| 719 |
really understand what the transformer architecture means in-depth.
|
| 720 |
|
| 721 |
181
|
| 722 |
-
00:08:47,
|
| 723 |
But I have scanned this and as I said, I
|
| 724 |
|
| 725 |
182
|
| 726 |
-
00:08:49,
|
| 727 |
want to print it and really kind of think over
|
| 728 |
|
| 729 |
183
|
| 730 |
-
00:08:52,
|
| 731 |
it at some point.
|
| 732 |
|
| 733 |
184
|
| 734 |
-
00:08:54,
|
| 735 |
And I'll probably feel bad about myself, I think, because
|
| 736 |
|
| 737 |
185
|
| 738 |
-
00:08:58,
|
| 739 |
weren't those guys in twenties?
|
| 740 |
|
| 741 |
186
|
| 742 |
-
00:09:00,
|
| 743 |
Like, that's crazy.
|
| 744 |
|
| 745 |
187
|
| 746 |
-
00:09:02,
|
| 747 |
I think I asked ChatGPT once who wrote that paper
|
| 748 |
|
| 749 |
188
|
| 750 |
-
00:09:06,
|
| 751 |
and how old were they when it was published in
|
| 752 |
|
| 753 |
189
|
| 754 |
-
00:09:09,
|
| 755 |
ArcSiv?
|
| 756 |
|
| 757 |
190
|
| 758 |
-
00:09:09,
|
| 759 |
And I was expecting like, I don't know, what do
|
| 760 |
|
| 761 |
191
|
| 762 |
-
00:09:13,
|
| 763 |
you imagine?
|
| 764 |
|
| 765 |
192
|
| 766 |
-
00:09:13,
|
| 767 |
I personally imagine kind of like, you you have these
|
| 768 |
|
| 769 |
193
|
| 770 |
-
00:09:15,
|
| 771 |
breakthroughs during COVID and things like that, where like these
|
| 772 |
|
| 773 |
194
|
| 774 |
-
00:09:19,
|
| 775 |
kind of really obscure scientists who are in their 50s
|
| 776 |
|
| 777 |
195
|
| 778 |
-
00:09:22,
|
| 779 |
and they've just kind of been laboring in labs and
|
| 780 |
|
| 781 |
196
|
| 782 |
-
00:09:26,
|
| 783 |
wearily in writing and publishing in kind of obscure academic
|
| 784 |
|
| 785 |
197
|
| 786 |
-
00:09:29,
|
| 787 |
publications.
|
| 788 |
|
| 789 |
198
|
| 790 |
-
00:09:30,
|
| 791 |
And they finally hit a big or win a Nobel
|
| 792 |
|
| 793 |
199
|
| 794 |
-
00:09:33,
|
| 795 |
Prize and then their household names.
|
| 796 |
|
| 797 |
200
|
| 798 |
-
00:09:36,
|
| 799 |
So that was kind of what I had in mind.
|
| 800 |
|
| 801 |
201
|
| 802 |
-
00:09:38,
|
| 803 |
That was the mental image I'd formed of the birth
|
| 804 |
|
| 805 |
202
|
| 806 |
-
00:09:42,
|
| 807 |
of ArcSim.
|
| 808 |
|
| 809 |
203
|
| 810 |
-
00:09:42,
|
| 811 |
Like I wasn't expecting twenty somethings in San Francisco.
|
| 812 |
|
| 813 |
204
|
| 814 |
-
00:09:45,
|
| 815 |
I thought that was both very funny, very cool, and
|
| 816 |
|
| 817 |
205
|
| 818 |
-
00:09:48,
|
| 819 |
actually kind of inspiring.
|
| 820 |
|
| 821 |
206
|
| 822 |
-
00:09:50,
|
| 823 |
It's nice to think that people who just you might
|
| 824 |
|
| 825 |
207
|
| 826 |
-
00:09:55,
|
| 827 |
put them in the kind of milieu or bubble or
|
| 828 |
|
| 829 |
208
|
| 830 |
-
00:09:58,
|
| 831 |
world that you are in incredibly in through a series
|
| 832 |
|
| 833 |
209
|
| 834 |
-
00:10:02,
|
| 835 |
of connections that are coming up with such literally world
|
| 836 |
|
| 837 |
210
|
| 838 |
-
00:10:05,
|
| 839 |
changing innovations.
|
| 840 |
|
| 841 |
211
|
| 842 |
-
00:10:07,
|
| 843 |
So that was I thought anyway, that's that that was
|
| 844 |
|
| 845 |
212
|
| 846 |
-
00:10:11,
|
| 847 |
cool.
|
| 848 |
|
| 849 |
213
|
| 850 |
-
00:10:12,
|
| 851 |
Okay.
|
| 852 |
|
| 853 |
214
|
| 854 |
-
00:10:12,
|
| 855 |
Voice training data.
|
| 856 |
|
| 857 |
215
|
| 858 |
-
00:10:13,
|
| 859 |
How are we doing?
|
| 860 |
|
| 861 |
216
|
| 862 |
-
00:10:14,
|
| 863 |
We're about ten minutes, and I'm still talking about voice
|
| 864 |
|
| 865 |
217
|
| 866 |
-
00:10:17,
|
| 867 |
technology.
|
| 868 |
|
| 869 |
218
|
| 870 |
-
00:10:18,
|
| 871 |
So Whisper was brilliant, and I was so excited that
|
| 872 |
|
| 873 |
219
|
| 874 |
-
00:10:22,
|
| 875 |
my first instinct was to guess, like, Oh my gosh,
|
| 876 |
|
| 877 |
220
|
| 878 |
-
00:10:25,
|
| 879 |
I have to get a really good microphone for this.
|
| 880 |
|
| 881 |
221
|
| 882 |
-
00:10:28,
|
| 883 |
So I didn't go on a spending spree because I
|
| 884 |
|
| 885 |
222
|
| 886 |
-
00:10:31,
|
| 887 |
said, I'm gonna have to just wait a month and
|
| 888 |
|
| 889 |
223
|
| 890 |
-
00:10:33,
|
| 891 |
see if I still use this.
|
| 892 |
|
| 893 |
224
|
| 894 |
-
00:10:35,
|
| 895 |
And it just kind of became it's become really part
|
| 896 |
|
| 897 |
225
|
| 898 |
-
00:10:38,
|
| 899 |
of my daily routine.
|
| 900 |
|
| 901 |
226
|
| 902 |
-
00:10:41,
|
| 903 |
Like if I'm writing an email, I'll record a voice
|
| 904 |
|
| 905 |
227
|
| 906 |
-
00:10:44,
|
| 907 |
note and then I've developed and it's nice to see
|
| 908 |
|
| 909 |
228
|
| 910 |
-
00:10:47,
|
| 911 |
that everyone is like developing the same things in parallel.
|
| 912 |
|
| 913 |
229
|
| 914 |
-
00:10:50,
|
| 915 |
That's kind of a weird thing to say, when I
|
| 916 |
|
| 917 |
230
|
| 918 |
-
00:10:53,
|
| 919 |
started working on these prototypes on GitHub, which is where
|
| 920 |
|
| 921 |
231
|
| 922 |
-
00:11:00,
|
| 923 |
I just kind of share very freely and loosely ideas
|
| 924 |
|
| 925 |
232
|
| 926 |
-
00:11:
|
| 927 |
and first iterations on concepts.
|
| 928 |
|
| 929 |
233
|
| 930 |
-
00:11:
|
| 931 |
And for want of a better word, I called it
|
| 932 |
|
| 933 |
234
|
| 934 |
-
00:11:10,
|
| 935 |
like LLM post processing or clean up or basically a
|
| 936 |
|
| 937 |
235
|
| 938 |
-
00:11:14,
|
| 939 |
system prompt that after you get back the raw text
|
| 940 |
|
| 941 |
236
|
| 942 |
-
00:11:17,
|
| 943 |
from Whisper, you run it through a model and say,
|
| 944 |
|
| 945 |
237
|
| 946 |
-
00:11:21,
|
| 947 |
okay, this is crappy text like add sentence structure and,
|
| 948 |
|
| 949 |
238
|
| 950 |
-
00:11:26,
|
| 951 |
you know, fix it up.
|
| 952 |
|
| 953 |
239
|
| 954 |
-
00:11:27,
|
| 955 |
And now when I'm exploring the different tools that are
|
| 956 |
|
| 957 |
240
|
| 958 |
-
00:11:32,
|
| 959 |
out there that people have built, I see quite a
|
| 960 |
|
| 961 |
241
|
| 962 |
-
00:11:35,
|
| 963 |
number of projects have basically done the same thing.
|
| 964 |
|
| 965 |
242
|
| 966 |
-
00:11:40,
|
| 967 |
Lest that be misconstrued, I'm not saying for a millisecond
|
| 968 |
|
| 969 |
243
|
| 970 |
-
00:11:43,
|
| 971 |
that I inspired them.
|
| 972 |
|
| 973 |
244
|
| 974 |
-
00:11:44,
|
| 975 |
I'm sure this has been a thing that's been integrated
|
| 976 |
|
| 977 |
245
|
| 978 |
-
00:11:
|
| 979 |
into tools for a while, but it's the kind of
|
| 980 |
|
| 981 |
246
|
| 982 |
-
00:11:51,
|
| 983 |
thing that when you start using these tools every day,
|
| 984 |
|
| 985 |
247
|
| 986 |
-
00:11:53,
|
| 987 |
the need for it is almost instantly apparent because text
|
| 988 |
|
| 989 |
248
|
| 990 |
-
00:11:57,
|
| 991 |
that doesn't have any punctuation or paragraph spacing takes a
|
| 992 |
|
| 993 |
249
|
| 994 |
-
00:12:01,
|
| 995 |
long time to, you know, it takes so long to
|
| 996 |
|
| 997 |
250
|
| 998 |
-
00:12:03,
|
| 999 |
get it into a presentable email that again, moves speech
|
| 1000 |
|
| 1001 |
251
|
| 1002 |
-
00:12:
|
| 1003 |
tech into that before that inflection point where you're like,
|
| 1004 |
|
| 1005 |
252
|
| 1006 |
-
00:12:13,
|
| 1007 |
nah, it's just not worth it.
|
| 1008 |
|
| 1009 |
253
|
| 1010 |
-
00:12:13,
|
| 1011 |
It's like, it'll just be quicker to type this.
|
| 1012 |
|
| 1013 |
254
|
| 1014 |
-
00:12:17,
|
| 1015 |
So it's a big, it's a little touch that actually
|
| 1016 |
|
| 1017 |
255
|
| 1018 |
-
00:12:20,
|
| 1019 |
is a big deal.
|
| 1020 |
|
| 1021 |
256
|
| 1022 |
-
00:12:21,
|
| 1023 |
So I was on Whisper and I've been using Whisper
|
| 1024 |
|
| 1025 |
257
|
| 1026 |
-
00:12:25,
|
| 1027 |
and I kind of early on found a couple of
|
| 1028 |
|
| 1029 |
258
|
| 1030 |
-
00:12:27,
|
| 1031 |
tools.
|
| 1032 |
|
| 1033 |
259
|
| 1034 |
-
00:12:28,
|
| 1035 |
I couldn't find what I was looking for on Linux,
|
| 1036 |
|
| 1037 |
260
|
| 1038 |
-
00:12:30,
|
| 1039 |
which is basically just something that'll run-in the background.
|
| 1040 |
|
| 1041 |
261
|
| 1042 |
-
00:12:35,
|
| 1043 |
You'll give it an API key and it will just
|
| 1044 |
|
| 1045 |
262
|
| 1046 |
-
00:12:38,
|
| 1047 |
like transcribe with like a little key to start and
|
| 1048 |
|
| 1049 |
263
|
| 1050 |
-
00:12:
|
| 1051 |
stop the dictation.
|
| 1052 |
|
| 1053 |
264
|
| 1054 |
-
00:12:45,
|
| 1055 |
And the issues where I discovered that like most people
|
| 1056 |
|
| 1057 |
265
|
| 1058 |
-
00:12:48,
|
| 1059 |
involved in creating these projects were very much focused on
|
| 1060 |
|
| 1061 |
266
|
| 1062 |
-
00:12:
|
| 1063 |
local models, running Whisper locally because you can.
|
| 1064 |
|
| 1065 |
267
|
| 1066 |
-
00:12:56,
|
| 1067 |
And I tried that a bunch of times and just
|
| 1068 |
|
| 1069 |
268
|
| 1070 |
-
00:12:58,
|
| 1071 |
never got results that were as good as the cloud.
|
| 1072 |
|
| 1073 |
269
|
| 1074 |
-
00:13:01,
|
| 1075 |
And when I began looking at the cost of the
|
| 1076 |
|
| 1077 |
270
|
| 1078 |
-
00:13:03,
|
| 1079 |
speech to text APIs and what I was spending, I
|
| 1080 |
|
| 1081 |
271
|
| 1082 |
-
00:13:06,
|
| 1083 |
just thought there is it's actually, in my opinion, just
|
| 1084 |
|
| 1085 |
272
|
| 1086 |
-
00:13:09,
|
| 1087 |
one of the better deals in API spending in the
|
| 1088 |
|
| 1089 |
273
|
| 1090 |
-
00:13:13,
|
| 1091 |
cloud.
|
| 1092 |
|
| 1093 |
274
|
| 1094 |
-
00:13:13,
|
| 1095 |
Like, it's just not that expensive for very, very good
|
| 1096 |
|
| 1097 |
275
|
| 1098 |
-
00:13:15,
|
| 1099 |
models that are much more, you know, you're gonna be
|
| 1100 |
|
| 1101 |
276
|
| 1102 |
-
00:13:19,
|
| 1103 |
able to run the full model, the latest model versus
|
| 1104 |
|
| 1105 |
277
|
| 1106 |
-
00:13:22,
|
| 1107 |
whatever you can run on your average GPU unless you
|
| 1108 |
|
| 1109 |
278
|
| 1110 |
-
00:13:26,
|
| 1111 |
want to buy a crazy GPU.
|
| 1112 |
|
| 1113 |
279
|
| 1114 |
-
00:13:28,
|
| 1115 |
It doesn't really make sense to me.
|
| 1116 |
|
| 1117 |
280
|
| 1118 |
-
00:13:
|
| 1119 |
Privacy is another concern that I know is kind of
|
| 1120 |
|
| 1121 |
281
|
| 1122 |
-
00:13:33,
|
| 1123 |
like a very much a separate thing that people just
|
| 1124 |
|
| 1125 |
282
|
| 1126 |
-
00:13:35,
|
| 1127 |
don't want their voice data and their voice leaving their
|
| 1128 |
|
| 1129 |
283
|
| 1130 |
-
00:13:38,
|
| 1131 |
local environment maybe for regulatory reasons as well.
|
| 1132 |
|
| 1133 |
284
|
| 1134 |
-
00:13:42,
|
| 1135 |
But I'm not in that.
|
| 1136 |
|
| 1137 |
285
|
| 1138 |
-
00:13:44,
|
| 1139 |
I neither really care about people listening to my, grocery
|
| 1140 |
|
| 1141 |
286
|
| 1142 |
-
00:13:48,
|
| 1143 |
list, consisting of, reminding myself that I need to buy
|
| 1144 |
|
| 1145 |
287
|
| 1146 |
-
00:13:51,
|
| 1147 |
more beer, Cheetos, and hummus, which is kind of the
|
| 1148 |
|
| 1149 |
288
|
| 1150 |
-
00:13:55,
|
| 1151 |
three staples of my diet during periods of poor nutrition.
|
| 1152 |
|
| 1153 |
289
|
| 1154 |
-
00:13:59,
|
| 1155 |
But the kind of stuff that I transcribe, it's just
|
| 1156 |
|
| 1157 |
290
|
| 1158 |
-
00:14:02,
|
| 1159 |
not.
|
| 1160 |
|
| 1161 |
291
|
| 1162 |
-
00:14:02,
|
| 1163 |
It's not a privacy thing I'm that sort of sensitive
|
| 1164 |
|
| 1165 |
292
|
| 1166 |
-
00:14:07,
|
| 1167 |
about and I don't do anything so sensitive or secure
|
| 1168 |
|
| 1169 |
293
|
| 1170 |
-
00:14:13,
|
| 1171 |
that requires air capping.
|
| 1172 |
|
| 1173 |
294
|
| 1174 |
-
00:14:15,
|
| 1175 |
I looked at the pricing and especially the kind of
|
| 1176 |
|
| 1177 |
295
|
| 1178 |
-
00:14:17,
|
| 1179 |
older model mini.
|
| 1180 |
|
| 1181 |
296
|
| 1182 |
-
00:14:19,
|
| 1183 |
Some of them are very, very affordable and I did
|
| 1184 |
|
| 1185 |
297
|
| 1186 |
-
00:14:21,
|
| 1187 |
a calculation once with ChatGPT and I was like, okay,
|
| 1188 |
|
| 1189 |
298
|
| 1190 |
-
00:14:26,
|
| 1191 |
this is the API price for I can't remember whatever
|
| 1192 |
|
| 1193 |
299
|
| 1194 |
-
00:14:30,
|
| 1195 |
the model was.
|
| 1196 |
|
| 1197 |
300
|
| 1198 |
-
00:14:31,
|
| 1199 |
Let's say I just go at it like nonstop, which
|
| 1200 |
|
| 1201 |
301
|
| 1202 |
-
00:14:34,
|
| 1203 |
rarely happens.
|
| 1204 |
|
| 1205 |
302
|
| 1206 |
-
00:14:35,
|
| 1207 |
Probably, I would say on average I might dictate thirty
|
| 1208 |
|
| 1209 |
303
|
| 1210 |
-
00:14:38,
|
| 1211 |
to sixty minutes per day if I was probably summing
|
| 1212 |
|
| 1213 |
304
|
| 1214 |
-
00:14:41,
|
| 1215 |
up the emails, documents, outlines, which is a lot, but
|
| 1216 |
|
| 1217 |
305
|
| 1218 |
-
00:14:
|
| 1219 |
it's it's still a fairly modest amount.
|
| 1220 |
|
| 1221 |
306
|
| 1222 |
-
00:14:50,
|
| 1223 |
And I was like, well, some days I do go
|
| 1224 |
|
| 1225 |
307
|
| 1226 |
-
00:14:51,
|
| 1227 |
on like one or two days where I've been usually
|
| 1228 |
|
| 1229 |
308
|
| 1230 |
-
00:14:54,
|
| 1231 |
when I'm like kind of out of the house and
|
| 1232 |
|
| 1233 |
309
|
| 1234 |
-
00:14:56,
|
| 1235 |
just have something like I have nothing else to do.
|
| 1236 |
|
| 1237 |
310
|
| 1238 |
-
00:15:00,
|
| 1239 |
Like if I'm at a hospital, we have a newborn
|
| 1240 |
|
| 1241 |
311
|
| 1242 |
-
00:15:03,
|
| 1243 |
and you're waiting for like eight hours and hours for
|
| 1244 |
|
| 1245 |
312
|
| 1246 |
-
00:15:07,
|
| 1247 |
an appointment.
|
| 1248 |
|
| 1249 |
313
|
| 1250 |
-
00:15:08,
|
| 1251 |
And I would probably have listened to podcasts before becoming
|
| 1252 |
|
| 1253 |
314
|
| 1254 |
-
00:15:
|
| 1255 |
a speech fanatic.
|
| 1256 |
|
| 1257 |
315
|
| 1258 |
-
00:15:12,
|
| 1259 |
And I'm like, Oh, wait, let me just get down.
|
| 1260 |
|
| 1261 |
316
|
| 1262 |
-
00:15:15,
|
| 1263 |
Let me just get these ideas out of my head.
|
| 1264 |
|
| 1265 |
317
|
| 1266 |
-
00:15:17,
|
| 1267 |
And that's when I'll go on my speech binges.
|
| 1268 |
|
| 1269 |
318
|
| 1270 |
-
00:15:20,
|
| 1271 |
But those are like once every few months, like not
|
| 1272 |
|
| 1273 |
319
|
| 1274 |
-
00:15:22,
|
| 1275 |
frequently.
|
| 1276 |
|
| 1277 |
320
|
| 1278 |
-
00:15:23,
|
| 1279 |
But I said, okay, let's just say if I'm going
|
| 1280 |
|
| 1281 |
321
|
| 1282 |
-
00:15:25,
|
| 1283 |
to price out cloud STT.
|
| 1284 |
|
| 1285 |
322
|
| 1286 |
-
00:15:28,
|
| 1287 |
If I was like dedicated every second of every waking
|
| 1288 |
|
| 1289 |
323
|
| 1290 |
-
00:15:33,
|
| 1291 |
hour to transcribing for some odd reason, I mean I'd
|
| 1292 |
|
| 1293 |
324
|
| 1294 |
-
00:15:37,
|
| 1295 |
have to eat and use the toilet.
|
| 1296 |
|
| 1297 |
325
|
| 1298 |
-
00:15:40,
|
| 1299 |
There's only so many hours I'm awake for.
|
| 1300 |
|
| 1301 |
326
|
| 1302 |
-
00:15:42,
|
| 1303 |
So let's just say a maximum of forty five minutes
|
| 1304 |
|
| 1305 |
327
|
| 1306 |
-
00:15:47,
|
| 1307 |
in the hour, then I said, All right, let's just
|
| 1308 |
|
| 1309 |
328
|
| 1310 |
-
00:15:49,
|
| 1311 |
say fifty.
|
| 1312 |
|
| 1313 |
329
|
| 1314 |
-
00:15:50,
|
| 1315 |
Who knows?
|
| 1316 |
|
| 1317 |
330
|
| 1318 |
-
00:15:51,
|
| 1319 |
You're dictating on the toilet.
|
| 1320 |
|
| 1321 |
331
|
| 1322 |
-
00:15:52,
|
| 1323 |
We do it.
|
| 1324 |
|
| 1325 |
332
|
| 1326 |
-
00:15:53,
|
| 1327 |
So you could just do sixty, but whatever I did
|
| 1328 |
|
| 1329 |
333
|
| 1330 |
-
00:15:57,
|
| 1331 |
and every day, like you're going flat out seven days
|
| 1332 |
|
| 1333 |
334
|
| 1334 |
-
00:16:01,
|
| 1335 |
a week dictating nonstop.
|
| 1336 |
|
| 1337 |
335
|
| 1338 |
-
00:16:02,
|
| 1339 |
I was like, What's my monthly API bill going to
|
| 1340 |
|
| 1341 |
336
|
| 1342 |
-
00:16:05,
|
| 1343 |
be at this price?
|
| 1344 |
|
| 1345 |
337
|
| 1346 |
-
00:16:06,
|
| 1347 |
And it came out to like seventy or eighty bucks.
|
| 1348 |
|
| 1349 |
338
|
| 1350 |
-
00:16:09,
|
| 1351 |
And I was like, Well, that would be an extraordinary
|
| 1352 |
|
| 1353 |
339
|
| 1354 |
-
00:16:12,
|
| 1355 |
amount of dictation.
|
| 1356 |
|
| 1357 |
340
|
| 1358 |
-
00:16:14,
|
| 1359 |
And I would hope that there was some compelling reason
|
| 1360 |
|
| 1361 |
341
|
| 1362 |
-
00:16:18,
|
| 1363 |
worth more than seventy dollars that I embarked upon that
|
| 1364 |
|
| 1365 |
342
|
| 1366 |
-
00:16:21,
|
| 1367 |
project.
|
| 1368 |
|
| 1369 |
343
|
| 1370 |
-
00:16:22,
|
| 1371 |
So given that that's kind of the max point for
|
| 1372 |
|
| 1373 |
344
|
| 1374 |
-
00:16:24,
|
| 1375 |
me I said that's actually very very affordable.
|
| 1376 |
|
| 1377 |
345
|
| 1378 |
-
00:16:
|
| 1379 |
Now you're gonna if you want to spec out the
|
| 1380 |
|
| 1381 |
346
|
| 1382 |
-
00:16:30,
|
| 1383 |
costs and you want to do the post processing that
|
| 1384 |
|
| 1385 |
347
|
| 1386 |
-
00:16:33,
|
| 1387 |
I really do feel is valuable, that's going to cost
|
| 1388 |
|
| 1389 |
348
|
| 1390 |
-
00:16:36,
|
| 1391 |
some more as well.
|
| 1392 |
|
| 1393 |
349
|
| 1394 |
-
00:16:
|
| 1395 |
Unless you're using Gemini, which needless to say is a
|
| 1396 |
|
| 1397 |
350
|
| 1398 |
-
00:16:43,
|
| 1399 |
random person sitting in Jerusalem.
|
| 1400 |
|
| 1401 |
351
|
| 1402 |
-
00:16:45,
|
| 1403 |
I have no affiliation nor with Google nor Anthropic nor
|
| 1404 |
|
| 1405 |
352
|
| 1406 |
-
00:16:49,
|
| 1407 |
Gemini nor any major tech vendor for that matter.
|
| 1408 |
|
| 1409 |
353
|
| 1410 |
-
00:16:53,
|
| 1411 |
I like Gemini not so much as a everyday model.
|
| 1412 |
|
| 1413 |
354
|
| 1414 |
-
00:16:57,
|
| 1415 |
It's kind of underwhelmed in that respect, I would say.
|
| 1416 |
|
| 1417 |
355
|
| 1418 |
-
00:17:00,
|
| 1419 |
But for multimodal, I think it's got a lot to
|
| 1420 |
|
| 1421 |
356
|
| 1422 |
-
00:17:02,
|
| 1423 |
offer.
|
| 1424 |
|
| 1425 |
357
|
| 1426 |
-
00:17:03,
|
| 1427 |
And I think that the transcribing functionality whereby it can,
|
| 1428 |
|
| 1429 |
358
|
| 1430 |
-
00:17:
|
| 1431 |
process audio with a system prompt and both give you
|
| 1432 |
|
| 1433 |
359
|
| 1434 |
-
00:17:12,
|
| 1435 |
transcription that's cleaned up.
|
| 1436 |
|
| 1437 |
360
|
| 1438 |
-
00:17:13,
|
| 1439 |
That reduces two steps to one.
|
| 1440 |
|
| 1441 |
361
|
| 1442 |
-
00:17:15,
|
| 1443 |
And that for me is a very, very big deal.
|
| 1444 |
|
| 1445 |
362
|
| 1446 |
-
00:17:18,
|
| 1447 |
And I feel like even Google hasn't really sort of
|
| 1448 |
|
| 1449 |
363
|
| 1450 |
-
00:17:22,
|
| 1451 |
thought through how useful the that modality is and what
|
| 1452 |
|
| 1453 |
364
|
| 1454 |
-
00:17:27,
|
| 1455 |
kind of use cases you can achieve with it.
|
| 1456 |
|
| 1457 |
365
|
| 1458 |
-
00:17:29,
|
| 1459 |
Because I found in the course of this year just
|
| 1460 |
|
| 1461 |
366
|
| 1462 |
-
00:17:32,
|
| 1463 |
an endless list of really kind of system prompt stuff
|
| 1464 |
|
| 1465 |
367
|
| 1466 |
-
00:17:
|
| 1467 |
that I can say, okay, I've used it to capture
|
| 1468 |
|
| 1469 |
368
|
| 1470 |
-
00:17:40,
|
| 1471 |
context data for AI, which is literally I might speak
|
| 1472 |
|
| 1473 |
369
|
| 1474 |
-
00:17:44,
|
| 1475 |
for if I wanted to have a good bank of
|
| 1476 |
|
| 1477 |
370
|
| 1478 |
-
00:17:46,
|
| 1479 |
context data about who knows my childhood.
|
| 1480 |
|
| 1481 |
371
|
| 1482 |
-
00:17:50,
|
| 1483 |
More realistically, maybe my career goals, something that would just
|
| 1484 |
|
| 1485 |
372
|
| 1486 |
-
00:17:54,
|
| 1487 |
be like really boring to type out.
|
| 1488 |
|
| 1489 |
373
|
| 1490 |
-
00:17:56,
|
| 1491 |
So I'll just like sit in my car and record
|
| 1492 |
|
| 1493 |
374
|
| 1494 |
-
00:18:00,
|
| 1495 |
it for ten minutes.
|
| 1496 |
|
| 1497 |
375
|
| 1498 |
-
00:18:01,
|
| 1499 |
And that ten minutes you get a lot of information
|
| 1500 |
|
| 1501 |
376
|
| 1502 |
-
00:18:03,
|
| 1503 |
in.
|
| 1504 |
|
| 1505 |
377
|
| 1506 |
-
00:18:05,
|
| 1507 |
Emails, which is short text.
|
| 1508 |
|
| 1509 |
378
|
| 1510 |
-
00:18:08,
|
| 1511 |
Just there is a whole bunch.
|
| 1512 |
|
| 1513 |
379
|
| 1514 |
-
00:18:10,
|
| 1515 |
And all these workflows kind of require a little bit
|
| 1516 |
|
| 1517 |
380
|
| 1518 |
-
00:18:13,
|
| 1519 |
of treatment afterwards and different treatment.
|
| 1520 |
|
| 1521 |
381
|
| 1522 |
-
00:18:15,
|
| 1523 |
My context pipeline is kind of like just extract the
|
| 1524 |
|
| 1525 |
382
|
| 1526 |
-
00:18:18,
|
| 1527 |
bare essentials.
|
| 1528 |
|
| 1529 |
383
|
| 1530 |
-
00:18:19,
|
| 1531 |
You end up with me talking very loosely about sort
|
| 1532 |
|
| 1533 |
384
|
| 1534 |
-
00:18:22,
|
| 1535 |
of what I've done in my career, where I've worked,
|
| 1536 |
|
| 1537 |
385
|
| 1538 |
-
00:18:24,
|
| 1539 |
where I might like to work.
|
| 1540 |
|
| 1541 |
386
|
| 1542 |
-
00:18:
|
| 1543 |
And it goes, it condenses that down to very robotic
|
| 1544 |
|
| 1545 |
387
|
| 1546 |
-
00:18:29,
|
| 1547 |
language that is easy to chunk parse and maybe put
|
| 1548 |
|
| 1549 |
388
|
| 1550 |
-
00:18:32,
|
| 1551 |
into a vector database.
|
| 1552 |
|
| 1553 |
389
|
| 1554 |
-
00:18:
|
| 1555 |
Daniel has worked in technology.
|
| 1556 |
|
| 1557 |
390
|
| 1558 |
-
00:18:36,
|
| 1559 |
Daniel has been working in, know, stuff like that.
|
| 1560 |
|
| 1561 |
391
|
| 1562 |
-
00:18:39,
|
| 1563 |
That's not how you would speak, but I figure it's
|
| 1564 |
|
| 1565 |
392
|
| 1566 |
-
00:18:
|
| 1567 |
probably easier to parse for, after all, robots.
|
| 1568 |
|
| 1569 |
393
|
| 1570 |
-
00:18:46,
|
| 1571 |
So we've almost got to twenty minutes and this is
|
| 1572 |
|
| 1573 |
394
|
| 1574 |
-
00:18:48,
|
| 1575 |
actually a success because I wasted twenty minutes of my
|
| 1576 |
|
| 1577 |
395
|
| 1578 |
-
00:18:53,
|
| 1579 |
of the evening speaking into you in microphone and the
|
| 1580 |
|
| 1581 |
396
|
| 1582 |
-
00:18:57,
|
| 1583 |
levels were shot and was clipping and I said I
|
| 1584 |
|
| 1585 |
397
|
| 1586 |
-
00:19:01,
|
| 1587 |
can't really do an evaluation.
|
| 1588 |
|
| 1589 |
398
|
| 1590 |
-
00:19:02,
|
| 1591 |
I have to be fair.
|
| 1592 |
|
| 1593 |
399
|
| 1594 |
-
00:19:03,
|
| 1595 |
I have to give the models a chance to do
|
| 1596 |
|
| 1597 |
400
|
| 1598 |
-
00:19:06,
|
| 1599 |
their thing.
|
| 1600 |
|
| 1601 |
401
|
| 1602 |
-
00:19:07,
|
| 1603 |
What am I hoping to achieve in this?
|
| 1604 |
|
| 1605 |
402
|
| 1606 |
-
00:19:09,
|
| 1607 |
Okay, my fine tune was a dud as mentioned.
|
| 1608 |
|
| 1609 |
403
|
| 1610 |
-
00:19:11,
|
| 1611 |
Deepgram STT, I'm really, really hopeful that this prototype will
|
| 1612 |
|
| 1613 |
404
|
| 1614 |
-
00:19:15,
|
| 1615 |
work and it's a build in public open source so
|
| 1616 |
|
| 1617 |
405
|
| 1618 |
-
00:19:
|
| 1619 |
anyone is welcome to use it if I make anything
|
| 1620 |
|
| 1621 |
406
|
| 1622 |
-
00:19:20,
|
| 1623 |
good.
|
| 1624 |
|
| 1625 |
407
|
| 1626 |
-
00:19:21,
|
| 1627 |
But that was really exciting for me last night when
|
| 1628 |
|
| 1629 |
408
|
| 1630 |
-
00:19:23,
|
| 1631 |
after hours of trying my own prototype, seeing someone just
|
| 1632 |
|
| 1633 |
409
|
| 1634 |
-
00:19:28,
|
| 1635 |
made something that works like that, you you're not gonna
|
| 1636 |
|
| 1637 |
410
|
| 1638 |
-
00:19:32,
|
| 1639 |
have to build a custom conda environment and image.
|
| 1640 |
|
| 1641 |
411
|
| 1642 |
-
00:19:36,
|
| 1643 |
I have AMD GPU which makes things much more complicated.
|
| 1644 |
|
| 1645 |
412
|
| 1646 |
-
00:19:40,
|
| 1647 |
I didn't find it and I was about to give
|
| 1648 |
|
| 1649 |
413
|
| 1650 |
-
00:19:42,
|
| 1651 |
up and I said, All right, let me just give
|
| 1652 |
|
| 1653 |
414
|
| 1654 |
-
00:19:43,
|
| 1655 |
Deepgram's Linux thing a shot.
|
| 1656 |
|
| 1657 |
415
|
| 1658 |
-
00:19:47,
|
| 1659 |
And if this doesn't work, I'm just gonna go back
|
| 1660 |
|
| 1661 |
416
|
| 1662 |
-
00:19:49,
|
| 1663 |
to trying to vibe code something myself.
|
| 1664 |
|
| 1665 |
417
|
| 1666 |
-
00:19:51,
|
| 1667 |
And when I ran the script, I was using Cloud
|
| 1668 |
|
| 1669 |
418
|
| 1670 |
-
00:19:55,
|
| 1671 |
Code to do the installation process, it ran the script
|
| 1672 |
|
| 1673 |
419
|
| 1674 |
-
00:19:59,
|
| 1675 |
and, oh my gosh, it works just like that.
|
| 1676 |
|
| 1677 |
420
|
| 1678 |
-
00:20:01,
|
| 1679 |
The tricky thing for all those who wants to know
|
| 1680 |
|
| 1681 |
421
|
| 1682 |
-
00:20:
|
| 1683 |
all the nitty, ditty, nitty gritty details was that I
|
| 1684 |
|
| 1685 |
422
|
| 1686 |
-
00:20:11,
|
| 1687 |
don't think it was actually struggling with transcription, but pasting
|
| 1688 |
|
| 1689 |
423
|
| 1690 |
-
00:20:14,
|
| 1691 |
Weyland makes life very hard.
|
| 1692 |
|
| 1693 |
424
|
| 1694 |
-
00:20:17,
|
| 1695 |
And I think there was something not running at the
|
| 1696 |
|
| 1697 |
425
|
| 1698 |
-
00:20:19,
|
| 1699 |
right time.
|
| 1700 |
|
| 1701 |
426
|
| 1702 |
-
00:20:19,
|
| 1703 |
Anyway, Deepgram, I looked at how they actually handle that
|
| 1704 |
|
| 1705 |
427
|
| 1706 |
-
00:20:
|
| 1707 |
because it worked out of the box when other stuff
|
| 1708 |
|
| 1709 |
428
|
| 1710 |
-
00:20:25,
|
| 1711 |
didn't.
|
| 1712 |
|
| 1713 |
429
|
| 1714 |
-
00:20:26,
|
| 1715 |
And it was quite a clever little mechanism.
|
| 1716 |
|
| 1717 |
430
|
| 1718 |
-
00:20:29,
|
| 1719 |
And but more so than that, the accuracy was brilliant.
|
| 1720 |
|
| 1721 |
431
|
| 1722 |
-
00:20:32,
|
| 1723 |
Now what am I what am I doing here?
|
| 1724 |
|
| 1725 |
432
|
| 1726 |
-
00:20:33,
|
| 1727 |
This is gonna be a twenty minute audio sample.
|
| 1728 |
|
| 1729 |
433
|
| 1730 |
-
00:20:38,
|
| 1731 |
And I'm I think I've done one or two of
|
| 1732 |
|
| 1733 |
434
|
| 1734 |
-
00:20:42,
|
| 1735 |
these before, but I did it with short, snappy voice
|
| 1736 |
|
| 1737 |
435
|
| 1738 |
-
00:20:47,
|
| 1739 |
notes.
|
| 1740 |
|
| 1741 |
436
|
| 1742 |
-
00:20:47,
|
| 1743 |
This is kind of long form.
|
| 1744 |
|
| 1745 |
437
|
| 1746 |
-
00:20:49,
|
| 1747 |
This actually might be a better approximation for what's useful
|
| 1748 |
|
| 1749 |
438
|
| 1750 |
-
00:20:
|
| 1751 |
to me than voice memos.
|
| 1752 |
|
| 1753 |
439
|
| 1754 |
-
00:20:53,
|
| 1755 |
Like, I need to buy three liters of milk tomorrow
|
| 1756 |
|
| 1757 |
440
|
| 1758 |
-
00:20:56,
|
| 1759 |
and peter bread, which is probably how half my voice
|
| 1760 |
|
| 1761 |
441
|
| 1762 |
-
00:21:00,
|
| 1763 |
notes sound.
|
| 1764 |
|
| 1765 |
442
|
| 1766 |
-
00:21:00,
|
| 1767 |
Like if anyone were to find my phone they'd be
|
| 1768 |
|
| 1769 |
443
|
| 1770 |
-
00:21:04,
|
| 1771 |
like this is the most boring person in the world.
|
| 1772 |
|
| 1773 |
444
|
| 1774 |
-
00:21:06,
|
| 1775 |
Although actually there are some journaling thoughts as well, but
|
| 1776 |
|
| 1777 |
445
|
| 1778 |
-
00:21:10,
|
| 1779 |
it's a lot of content like that.
|
| 1780 |
|
| 1781 |
446
|
| 1782 |
-
00:21:11,
|
| 1783 |
And the probably for the evaluation, the most useful thing
|
| 1784 |
|
| 1785 |
447
|
| 1786 |
-
00:21:14,
|
| 1787 |
is slightly obscure tech, GitHub, Nucleano, hugging face, not so
|
| 1788 |
|
| 1789 |
448
|
| 1790 |
-
00:21:21,
|
| 1791 |
obscure that it's not gonna have a chance of knowing
|
| 1792 |
|
| 1793 |
449
|
| 1794 |
-
00:21:24,
|
| 1795 |
it, but hopefully sufficiently well known that the model should
|
| 1796 |
|
| 1797 |
450
|
| 1798 |
-
00:21:27,
|
| 1799 |
get it.
|
| 1800 |
|
| 1801 |
451
|
| 1802 |
-
00:21:27,
|
| 1803 |
I tried to do a little bit of speaking really
|
| 1804 |
|
| 1805 |
452
|
| 1806 |
-
00:21:
|
| 1807 |
fast and speaking very slowly.
|
| 1808 |
|
| 1809 |
453
|
| 1810 |
-
00:21:32,
|
| 1811 |
Would say in general, I've spoken, delivered this at a
|
| 1812 |
|
| 1813 |
454
|
| 1814 |
-
00:21:35,
|
| 1815 |
faster pace than I usually would owing to strong coffee
|
| 1816 |
|
| 1817 |
455
|
| 1818 |
-
00:21:39,
|
| 1819 |
flowing through my bloodstream.
|
| 1820 |
|
| 1821 |
456
|
| 1822 |
-
00:21:41,
|
| 1823 |
And the thing that I'm not gonna get in this
|
| 1824 |
|
| 1825 |
457
|
| 1826 |
-
00:21:43,
|
| 1827 |
benchmark is background noise, which in my first take that
|
| 1828 |
|
| 1829 |
458
|
| 1830 |
-
00:21:46,
|
| 1831 |
I had to get rid of, my wife came in
|
| 1832 |
|
| 1833 |
459
|
| 1834 |
-
00:21:48,
|
| 1835 |
with my son and for a good night kiss.
|
| 1836 |
|
| 1837 |
460
|
| 1838 |
-
00:21:51,
|
| 1839 |
And that actually would have been super helpful to get
|
| 1840 |
|
| 1841 |
461
|
| 1842 |
-
00:21:55,
|
| 1843 |
in because it was non diarized or if we had
|
| 1844 |
|
| 1845 |
462
|
| 1846 |
-
00:21:57,
|
| 1847 |
diarization.
|
| 1848 |
|
| 1849 |
463
|
| 1850 |
-
00:21:59,
|
| 1851 |
A female, I could say, I want the male voice
|
| 1852 |
|
| 1853 |
464
|
| 1854 |
-
00:22:01,
|
| 1855 |
and that wasn't intended for transcription.
|
| 1856 |
|
| 1857 |
465
|
| 1858 |
-
00:22:04,
|
| 1859 |
And we're not going to get background noise like people
|
| 1860 |
|
| 1861 |
466
|
| 1862 |
-
00:22:06,
|
| 1863 |
honking their horns, which is something I've done in my
|
| 1864 |
|
| 1865 |
467
|
| 1866 |
-
00:22:09,
|
| 1867 |
main data set where I am trying to go back
|
| 1868 |
|
| 1869 |
468
|
| 1870 |
-
00:22:11,
|
| 1871 |
to some of my voice notes, annotate them and run
|
| 1872 |
|
| 1873 |
469
|
| 1874 |
-
00:22:15,
|
| 1875 |
a benchmark.
|
| 1876 |
|
| 1877 |
470
|
| 1878 |
-
00:22:15,
|
| 1879 |
But this is going to be just a pure quick
|
| 1880 |
|
| 1881 |
471
|
| 1882 |
-
00:22:18,
|
| 1883 |
test.
|
| 1884 |
|
| 1885 |
472
|
| 1886 |
-
00:22:19,
|
| 1887 |
And as someone I'm working on a voice note idea.
|
| 1888 |
|
| 1889 |
473
|
| 1890 |
-
00:22:24,
|
| 1891 |
That's my sort of end motivation besides thinking it's an
|
| 1892 |
|
| 1893 |
474
|
| 1894 |
-
00:22:28,
|
| 1895 |
absolutely outstanding technology that's coming to viability.
|
| 1896 |
|
| 1897 |
475
|
| 1898 |
-
00:22:31,
|
| 1899 |
And really, I know this sounds cheesy, can actually have
|
| 1900 |
|
| 1901 |
476
|
| 1902 |
-
00:22:34,
|
| 1903 |
a very transformative effect.
|
| 1904 |
|
| 1905 |
477
|
| 1906 |
-
00:22:
|
| 1907 |
Voice technology has been life changing for folks living with
|
| 1908 |
|
| 1909 |
478
|
| 1910 |
-
00:22:
|
| 1911 |
disabilities.
|
| 1912 |
|
| 1913 |
479
|
| 1914 |
-
00:22:
|
| 1915 |
And I think there's something really nice about the fact
|
| 1916 |
|
| 1917 |
480
|
| 1918 |
-
00:22:48,
|
| 1919 |
that it can also benefit folks who are able-bodied and
|
| 1920 |
|
| 1921 |
481
|
| 1922 |
-
00:22:52,
|
| 1923 |
we can all in different ways make this tech as
|
| 1924 |
|
| 1925 |
482
|
| 1926 |
-
00:22:57,
|
| 1927 |
useful as possible regardless of the exact way that we're
|
| 1928 |
|
| 1929 |
483
|
| 1930 |
-
00:23:00,
|
| 1931 |
using it.
|
| 1932 |
|
| 1933 |
484
|
| 1934 |
-
00:23:02,
|
| 1935 |
And I think there's something very powerful in that, and
|
| 1936 |
|
| 1937 |
485
|
| 1938 |
-
00:23:04,
|
| 1939 |
it can be very cool.
|
| 1940 |
|
| 1941 |
486
|
| 1942 |
-
00:23:06,
|
| 1943 |
I see huge potential.
|
| 1944 |
|
| 1945 |
487
|
| 1946 |
-
00:23:07,
|
| 1947 |
What excites me about voice tech?
|
| 1948 |
|
| 1949 |
488
|
| 1950 |
-
00:23:09,
|
| 1951 |
A lot of things actually.
|
| 1952 |
|
| 1953 |
489
|
| 1954 |
-
00:23:12,
|
| 1955 |
Firstly, the fact that it's cheap and accurate, as I
|
| 1956 |
|
| 1957 |
490
|
| 1958 |
-
00:23:14,
|
| 1959 |
mentioned at the very start of this, and it's getting
|
| 1960 |
|
| 1961 |
491
|
| 1962 |
-
00:23:17,
|
| 1963 |
better and better with stuff like accent handling.
|
| 1964 |
|
| 1965 |
492
|
| 1966 |
-
00:23:20,
|
| 1967 |
I'm not sure my fine tune will actually ever come
|
| 1968 |
|
| 1969 |
493
|
| 1970 |
-
00:23:23,
|
| 1971 |
to fruition in the sense that I'll use it day
|
| 1972 |
|
| 1973 |
494
|
| 1974 |
-
00:23:25,
|
| 1975 |
to day as I imagine.
|
| 1976 |
|
| 1977 |
495
|
| 1978 |
-
00:23:26,
|
| 1979 |
I get like superb, flawless words error rates because I'm
|
| 1980 |
|
| 1981 |
496
|
| 1982 |
-
00:23:30,
|
| 1983 |
just kind of skeptical about local speech to text, as
|
| 1984 |
|
| 1985 |
497
|
| 1986 |
-
00:23:
|
| 1987 |
I mentioned.
|
| 1988 |
|
| 1989 |
498
|
| 1990 |
-
00:23:36,
|
| 1991 |
And I think the pace of innovation and improvement in
|
| 1992 |
|
| 1993 |
499
|
| 1994 |
-
00:23:39,
|
| 1995 |
the models, the main reasons for fine tuning from what
|
| 1996 |
|
| 1997 |
500
|
| 1998 |
-
00:23:42,
|
| 1999 |
I've seen have been people who are something that really
|
| 2000 |
|
| 2001 |
501
|
| 2002 |
-
00:23:46,
|
| 2003 |
blows blows my mind about ASR is the idea that
|
| 2004 |
|
| 2005 |
502
|
| 2006 |
-
00:23:50,
|
| 2007 |
it's inherently ailingual or multilingual, phonetic based.
|
| 2008 |
|
| 2009 |
503
|
| 2010 |
-
00:23:56,
|
| 2011 |
So as folks who use speak very obscure languages that
|
| 2012 |
|
| 2013 |
504
|
| 2014 |
-
00:24:00,
|
| 2015 |
there may be very there might be a paucity of
|
| 2016 |
|
| 2017 |
505
|
| 2018 |
-
00:24:02,
|
| 2019 |
training data or almost none at all, and therefore the
|
| 2020 |
|
| 2021 |
506
|
| 2022 |
-
00:24:05,
|
| 2023 |
accuracy is significantly reduced.
|
| 2024 |
|
| 2025 |
507
|
| 2026 |
-
00:24:06,
|
| 2027 |
Or folks in very critical environments, I know there are
|
| 2028 |
|
| 2029 |
508
|
| 2030 |
-
00:24:11,
|
| 2031 |
this is used extensively in medical transcription and dispatcher work
|
| 2032 |
|
| 2033 |
509
|
| 2034 |
-
00:24:15,
|
| 2035 |
as, you know the call centers who send out ambulances
|
| 2036 |
|
| 2037 |
510
|
| 2038 |
-
00:24:19,
|
| 2039 |
etc.
|
| 2040 |
|
| 2041 |
511
|
| 2042 |
-
00:24:20,
|
| 2043 |
Where accuracy is absolutely paramount and in the case of
|
| 2044 |
|
| 2045 |
512
|
| 2046 |
-
00:24:23,
|
| 2047 |
doctors radiologists they might be using very specialized vocab all
|
| 2048 |
|
| 2049 |
513
|
| 2050 |
-
00:24:27,
|
| 2051 |
the time.
|
| 2052 |
|
| 2053 |
514
|
| 2054 |
-
00:24:28,
|
| 2055 |
So those are kind of the main two things, and
|
| 2056 |
|
| 2057 |
515
|
| 2058 |
-
00:24:30,
|
| 2059 |
I'm not sure that really just for trying to make
|
| 2060 |
|
| 2061 |
516
|
| 2062 |
-
00:24:32,
|
| 2063 |
it better on a few random tech words with my
|
| 2064 |
|
| 2065 |
517
|
| 2066 |
-
00:24:36,
|
| 2067 |
slightly I mean, I have an accent, but, like, not,
|
| 2068 |
|
| 2069 |
518
|
| 2070 |
-
00:24:39,
|
| 2071 |
you know, an accent that a few other million people
|
| 2072 |
|
| 2073 |
519
|
| 2074 |
-
00:24:42,
|
| 2075 |
have ish.
|
| 2076 |
|
| 2077 |
520
|
| 2078 |
-
00:24:44,
|
| 2079 |
I'm not sure that my little fine tune is gonna
|
| 2080 |
|
| 2081 |
521
|
| 2082 |
-
00:24:
|
| 2083 |
actually like, the bump in word error reduction, if I
|
| 2084 |
|
| 2085 |
522
|
| 2086 |
-
00:24:52,
|
| 2087 |
ever actually figure out how to do it and get
|
| 2088 |
|
| 2089 |
523
|
| 2090 |
-
00:24:54,
|
| 2091 |
it up to the cloud, by the time we've done
|
| 2092 |
|
| 2093 |
524
|
| 2094 |
-
00:24:56,
|
| 2095 |
that, I suspect that the next generation of ASR will
|
| 2096 |
|
| 2097 |
525
|
| 2098 |
-
00:
|
| 2099 |
just be so good that it will kind of be,
|
| 2100 |
|
| 2101 |
526
|
| 2102 |
-
00:25:
|
| 2103 |
well, that would have been cool if it worked out,
|
| 2104 |
|
| 2105 |
527
|
| 2106 |
-
00:25:
|
| 2107 |
but I'll just use this instead.
|
| 2108 |
|
| 2109 |
528
|
| 2110 |
-
00:25:05,
|
| 2111 |
So that's gonna be it for today's episode of voice
|
| 2112 |
|
| 2113 |
529
|
| 2114 |
-
00:25:10,
|
| 2115 |
training data.
|
| 2116 |
|
| 2117 |
530
|
| 2118 |
-
00:25:11,
|
| 2119 |
Single, long shot evaluation.
|
| 2120 |
|
| 2121 |
531
|
| 2122 |
-
00:25:14,
|
| 2123 |
Who am I gonna compare?
|
| 2124 |
|
| 2125 |
532
|
| 2126 |
-
00:25:16,
|
| 2127 |
Whisper is always good as a benchmark, but I'm more
|
| 2128 |
|
| 2129 |
533
|
| 2130 |
-
00:25:18,
|
| 2131 |
interested in seeing Whisper head to head with two things
|
| 2132 |
|
| 2133 |
534
|
| 2134 |
-
00:25:22,
|
| 2135 |
really.
|
| 2136 |
|
| 2137 |
535
|
| 2138 |
-
00:25:23,
|
| 2139 |
One is Whisper variants.
|
| 2140 |
|
| 2141 |
536
|
| 2142 |
-
00:25:25,
|
| 2143 |
So you've got these projects like Faster Whisper.
|
| 2144 |
|
| 2145 |
537
|
| 2146 |
-
00:25:29,
|
| 2147 |
Distill Whisper.
|
| 2148 |
|
| 2149 |
538
|
| 2150 |
-
00:25:
|
| 2151 |
It's a bit confusing.
|
| 2152 |
|
| 2153 |
539
|
| 2154 |
-
00:25:30,
|
| 2155 |
There's a whole bunch of them.
|
| 2156 |
|
| 2157 |
540
|
| 2158 |
-
00:25:32,
|
| 2159 |
And the emerging ASRs, which are also a thing.
|
| 2160 |
|
| 2161 |
541
|
| 2162 |
-
00:25:35,
|
| 2163 |
My intention for this is I'm not sure I'm gonna
|
| 2164 |
|
| 2165 |
542
|
| 2166 |
-
00:25:37,
|
| 2167 |
have the time in any point in the foreseeable future
|
| 2168 |
|
| 2169 |
543
|
| 2170 |
-
00:25:39,
|
| 2171 |
to go back to this whole episode and create a
|
| 2172 |
|
| 2173 |
544
|
| 2174 |
-
00:25:44,
|
| 2175 |
proper source truth where I fix everything.
|
| 2176 |
|
| 2177 |
545
|
| 2178 |
-
00:25:49,
|
| 2179 |
Might do it if I can get one transcription that's
|
| 2180 |
|
| 2181 |
546
|
| 2182 |
-
00:25:51,
|
| 2183 |
sufficiently close to perfection.
|
| 2184 |
|
| 2185 |
547
|
| 2186 |
-
00:25:
|
| 2187 |
But what I would actually love to do on Hugging
|
| 2188 |
|
| 2189 |
548
|
| 2190 |
-
00:25:58,
|
| 2191 |
Face, I think would be a great probably how I
|
| 2192 |
|
| 2193 |
549
|
| 2194 |
-
00:26:00,
|
| 2195 |
might visualize this is having the audio waveform play and
|
| 2196 |
|
| 2197 |
550
|
| 2198 |
-
00:26:04,
|
| 2199 |
then have the transcript for each model below it and
|
| 2200 |
|
| 2201 |
551
|
| 2202 |
-
00:26:08,
|
| 2203 |
maybe even a, like, you know, to scale and maybe
|
| 2204 |
|
| 2205 |
552
|
| 2206 |
-
00:26:13,
|
| 2207 |
even a local one as well, like local whisper versus
|
| 2208 |
|
| 2209 |
553
|
| 2210 |
-
00:26:16,
|
| 2211 |
OpenAI API, etcetera.
|
| 2212 |
|
| 2213 |
554
|
| 2214 |
-
00:26:19,
|
| 2215 |
And I can then actually listen back to segments or
|
| 2216 |
|
| 2217 |
555
|
| 2218 |
-
00:26:23,
|
| 2219 |
anyone who wants to can listen back to segments of
|
| 2220 |
|
| 2221 |
556
|
| 2222 |
-
00:26:25,
|
| 2223 |
this recording and see where a particular model struggled and
|
| 2224 |
|
| 2225 |
557
|
| 2226 |
-
00:26:30,
|
| 2227 |
others didn't as well as the sort of headline finding
|
| 2228 |
|
| 2229 |
558
|
| 2230 |
-
00:26:33,
|
| 2231 |
of which had the best W E R but that
|
| 2232 |
|
| 2233 |
559
|
| 2234 |
-
00:26:35,
|
| 2235 |
would require the source of truth.
|
| 2236 |
|
| 2237 |
560
|
| 2238 |
-
00:26:37,
|
| 2239 |
Okay, that's it.
|
| 2240 |
|
| 2241 |
561
|
| 2242 |
-
00:26:38,
|
| 2243 |
I hope this was, I don't know, maybe useful for
|
| 2244 |
|
| 2245 |
562
|
| 2246 |
-
00:26:
|
| 2247 |
other folks interested in STT.
|
| 2248 |
|
| 2249 |
563
|
| 2250 |
-
00:26:
|
| 2251 |
You want to see I always think I've just said
|
| 2252 |
|
| 2253 |
564
|
| 2254 |
-
00:26:
|
| 2255 |
it as something I didn't intend to.
|
| 2256 |
|
| 2257 |
565
|
| 2258 |
-
00:26:47,
|
| 2259 |
STT, I said for those.
|
| 2260 |
|
| 2261 |
566
|
| 2262 |
-
00:26:49,
|
| 2263 |
Listen carefully, including hopefully the models themselves.
|
| 2264 |
|
| 2265 |
567
|
| 2266 |
-
00:26:53,
|
| 2267 |
This has been myself, Daniel Rosol.
|
| 2268 |
|
| 2269 |
568
|
| 2270 |
-
00:26:55,
|
| 2271 |
For more jumbled repositories about my roving interest in AI
|
| 2272 |
|
| 2273 |
569
|
| 2274 |
-
00:26:59,
|
| 2275 |
but particularly AgenTic, MCP and VoiceTech you can find me
|
| 2276 |
|
| 2277 |
570
|
| 2278 |
-
00:27:04,
|
| 2279 |
on GitHub.
|
| 2280 |
|
| 2281 |
571
|
| 2282 |
-
00:27:
|
| 2283 |
Hugging Face.
|
| 2284 |
|
| 2285 |
572
|
| 2286 |
-
00:27:08,
|
| 2287 |
Where else?
|
| 2288 |
|
| 2289 |
573
|
| 2290 |
-
00:27:
|
| 2291 |
DanielRosel dot com, which is my personal website, as well
|
| 2292 |
|
| 2293 |
574
|
| 2294 |
-
00:27:11,
|
| 2295 |
as this podcast whose name I sadly cannot remember.
|
| 2296 |
|
| 2297 |
575
|
| 2298 |
-
00:27:15,
|
| 2299 |
Until next time.
|
| 2300 |
|
| 2301 |
576
|
| 2302 |
-
00:27:16,
|
| 2303 |
Thanks for listening.
|
| 2304 |
|
|
|
|
| 1 |
1
|
| 2 |
+
00:00:00,000 --> 00:00:06,160
|
| 3 |
Hello and welcome to a audio dataset consisting of one
|
| 4 |
|
| 5 |
2
|
| 6 |
+
00:00:06,160 --> 00:00:08,320
|
| 7 |
single episode of a nonexistent podcast.
|
| 8 |
|
| 9 |
3
|
| 10 |
+
00:00:08,720 --> 00:00:12,800
|
| 11 |
Or it I may append this to a podcast that
|
| 12 |
|
| 13 |
4
|
| 14 |
+
00:00:12,800 --> 00:00:18,734
|
| 15 |
I set up recently regarding my with my thoughts on
|
| 16 |
|
| 17 |
5
|
| 18 |
+
00:00:18,735 --> 00:00:20,735
|
| 19 |
speech tech and A.
|
| 20 |
|
| 21 |
6
|
| 22 |
+
00:00:20,735 --> 00:00:21,134
|
| 23 |
I.
|
| 24 |
|
| 25 |
7
|
| 26 |
+
00:00:21,134 --> 00:00:22,734
|
| 27 |
In particular, more A.
|
| 28 |
|
| 29 |
8
|
| 30 |
+
00:00:22,734 --> 00:00:22,974
|
| 31 |
I.
|
| 32 |
|
| 33 |
9
|
| 34 |
+
00:00:22,974 --> 00:00:23,855
|
| 35 |
And generative A.
|
| 36 |
|
| 37 |
10
|
| 38 |
+
00:00:23,855 --> 00:00:24,015
|
| 39 |
I.
|
| 40 |
|
| 41 |
11
|
| 42 |
+
00:00:24,015 --> 00:00:26,414
|
| 43 |
I would I would say.
|
| 44 |
|
| 45 |
12
|
| 46 |
+
00:00:26,734 --> 00:00:30,789
|
| 47 |
But in any event, the purpose of this voice recording
|
| 48 |
|
| 49 |
13
|
| 50 |
+
00:00:30,789 --> 00:00:35,510
|
| 51 |
is actually to create a lengthy voice sample for a
|
| 52 |
|
| 53 |
14
|
| 54 |
+
00:00:35,510 --> 00:00:38,870
|
| 55 |
quick evaluation, a back of the envelope evaluation, they might
|
| 56 |
|
| 57 |
15
|
| 58 |
+
00:00:38,870 --> 00:00:41,349
|
| 59 |
say, for different speech attacks models.
|
| 60 |
|
| 61 |
16
|
| 62 |
+
00:00:41,349 --> 00:00:43,865
|
| 63 |
I'm doing this because I thought I'd made a great
|
| 64 |
|
| 65 |
17
|
| 66 |
+
00:00:43,865 --> 00:00:47,704
|
| 67 |
breakthrough in my journey with speech tech and that was
|
| 68 |
|
| 69 |
18
|
| 70 |
+
00:00:47,704 --> 00:00:51,305
|
| 71 |
succeeding in the elusive task of fine tuning whisper.
|
| 72 |
|
| 73 |
19
|
| 74 |
+
00:00:51,624 --> 00:00:56,344
|
| 75 |
Whisper is, and I'm to just talk, I'm trying to
|
| 76 |
|
| 77 |
20
|
| 78 |
+
00:00:55,749 --> 00:00:56,709
|
| 79 |
mix up.
|
| 80 |
|
| 81 |
21
|
| 82 |
+
00:00:56,789 --> 00:01:00,310
|
| 83 |
I'm going to try a few different styles of speaking
|
| 84 |
|
| 85 |
22
|
| 86 |
+
00:01:00,310 --> 00:01:02,789
|
| 87 |
whisper something at some points as well.
|
| 88 |
|
| 89 |
23
|
| 90 |
+
00:01:03,270 --> 00:01:06,710
|
| 91 |
And I'll go back to speaking loud in in different
|
| 92 |
|
| 93 |
24
|
| 94 |
+
00:01:06,710 --> 00:01:08,950
|
| 95 |
parts are going to sound really like a crazy person
|
| 96 |
|
| 97 |
25
|
| 98 |
+
00:01:08,950 --> 00:01:12,344
|
| 99 |
because I'm also going to try to speak at different
|
| 100 |
|
| 101 |
26
|
| 102 |
+
00:01:12,904 --> 00:01:17,945
|
| 103 |
pitches and cadences in order to really try to push
|
| 104 |
|
| 105 |
27
|
| 106 |
+
00:01:18,264 --> 00:01:21,065
|
| 107 |
a speech to text model through its paces, which is
|
| 108 |
|
| 109 |
28
|
| 110 |
+
00:01:21,065 --> 00:01:24,529
|
| 111 |
trying to make sense of is this guy just rambling
|
| 112 |
|
| 113 |
29
|
| 114 |
+
00:01:24,529 --> 00:01:29,969
|
| 115 |
on incoherently in one long sentence or are these just
|
| 116 |
|
| 117 |
30
|
| 118 |
+
00:01:29,969 --> 00:01:36,370
|
| 119 |
actually a series of step standalone, standalone, standalone sentences?
|
| 120 |
|
| 121 |
31
|
| 122 |
+
00:01:36,370 --> 00:01:38,050
|
| 123 |
And how is it going to handle step alone?
|
| 124 |
|
| 125 |
32
|
| 126 |
+
00:01:38,050 --> 00:01:38,690
|
| 127 |
That's not a word.
|
| 128 |
|
| 129 |
33
|
| 130 |
+
00:01:39,624 --> 00:01:41,945
|
| 131 |
What happens when you use speech to text and you
|
| 132 |
|
| 133 |
34
|
| 134 |
+
00:01:41,945 --> 00:01:43,304
|
| 135 |
use a fake word?
|
| 136 |
|
| 137 |
35
|
| 138 |
+
00:01:43,304 --> 00:01:45,704
|
| 139 |
And then you're like, wait, that's not actually that word
|
| 140 |
|
| 141 |
36
|
| 142 |
+
00:01:45,704 --> 00:01:46,585
|
| 143 |
doesn't exist.
|
| 144 |
|
| 145 |
37
|
| 146 |
+
00:01:46,904 --> 00:01:48,504
|
| 147 |
How does AI handle that?
|
| 148 |
|
| 149 |
38
|
| 150 |
+
00:01:48,504 --> 00:01:53,670
|
| 151 |
And these and more are all the questions that I'm
|
| 152 |
|
| 153 |
39
|
| 154 |
+
00:01:53,670 --> 00:01:55,670
|
| 155 |
seeking to answer in this training data.
|
| 156 |
|
| 157 |
40
|
| 158 |
+
00:01:55,749 --> 00:01:58,469
|
| 159 |
Now, why was I trying to fine tune Whisper?
|
| 160 |
|
| 161 |
41
|
| 162 |
+
00:01:58,469 --> 00:01:59,670
|
| 163 |
And what is Whisper?
|
| 164 |
|
| 165 |
42
|
| 166 |
+
00:01:59,670 --> 00:02:02,630
|
| 167 |
As I said, I'm going to try to record this
|
| 168 |
|
| 169 |
43
|
| 170 |
+
00:02:02,630 --> 00:02:06,564
|
| 171 |
at a couple of different levels of technicality for folks
|
| 172 |
|
| 173 |
44
|
| 174 |
+
00:02:06,564 --> 00:02:11,684
|
| 175 |
who are in the normal world and not totally stuck
|
| 176 |
|
| 177 |
45
|
| 178 |
+
00:02:11,684 --> 00:02:13,684
|
| 179 |
down the rabbit hole of AI, which you have to
|
| 180 |
|
| 181 |
46
|
| 182 |
+
00:02:13,684 --> 00:02:17,605
|
| 183 |
say is a really wonderful rabbit hole to be done.
|
| 184 |
|
| 185 |
47
|
| 186 |
+
00:02:17,764 --> 00:02:20,839
|
| 187 |
It's a really interesting area and speech and voice tech
|
| 188 |
|
| 189 |
48
|
| 190 |
+
00:02:20,839 --> 00:02:24,279
|
| 191 |
is is the aspect of it that I find actually
|
| 192 |
|
| 193 |
49
|
| 194 |
+
00:02:24,279 --> 00:02:27,159
|
| 195 |
most I'm not sure I would say the most interesting
|
| 196 |
|
| 197 |
50
|
| 198 |
+
00:02:27,159 --> 00:02:30,679
|
| 199 |
because there's just so much that is fascinating in AI.
|
| 200 |
|
| 201 |
51
|
| 202 |
+
00:02:31,320 --> 00:02:34,054
|
| 203 |
But the most that I find the most personally transformative
|
| 204 |
|
| 205 |
52
|
| 206 |
+
00:02:34,054 --> 00:02:38,454
|
| 207 |
in terms of the impact that it's had on my
|
| 208 |
|
| 209 |
53
|
| 210 |
+
00:02:38,454 --> 00:02:41,174
|
| 211 |
daily work life and productivity and how I sort of
|
| 212 |
|
| 213 |
54
|
| 214 |
+
00:02:41,174 --> 00:02:41,815
|
| 215 |
work.
|
| 216 |
|
| 217 |
55
|
| 218 |
+
00:02:42,855 --> 00:02:47,420
|
| 219 |
I'm persevering hard with the task of trying to get
|
| 220 |
|
| 221 |
56
|
| 222 |
+
00:02:47,420 --> 00:02:50,859
|
| 223 |
a good solution working for Linux, which if anyone actually
|
| 224 |
|
| 225 |
57
|
| 226 |
+
00:02:50,859 --> 00:02:52,859
|
| 227 |
does listen to this, not just for the training data
|
| 228 |
|
| 229 |
58
|
| 230 |
+
00:02:52,859 --> 00:02:56,620
|
| 231 |
and for the actual content, is sparked.
|
| 232 |
|
| 233 |
59
|
| 234 |
+
00:02:56,620 --> 00:02:59,900
|
| 235 |
I had, besides the fine tune not working, well that
|
| 236 |
|
| 237 |
60
|
| 238 |
+
00:02:59,900 --> 00:03:01,305
|
| 239 |
was the failure.
|
| 240 |
|
| 241 |
61
|
| 242 |
+
00:03:02,424 --> 00:03:06,665
|
| 243 |
I used Claude code because one thinks these days that
|
| 244 |
|
| 245 |
62
|
| 246 |
+
00:03:06,665 --> 00:03:13,200
|
| 247 |
there is nothing short of solving, you know, the the
|
| 248 |
|
| 249 |
63
|
| 250 |
+
00:03:13,200 --> 00:03:17,519
|
| 251 |
reason of life or something that clause and agentic AI
|
| 252 |
|
| 253 |
64
|
| 254 |
+
00:03:17,519 --> 00:03:19,600
|
| 255 |
can't do, which is not really the case.
|
| 256 |
|
| 257 |
65
|
| 258 |
+
00:03:19,600 --> 00:03:23,119
|
| 259 |
It does seem that way sometimes, but it fails a
|
| 260 |
|
| 261 |
66
|
| 262 |
+
00:03:23,119 --> 00:03:23,679
|
| 263 |
lot as well.
|
| 264 |
|
| 265 |
67
|
| 266 |
+
00:03:23,679 --> 00:03:26,559
|
| 267 |
And this is one of those instances where last week
|
| 268 |
|
| 269 |
68
|
| 270 |
+
00:03:26,559 --> 00:03:30,744
|
| 271 |
I put together an hour of voice training data, basically
|
| 272 |
|
| 273 |
69
|
| 274 |
+
00:03:30,744 --> 00:03:33,385
|
| 275 |
speaking just random things for three minutes.
|
| 276 |
|
| 277 |
70
|
| 278 |
+
00:03:35,385 --> 00:03:38,024
|
| 279 |
It was actually kind of tedious because the texts were
|
| 280 |
|
| 281 |
71
|
| 282 |
+
00:03:38,024 --> 00:03:38,584
|
| 283 |
really weird.
|
| 284 |
|
| 285 |
72
|
| 286 |
+
00:03:38,584 --> 00:03:41,290
|
| 287 |
Some of them were, it was like it was AI
|
| 288 |
|
| 289 |
73
|
| 290 |
+
00:03:41,290 --> 00:03:42,170
|
| 291 |
generated.
|
| 292 |
|
| 293 |
74
|
| 294 |
+
00:03:42,489 --> 00:03:44,809
|
| 295 |
I tried before to read Sherlock Holmes for an hour
|
| 296 |
|
| 297 |
75
|
| 298 |
+
00:03:44,809 --> 00:03:47,609
|
| 299 |
and I just couldn't, I was so bored after ten
|
| 300 |
|
| 301 |
76
|
| 302 |
+
00:03:47,609 --> 00:03:50,489
|
| 303 |
minutes that I was like, okay, no, I'm just gonna
|
| 304 |
|
| 305 |
77
|
| 306 |
+
00:03:50,489 --> 00:03:51,850
|
| 307 |
have to find something else to read.
|
| 308 |
|
| 309 |
78
|
| 310 |
+
00:03:51,850 --> 00:03:58,204
|
| 311 |
So I used a created with AI Studio, VibeCoded, a
|
| 312 |
|
| 313 |
79
|
| 314 |
+
00:03:58,204 --> 00:04:03,084
|
| 315 |
synthetic text generator which actually I thought was probably a
|
| 316 |
|
| 317 |
80
|
| 318 |
+
00:04:03,084 --> 00:04:05,165
|
| 319 |
better way of doing it because it would give me
|
| 320 |
|
| 321 |
81
|
| 322 |
+
00:04:05,165 --> 00:04:08,989
|
| 323 |
more short samples with more varied content.
|
| 324 |
|
| 325 |
82
|
| 326 |
+
00:04:08,989 --> 00:04:11,630
|
| 327 |
So I was like, okay, give me a voice note
|
| 328 |
|
| 329 |
83
|
| 330 |
+
00:04:11,630 --> 00:04:14,829
|
| 331 |
like I'm recording an email, give me a short story
|
| 332 |
|
| 333 |
84
|
| 334 |
+
00:04:14,829 --> 00:04:18,109
|
| 335 |
to read, give me prose to read.
|
| 336 |
|
| 337 |
85
|
| 338 |
+
00:04:18,109 --> 00:04:20,554
|
| 339 |
So I came up with all these different things and
|
| 340 |
|
| 341 |
86
|
| 342 |
+
00:04:20,554 --> 00:04:22,634
|
| 343 |
they added a little timer to it so I could
|
| 344 |
|
| 345 |
87
|
| 346 |
+
00:04:22,634 --> 00:04:24,875
|
| 347 |
see how close I was to one hour.
|
| 348 |
|
| 349 |
88
|
| 350 |
+
00:04:25,835 --> 00:04:29,035
|
| 351 |
And I spent like an hour one afternoon or probably
|
| 352 |
|
| 353 |
89
|
| 354 |
+
00:04:29,035 --> 00:04:33,035
|
| 355 |
two hours by the time you do retakes and whatever
|
| 356 |
|
| 357 |
90
|
| 358 |
+
00:04:33,035 --> 00:04:36,089
|
| 359 |
because you want to it gave me a source of
|
| 360 |
|
| 361 |
91
|
| 362 |
+
00:04:36,089 --> 00:04:39,929
|
| 363 |
truth which I'm not sure if that's the scientific way
|
| 364 |
|
| 365 |
92
|
| 366 |
+
00:04:39,929 --> 00:04:44,089
|
| 367 |
to approach this topic of gathering training data but I
|
| 368 |
|
| 369 |
93
|
| 370 |
+
00:04:44,089 --> 00:04:45,369
|
| 371 |
thought made sense.
|
| 372 |
|
| 373 |
94
|
| 374 |
+
00:04:46,410 --> 00:04:49,384
|
| 375 |
I have a lot of audio data from recording voice
|
| 376 |
|
| 377 |
95
|
| 378 |
+
00:04:49,384 --> 00:04:53,464
|
| 379 |
notes which I've also kind of used, been experimenting with
|
| 380 |
|
| 381 |
96
|
| 382 |
+
00:04:53,464 --> 00:04:54,984
|
| 383 |
using for a different purpose.
|
| 384 |
|
| 385 |
97
|
| 386 |
+
00:04:55,304 --> 00:04:58,665
|
| 387 |
Slightly different annotating task types.
|
| 388 |
|
| 389 |
98
|
| 390 |
+
00:04:58,665 --> 00:05:03,170
|
| 391 |
It's more a text classification experiment or Well, it's more
|
| 392 |
|
| 393 |
99
|
| 394 |
+
00:05:03,170 --> 00:05:03,730
|
| 395 |
than that actually.
|
| 396 |
|
| 397 |
100
|
| 398 |
+
00:05:03,730 --> 00:05:04,929
|
| 399 |
I'm working on a voice app.
|
| 400 |
|
| 401 |
101
|
| 402 |
+
00:05:04,929 --> 00:05:09,249
|
| 403 |
So it's a prototype, I guess, is really more accurate.
|
| 404 |
|
| 405 |
102
|
| 406 |
+
00:05:11,329 --> 00:05:13,889
|
| 407 |
But you can do that and you can work backwards.
|
| 408 |
|
| 409 |
103
|
| 410 |
+
00:05:13,889 --> 00:05:18,274
|
| 411 |
Listen back to a voice note and you painfully go
|
| 412 |
|
| 413 |
104
|
| 414 |
+
00:05:18,274 --> 00:05:21,394
|
| 415 |
through one of those transcribing, where you start and stop
|
| 416 |
|
| 417 |
105
|
| 418 |
+
00:05:21,394 --> 00:05:23,554
|
| 419 |
and scrub around it and you fix the errors, but
|
| 420 |
|
| 421 |
106
|
| 422 |
+
00:05:23,554 --> 00:05:25,795
|
| 423 |
it's really, really pouring to do that.
|
| 424 |
|
| 425 |
107
|
| 426 |
+
00:05:26,035 --> 00:05:27,954
|
| 427 |
So I thought it would be less tedious in the
|
| 428 |
|
| 429 |
108
|
| 430 |
+
00:05:27,954 --> 00:05:31,634
|
| 431 |
long term if I just recorded the source of truth.
|
| 432 |
|
| 433 |
109
|
| 434 |
+
00:05:31,989 --> 00:05:34,309
|
| 435 |
So it gave me these three minutes snippets.
|
| 436 |
|
| 437 |
110
|
| 438 |
+
00:05:34,309 --> 00:05:37,429
|
| 439 |
I recorded them and saved an MP3 and a TXT
|
| 440 |
|
| 441 |
111
|
| 442 |
+
00:05:37,670 --> 00:05:40,230
|
| 443 |
in the same folder and I created an error that
|
| 444 |
|
| 445 |
112
|
| 446 |
+
00:05:40,230 --> 00:05:40,869
|
| 447 |
data.
|
| 448 |
|
| 449 |
113
|
| 450 |
+
00:05:41,910 --> 00:05:44,790
|
| 451 |
So I was very hopeful, quietly, a little bit hopeful
|
| 452 |
|
| 453 |
114
|
| 454 |
+
00:05:44,790 --> 00:05:46,949
|
| 455 |
that I would be able, that I could actually fine
|
| 456 |
|
| 457 |
115
|
| 458 |
+
00:05:46,949 --> 00:05:47,670
|
| 459 |
tune Whisper.
|
| 460 |
|
| 461 |
116
|
| 462 |
+
00:05:48,285 --> 00:05:51,005
|
| 463 |
I want to fine tune Whisper because when I got
|
| 464 |
|
| 465 |
117
|
| 466 |
+
00:05:51,005 --> 00:05:54,924
|
| 467 |
into voice tech last November, my wife was in the
|
| 468 |
|
| 469 |
118
|
| 470 |
+
00:05:54,924 --> 00:05:57,165
|
| 471 |
US and I was alone at home.
|
| 472 |
|
| 473 |
119
|
| 474 |
+
00:05:57,244 --> 00:06:00,924
|
| 475 |
And when crazy people like me do really wild things
|
| 476 |
|
| 477 |
120
|
| 478 |
+
00:06:00,924 --> 00:06:03,900
|
| 479 |
like use voice to tech technology.
|
| 480 |
|
| 481 |
121
|
| 482 |
+
00:06:03,900 --> 00:06:06,859
|
| 483 |
That was basically when I started doing it, I didn't
|
| 484 |
|
| 485 |
122
|
| 486 |
+
00:06:06,859 --> 00:06:09,500
|
| 487 |
feel like a crazy person speaking to myself.
|
| 488 |
|
| 489 |
123
|
| 490 |
+
00:06:09,900 --> 00:06:12,700
|
| 491 |
And my expectations weren't that high.
|
| 492 |
|
| 493 |
124
|
| 494 |
+
00:06:13,100 --> 00:06:17,605
|
| 495 |
I'd used speech tech now and again, tried it out.
|
| 496 |
|
| 497 |
125
|
| 498 |
+
00:06:17,605 --> 00:06:18,804
|
| 499 |
I was like, it'd be really cool if you could
|
| 500 |
|
| 501 |
126
|
| 502 |
+
00:06:18,804 --> 00:06:22,324
|
| 503 |
just like speak into your computer and whatever I tried
|
| 504 |
|
| 505 |
127
|
| 506 |
+
00:06:22,324 --> 00:06:25,845
|
| 507 |
out that had Linux support was just, it was not
|
| 508 |
|
| 509 |
128
|
| 510 |
+
00:06:25,845 --> 00:06:26,725
|
| 511 |
good basically.
|
| 512 |
|
| 513 |
129
|
| 514 |
+
00:06:27,285 --> 00:06:29,444
|
| 515 |
And this blew me away from the first go.
|
| 516 |
|
| 517 |
130
|
| 518 |
+
00:06:29,444 --> 00:06:32,259
|
| 519 |
I mean, it wasn't one hundred percent accurate out of
|
| 520 |
|
| 521 |
131
|
| 522 |
+
00:06:32,259 --> 00:06:34,420
|
| 523 |
the box and it took work, but it was good
|
| 524 |
|
| 525 |
132
|
| 526 |
+
00:06:34,420 --> 00:06:36,739
|
| 527 |
enough that there was a solid foundation and it kind
|
| 528 |
|
| 529 |
133
|
| 530 |
+
00:06:36,739 --> 00:06:41,059
|
| 531 |
of passed that pivot point that it's actually worth doing
|
| 532 |
|
| 533 |
134
|
| 534 |
+
00:06:41,059 --> 00:06:41,540
|
| 535 |
this.
|
| 536 |
|
| 537 |
135
|
| 538 |
+
00:06:41,859 --> 00:06:43,859
|
| 539 |
You know, there's a point where it's so like, the
|
| 540 |
|
| 541 |
136
|
| 542 |
+
00:06:43,859 --> 00:06:46,405
|
| 543 |
transcript is you don't have to get one hundred percent
|
| 544 |
|
| 545 |
137
|
| 546 |
+
00:06:46,405 --> 00:06:49,445
|
| 547 |
accuracy for it to be worth your time for speech
|
| 548 |
|
| 549 |
138
|
| 550 |
+
00:06:49,445 --> 00:06:51,845
|
| 551 |
to text to be a worthwhile addition to your productivity.
|
| 552 |
|
| 553 |
139
|
| 554 |
+
00:06:51,845 --> 00:06:53,605
|
| 555 |
But you do need to get above, let's say, I
|
| 556 |
|
| 557 |
140
|
| 558 |
+
00:06:53,605 --> 00:06:55,045
|
| 559 |
don't know, eighty five percent.
|
| 560 |
|
| 561 |
141
|
| 562 |
+
00:06:55,525 --> 00:06:58,725
|
| 563 |
If it's sixty percent or fifty percent, you inevitably say,
|
| 564 |
|
| 565 |
142
|
| 566 |
+
00:06:58,960 --> 00:07:00,239
|
| 567 |
Screw it, I'll just type it.
|
| 568 |
|
| 569 |
143
|
| 570 |
+
00:07:00,239 --> 00:07:03,600
|
| 571 |
Because you end up missing errors in the transcript and
|
| 572 |
|
| 573 |
144
|
| 574 |
+
00:07:03,600 --> 00:07:04,960
|
| 575 |
it becomes actually worse.
|
| 576 |
|
| 577 |
145
|
| 578 |
+
00:07:04,960 --> 00:07:06,640
|
| 579 |
You end up in a worse position than you started
|
| 580 |
|
| 581 |
146
|
| 582 |
+
00:07:06,640 --> 00:07:06,960
|
| 583 |
with it.
|
| 584 |
|
| 585 |
147
|
| 586 |
+
00:07:06,960 --> 00:07:08,160
|
| 587 |
That's been my experience.
|
| 588 |
|
| 589 |
148
|
| 590 |
+
00:07:08,480 --> 00:07:12,400
|
| 591 |
So I was like, Oh, this is actually really, really
|
| 592 |
|
| 593 |
149
|
| 594 |
+
00:07:12,400 --> 00:07:12,880
|
| 595 |
good now.
|
| 596 |
|
| 597 |
150
|
| 598 |
+
00:07:12,880 --> 00:07:13,600
|
| 599 |
How did that happen?
|
| 600 |
|
| 601 |
151
|
| 602 |
+
00:07:13,600 --> 00:07:17,915
|
| 603 |
And the answer is ASR, Whisper being open sourced and
|
| 604 |
|
| 605 |
152
|
| 606 |
+
00:07:18,634 --> 00:07:21,514
|
| 607 |
the transformer architecture, if you want to go back to
|
| 608 |
|
| 609 |
153
|
| 610 |
+
00:07:21,514 --> 00:07:26,314
|
| 611 |
the underpinnings, which really blows my mind and it's on
|
| 612 |
|
| 613 |
154
|
| 614 |
+
00:07:26,314 --> 00:07:29,750
|
| 615 |
my list to read through that paper.
|
| 616 |
|
| 617 |
155
|
| 618 |
+
00:07:30,309 --> 00:07:35,910
|
| 619 |
All you need is attention as attentively as can be
|
| 620 |
|
| 621 |
156
|
| 622 |
+
00:07:35,910 --> 00:07:39,270
|
| 623 |
done with my limited brain because it's super super high
|
| 624 |
|
| 625 |
157
|
| 626 |
+
00:07:39,270 --> 00:07:42,965
|
| 627 |
level stuff, super advanced stuff, mean.
|
| 628 |
|
| 629 |
158
|
| 630 |
+
00:07:43,205 --> 00:07:48,004
|
| 631 |
That I think of all the things that are fascinating
|
| 632 |
|
| 633 |
159
|
| 634 |
+
00:07:48,004 --> 00:07:52,484
|
| 635 |
about the sudden rise in AI and the dramatic capabilities,
|
| 636 |
|
| 637 |
160
|
| 638 |
+
00:07:53,259 --> 00:07:55,339
|
| 639 |
I find it fascinating that few people are like, hang
|
| 640 |
|
| 641 |
161
|
| 642 |
+
00:07:55,339 --> 00:07:58,220
|
| 643 |
on, you've got this thing that can speak to you
|
| 644 |
|
| 645 |
162
|
| 646 |
+
00:07:58,220 --> 00:07:59,980
|
| 647 |
like a chatbot, an LLM.
|
| 648 |
|
| 649 |
163
|
| 650 |
+
00:08:00,540 --> 00:08:02,780
|
| 651 |
And then you've got image generation.
|
| 652 |
|
| 653 |
164
|
| 654 |
+
00:08:02,780 --> 00:08:03,100
|
| 655 |
Okay.
|
| 656 |
|
| 657 |
165
|
| 658 |
+
00:08:03,100 --> 00:08:07,020
|
| 659 |
So firstly, two things on the surface have nothing in
|
| 660 |
|
| 661 |
166
|
| 662 |
+
00:08:07,020 --> 00:08:07,339
|
| 663 |
common.
|
| 664 |
|
| 665 |
167
|
| 666 |
+
00:08:08,285 --> 00:08:11,964
|
| 667 |
So how did that just happen all at the same
|
| 668 |
|
| 669 |
168
|
| 670 |
+
00:08:11,964 --> 00:08:12,205
|
| 671 |
time?
|
| 672 |
|
| 673 |
169
|
| 674 |
+
00:08:12,205 --> 00:08:15,884
|
| 675 |
And then when you extend that further, you're like, Suno.
|
| 676 |
|
| 677 |
170
|
| 678 |
+
00:08:15,884 --> 00:08:19,405
|
| 679 |
You can sing a song and AI will come up
|
| 680 |
|
| 681 |
171
|
| 682 |
+
00:08:19,405 --> 00:08:21,085
|
| 683 |
with an instrumental.
|
| 684 |
|
| 685 |
172
|
| 686 |
+
00:08:21,405 --> 00:08:23,405
|
| 687 |
And then you've got Whisper and you're like, Wait a
|
| 688 |
|
| 689 |
173
|
| 690 |
+
00:08:23,405 --> 00:08:23,645
|
| 691 |
second.
|
| 692 |
|
| 693 |
174
|
| 694 |
+
00:08:24,020 --> 00:08:28,100
|
| 695 |
How did all this stuff If it's all AI, there
|
| 696 |
|
| 697 |
175
|
| 698 |
+
00:08:28,100 --> 00:08:29,460
|
| 699 |
has to be some commonality.
|
| 700 |
|
| 701 |
176
|
| 702 |
+
00:08:29,460 --> 00:08:35,059
|
| 703 |
Otherwise, are totally different technologies on the surface of it.
|
| 704 |
|
| 705 |
177
|
| 706 |
+
00:08:35,140 --> 00:08:39,304
|
| 707 |
And the transformer architecture is, as far as I know,
|
| 708 |
|
| 709 |
178
|
| 710 |
+
00:08:39,304 --> 00:08:40,184
|
| 711 |
the answer.
|
| 712 |
|
| 713 |
179
|
| 714 |
+
00:08:40,184 --> 00:08:42,905
|
| 715 |
And I can't even say, can't even pretend that I
|
| 716 |
|
| 717 |
180
|
| 718 |
+
00:08:42,905 --> 00:08:47,304
|
| 719 |
really understand what the transformer architecture means in-depth.
|
| 720 |
|
| 721 |
181
|
| 722 |
+
00:08:47,304 --> 00:08:49,785
|
| 723 |
But I have scanned this and as I said, I
|
| 724 |
|
| 725 |
182
|
| 726 |
+
00:08:49,785 --> 00:08:52,799
|
| 727 |
want to print it and really kind of think over
|
| 728 |
|
| 729 |
183
|
| 730 |
+
00:08:52,799 --> 00:08:54,080
|
| 731 |
it at some point.
|
| 732 |
|
| 733 |
184
|
| 734 |
+
00:08:54,799 --> 00:08:58,000
|
| 735 |
And I'll probably feel bad about myself, I think, because
|
| 736 |
|
| 737 |
185
|
| 738 |
+
00:08:58,000 --> 00:08:59,599
|
| 739 |
weren't those guys in twenties?
|
| 740 |
|
| 741 |
186
|
| 742 |
+
00:09:00,240 --> 00:09:01,760
|
| 743 |
Like, that's crazy.
|
| 744 |
|
| 745 |
187
|
| 746 |
+
00:09:02,080 --> 00:09:06,080
|
| 747 |
I think I asked ChatGPT once who wrote that paper
|
| 748 |
|
| 749 |
188
|
| 750 |
+
00:09:06,465 --> 00:09:09,184
|
| 751 |
and how old were they when it was published in
|
| 752 |
|
| 753 |
189
|
| 754 |
+
00:09:09,184 --> 00:09:09,745
|
| 755 |
ArcSiv?
|
| 756 |
|
| 757 |
190
|
| 758 |
+
00:09:09,745 --> 00:09:13,025
|
| 759 |
And I was expecting like, I don't know, what do
|
| 760 |
|
| 761 |
191
|
| 762 |
+
00:09:13,025 --> 00:09:13,505
|
| 763 |
you imagine?
|
| 764 |
|
| 765 |
192
|
| 766 |
+
00:09:13,505 --> 00:09:15,585
|
| 767 |
I personally imagine kind of like, you you have these
|
| 768 |
|
| 769 |
193
|
| 770 |
+
00:09:15,585 --> 00:09:19,665
|
| 771 |
breakthroughs during COVID and things like that, where like these
|
| 772 |
|
| 773 |
194
|
| 774 |
+
00:09:19,665 --> 00:09:22,549
|
| 775 |
kind of really obscure scientists who are in their 50s
|
| 776 |
|
| 777 |
195
|
| 778 |
+
00:09:22,549 --> 00:09:26,790
|
| 779 |
and they've just kind of been laboring in labs and
|
| 780 |
|
| 781 |
196
|
| 782 |
+
00:09:26,790 --> 00:09:29,750
|
| 783 |
wearily in writing and publishing in kind of obscure academic
|
| 784 |
|
| 785 |
197
|
| 786 |
+
00:09:29,750 --> 00:09:30,630
|
| 787 |
publications.
|
| 788 |
|
| 789 |
198
|
| 790 |
+
00:09:30,790 --> 00:09:33,589
|
| 791 |
And they finally hit a big or win a Nobel
|
| 792 |
|
| 793 |
199
|
| 794 |
+
00:09:33,589 --> 00:09:36,155
|
| 795 |
Prize and then their household names.
|
| 796 |
|
| 797 |
200
|
| 798 |
+
00:09:36,554 --> 00:09:38,554
|
| 799 |
So that was kind of what I had in mind.
|
| 800 |
|
| 801 |
201
|
| 802 |
+
00:09:38,554 --> 00:09:42,074
|
| 803 |
That was the mental image I'd formed of the birth
|
| 804 |
|
| 805 |
202
|
| 806 |
+
00:09:42,074 --> 00:09:42,875
|
| 807 |
of ArcSim.
|
| 808 |
|
| 809 |
203
|
| 810 |
+
00:09:42,875 --> 00:09:45,515
|
| 811 |
Like I wasn't expecting twenty somethings in San Francisco.
|
| 812 |
|
| 813 |
204
|
| 814 |
+
00:09:45,515 --> 00:09:48,714
|
| 815 |
I thought that was both very funny, very cool, and
|
| 816 |
|
| 817 |
205
|
| 818 |
+
00:09:48,714 --> 00:09:49,995
|
| 819 |
actually kind of inspiring.
|
| 820 |
|
| 821 |
206
|
| 822 |
+
00:09:50,474 --> 00:09:55,150
|
| 823 |
It's nice to think that people who just you might
|
| 824 |
|
| 825 |
207
|
| 826 |
+
00:09:55,150 --> 00:09:58,429
|
| 827 |
put them in the kind of milieu or bubble or
|
| 828 |
|
| 829 |
208
|
| 830 |
+
00:09:58,429 --> 00:10:02,589
|
| 831 |
world that you are in incredibly in through a series
|
| 832 |
|
| 833 |
209
|
| 834 |
+
00:10:02,589 --> 00:10:05,755
|
| 835 |
of connections that are coming up with such literally world
|
| 836 |
|
| 837 |
210
|
| 838 |
+
00:10:05,755 --> 00:10:07,755
|
| 839 |
changing innovations.
|
| 840 |
|
| 841 |
211
|
| 842 |
+
00:10:07,834 --> 00:10:11,194
|
| 843 |
So that was I thought anyway, that's that that was
|
| 844 |
|
| 845 |
212
|
| 846 |
+
00:10:11,194 --> 00:10:11,755
|
| 847 |
cool.
|
| 848 |
|
| 849 |
213
|
| 850 |
+
00:10:12,155 --> 00:10:12,474
|
| 851 |
Okay.
|
| 852 |
|
| 853 |
214
|
| 854 |
+
00:10:12,474 --> 00:10:13,354
|
| 855 |
Voice training data.
|
| 856 |
|
| 857 |
215
|
| 858 |
+
00:10:13,354 --> 00:10:14,074
|
| 859 |
How are we doing?
|
| 860 |
|
| 861 |
216
|
| 862 |
+
00:10:14,074 --> 00:10:17,275
|
| 863 |
We're about ten minutes, and I'm still talking about voice
|
| 864 |
|
| 865 |
217
|
| 866 |
+
00:10:17,275 --> 00:10:18,155
|
| 867 |
technology.
|
| 868 |
|
| 869 |
218
|
| 870 |
+
00:10:18,554 --> 00:10:22,099
|
| 871 |
So Whisper was brilliant, and I was so excited that
|
| 872 |
|
| 873 |
219
|
| 874 |
+
00:10:22,099 --> 00:10:25,780
|
| 875 |
my first instinct was to guess, like, Oh my gosh,
|
| 876 |
|
| 877 |
220
|
| 878 |
+
00:10:25,780 --> 00:10:27,939
|
| 879 |
I have to get a really good microphone for this.
|
| 880 |
|
| 881 |
221
|
| 882 |
+
00:10:28,099 --> 00:10:31,299
|
| 883 |
So I didn't go on a spending spree because I
|
| 884 |
|
| 885 |
222
|
| 886 |
+
00:10:31,299 --> 00:10:33,219
|
| 887 |
said, I'm gonna have to just wait a month and
|
| 888 |
|
| 889 |
223
|
| 890 |
+
00:10:33,219 --> 00:10:34,660
|
| 891 |
see if I still use this.
|
| 892 |
|
| 893 |
224
|
| 894 |
+
00:10:35,140 --> 00:10:38,795
|
| 895 |
And it just kind of became it's become really part
|
| 896 |
|
| 897 |
225
|
| 898 |
+
00:10:38,795 --> 00:10:40,875
|
| 899 |
of my daily routine.
|
| 900 |
|
| 901 |
226
|
| 902 |
+
00:10:41,674 --> 00:10:44,235
|
| 903 |
Like if I'm writing an email, I'll record a voice
|
| 904 |
|
| 905 |
227
|
| 906 |
+
00:10:44,235 --> 00:10:47,515
|
| 907 |
note and then I've developed and it's nice to see
|
| 908 |
|
| 909 |
228
|
| 910 |
+
00:10:47,515 --> 00:10:50,679
|
| 911 |
that everyone is like developing the same things in parallel.
|
| 912 |
|
| 913 |
229
|
| 914 |
+
00:10:50,679 --> 00:10:53,319
|
| 915 |
That's kind of a weird thing to say, when I
|
| 916 |
|
| 917 |
230
|
| 918 |
+
00:10:53,319 --> 00:11:00,199
|
| 919 |
started working on these prototypes on GitHub, which is where
|
| 920 |
|
| 921 |
231
|
| 922 |
+
00:11:00,199 --> 00:11:03,959
|
| 923 |
I just kind of share very freely and loosely ideas
|
| 924 |
|
| 925 |
232
|
| 926 |
+
00:11:03,959 --> 00:11:06,865
|
| 927 |
and first iterations on concepts.
|
| 928 |
|
| 929 |
233
|
| 930 |
+
00:11:08,944 --> 00:11:10,624
|
| 931 |
And for want of a better word, I called it
|
| 932 |
|
| 933 |
234
|
| 934 |
+
00:11:10,624 --> 00:11:14,865
|
| 935 |
like LLM post processing or clean up or basically a
|
| 936 |
|
| 937 |
235
|
| 938 |
+
00:11:14,865 --> 00:11:17,665
|
| 939 |
system prompt that after you get back the raw text
|
| 940 |
|
| 941 |
236
|
| 942 |
+
00:11:17,665 --> 00:11:21,540
|
| 943 |
from Whisper, you run it through a model and say,
|
| 944 |
|
| 945 |
237
|
| 946 |
+
00:11:21,540 --> 00:11:26,259
|
| 947 |
okay, this is crappy text like add sentence structure and,
|
| 948 |
|
| 949 |
238
|
| 950 |
+
00:11:26,259 --> 00:11:27,379
|
| 951 |
you know, fix it up.
|
| 952 |
|
| 953 |
239
|
| 954 |
+
00:11:27,780 --> 00:11:32,499
|
| 955 |
And now when I'm exploring the different tools that are
|
| 956 |
|
| 957 |
240
|
| 958 |
+
00:11:32,499 --> 00:11:35,554
|
| 959 |
out there that people have built, I see quite a
|
| 960 |
|
| 961 |
241
|
| 962 |
+
00:11:35,554 --> 00:11:39,395
|
| 963 |
number of projects have basically done the same thing.
|
| 964 |
|
| 965 |
242
|
| 966 |
+
00:11:40,674 --> 00:11:43,155
|
| 967 |
Lest that be misconstrued, I'm not saying for a millisecond
|
| 968 |
|
| 969 |
243
|
| 970 |
+
00:11:43,155 --> 00:11:44,515
|
| 971 |
that I inspired them.
|
| 972 |
|
| 973 |
244
|
| 974 |
+
00:11:44,515 --> 00:11:47,954
|
| 975 |
I'm sure this has been a thing that's been integrated
|
| 976 |
|
| 977 |
245
|
| 978 |
+
00:11:47,954 --> 00:11:51,210
|
| 979 |
into tools for a while, but it's the kind of
|
| 980 |
|
| 981 |
246
|
| 982 |
+
00:11:51,210 --> 00:11:53,610
|
| 983 |
thing that when you start using these tools every day,
|
| 984 |
|
| 985 |
247
|
| 986 |
+
00:11:53,610 --> 00:11:57,530
|
| 987 |
the need for it is almost instantly apparent because text
|
| 988 |
|
| 989 |
248
|
| 990 |
+
00:11:57,530 --> 00:12:01,449
|
| 991 |
that doesn't have any punctuation or paragraph spacing takes a
|
| 992 |
|
| 993 |
249
|
| 994 |
+
00:12:01,449 --> 00:12:03,885
|
| 995 |
long time to, you know, it takes so long to
|
| 996 |
|
| 997 |
250
|
| 998 |
+
00:12:03,885 --> 00:12:08,924
|
| 999 |
get it into a presentable email that again, moves speech
|
| 1000 |
|
| 1001 |
251
|
| 1002 |
+
00:12:08,924 --> 00:12:13,005
|
| 1003 |
tech into that before that inflection point where you're like,
|
| 1004 |
|
| 1005 |
252
|
| 1006 |
+
00:12:13,005 --> 00:12:13,885
|
| 1007 |
nah, it's just not worth it.
|
| 1008 |
|
| 1009 |
253
|
| 1010 |
+
00:12:13,885 --> 00:12:16,844
|
| 1011 |
It's like, it'll just be quicker to type this.
|
| 1012 |
|
| 1013 |
254
|
| 1014 |
+
00:12:17,199 --> 00:12:19,760
|
| 1015 |
So it's a big, it's a little touch that actually
|
| 1016 |
|
| 1017 |
255
|
| 1018 |
+
00:12:20,000 --> 00:12:21,120
|
| 1019 |
is a big deal.
|
| 1020 |
|
| 1021 |
256
|
| 1022 |
+
00:12:21,439 --> 00:12:25,360
|
| 1023 |
So I was on Whisper and I've been using Whisper
|
| 1024 |
|
| 1025 |
257
|
| 1026 |
+
00:12:25,360 --> 00:12:27,679
|
| 1027 |
and I kind of early on found a couple of
|
| 1028 |
|
| 1029 |
258
|
| 1030 |
+
00:12:27,679 --> 00:12:28,319
|
| 1031 |
tools.
|
| 1032 |
|
| 1033 |
259
|
| 1034 |
+
00:12:28,319 --> 00:12:30,559
|
| 1035 |
I couldn't find what I was looking for on Linux,
|
| 1036 |
|
| 1037 |
260
|
| 1038 |
+
00:12:30,559 --> 00:12:35,844
|
| 1039 |
which is basically just something that'll run-in the background.
|
| 1040 |
|
| 1041 |
261
|
| 1042 |
+
00:12:35,844 --> 00:12:38,165
|
| 1043 |
You'll give it an API key and it will just
|
| 1044 |
|
| 1045 |
262
|
| 1046 |
+
00:12:38,165 --> 00:12:42,964
|
| 1047 |
like transcribe with like a little key to start and
|
| 1048 |
|
| 1049 |
263
|
| 1050 |
+
00:12:42,964 --> 00:12:43,765
|
| 1051 |
stop the dictation.
|
| 1052 |
|
| 1053 |
264
|
| 1054 |
+
00:12:45,000 --> 00:12:48,360
|
| 1055 |
And the issues where I discovered that like most people
|
| 1056 |
|
| 1057 |
265
|
| 1058 |
+
00:12:48,360 --> 00:12:51,960
|
| 1059 |
involved in creating these projects were very much focused on
|
| 1060 |
|
| 1061 |
266
|
| 1062 |
+
00:12:51,960 --> 00:12:55,720
|
| 1063 |
local models, running Whisper locally because you can.
|
| 1064 |
|
| 1065 |
267
|
| 1066 |
+
00:12:56,199 --> 00:12:58,120
|
| 1067 |
And I tried that a bunch of times and just
|
| 1068 |
|
| 1069 |
268
|
| 1070 |
+
00:12:58,120 --> 00:13:00,974
|
| 1071 |
never got results that were as good as the cloud.
|
| 1072 |
|
| 1073 |
269
|
| 1074 |
+
00:13:01,375 --> 00:13:03,535
|
| 1075 |
And when I began looking at the cost of the
|
| 1076 |
|
| 1077 |
270
|
| 1078 |
+
00:13:03,535 --> 00:13:06,574
|
| 1079 |
speech to text APIs and what I was spending, I
|
| 1080 |
|
| 1081 |
271
|
| 1082 |
+
00:13:06,574 --> 00:13:09,775
|
| 1083 |
just thought there is it's actually, in my opinion, just
|
| 1084 |
|
| 1085 |
272
|
| 1086 |
+
00:13:09,775 --> 00:13:13,080
|
| 1087 |
one of the better deals in API spending in the
|
| 1088 |
|
| 1089 |
273
|
| 1090 |
+
00:13:13,080 --> 00:13:13,400
|
| 1091 |
cloud.
|
| 1092 |
|
| 1093 |
274
|
| 1094 |
+
00:13:13,400 --> 00:13:15,640
|
| 1095 |
Like, it's just not that expensive for very, very good
|
| 1096 |
|
| 1097 |
275
|
| 1098 |
+
00:13:15,640 --> 00:13:19,559
|
| 1099 |
models that are much more, you know, you're gonna be
|
| 1100 |
|
| 1101 |
276
|
| 1102 |
+
00:13:19,559 --> 00:13:22,679
|
| 1103 |
able to run the full model, the latest model versus
|
| 1104 |
|
| 1105 |
277
|
| 1106 |
+
00:13:22,679 --> 00:13:26,525
|
| 1107 |
whatever you can run on your average GPU unless you
|
| 1108 |
|
| 1109 |
278
|
| 1110 |
+
00:13:26,525 --> 00:13:28,765
|
| 1111 |
want to buy a crazy GPU.
|
| 1112 |
|
| 1113 |
279
|
| 1114 |
+
00:13:28,765 --> 00:13:29,964
|
| 1115 |
It doesn't really make sense to me.
|
| 1116 |
|
| 1117 |
280
|
| 1118 |
+
00:13:29,964 --> 00:13:33,084
|
| 1119 |
Privacy is another concern that I know is kind of
|
| 1120 |
|
| 1121 |
281
|
| 1122 |
+
00:13:33,084 --> 00:13:35,245
|
| 1123 |
like a very much a separate thing that people just
|
| 1124 |
|
| 1125 |
282
|
| 1126 |
+
00:13:35,245 --> 00:13:38,765
|
| 1127 |
don't want their voice data and their voice leaving their
|
| 1128 |
|
| 1129 |
283
|
| 1130 |
+
00:13:38,765 --> 00:13:42,380
|
| 1131 |
local environment maybe for regulatory reasons as well.
|
| 1132 |
|
| 1133 |
284
|
| 1134 |
+
00:13:42,620 --> 00:13:43,900
|
| 1135 |
But I'm not in that.
|
| 1136 |
|
| 1137 |
285
|
| 1138 |
+
00:13:44,140 --> 00:13:48,460
|
| 1139 |
I neither really care about people listening to my, grocery
|
| 1140 |
|
| 1141 |
286
|
| 1142 |
+
00:13:48,460 --> 00:13:51,500
|
| 1143 |
list, consisting of, reminding myself that I need to buy
|
| 1144 |
|
| 1145 |
287
|
| 1146 |
+
00:13:51,500 --> 00:13:54,699
|
| 1147 |
more beer, Cheetos, and hummus, which is kind of the
|
| 1148 |
|
| 1149 |
288
|
| 1150 |
+
00:13:55,254 --> 00:13:59,494
|
| 1151 |
three staples of my diet during periods of poor nutrition.
|
| 1152 |
|
| 1153 |
289
|
| 1154 |
+
00:13:59,814 --> 00:14:02,295
|
| 1155 |
But the kind of stuff that I transcribe, it's just
|
| 1156 |
|
| 1157 |
290
|
| 1158 |
+
00:14:02,295 --> 00:14:02,614
|
| 1159 |
not.
|
| 1160 |
|
| 1161 |
291
|
| 1162 |
+
00:14:02,614 --> 00:14:07,734
|
| 1163 |
It's not a privacy thing I'm that sort of sensitive
|
| 1164 |
|
| 1165 |
292
|
| 1166 |
+
00:14:07,734 --> 00:14:13,189
|
| 1167 |
about and I don't do anything so sensitive or secure
|
| 1168 |
|
| 1169 |
293
|
| 1170 |
+
00:14:13,189 --> 00:14:14,710
|
| 1171 |
that requires air capping.
|
| 1172 |
|
| 1173 |
294
|
| 1174 |
+
00:14:15,590 --> 00:14:17,510
|
| 1175 |
I looked at the pricing and especially the kind of
|
| 1176 |
|
| 1177 |
295
|
| 1178 |
+
00:14:17,510 --> 00:14:18,870
|
| 1179 |
older model mini.
|
| 1180 |
|
| 1181 |
296
|
| 1182 |
+
00:14:19,510 --> 00:14:21,830
|
| 1183 |
Some of them are very, very affordable and I did
|
| 1184 |
|
| 1185 |
297
|
| 1186 |
+
00:14:21,830 --> 00:14:26,684
|
| 1187 |
a calculation once with ChatGPT and I was like, okay,
|
| 1188 |
|
| 1189 |
298
|
| 1190 |
+
00:14:26,684 --> 00:14:30,285
|
| 1191 |
this is the API price for I can't remember whatever
|
| 1192 |
|
| 1193 |
299
|
| 1194 |
+
00:14:30,285 --> 00:14:31,324
|
| 1195 |
the model was.
|
| 1196 |
|
| 1197 |
300
|
| 1198 |
+
00:14:31,724 --> 00:14:34,365
|
| 1199 |
Let's say I just go at it like nonstop, which
|
| 1200 |
|
| 1201 |
301
|
| 1202 |
+
00:14:34,365 --> 00:14:35,485
|
| 1203 |
rarely happens.
|
| 1204 |
|
| 1205 |
302
|
| 1206 |
+
00:14:35,564 --> 00:14:38,879
|
| 1207 |
Probably, I would say on average I might dictate thirty
|
| 1208 |
|
| 1209 |
303
|
| 1210 |
+
00:14:38,879 --> 00:14:41,679
|
| 1211 |
to sixty minutes per day if I was probably summing
|
| 1212 |
|
| 1213 |
304
|
| 1214 |
+
00:14:41,679 --> 00:14:47,920
|
| 1215 |
up the emails, documents, outlines, which is a lot, but
|
| 1216 |
|
| 1217 |
305
|
| 1218 |
+
00:14:47,920 --> 00:14:50,079
|
| 1219 |
it's it's still a fairly modest amount.
|
| 1220 |
|
| 1221 |
306
|
| 1222 |
+
00:14:50,079 --> 00:14:51,759
|
| 1223 |
And I was like, well, some days I do go
|
| 1224 |
|
| 1225 |
307
|
| 1226 |
+
00:14:51,759 --> 00:14:54,854
|
| 1227 |
on like one or two days where I've been usually
|
| 1228 |
|
| 1229 |
308
|
| 1230 |
+
00:14:54,854 --> 00:14:56,775
|
| 1231 |
when I'm like kind of out of the house and
|
| 1232 |
|
| 1233 |
309
|
| 1234 |
+
00:14:56,775 --> 00:15:00,455
|
| 1235 |
just have something like I have nothing else to do.
|
| 1236 |
|
| 1237 |
310
|
| 1238 |
+
00:15:00,455 --> 00:15:03,095
|
| 1239 |
Like if I'm at a hospital, we have a newborn
|
| 1240 |
|
| 1241 |
311
|
| 1242 |
+
00:15:03,495 --> 00:15:07,219
|
| 1243 |
and you're waiting for like eight hours and hours for
|
| 1244 |
|
| 1245 |
312
|
| 1246 |
+
00:15:07,219 --> 00:15:08,020
|
| 1247 |
an appointment.
|
| 1248 |
|
| 1249 |
313
|
| 1250 |
+
00:15:08,099 --> 00:15:11,939
|
| 1251 |
And I would probably have listened to podcasts before becoming
|
| 1252 |
|
| 1253 |
314
|
| 1254 |
+
00:15:11,939 --> 00:15:12,900
|
| 1255 |
a speech fanatic.
|
| 1256 |
|
| 1257 |
315
|
| 1258 |
+
00:15:12,900 --> 00:15:15,299
|
| 1259 |
And I'm like, Oh, wait, let me just get down.
|
| 1260 |
|
| 1261 |
316
|
| 1262 |
+
00:15:15,299 --> 00:15:17,299
|
| 1263 |
Let me just get these ideas out of my head.
|
| 1264 |
|
| 1265 |
317
|
| 1266 |
+
00:15:17,460 --> 00:15:20,665
|
| 1267 |
And that's when I'll go on my speech binges.
|
| 1268 |
|
| 1269 |
318
|
| 1270 |
+
00:15:20,665 --> 00:15:22,584
|
| 1271 |
But those are like once every few months, like not
|
| 1272 |
|
| 1273 |
319
|
| 1274 |
+
00:15:22,584 --> 00:15:23,464
|
| 1275 |
frequently.
|
| 1276 |
|
| 1277 |
320
|
| 1278 |
+
00:15:23,704 --> 00:15:25,704
|
| 1279 |
But I said, okay, let's just say if I'm going
|
| 1280 |
|
| 1281 |
321
|
| 1282 |
+
00:15:25,704 --> 00:15:28,104
|
| 1283 |
to price out cloud STT.
|
| 1284 |
|
| 1285 |
322
|
| 1286 |
+
00:15:28,905 --> 00:15:33,420
|
| 1287 |
If I was like dedicated every second of every waking
|
| 1288 |
|
| 1289 |
323
|
| 1290 |
+
00:15:33,420 --> 00:15:37,740
|
| 1291 |
hour to transcribing for some odd reason, I mean I'd
|
| 1292 |
|
| 1293 |
324
|
| 1294 |
+
00:15:37,740 --> 00:15:39,740
|
| 1295 |
have to eat and use the toilet.
|
| 1296 |
|
| 1297 |
325
|
| 1298 |
+
00:15:40,460 --> 00:15:42,620
|
| 1299 |
There's only so many hours I'm awake for.
|
| 1300 |
|
| 1301 |
326
|
| 1302 |
+
00:15:42,620 --> 00:15:46,939
|
| 1303 |
So let's just say a maximum of forty five minutes
|
| 1304 |
|
| 1305 |
327
|
| 1306 |
+
00:15:47,125 --> 00:15:49,125
|
| 1307 |
in the hour, then I said, All right, let's just
|
| 1308 |
|
| 1309 |
328
|
| 1310 |
+
00:15:49,125 --> 00:15:50,085
|
| 1311 |
say fifty.
|
| 1312 |
|
| 1313 |
329
|
| 1314 |
+
00:15:50,564 --> 00:15:51,285
|
| 1315 |
Who knows?
|
| 1316 |
|
| 1317 |
330
|
| 1318 |
+
00:15:51,285 --> 00:15:52,724
|
| 1319 |
You're dictating on the toilet.
|
| 1320 |
|
| 1321 |
331
|
| 1322 |
+
00:15:52,724 --> 00:15:53,525
|
| 1323 |
We do it.
|
| 1324 |
|
| 1325 |
332
|
| 1326 |
+
00:15:53,844 --> 00:15:56,804
|
| 1327 |
So you could just do sixty, but whatever I did
|
| 1328 |
|
| 1329 |
333
|
| 1330 |
+
00:15:57,045 --> 00:16:01,099
|
| 1331 |
and every day, like you're going flat out seven days
|
| 1332 |
|
| 1333 |
334
|
| 1334 |
+
00:16:01,099 --> 00:16:02,540
|
| 1335 |
a week dictating nonstop.
|
| 1336 |
|
| 1337 |
335
|
| 1338 |
+
00:16:02,540 --> 00:16:05,499
|
| 1339 |
I was like, What's my monthly API bill going to
|
| 1340 |
|
| 1341 |
336
|
| 1342 |
+
00:16:05,499 --> 00:16:06,620
|
| 1343 |
be at this price?
|
| 1344 |
|
| 1345 |
337
|
| 1346 |
+
00:16:06,699 --> 00:16:09,259
|
| 1347 |
And it came out to like seventy or eighty bucks.
|
| 1348 |
|
| 1349 |
338
|
| 1350 |
+
00:16:09,259 --> 00:16:12,540
|
| 1351 |
And I was like, Well, that would be an extraordinary
|
| 1352 |
|
| 1353 |
339
|
| 1354 |
+
00:16:12,860 --> 00:16:14,299
|
| 1355 |
amount of dictation.
|
| 1356 |
|
| 1357 |
340
|
| 1358 |
+
00:16:14,299 --> 00:16:18,025
|
| 1359 |
And I would hope that there was some compelling reason
|
| 1360 |
|
| 1361 |
341
|
| 1362 |
+
00:16:18,665 --> 00:16:21,704
|
| 1363 |
worth more than seventy dollars that I embarked upon that
|
| 1364 |
|
| 1365 |
342
|
| 1366 |
+
00:16:21,704 --> 00:16:22,344
|
| 1367 |
project.
|
| 1368 |
|
| 1369 |
343
|
| 1370 |
+
00:16:22,584 --> 00:16:24,505
|
| 1371 |
So given that that's kind of the max point for
|
| 1372 |
|
| 1373 |
344
|
| 1374 |
+
00:16:24,505 --> 00:16:27,224
|
| 1375 |
me I said that's actually very very affordable.
|
| 1376 |
|
| 1377 |
345
|
| 1378 |
+
00:16:27,944 --> 00:16:30,424
|
| 1379 |
Now you're gonna if you want to spec out the
|
| 1380 |
|
| 1381 |
346
|
| 1382 |
+
00:16:30,424 --> 00:16:33,829
|
| 1383 |
costs and you want to do the post processing that
|
| 1384 |
|
| 1385 |
347
|
| 1386 |
+
00:16:33,829 --> 00:16:36,709
|
| 1387 |
I really do feel is valuable, that's going to cost
|
| 1388 |
|
| 1389 |
348
|
| 1390 |
+
00:16:36,709 --> 00:16:37,670
|
| 1391 |
some more as well.
|
| 1392 |
|
| 1393 |
349
|
| 1394 |
+
00:16:37,990 --> 00:16:43,189
|
| 1395 |
Unless you're using Gemini, which needless to say is a
|
| 1396 |
|
| 1397 |
350
|
| 1398 |
+
00:16:43,189 --> 00:16:45,110
|
| 1399 |
random person sitting in Jerusalem.
|
| 1400 |
|
| 1401 |
351
|
| 1402 |
+
00:16:45,775 --> 00:16:49,375
|
| 1403 |
I have no affiliation nor with Google nor Anthropic nor
|
| 1404 |
|
| 1405 |
352
|
| 1406 |
+
00:16:49,375 --> 00:16:52,334
|
| 1407 |
Gemini nor any major tech vendor for that matter.
|
| 1408 |
|
| 1409 |
353
|
| 1410 |
+
00:16:53,775 --> 00:16:57,135
|
| 1411 |
I like Gemini not so much as a everyday model.
|
| 1412 |
|
| 1413 |
354
|
| 1414 |
+
00:16:57,375 --> 00:16:59,854
|
| 1415 |
It's kind of underwhelmed in that respect, I would say.
|
| 1416 |
|
| 1417 |
355
|
| 1418 |
+
00:17:00,299 --> 00:17:02,699
|
| 1419 |
But for multimodal, I think it's got a lot to
|
| 1420 |
|
| 1421 |
356
|
| 1422 |
+
00:17:02,699 --> 00:17:03,259
|
| 1423 |
offer.
|
| 1424 |
|
| 1425 |
357
|
| 1426 |
+
00:17:03,579 --> 00:17:07,099
|
| 1427 |
And I think that the transcribing functionality whereby it can,
|
| 1428 |
|
| 1429 |
358
|
| 1430 |
+
00:17:07,979 --> 00:17:12,300
|
| 1431 |
process audio with a system prompt and both give you
|
| 1432 |
|
| 1433 |
359
|
| 1434 |
+
00:17:12,300 --> 00:17:13,820
|
| 1435 |
transcription that's cleaned up.
|
| 1436 |
|
| 1437 |
360
|
| 1438 |
+
00:17:13,820 --> 00:17:15,259
|
| 1439 |
That reduces two steps to one.
|
| 1440 |
|
| 1441 |
361
|
| 1442 |
+
00:17:15,755 --> 00:17:18,874
|
| 1443 |
And that for me is a very, very big deal.
|
| 1444 |
|
| 1445 |
362
|
| 1446 |
+
00:17:18,875 --> 00:17:22,394
|
| 1447 |
And I feel like even Google hasn't really sort of
|
| 1448 |
|
| 1449 |
363
|
| 1450 |
+
00:17:22,475 --> 00:17:27,115
|
| 1451 |
thought through how useful the that modality is and what
|
| 1452 |
|
| 1453 |
364
|
| 1454 |
+
00:17:27,115 --> 00:17:29,620
|
| 1455 |
kind of use cases you can achieve with it.
|
| 1456 |
|
| 1457 |
365
|
| 1458 |
+
00:17:29,620 --> 00:17:32,259
|
| 1459 |
Because I found in the course of this year just
|
| 1460 |
|
| 1461 |
366
|
| 1462 |
+
00:17:32,259 --> 00:17:37,939
|
| 1463 |
an endless list of really kind of system prompt stuff
|
| 1464 |
|
| 1465 |
367
|
| 1466 |
+
00:17:37,939 --> 00:17:40,820
|
| 1467 |
that I can say, okay, I've used it to capture
|
| 1468 |
|
| 1469 |
368
|
| 1470 |
+
00:17:40,820 --> 00:17:44,035
|
| 1471 |
context data for AI, which is literally I might speak
|
| 1472 |
|
| 1473 |
369
|
| 1474 |
+
00:17:44,035 --> 00:17:46,675
|
| 1475 |
for if I wanted to have a good bank of
|
| 1476 |
|
| 1477 |
370
|
| 1478 |
+
00:17:46,675 --> 00:17:49,955
|
| 1479 |
context data about who knows my childhood.
|
| 1480 |
|
| 1481 |
371
|
| 1482 |
+
00:17:50,354 --> 00:17:54,275
|
| 1483 |
More realistically, maybe my career goals, something that would just
|
| 1484 |
|
| 1485 |
372
|
| 1486 |
+
00:17:54,275 --> 00:17:56,115
|
| 1487 |
be like really boring to type out.
|
| 1488 |
|
| 1489 |
373
|
| 1490 |
+
00:17:56,115 --> 00:18:00,420
|
| 1491 |
So I'll just like sit in my car and record
|
| 1492 |
|
| 1493 |
374
|
| 1494 |
+
00:18:00,420 --> 00:18:01,380
|
| 1495 |
it for ten minutes.
|
| 1496 |
|
| 1497 |
375
|
| 1498 |
+
00:18:01,380 --> 00:18:03,699
|
| 1499 |
And that ten minutes you get a lot of information
|
| 1500 |
|
| 1501 |
376
|
| 1502 |
+
00:18:03,699 --> 00:18:04,339
|
| 1503 |
in.
|
| 1504 |
|
| 1505 |
377
|
| 1506 |
+
00:18:05,539 --> 00:18:07,620
|
| 1507 |
Emails, which is short text.
|
| 1508 |
|
| 1509 |
378
|
| 1510 |
+
00:18:08,580 --> 00:18:10,339
|
| 1511 |
Just there is a whole bunch.
|
| 1512 |
|
| 1513 |
379
|
| 1514 |
+
00:18:10,340 --> 00:18:13,295
|
| 1515 |
And all these workflows kind of require a little bit
|
| 1516 |
|
| 1517 |
380
|
| 1518 |
+
00:18:13,295 --> 00:18:15,054
|
| 1519 |
of treatment afterwards and different treatment.
|
| 1520 |
|
| 1521 |
381
|
| 1522 |
+
00:18:15,054 --> 00:18:18,334
|
| 1523 |
My context pipeline is kind of like just extract the
|
| 1524 |
|
| 1525 |
382
|
| 1526 |
+
00:18:18,334 --> 00:18:19,215
|
| 1527 |
bare essentials.
|
| 1528 |
|
| 1529 |
383
|
| 1530 |
+
00:18:19,215 --> 00:18:22,094
|
| 1531 |
You end up with me talking very loosely about sort
|
| 1532 |
|
| 1533 |
384
|
| 1534 |
+
00:18:22,094 --> 00:18:24,414
|
| 1535 |
of what I've done in my career, where I've worked,
|
| 1536 |
|
| 1537 |
385
|
| 1538 |
+
00:18:24,414 --> 00:18:25,374
|
| 1539 |
where I might like to work.
|
| 1540 |
|
| 1541 |
386
|
| 1542 |
+
00:18:25,920 --> 00:18:29,039
|
| 1543 |
And it goes, it condenses that down to very robotic
|
| 1544 |
|
| 1545 |
387
|
| 1546 |
+
00:18:29,039 --> 00:18:32,640
|
| 1547 |
language that is easy to chunk parse and maybe put
|
| 1548 |
|
| 1549 |
388
|
| 1550 |
+
00:18:32,640 --> 00:18:33,920
|
| 1551 |
into a vector database.
|
| 1552 |
|
| 1553 |
389
|
| 1554 |
+
00:18:33,920 --> 00:18:36,160
|
| 1555 |
Daniel has worked in technology.
|
| 1556 |
|
| 1557 |
390
|
| 1558 |
+
00:18:36,160 --> 00:18:39,760
|
| 1559 |
Daniel has been working in, know, stuff like that.
|
| 1560 |
|
| 1561 |
391
|
| 1562 |
+
00:18:39,760 --> 00:18:42,975
|
| 1563 |
That's not how you would speak, but I figure it's
|
| 1564 |
|
| 1565 |
392
|
| 1566 |
+
00:18:42,975 --> 00:18:46,414
|
| 1567 |
probably easier to parse for, after all, robots.
|
| 1568 |
|
| 1569 |
393
|
| 1570 |
+
00:18:46,735 --> 00:18:48,654
|
| 1571 |
So we've almost got to twenty minutes and this is
|
| 1572 |
|
| 1573 |
394
|
| 1574 |
+
00:18:48,654 --> 00:18:53,054
|
| 1575 |
actually a success because I wasted twenty minutes of my
|
| 1576 |
|
| 1577 |
395
|
| 1578 |
+
00:18:53,455 --> 00:18:57,120
|
| 1579 |
of the evening speaking into you in microphone and the
|
| 1580 |
|
| 1581 |
396
|
| 1582 |
+
00:18:57,120 --> 00:19:01,039
|
| 1583 |
levels were shot and was clipping and I said I
|
| 1584 |
|
| 1585 |
397
|
| 1586 |
+
00:19:01,039 --> 00:19:02,320
|
| 1587 |
can't really do an evaluation.
|
| 1588 |
|
| 1589 |
398
|
| 1590 |
+
00:19:02,320 --> 00:19:03,360
|
| 1591 |
I have to be fair.
|
| 1592 |
|
| 1593 |
399
|
| 1594 |
+
00:19:03,360 --> 00:19:06,320
|
| 1595 |
I have to give the models a chance to do
|
| 1596 |
|
| 1597 |
400
|
| 1598 |
+
00:19:06,320 --> 00:19:06,880
|
| 1599 |
their thing.
|
| 1600 |
|
| 1601 |
401
|
| 1602 |
+
00:19:07,425 --> 00:19:09,505
|
| 1603 |
What am I hoping to achieve in this?
|
| 1604 |
|
| 1605 |
402
|
| 1606 |
+
00:19:09,505 --> 00:19:11,584
|
| 1607 |
Okay, my fine tune was a dud as mentioned.
|
| 1608 |
|
| 1609 |
403
|
| 1610 |
+
00:19:11,665 --> 00:19:15,185
|
| 1611 |
Deepgram STT, I'm really, really hopeful that this prototype will
|
| 1612 |
|
| 1613 |
404
|
| 1614 |
+
00:19:15,185 --> 00:19:17,985
|
| 1615 |
work and it's a build in public open source so
|
| 1616 |
|
| 1617 |
405
|
| 1618 |
+
00:19:17,985 --> 00:19:20,304
|
| 1619 |
anyone is welcome to use it if I make anything
|
| 1620 |
|
| 1621 |
406
|
| 1622 |
+
00:19:20,304 --> 00:19:20,625
|
| 1623 |
good.
|
| 1624 |
|
| 1625 |
407
|
| 1626 |
+
00:19:21,560 --> 00:19:23,800
|
| 1627 |
But that was really exciting for me last night when
|
| 1628 |
|
| 1629 |
408
|
| 1630 |
+
00:19:23,800 --> 00:19:28,840
|
| 1631 |
after hours of trying my own prototype, seeing someone just
|
| 1632 |
|
| 1633 |
409
|
| 1634 |
+
00:19:28,840 --> 00:19:32,039
|
| 1635 |
made something that works like that, you you're not gonna
|
| 1636 |
|
| 1637 |
410
|
| 1638 |
+
00:19:32,039 --> 00:19:36,374
|
| 1639 |
have to build a custom conda environment and image.
|
| 1640 |
|
| 1641 |
411
|
| 1642 |
+
00:19:36,374 --> 00:19:39,974
|
| 1643 |
I have AMD GPU which makes things much more complicated.
|
| 1644 |
|
| 1645 |
412
|
| 1646 |
+
00:19:40,214 --> 00:19:42,614
|
| 1647 |
I didn't find it and I was about to give
|
| 1648 |
|
| 1649 |
413
|
| 1650 |
+
00:19:42,614 --> 00:19:43,894
|
| 1651 |
up and I said, All right, let me just give
|
| 1652 |
|
| 1653 |
414
|
| 1654 |
+
00:19:43,894 --> 00:19:46,455
|
| 1655 |
Deepgram's Linux thing a shot.
|
| 1656 |
|
| 1657 |
415
|
| 1658 |
+
00:19:47,029 --> 00:19:49,589
|
| 1659 |
And if this doesn't work, I'm just gonna go back
|
| 1660 |
|
| 1661 |
416
|
| 1662 |
+
00:19:49,589 --> 00:19:51,349
|
| 1663 |
to trying to vibe code something myself.
|
| 1664 |
|
| 1665 |
417
|
| 1666 |
+
00:19:51,670 --> 00:19:55,509
|
| 1667 |
And when I ran the script, I was using Cloud
|
| 1668 |
|
| 1669 |
418
|
| 1670 |
+
00:19:55,509 --> 00:19:59,029
|
| 1671 |
Code to do the installation process, it ran the script
|
| 1672 |
|
| 1673 |
419
|
| 1674 |
+
00:19:59,029 --> 00:20:01,189
|
| 1675 |
and, oh my gosh, it works just like that.
|
| 1676 |
|
| 1677 |
420
|
| 1678 |
+
00:20:01,824 --> 00:20:05,985
|
| 1679 |
The tricky thing for all those who wants to know
|
| 1680 |
|
| 1681 |
421
|
| 1682 |
+
00:20:05,985 --> 00:20:11,425
|
| 1683 |
all the nitty, ditty, nitty gritty details was that I
|
| 1684 |
|
| 1685 |
422
|
| 1686 |
+
00:20:11,425 --> 00:20:14,624
|
| 1687 |
don't think it was actually struggling with transcription, but pasting
|
| 1688 |
|
| 1689 |
423
|
| 1690 |
+
00:20:14,705 --> 00:20:17,539
|
| 1691 |
Weyland makes life very hard.
|
| 1692 |
|
| 1693 |
424
|
| 1694 |
+
00:20:17,539 --> 00:20:19,140
|
| 1695 |
And I think there was something not running at the
|
| 1696 |
|
| 1697 |
425
|
| 1698 |
+
00:20:19,140 --> 00:20:19,699
|
| 1699 |
right time.
|
| 1700 |
|
| 1701 |
426
|
| 1702 |
+
00:20:19,699 --> 00:20:22,979
|
| 1703 |
Anyway, Deepgram, I looked at how they actually handle that
|
| 1704 |
|
| 1705 |
427
|
| 1706 |
+
00:20:22,979 --> 00:20:25,140
|
| 1707 |
because it worked out of the box when other stuff
|
| 1708 |
|
| 1709 |
428
|
| 1710 |
+
00:20:25,140 --> 00:20:25,779
|
| 1711 |
didn't.
|
| 1712 |
|
| 1713 |
429
|
| 1714 |
+
00:20:26,100 --> 00:20:28,900
|
| 1715 |
And it was quite a clever little mechanism.
|
| 1716 |
|
| 1717 |
430
|
| 1718 |
+
00:20:29,495 --> 00:20:32,135
|
| 1719 |
And but more so than that, the accuracy was brilliant.
|
| 1720 |
|
| 1721 |
431
|
| 1722 |
+
00:20:32,135 --> 00:20:33,574
|
| 1723 |
Now what am I what am I doing here?
|
| 1724 |
|
| 1725 |
432
|
| 1726 |
+
00:20:33,574 --> 00:20:37,175
|
| 1727 |
This is gonna be a twenty minute audio sample.
|
| 1728 |
|
| 1729 |
433
|
| 1730 |
+
00:20:38,375 --> 00:20:42,410
|
| 1731 |
And I'm I think I've done one or two of
|
| 1732 |
|
| 1733 |
434
|
| 1734 |
+
00:20:42,410 --> 00:20:47,130
|
| 1735 |
these before, but I did it with short, snappy voice
|
| 1736 |
|
| 1737 |
435
|
| 1738 |
+
00:20:47,130 --> 00:20:47,610
|
| 1739 |
notes.
|
| 1740 |
|
| 1741 |
436
|
| 1742 |
+
00:20:47,610 --> 00:20:49,370
|
| 1743 |
This is kind of long form.
|
| 1744 |
|
| 1745 |
437
|
| 1746 |
+
00:20:49,449 --> 00:20:51,929
|
| 1747 |
This actually might be a better approximation for what's useful
|
| 1748 |
|
| 1749 |
438
|
| 1750 |
+
00:20:51,929 --> 00:20:53,849
|
| 1751 |
to me than voice memos.
|
| 1752 |
|
| 1753 |
439
|
| 1754 |
+
00:20:53,849 --> 00:20:56,894
|
| 1755 |
Like, I need to buy three liters of milk tomorrow
|
| 1756 |
|
| 1757 |
440
|
| 1758 |
+
00:20:56,894 --> 00:21:00,175
|
| 1759 |
and peter bread, which is probably how half my voice
|
| 1760 |
|
| 1761 |
441
|
| 1762 |
+
00:21:00,175 --> 00:21:00,735
|
| 1763 |
notes sound.
|
| 1764 |
|
| 1765 |
442
|
| 1766 |
+
00:21:00,735 --> 00:21:04,094
|
| 1767 |
Like if anyone were to find my phone they'd be
|
| 1768 |
|
| 1769 |
443
|
| 1770 |
+
00:21:04,094 --> 00:21:05,934
|
| 1771 |
like this is the most boring person in the world.
|
| 1772 |
|
| 1773 |
444
|
| 1774 |
+
00:21:06,015 --> 00:21:10,050
|
| 1775 |
Although actually there are some journaling thoughts as well, but
|
| 1776 |
|
| 1777 |
445
|
| 1778 |
+
00:21:10,050 --> 00:21:11,810
|
| 1779 |
it's a lot of content like that.
|
| 1780 |
|
| 1781 |
446
|
| 1782 |
+
00:21:11,810 --> 00:21:14,610
|
| 1783 |
And the probably for the evaluation, the most useful thing
|
| 1784 |
|
| 1785 |
447
|
| 1786 |
+
00:21:14,610 --> 00:21:21,834
|
| 1787 |
is slightly obscure tech, GitHub, Nucleano, hugging face, not so
|
| 1788 |
|
| 1789 |
448
|
| 1790 |
+
00:21:21,834 --> 00:21:24,474
|
| 1791 |
obscure that it's not gonna have a chance of knowing
|
| 1792 |
|
| 1793 |
449
|
| 1794 |
+
00:21:24,474 --> 00:21:27,194
|
| 1795 |
it, but hopefully sufficiently well known that the model should
|
| 1796 |
|
| 1797 |
450
|
| 1798 |
+
00:21:27,194 --> 00:21:27,834
|
| 1799 |
get it.
|
| 1800 |
|
| 1801 |
451
|
| 1802 |
+
00:21:27,914 --> 00:21:29,995
|
| 1803 |
I tried to do a little bit of speaking really
|
| 1804 |
|
| 1805 |
452
|
| 1806 |
+
00:21:29,995 --> 00:21:32,394
|
| 1807 |
fast and speaking very slowly.
|
| 1808 |
|
| 1809 |
453
|
| 1810 |
+
00:21:32,394 --> 00:21:35,529
|
| 1811 |
Would say in general, I've spoken, delivered this at a
|
| 1812 |
|
| 1813 |
454
|
| 1814 |
+
00:21:35,529 --> 00:21:39,130
|
| 1815 |
faster pace than I usually would owing to strong coffee
|
| 1816 |
|
| 1817 |
455
|
| 1818 |
+
00:21:39,130 --> 00:21:40,570
|
| 1819 |
flowing through my bloodstream.
|
| 1820 |
|
| 1821 |
456
|
| 1822 |
+
00:21:41,130 --> 00:21:43,529
|
| 1823 |
And the thing that I'm not gonna get in this
|
| 1824 |
|
| 1825 |
457
|
| 1826 |
+
00:21:43,529 --> 00:21:46,090
|
| 1827 |
benchmark is background noise, which in my first take that
|
| 1828 |
|
| 1829 |
458
|
| 1830 |
+
00:21:46,090 --> 00:21:48,455
|
| 1831 |
I had to get rid of, my wife came in
|
| 1832 |
|
| 1833 |
459
|
| 1834 |
+
00:21:48,455 --> 00:21:51,495
|
| 1835 |
with my son and for a good night kiss.
|
| 1836 |
|
| 1837 |
460
|
| 1838 |
+
00:21:51,574 --> 00:21:55,094
|
| 1839 |
And that actually would have been super helpful to get
|
| 1840 |
|
| 1841 |
461
|
| 1842 |
+
00:21:55,094 --> 00:21:57,814
|
| 1843 |
in because it was non diarized or if we had
|
| 1844 |
|
| 1845 |
462
|
| 1846 |
+
00:21:57,814 --> 00:21:58,695
|
| 1847 |
diarization.
|
| 1848 |
|
| 1849 |
463
|
| 1850 |
+
00:21:59,334 --> 00:22:01,414
|
| 1851 |
A female, I could say, I want the male voice
|
| 1852 |
|
| 1853 |
464
|
| 1854 |
+
00:22:01,414 --> 00:22:03,094
|
| 1855 |
and that wasn't intended for transcription.
|
| 1856 |
|
| 1857 |
465
|
| 1858 |
+
00:22:04,509 --> 00:22:06,269
|
| 1859 |
And we're not going to get background noise like people
|
| 1860 |
|
| 1861 |
466
|
| 1862 |
+
00:22:06,269 --> 00:22:08,989
|
| 1863 |
honking their horns, which is something I've done in my
|
| 1864 |
|
| 1865 |
467
|
| 1866 |
+
00:22:09,150 --> 00:22:11,870
|
| 1867 |
main data set where I am trying to go back
|
| 1868 |
|
| 1869 |
468
|
| 1870 |
+
00:22:11,870 --> 00:22:15,070
|
| 1871 |
to some of my voice notes, annotate them and run
|
| 1872 |
|
| 1873 |
469
|
| 1874 |
+
00:22:15,070 --> 00:22:15,709
|
| 1875 |
a benchmark.
|
| 1876 |
|
| 1877 |
470
|
| 1878 |
+
00:22:15,709 --> 00:22:18,265
|
| 1879 |
But this is going to be just a pure quick
|
| 1880 |
|
| 1881 |
471
|
| 1882 |
+
00:22:18,265 --> 00:22:19,064
|
| 1883 |
test.
|
| 1884 |
|
| 1885 |
472
|
| 1886 |
+
00:22:19,785 --> 00:22:24,025
|
| 1887 |
And as someone I'm working on a voice note idea.
|
| 1888 |
|
| 1889 |
473
|
| 1890 |
+
00:22:24,025 --> 00:22:28,185
|
| 1891 |
That's my sort of end motivation besides thinking it's an
|
| 1892 |
|
| 1893 |
474
|
| 1894 |
+
00:22:28,185 --> 00:22:31,785
|
| 1895 |
absolutely outstanding technology that's coming to viability.
|
| 1896 |
|
| 1897 |
475
|
| 1898 |
+
00:22:31,785 --> 00:22:34,400
|
| 1899 |
And really, I know this sounds cheesy, can actually have
|
| 1900 |
|
| 1901 |
476
|
| 1902 |
+
00:22:34,400 --> 00:22:36,479
|
| 1903 |
a very transformative effect.
|
| 1904 |
|
| 1905 |
477
|
| 1906 |
+
00:22:37,920 --> 00:22:43,120
|
| 1907 |
Voice technology has been life changing for folks living with
|
| 1908 |
|
| 1909 |
478
|
| 1910 |
+
00:22:43,999 --> 00:22:45,039
|
| 1911 |
disabilities.
|
| 1912 |
|
| 1913 |
479
|
| 1914 |
+
00:22:45,920 --> 00:22:48,545
|
| 1915 |
And I think there's something really nice about the fact
|
| 1916 |
|
| 1917 |
480
|
| 1918 |
+
00:22:48,545 --> 00:22:52,545
|
| 1919 |
that it can also benefit folks who are able-bodied and
|
| 1920 |
|
| 1921 |
481
|
| 1922 |
+
00:22:52,545 --> 00:22:57,904
|
| 1923 |
we can all in different ways make this tech as
|
| 1924 |
|
| 1925 |
482
|
| 1926 |
+
00:22:57,904 --> 00:23:00,705
|
| 1927 |
useful as possible regardless of the exact way that we're
|
| 1928 |
|
| 1929 |
483
|
| 1930 |
+
00:23:00,705 --> 00:23:01,025
|
| 1931 |
using it.
|
| 1932 |
|
| 1933 |
484
|
| 1934 |
+
00:23:02,199 --> 00:23:04,439
|
| 1935 |
And I think there's something very powerful in that, and
|
| 1936 |
|
| 1937 |
485
|
| 1938 |
+
00:23:04,439 --> 00:23:05,559
|
| 1939 |
it can be very cool.
|
| 1940 |
|
| 1941 |
486
|
| 1942 |
+
00:23:06,120 --> 00:23:07,559
|
| 1943 |
I see huge potential.
|
| 1944 |
|
| 1945 |
487
|
| 1946 |
+
00:23:07,559 --> 00:23:09,319
|
| 1947 |
What excites me about voice tech?
|
| 1948 |
|
| 1949 |
488
|
| 1950 |
+
00:23:09,719 --> 00:23:11,159
|
| 1951 |
A lot of things actually.
|
| 1952 |
|
| 1953 |
489
|
| 1954 |
+
00:23:12,120 --> 00:23:14,839
|
| 1955 |
Firstly, the fact that it's cheap and accurate, as I
|
| 1956 |
|
| 1957 |
490
|
| 1958 |
+
00:23:14,839 --> 00:23:17,785
|
| 1959 |
mentioned at the very start of this, and it's getting
|
| 1960 |
|
| 1961 |
491
|
| 1962 |
+
00:23:17,785 --> 00:23:20,104
|
| 1963 |
better and better with stuff like accent handling.
|
| 1964 |
|
| 1965 |
492
|
| 1966 |
+
00:23:20,745 --> 00:23:23,304
|
| 1967 |
I'm not sure my fine tune will actually ever come
|
| 1968 |
|
| 1969 |
493
|
| 1970 |
+
00:23:23,304 --> 00:23:25,225
|
| 1971 |
to fruition in the sense that I'll use it day
|
| 1972 |
|
| 1973 |
494
|
| 1974 |
+
00:23:25,225 --> 00:23:26,584
|
| 1975 |
to day as I imagine.
|
| 1976 |
|
| 1977 |
495
|
| 1978 |
+
00:23:26,664 --> 00:23:30,505
|
| 1979 |
I get like superb, flawless words error rates because I'm
|
| 1980 |
|
| 1981 |
496
|
| 1982 |
+
00:23:30,505 --> 00:23:34,949
|
| 1983 |
just kind of skeptical about local speech to text, as
|
| 1984 |
|
| 1985 |
497
|
| 1986 |
+
00:23:34,949 --> 00:23:35,670
|
| 1987 |
I mentioned.
|
| 1988 |
|
| 1989 |
498
|
| 1990 |
+
00:23:36,070 --> 00:23:39,830
|
| 1991 |
And I think the pace of innovation and improvement in
|
| 1992 |
|
| 1993 |
499
|
| 1994 |
+
00:23:39,830 --> 00:23:42,310
|
| 1995 |
the models, the main reasons for fine tuning from what
|
| 1996 |
|
| 1997 |
500
|
| 1998 |
+
00:23:42,310 --> 00:23:46,150
|
| 1999 |
I've seen have been people who are something that really
|
| 2000 |
|
| 2001 |
501
|
| 2002 |
+
00:23:46,150 --> 00:23:50,375
|
| 2003 |
blows blows my mind about ASR is the idea that
|
| 2004 |
|
| 2005 |
502
|
| 2006 |
+
00:23:50,375 --> 00:23:55,574
|
| 2007 |
it's inherently ailingual or multilingual, phonetic based.
|
| 2008 |
|
| 2009 |
503
|
| 2010 |
+
00:23:56,295 --> 00:24:00,375
|
| 2011 |
So as folks who use speak very obscure languages that
|
| 2012 |
|
| 2013 |
504
|
| 2014 |
+
00:24:00,375 --> 00:24:03,094
|
| 2015 |
there may be very there might be a paucity of
|
| 2016 |
|
| 2017 |
505
|
| 2018 |
+
00:24:02,229 --> 00:24:05,030
|
| 2019 |
training data or almost none at all, and therefore the
|
| 2020 |
|
| 2021 |
506
|
| 2022 |
+
00:24:05,030 --> 00:24:06,790
|
| 2023 |
accuracy is significantly reduced.
|
| 2024 |
|
| 2025 |
507
|
| 2026 |
+
00:24:06,790 --> 00:24:11,350
|
| 2027 |
Or folks in very critical environments, I know there are
|
| 2028 |
|
| 2029 |
508
|
| 2030 |
+
00:24:11,510 --> 00:24:15,350
|
| 2031 |
this is used extensively in medical transcription and dispatcher work
|
| 2032 |
|
| 2033 |
509
|
| 2034 |
+
00:24:15,350 --> 00:24:19,064
|
| 2035 |
as, you know the call centers who send out ambulances
|
| 2036 |
|
| 2037 |
510
|
| 2038 |
+
00:24:19,064 --> 00:24:19,864
|
| 2039 |
etc.
|
| 2040 |
|
| 2041 |
511
|
| 2042 |
+
00:24:20,265 --> 00:24:23,545
|
| 2043 |
Where accuracy is absolutely paramount and in the case of
|
| 2044 |
|
| 2045 |
512
|
| 2046 |
+
00:24:23,545 --> 00:24:27,545
|
| 2047 |
doctors radiologists they might be using very specialized vocab all
|
| 2048 |
|
| 2049 |
513
|
| 2050 |
+
00:24:27,545 --> 00:24:27,865
|
| 2051 |
the time.
|
| 2052 |
|
| 2053 |
514
|
| 2054 |
+
00:24:28,630 --> 00:24:30,229
|
| 2055 |
So those are kind of the main two things, and
|
| 2056 |
|
| 2057 |
515
|
| 2058 |
+
00:24:30,229 --> 00:24:32,150
|
| 2059 |
I'm not sure that really just for trying to make
|
| 2060 |
|
| 2061 |
516
|
| 2062 |
+
00:24:32,150 --> 00:24:36,390
|
| 2063 |
it better on a few random tech words with my
|
| 2064 |
|
| 2065 |
517
|
| 2066 |
+
00:24:36,390 --> 00:24:39,429
|
| 2067 |
slightly I mean, I have an accent, but, like, not,
|
| 2068 |
|
| 2069 |
518
|
| 2070 |
+
00:24:39,429 --> 00:24:42,469
|
| 2071 |
you know, an accent that a few other million people
|
| 2072 |
|
| 2073 |
519
|
| 2074 |
+
00:24:42,870 --> 00:24:43,910
|
| 2075 |
have ish.
|
| 2076 |
|
| 2077 |
520
|
| 2078 |
+
00:24:44,685 --> 00:24:47,965
|
| 2079 |
I'm not sure that my little fine tune is gonna
|
| 2080 |
|
| 2081 |
521
|
| 2082 |
+
00:24:47,965 --> 00:24:52,604
|
| 2083 |
actually like, the bump in word error reduction, if I
|
| 2084 |
|
| 2085 |
522
|
| 2086 |
+
00:24:52,604 --> 00:24:54,205
|
| 2087 |
ever actually figure out how to do it and get
|
| 2088 |
|
| 2089 |
523
|
| 2090 |
+
00:24:54,205 --> 00:24:56,365
|
| 2091 |
it up to the cloud, by the time we've done
|
| 2092 |
|
| 2093 |
524
|
| 2094 |
+
00:24:56,365 --> 00:24:59,959
|
| 2095 |
that, I suspect that the next generation of ASR will
|
| 2096 |
|
| 2097 |
525
|
| 2098 |
+
00:24:59,959 --> 00:25:01,719
|
| 2099 |
just be so good that it will kind of be,
|
| 2100 |
|
| 2101 |
526
|
| 2102 |
+
00:25:01,959 --> 00:25:03,959
|
| 2103 |
well, that would have been cool if it worked out,
|
| 2104 |
|
| 2105 |
527
|
| 2106 |
+
00:25:03,959 --> 00:25:05,479
|
| 2107 |
but I'll just use this instead.
|
| 2108 |
|
| 2109 |
528
|
| 2110 |
+
00:25:05,719 --> 00:25:10,679
|
| 2111 |
So that's gonna be it for today's episode of voice
|
| 2112 |
|
| 2113 |
529
|
| 2114 |
+
00:25:10,679 --> 00:25:11,640
|
| 2115 |
training data.
|
| 2116 |
|
| 2117 |
530
|
| 2118 |
+
00:25:11,880 --> 00:25:14,255
|
| 2119 |
Single, long shot evaluation.
|
| 2120 |
|
| 2121 |
531
|
| 2122 |
+
00:25:14,495 --> 00:25:15,694
|
| 2123 |
Who am I gonna compare?
|
| 2124 |
|
| 2125 |
532
|
| 2126 |
+
00:25:16,414 --> 00:25:18,574
|
| 2127 |
Whisper is always good as a benchmark, but I'm more
|
| 2128 |
|
| 2129 |
533
|
| 2130 |
+
00:25:18,574 --> 00:25:22,175
|
| 2131 |
interested in seeing Whisper head to head with two things
|
| 2132 |
|
| 2133 |
534
|
| 2134 |
+
00:25:22,175 --> 00:25:22,894
|
| 2135 |
really.
|
| 2136 |
|
| 2137 |
535
|
| 2138 |
+
00:25:23,295 --> 00:25:25,134
|
| 2139 |
One is Whisper variants.
|
| 2140 |
|
| 2141 |
536
|
| 2142 |
+
00:25:25,134 --> 00:25:27,695
|
| 2143 |
So you've got these projects like Faster Whisper.
|
| 2144 |
|
| 2145 |
537
|
| 2146 |
+
00:25:29,110 --> 00:25:29,989
|
| 2147 |
Distill Whisper.
|
| 2148 |
|
| 2149 |
538
|
| 2150 |
+
00:25:29,989 --> 00:25:30,709
|
| 2151 |
It's a bit confusing.
|
| 2152 |
|
| 2153 |
539
|
| 2154 |
+
00:25:30,709 --> 00:25:31,909
|
| 2155 |
There's a whole bunch of them.
|
| 2156 |
|
| 2157 |
540
|
| 2158 |
+
00:25:32,150 --> 00:25:35,110
|
| 2159 |
And the emerging ASRs, which are also a thing.
|
| 2160 |
|
| 2161 |
541
|
| 2162 |
+
00:25:35,269 --> 00:25:37,110
|
| 2163 |
My intention for this is I'm not sure I'm gonna
|
| 2164 |
|
| 2165 |
542
|
| 2166 |
+
00:25:37,110 --> 00:25:39,910
|
| 2167 |
have the time in any point in the foreseeable future
|
| 2168 |
|
| 2169 |
543
|
| 2170 |
+
00:25:39,910 --> 00:25:44,775
|
| 2171 |
to go back to this whole episode and create a
|
| 2172 |
|
| 2173 |
544
|
| 2174 |
+
00:25:44,775 --> 00:25:48,294
|
| 2175 |
proper source truth where I fix everything.
|
| 2176 |
|
| 2177 |
545
|
| 2178 |
+
00:25:49,255 --> 00:25:51,894
|
| 2179 |
Might do it if I can get one transcription that's
|
| 2180 |
|
| 2181 |
546
|
| 2182 |
+
00:25:51,894 --> 00:25:54,134
|
| 2183 |
sufficiently close to perfection.
|
| 2184 |
|
| 2185 |
547
|
| 2186 |
+
00:25:54,934 --> 00:25:58,400
|
| 2187 |
But what I would actually love to do on Hugging
|
| 2188 |
|
| 2189 |
548
|
| 2190 |
+
00:25:58,400 --> 00:26:00,479
|
| 2191 |
Face, I think would be a great probably how I
|
| 2192 |
|
| 2193 |
549
|
| 2194 |
+
00:26:00,479 --> 00:26:04,400
|
| 2195 |
might visualize this is having the audio waveform play and
|
| 2196 |
|
| 2197 |
550
|
| 2198 |
+
00:26:04,400 --> 00:26:08,880
|
| 2199 |
then have the transcript for each model below it and
|
| 2200 |
|
| 2201 |
551
|
| 2202 |
+
00:26:08,880 --> 00:26:13,765
|
| 2203 |
maybe even a, like, you know, to scale and maybe
|
| 2204 |
|
| 2205 |
552
|
| 2206 |
+
00:26:13,765 --> 00:26:16,644
|
| 2207 |
even a local one as well, like local whisper versus
|
| 2208 |
|
| 2209 |
553
|
| 2210 |
+
00:26:16,644 --> 00:26:19,684
|
| 2211 |
OpenAI API, etcetera.
|
| 2212 |
|
| 2213 |
554
|
| 2214 |
+
00:26:19,765 --> 00:26:23,124
|
| 2215 |
And I can then actually listen back to segments or
|
| 2216 |
|
| 2217 |
555
|
| 2218 |
+
00:26:23,124 --> 00:26:25,285
|
| 2219 |
anyone who wants to can listen back to segments of
|
| 2220 |
|
| 2221 |
556
|
| 2222 |
+
00:26:25,285 --> 00:26:30,219
|
| 2223 |
this recording and see where a particular model struggled and
|
| 2224 |
|
| 2225 |
557
|
| 2226 |
+
00:26:30,219 --> 00:26:33,099
|
| 2227 |
others didn't as well as the sort of headline finding
|
| 2228 |
|
| 2229 |
558
|
| 2230 |
+
00:26:33,099 --> 00:26:35,579
|
| 2231 |
of which had the best W E R but that
|
| 2232 |
|
| 2233 |
559
|
| 2234 |
+
00:26:35,579 --> 00:26:37,659
|
| 2235 |
would require the source of truth.
|
| 2236 |
|
| 2237 |
560
|
| 2238 |
+
00:26:37,660 --> 00:26:38,459
|
| 2239 |
Okay, that's it.
|
| 2240 |
|
| 2241 |
561
|
| 2242 |
+
00:26:38,425 --> 00:26:40,985
|
| 2243 |
I hope this was, I don't know, maybe useful for
|
| 2244 |
|
| 2245 |
562
|
| 2246 |
+
00:26:40,985 --> 00:26:42,904
|
| 2247 |
other folks interested in STT.
|
| 2248 |
|
| 2249 |
563
|
| 2250 |
+
00:26:42,985 --> 00:26:45,945
|
| 2251 |
You want to see I always think I've just said
|
| 2252 |
|
| 2253 |
564
|
| 2254 |
+
00:26:45,945 --> 00:26:47,624
|
| 2255 |
it as something I didn't intend to.
|
| 2256 |
|
| 2257 |
565
|
| 2258 |
+
00:26:47,864 --> 00:26:49,624
|
| 2259 |
STT, I said for those.
|
| 2260 |
|
| 2261 |
566
|
| 2262 |
+
00:26:49,624 --> 00:26:53,049
|
| 2263 |
Listen carefully, including hopefully the models themselves.
|
| 2264 |
|
| 2265 |
567
|
| 2266 |
+
00:26:53,289 --> 00:26:55,049
|
| 2267 |
This has been myself, Daniel Rosol.
|
| 2268 |
|
| 2269 |
568
|
| 2270 |
+
00:26:55,049 --> 00:26:59,370
|
| 2271 |
For more jumbled repositories about my roving interest in AI
|
| 2272 |
|
| 2273 |
569
|
| 2274 |
+
00:26:59,370 --> 00:27:04,009
|
| 2275 |
but particularly AgenTic, MCP and VoiceTech you can find me
|
| 2276 |
|
| 2277 |
570
|
| 2278 |
+
00:27:04,009 --> 00:27:05,689
|
| 2279 |
on GitHub.
|
| 2280 |
|
| 2281 |
571
|
| 2282 |
+
00:27:05,929 --> 00:27:06,650
|
| 2283 |
Hugging Face.
|
| 2284 |
|
| 2285 |
572
|
| 2286 |
+
00:27:08,045 --> 00:27:08,924
|
| 2287 |
Where else?
|
| 2288 |
|
| 2289 |
573
|
| 2290 |
+
00:27:08,925 --> 00:27:11,725
|
| 2291 |
DanielRosel dot com, which is my personal website, as well
|
| 2292 |
|
| 2293 |
574
|
| 2294 |
+
00:27:11,725 --> 00:27:15,485
|
| 2295 |
as this podcast whose name I sadly cannot remember.
|
| 2296 |
|
| 2297 |
575
|
| 2298 |
+
00:27:15,644 --> 00:27:16,685
|
| 2299 |
Until next time.
|
| 2300 |
|
| 2301 |
576
|
| 2302 |
+
00:27:16,685 --> 00:27:17,324
|
| 2303 |
Thanks for listening.
|
| 2304 |
|
srt-out/speechmatics.srt
CHANGED
|
@@ -1,2069 +1,2069 @@
|
|
| 1 |
1
|
| 2 |
-
00:00:00,
|
| 3 |
Hello and welcome to a audio data
|
| 4 |
set consisting of one single
|
| 5 |
|
| 6 |
2
|
| 7 |
-
00:00:06,
|
| 8 |
episode of a non-existent podcast.
|
| 9 |
Or it, uh, I may append this to a
|
| 10 |
|
| 11 |
3
|
| 12 |
-
00:00:12,
|
| 13 |
podcast that I set up recently.
|
| 14 |
Um, regarding my, uh,
|
| 15 |
|
| 16 |
4
|
| 17 |
-
00:00:16,
|
| 18 |
with my thoughts on speech,
|
| 19 |
tech and AI in particular,
|
| 20 |
|
| 21 |
5
|
| 22 |
-
00:00:22,
|
| 23 |
more AI and generative AI, I would,
|
| 24 |
uh, I would say, but in any event,
|
| 25 |
|
| 26 |
6
|
| 27 |
-
00:00:27,
|
| 28 |
the purpose of this, um,
|
| 29 |
voice recording is actually to create
|
| 30 |
|
| 31 |
7
|
| 32 |
-
00:00:32,
|
| 33 |
a lengthy voice sample for a quick
|
| 34 |
evaluation, a back of the envelope
|
| 35 |
|
| 36 |
8
|
| 37 |
-
00:00:37,
|
| 38 |
evaluation, as they might say,
|
| 39 |
for different speech to text models.
|
| 40 |
|
| 41 |
9
|
| 42 |
-
00:00:41,
|
| 43 |
And I'm doing this because I,
|
| 44 |
uh, I thought I'd made a great
|
| 45 |
|
| 46 |
10
|
| 47 |
-
00:00:43,
|
| 48 |
breakthrough in my journey with
|
| 49 |
speech tech, and that was succeeding
|
| 50 |
|
| 51 |
11
|
| 52 |
-
00:00:48,
|
| 53 |
in the elusive task of fine tuning.
|
| 54 |
Whisper, whisper is.
|
| 55 |
|
| 56 |
12
|
| 57 |
-
00:00:52,
|
| 58 |
And I'm going to just talk.
|
| 59 |
I'm trying to mix up, uh,
|
| 60 |
|
| 61 |
13
|
| 62 |
-
00:00:56,
|
| 63 |
I'm going to try a few different
|
| 64 |
styles of speaking.
|
| 65 |
|
| 66 |
14
|
| 67 |
-
00:01:00,
|
| 68 |
I might whisper something at
|
| 69 |
some point as well,
|
| 70 |
|
| 71 |
15
|
| 72 |
-
00:01:03,
|
| 73 |
and I'll go back to speaking loud in,
|
| 74 |
uh, in different parts.
|
| 75 |
|
| 76 |
16
|
| 77 |
-
00:01:07,
|
| 78 |
I'm going to sound really like a
|
| 79 |
crazy person, because I'm also
|
| 80 |
|
| 81 |
17
|
| 82 |
-
00:01:09,
|
| 83 |
going to try to speak at different
|
| 84 |
pitches and cadences in order to
|
| 85 |
|
| 86 |
18
|
| 87 |
-
00:01:15,
|
| 88 |
really try to put a speech to
|
| 89 |
text model through its paces,
|
| 90 |
|
| 91 |
19
|
| 92 |
-
00:01:20,
|
| 93 |
which is trying to make sense of,
|
| 94 |
is this guy just on incoherently in
|
| 95 |
|
| 96 |
20
|
| 97 |
-
00:01:25,
|
| 98 |
one long sentence, or are these just
|
| 99 |
actually a series of step standalone,
|
| 100 |
|
| 101 |
21
|
| 102 |
-
00:01:34,
|
| 103 |
standalone, standalone sentences?
|
| 104 |
And how is it going to handle
|
| 105 |
|
| 106 |
22
|
| 107 |
-
00:01:37,
|
| 108 |
step alone? That's not a word.
|
| 109 |
Uh, what happens when you use
|
| 110 |
|
| 111 |
23
|
| 112 |
-
00:01:40,
|
| 113 |
speech to text and you use a fake
|
| 114 |
word and then you're like, wait,
|
| 115 |
|
| 116 |
24
|
| 117 |
-
00:01:
|
| 118 |
that's not actually that word doesn't
|
| 119 |
exist. How does AI handle that?
|
| 120 |
|
| 121 |
25
|
| 122 |
-
00:01:48,
|
| 123 |
And, uh, these and more are all
|
| 124 |
the questions that I'm seeking
|
| 125 |
|
| 126 |
26
|
| 127 |
-
00:01:53,
|
| 128 |
to answer in this training data.
|
| 129 |
Now, why did why was it trying
|
| 130 |
|
| 131 |
27
|
| 132 |
-
00:01:57,
|
| 133 |
to fine tune a whisper?
|
| 134 |
And what is whisper?
|
| 135 |
|
| 136 |
28
|
| 137 |
-
00:01:59,
|
| 138 |
As I said, I'm gonna try to, uh,
|
| 139 |
record this at a couple of different
|
| 140 |
|
| 141 |
29
|
| 142 |
-
00:02:03,
|
| 143 |
levels of technicality for folks who
|
| 144 |
are, uh, you know, in the normal, uh,
|
| 145 |
|
| 146 |
30
|
| 147 |
-
00:02:
|
| 148 |
world and not totally stuck down
|
| 149 |
the rabbit hole of AI, uh, which I
|
| 150 |
|
| 151 |
31
|
| 152 |
-
00:02:13,
|
| 153 |
have to say is a really wonderful,
|
| 154 |
uh, rabbit hole to be to be down.
|
| 155 |
|
| 156 |
32
|
| 157 |
-
00:02:17,
|
| 158 |
Um, it's a really interesting area.
|
| 159 |
And speech and voice tech is is
|
| 160 |
|
| 161 |
33
|
| 162 |
-
00:02:21,
|
| 163 |
the aspect of it that I find
|
| 164 |
actually most.
|
| 165 |
|
| 166 |
34
|
| 167 |
-
00:02:25,
|
| 168 |
I'm not sure I would say the most
|
| 169 |
interesting, because there's just
|
| 170 |
|
| 171 |
35
|
| 172 |
-
00:02:28,
|
| 173 |
so much that is fascinating in AI.
|
| 174 |
Uh, but the most that I find the
|
| 175 |
|
| 176 |
36
|
| 177 |
-
00:02:32,
|
| 178 |
most personally transformative
|
| 179 |
in terms of the impact that it's
|
| 180 |
|
| 181 |
37
|
| 182 |
-
00:02:36,
|
| 183 |
had on my daily work life and
|
| 184 |
productivity and how I sort of work.
|
| 185 |
|
| 186 |
38
|
| 187 |
-
00:02:41,
|
| 188 |
And I'm persevering hard with the
|
| 189 |
task of trying to guess a good
|
| 190 |
|
| 191 |
39
|
| 192 |
-
00:02:
|
| 193 |
solution working for Linux, which if
|
| 194 |
anyone actually does listen to this,
|
| 195 |
|
| 196 |
40
|
| 197 |
-
00:02:51,
|
| 198 |
not just for the training data
|
| 199 |
and for the actual content, uh,
|
| 200 |
|
| 201 |
41
|
| 202 |
-
00:02:55,
|
| 203 |
this is this is has sparked I had
|
| 204 |
besides the fine tune not working.
|
| 205 |
|
| 206 |
42
|
| 207 |
-
00:02:59,
|
| 208 |
Well, that was the failure.
|
| 209 |
Um, I used clod code because one
|
| 210 |
|
| 211 |
43
|
| 212 |
-
00:03:05,
|
| 213 |
thinks these days that there is
|
| 214 |
nothing short of solving,
|
| 215 |
|
| 216 |
44
|
| 217 |
-
00:03:
|
| 218 |
you know, the, uh,
|
| 219 |
the reason of life or something.
|
| 220 |
|
| 221 |
45
|
| 222 |
-
00:03:
|
| 223 |
Uh, that clod and agentic AI can't
|
| 224 |
do, uh, which is not really the case.
|
| 225 |
|
| 226 |
46
|
| 227 |
-
00:03:19,
|
| 228 |
Uh, it does seem that way sometimes,
|
| 229 |
but it fails a lot as well.
|
| 230 |
|
| 231 |
47
|
| 232 |
-
00:03:23,
|
| 233 |
And this is one of those, uh,
|
| 234 |
instances where last week I put
|
| 235 |
|
| 236 |
48
|
| 237 |
-
00:03:26,
|
| 238 |
together an hour of voice training
|
| 239 |
data, basically speaking just
|
| 240 |
|
| 241 |
49
|
| 242 |
-
00:03:31,
|
| 243 |
random things for three minutes.
|
| 244 |
And, um,
|
| 245 |
|
| 246 |
50
|
| 247 |
-
00:03:35,
|
| 248 |
it was actually kind of tedious
|
| 249 |
because the texts were really weird.
|
| 250 |
|
| 251 |
51
|
| 252 |
-
00:03:38,
|
| 253 |
Some of them were it was like it
|
| 254 |
was AI generated.
|
| 255 |
|
| 256 |
52
|
| 257 |
-
00:03:42,
|
| 258 |
Um, I tried before to read
|
| 259 |
Sherlock Holmes for an hour and
|
| 260 |
|
| 261 |
53
|
| 262 |
-
00:03:44,
|
| 263 |
I just couldn't.
|
| 264 |
I was so bored, uh,
|
| 265 |
|
| 266 |
54
|
| 267 |
-
00:03:
|
| 268 |
after ten minutes that I was like,
|
| 269 |
okay, now I'm just gonna have to
|
| 270 |
|
| 271 |
55
|
| 272 |
-
00:03:50,
|
| 273 |
find something else to read.
|
| 274 |
So I used a created with AI
|
| 275 |
|
| 276 |
56
|
| 277 |
-
00:03:56,
|
| 278 |
studio vibe coded.
|
| 279 |
A synthetic text generator.
|
| 280 |
|
| 281 |
57
|
| 282 |
-
00:04:00,
|
| 283 |
Um, which actually I thought was
|
| 284 |
probably a better way of doing it
|
| 285 |
|
| 286 |
58
|
| 287 |
-
00:04:03,
|
| 288 |
because it would give me more short
|
| 289 |
samples with more varied content.
|
| 290 |
|
| 291 |
59
|
| 292 |
-
00:04:08,
|
| 293 |
So I was like, okay, give me a voice
|
| 294 |
note, like I'm recording an email,
|
| 295 |
|
| 296 |
60
|
| 297 |
-
00:04:13,
|
| 298 |
give me a short story to read,
|
| 299 |
give me prose, um, to read.
|
| 300 |
|
| 301 |
61
|
| 302 |
-
00:04:
|
| 303 |
So I came up with all these
|
| 304 |
different things, and I added a
|
| 305 |
|
| 306 |
62
|
| 307 |
-
00:04:21,
|
| 308 |
little timer to it so I could
|
| 309 |
see how close I was to one hour.
|
| 310 |
|
| 311 |
63
|
| 312 |
-
00:04:24,
|
| 313 |
Um, and, uh, I spent like an hour one
|
| 314 |
afternoon or probably two hours by
|
| 315 |
|
| 316 |
64
|
| 317 |
-
00:04:29,
|
| 318 |
the time you, um, you do retakes
|
| 319 |
or whatever because you want to.
|
| 320 |
|
| 321 |
65
|
| 322 |
-
00:04:34,
|
| 323 |
It gave me a source of truth,
|
| 324 |
which I'm not sure if that's the
|
| 325 |
|
| 326 |
66
|
| 327 |
-
00:04:39,
|
| 328 |
scientific way to approach this topic
|
| 329 |
of gathering, uh, training data,
|
| 330 |
|
| 331 |
67
|
| 332 |
-
00:04:43,
|
| 333 |
but I thought it made sense.
|
| 334 |
Um, I have a lot of audio data
|
| 335 |
|
| 336 |
68
|
| 337 |
-
00:04:
|
| 338 |
from recording voice notes,
|
| 339 |
which I've also kind of used, um,
|
| 340 |
|
| 341 |
69
|
| 342 |
-
00:04:
|
| 343 |
been experimenting with using for
|
| 344 |
a different purpose, slightly
|
| 345 |
|
| 346 |
70
|
| 347 |
-
00:04:55,
|
| 348 |
different annotating task types.
|
| 349 |
It's more text classification
|
| 350 |
|
| 351 |
71
|
| 352 |
-
00:05:00,
|
| 353 |
experiment or uh, well,
|
| 354 |
it's more than that, actually.
|
| 355 |
|
| 356 |
72
|
| 357 |
-
00:05:03,
|
| 358 |
I'm working on a voice app,
|
| 359 |
so it's a prototype I guess is
|
| 360 |
|
| 361 |
73
|
| 362 |
-
00:05:
|
| 363 |
really more accurate.
|
| 364 |
Um, but you can do that and you
|
| 365 |
|
| 366 |
74
|
| 367 |
-
00:05:12,
|
| 368 |
can work backwards.
|
| 369 |
You're like,
|
| 370 |
|
| 371 |
75
|
| 372 |
-
00:05:14,
|
| 373 |
you listen back to a voice note
|
| 374 |
and you painfully go through one
|
| 375 |
|
| 376 |
76
|
| 377 |
-
00:05:18,
|
| 378 |
of those transcribing, you know,
|
| 379 |
where you start and stop and scrub
|
| 380 |
|
| 381 |
77
|
| 382 |
-
00:05:21,
|
| 383 |
around it and you fix the errors.
|
| 384 |
But it's really,
|
| 385 |
|
| 386 |
78
|
| 387 |
-
00:05:
|
| 388 |
really boring to do that.
|
| 389 |
So I thought it would be less
|
| 390 |
|
| 391 |
79
|
| 392 |
-
00:05:27,
|
| 393 |
tedious in the long term if I just
|
| 394 |
recorded The Source of truth.
|
| 395 |
|
| 396 |
80
|
| 397 |
-
00:05:32,
|
| 398 |
So it gave me these three minute
|
| 399 |
snippets.
|
| 400 |
|
| 401 |
81
|
| 402 |
-
00:05:34,
|
| 403 |
I recorded them and saved an MP3
|
| 404 |
and a txt in the same folder,
|
| 405 |
|
| 406 |
82
|
| 407 |
-
00:05:38,
|
| 408 |
and I created an hour of that data.
|
| 409 |
Uh, so I was very hopeful, quietly,
|
| 410 |
|
| 411 |
83
|
| 412 |
-
00:05:43,
|
| 413 |
you know, a little bit hopeful
|
| 414 |
that I would be able that I could
|
| 415 |
|
| 416 |
84
|
| 417 |
-
00:05:46,
|
| 418 |
actually fine tune, whisper.
|
| 419 |
Um, I want to fine tune whisper
|
| 420 |
|
| 421 |
85
|
| 422 |
-
00:05:49,
|
| 423 |
because when I got into voice tech
|
| 424 |
last November, my wife was in
|
| 425 |
|
| 426 |
86
|
| 427 |
-
00:05:54,
|
| 428 |
the US and I was alone at home.
|
| 429 |
And you know, when crazy people
|
| 430 |
|
| 431 |
87
|
| 432 |
-
00:05:59,
|
| 433 |
like me do really wild things like
|
| 434 |
use voice to tech, uh, technology.
|
| 435 |
|
| 436 |
88
|
| 437 |
-
00:06:03,
|
| 438 |
That was basically, um,
|
| 439 |
when I started doing it,
|
| 440 |
|
| 441 |
89
|
| 442 |
-
00:06:06,
|
| 443 |
I didn't feel like a crazy person
|
| 444 |
speaking to myself, and my
|
| 445 |
|
| 446 |
90
|
| 447 |
-
00:06:10,
|
| 448 |
expectations weren't that high.
|
| 449 |
Uh, I used speech tech now and again.
|
| 450 |
|
| 451 |
91
|
| 452 |
-
00:06:16,
|
| 453 |
Um, tried it out.
|
| 454 |
I was like, it'd be really cool
|
| 455 |
|
| 456 |
92
|
| 457 |
-
00:06:18,
|
| 458 |
if you could just, like,
|
| 459 |
speak into your computer.
|
| 460 |
|
| 461 |
93
|
| 462 |
-
00:06:20,
|
| 463 |
And whatever I tried out that
|
| 464 |
had Linux support was just.
|
| 465 |
|
| 466 |
94
|
| 467 |
-
00:06:25,
|
| 468 |
It was not good, basically.
|
| 469 |
Um, and this blew me away from
|
| 470 |
|
| 471 |
95
|
| 472 |
-
00:06:28,
|
| 473 |
the first go.
|
| 474 |
I mean, it wasn't 100% accurate
|
| 475 |
|
| 476 |
96
|
| 477 |
-
00:06:
|
| 478 |
out of the box and it took work,
|
| 479 |
but it was good enough that there was
|
| 480 |
|
| 481 |
97
|
| 482 |
-
00:06:35,
|
| 483 |
a solid foundation and it kind of
|
| 484 |
passed that, uh, pivot point that
|
| 485 |
|
| 486 |
98
|
| 487 |
-
00:06:39,
|
| 488 |
it's actually worth doing this.
|
| 489 |
You know, there's a point where
|
| 490 |
|
| 491 |
99
|
| 492 |
-
00:06:42,
|
| 493 |
it's so like the transcript is you
|
| 494 |
don't have to get 100% accuracy
|
| 495 |
|
| 496 |
100
|
| 497 |
-
00:06:46,
|
| 498 |
for it to be worth your time for
|
| 499 |
speech to text to be a worthwhile
|
| 500 |
|
| 501 |
101
|
| 502 |
-
00:06:50,
|
| 503 |
addition to your productivity.
|
| 504 |
But you do need to get above.
|
| 505 |
|
| 506 |
102
|
| 507 |
-
00:06:
|
| 508 |
Let's say, I don't know, 85%.
|
| 509 |
If it's 60% or 50%,
|
| 510 |
|
| 511 |
103
|
| 512 |
-
00:06:57,
|
| 513 |
you inevitably say, screw it.
|
| 514 |
I'll just type it because you end up
|
| 515 |
|
| 516 |
104
|
| 517 |
-
00:07:00,
|
| 518 |
missing errors in the transcript
|
| 519 |
and it becomes actually worse.
|
| 520 |
|
| 521 |
105
|
| 522 |
-
00:07:
|
| 523 |
You end up in a worse position
|
| 524 |
than you started with.
|
| 525 |
|
| 526 |
106
|
| 527 |
-
00:07:06,
|
| 528 |
And that's been my experience.
|
| 529 |
So, um, I was like, oh,
|
| 530 |
|
| 531 |
107
|
| 532 |
-
00:07:
|
| 533 |
this is actually really, really good.
|
| 534 |
Now how did that happen?
|
| 535 |
|
| 536 |
108
|
| 537 |
-
00:07:13,
|
| 538 |
And the answer is ASR whisper
|
| 539 |
being open sourced and the
|
| 540 |
|
| 541 |
109
|
| 542 |
-
00:07:18,
|
| 543 |
transformer architecture,
|
| 544 |
if you want to go back to the,
|
| 545 |
|
| 546 |
110
|
| 547 |
-
00:07:22,
|
| 548 |
um, to the underpinnings, which
|
| 549 |
really blows my mind and it's on my
|
| 550 |
|
| 551 |
111
|
| 552 |
-
00:07:26,
|
| 553 |
list to read through that paper.
|
| 554 |
Um, all you need is attention as
|
| 555 |
|
| 556 |
112
|
| 557 |
-
00:07:33,
|
| 558 |
attentively as can be done with my
|
| 559 |
limited brain because it's super,
|
| 560 |
|
| 561 |
113
|
| 562 |
-
00:07:38,
|
| 563 |
super high level stuff.
|
| 564 |
Um, super advanced stuff.
|
| 565 |
|
| 566 |
114
|
| 567 |
-
00:07:42,
|
| 568 |
I mean, uh, but that I think of all
|
| 569 |
the things that are fascinating
|
| 570 |
|
| 571 |
115
|
| 572 |
-
00:07:48,
|
| 573 |
about the sudden rise in AI and
|
| 574 |
the dramatic capabilities.
|
| 575 |
|
| 576 |
116
|
| 577 |
-
00:07:53,
|
| 578 |
I find it fascinating that few
|
| 579 |
people are like, hang on,
|
| 580 |
|
| 581 |
117
|
| 582 |
-
00:07:55,
|
| 583 |
you've got this thing that can speak
|
| 584 |
to you like a chatbot, an LLM,
|
| 585 |
|
| 586 |
118
|
| 587 |
-
00:08:00,
|
| 588 |
and then you've got image generation.
|
| 589 |
Okay, so firstly, those two things on
|
| 590 |
|
| 591 |
119
|
| 592 |
-
00:08:05,
|
| 593 |
the surface have nothing in common.
|
| 594 |
Um, so like how are they how did that
|
| 595 |
|
| 596 |
120
|
| 597 |
-
00:08:10,
|
| 598 |
just happen all at the same time.
|
| 599 |
And then when you extend that
|
| 600 |
|
| 601 |
121
|
| 602 |
-
00:08:
|
| 603 |
further, um, you're like sooner,
|
| 604 |
right?
|
| 605 |
|
| 606 |
122
|
| 607 |
-
00:08:16,
|
| 608 |
You can sing a song and AI will like,
|
| 609 |
come up with an instrumental and then
|
| 610 |
|
| 611 |
123
|
| 612 |
-
00:08:21,
|
| 613 |
you've got whisper and you're like,
|
| 614 |
wait a second,
|
| 615 |
|
| 616 |
124
|
| 617 |
-
00:08:
|
| 618 |
how did all this stuff, like,
|
| 619 |
if it's all AI, what's like there
|
| 620 |
|
| 621 |
125
|
| 622 |
-
00:08:
|
| 623 |
has to be some commonality.
|
| 624 |
Otherwise these are four.
|
| 625 |
|
| 626 |
126
|
| 627 |
-
00:08:30,
|
| 628 |
These are totally different
|
| 629 |
technologies on the surface of it.
|
| 630 |
|
| 631 |
127
|
| 632 |
-
00:08:34,
|
| 633 |
And, uh, the transformer architecture
|
| 634 |
is, as far as I know, the answer.
|
| 635 |
|
| 636 |
128
|
| 637 |
-
00:08:40,
|
| 638 |
And I can't even say can't even
|
| 639 |
pretend that I really understand
|
| 640 |
|
| 641 |
129
|
| 642 |
-
00:08:44,
|
| 643 |
what the transformer
|
| 644 |
architecture means in depth,
|
| 645 |
|
| 646 |
130
|
| 647 |
-
00:08:47,
|
| 648 |
but I have scanned it and as I said,
|
| 649 |
I want to print it and really kind
|
| 650 |
|
| 651 |
131
|
| 652 |
-
00:08:51,
|
| 653 |
of think over it at some point,
|
| 654 |
and I'll probably feel bad about
|
| 655 |
|
| 656 |
132
|
| 657 |
-
00:08:56,
|
| 658 |
myself, I think,
|
| 659 |
because weren't those guys in their
|
| 660 |
|
| 661 |
133
|
| 662 |
-
00:08:59,
|
| 663 |
in their 20s like, that's crazy.
|
| 664 |
I think I asked ChatGPT once who
|
| 665 |
|
| 666 |
134
|
| 667 |
-
00:09:
|
| 668 |
were the who wrote that paper
|
| 669 |
and how old were they when it
|
| 670 |
|
| 671 |
135
|
| 672 |
-
00:09:08,
|
| 673 |
was published in arXiv?
|
| 674 |
And I was expecting like,
|
| 675 |
|
| 676 |
136
|
| 677 |
-
00:09:11,
|
| 678 |
I don't know,
|
| 679 |
what do you what do you imagine?
|
| 680 |
|
| 681 |
137
|
| 682 |
-
00:09:13,
|
| 683 |
I personally imagine kind of like,
|
| 684 |
you know,
|
| 685 |
|
| 686 |
138
|
| 687 |
-
00:09:
|
| 688 |
you have these breakthroughs during
|
| 689 |
Covid and things like that where
|
| 690 |
|
| 691 |
139
|
| 692 |
-
00:09:19,
|
| 693 |
like these kind of really obscure
|
| 694 |
scientists who are like in their
|
| 695 |
|
| 696 |
140
|
| 697 |
-
00:09:22,
|
| 698 |
50s and they've just kind of been
|
| 699 |
laboring in labs and, uh, wearily
|
| 700 |
|
| 701 |
141
|
| 702 |
-
00:09:27,
|
| 703 |
and writing in publishing in kind
|
| 704 |
of obscure academic publications.
|
| 705 |
|
| 706 |
142
|
| 707 |
-
00:09:30,
|
| 708 |
And they finally, like,
|
| 709 |
hit a big or win a Nobel Prize and
|
| 710 |
|
| 711 |
143
|
| 712 |
-
00:09:
|
| 713 |
then their household household names.
|
| 714 |
Uh, so that was kind of what I
|
| 715 |
|
| 716 |
144
|
| 717 |
-
00:09:37,
|
| 718 |
had in mind.
|
| 719 |
That was the mental image I'd
|
| 720 |
|
| 721 |
145
|
| 722 |
-
00:09:39,
|
| 723 |
formed of the birth of arXiv.
|
| 724 |
Like, I wasn't expecting 20
|
| 725 |
|
| 726 |
146
|
| 727 |
-
00:09:
|
| 728 |
somethings in San Francisco,
|
| 729 |
though I thought that was both very,
|
| 730 |
|
| 731 |
147
|
| 732 |
-
00:09:47,
|
| 733 |
very funny, very cool,
|
| 734 |
and actually kind of inspiring.
|
| 735 |
|
| 736 |
148
|
| 737 |
-
00:09:50,
|
| 738 |
It's nice to think that people who,
|
| 739 |
you know, just you might put them
|
| 740 |
|
| 741 |
149
|
| 742 |
-
00:09:55,
|
| 743 |
in the kind of milieu or bubble or
|
| 744 |
world that you are in or credibly in,
|
| 745 |
|
| 746 |
150
|
| 747 |
-
00:10:
|
| 748 |
through, you know,
|
| 749 |
a series of connections that are
|
| 750 |
|
| 751 |
151
|
| 752 |
-
00:10:03,
|
| 753 |
coming up with such literally
|
| 754 |
world changing, um, innovations.
|
| 755 |
|
| 756 |
152
|
| 757 |
-
00:10:07,
|
| 758 |
Uh, so that was, I thought,
|
| 759 |
anyway, that, that that was cool.
|
| 760 |
|
| 761 |
153
|
| 762 |
-
00:10:12,
|
| 763 |
Okay. Voice training data.
|
| 764 |
How are we doing?
|
| 765 |
|
| 766 |
154
|
| 767 |
-
00:10:
|
| 768 |
We're about ten minutes, and I'm
|
| 769 |
still talking about voice technology.
|
| 770 |
|
| 771 |
155
|
| 772 |
-
00:10:18,
|
| 773 |
Um, so whisper was brilliant,
|
| 774 |
and I was so excited that I was.
|
| 775 |
|
| 776 |
156
|
| 777 |
-
00:10:22,
|
| 778 |
My first instinct was to, like,
|
| 779 |
get like, oh, my gosh,
|
| 780 |
|
| 781 |
157
|
| 782 |
-
00:10:25,
|
| 783 |
I have to get, like,
|
| 784 |
a really good microphone for this.
|
| 785 |
|
| 786 |
158
|
| 787 |
-
00:10:
|
| 788 |
So, um, I didn't go on a
|
| 789 |
spending spree because I said,
|
| 790 |
|
| 791 |
159
|
| 792 |
-
00:10:31,
|
| 793 |
I'm gonna have to just wait a
|
| 794 |
month and see if I still use this.
|
| 795 |
|
| 796 |
160
|
| 797 |
-
00:10:
|
| 798 |
And it just kind of became it's
|
| 799 |
become really part of my daily
|
| 800 |
|
| 801 |
161
|
| 802 |
-
00:10:
|
| 803 |
routine.
|
| 804 |
Like, if I'm writing an email,
|
| 805 |
|
| 806 |
162
|
| 807 |
-
00:10:
|
| 808 |
I'll record a voice note.
|
| 809 |
And then I've developed and it's
|
| 810 |
|
| 811 |
163
|
| 812 |
-
00:10:47,
|
| 813 |
nice to see that everyone is
|
| 814 |
like developing the same things
|
| 815 |
|
| 816 |
164
|
| 817 |
-
00:10:
|
| 818 |
in parallel.
|
| 819 |
Like, that's kind of a weird thing
|
| 820 |
|
| 821 |
165
|
| 822 |
-
00:10:
|
| 823 |
to say, but when I look, I kind of
|
| 824 |
came when I started working on this,
|
| 825 |
|
| 826 |
166
|
| 827 |
-
00:10:57,
|
| 828 |
these prototypes on GitHub,
|
| 829 |
which is where I just kind of
|
| 830 |
|
| 831 |
167
|
| 832 |
-
00:11:00,
|
| 833 |
share very freely and loosely,
|
| 834 |
uh, ideas and, you know,
|
| 835 |
|
| 836 |
168
|
| 837 |
-
00:11:04,
|
| 838 |
first iterations on, on concepts,
|
| 839 |
um, and for want of a better word,
|
| 840 |
|
| 841 |
169
|
| 842 |
-
00:11:10,
|
| 843 |
I called it like, uh,
|
| 844 |
lm post-processing or cleanup or
|
| 845 |
|
| 846 |
170
|
| 847 |
-
00:11:14,
|
| 848 |
basically a system prompt that after
|
| 849 |
you get back the raw text from
|
| 850 |
|
| 851 |
171
|
| 852 |
-
00:11:18,
|
| 853 |
whisper, you run it through a model
|
| 854 |
and say, okay, this is crappy text,
|
| 855 |
|
| 856 |
172
|
| 857 |
-
00:11:24,
|
| 858 |
like add sentence structure and,
|
| 859 |
you know, fix it up.
|
| 860 |
|
| 861 |
173
|
| 862 |
-
00:11:27,
|
| 863 |
And, um, now when I'm exploring the
|
| 864 |
different tools that are out there
|
| 865 |
|
| 866 |
174
|
| 867 |
-
00:11:32,
|
| 868 |
that people have built, I see, uh,
|
| 869 |
quite a number of projects have
|
| 870 |
|
| 871 |
175
|
| 872 |
-
00:11:37,
|
| 873 |
basically done the same thing,
|
| 874 |
um, less that be misconstrued.
|
| 875 |
|
| 876 |
176
|
| 877 |
-
00:11:41,
|
| 878 |
I'm not saying for a millisecond
|
| 879 |
that I inspired them.
|
| 880 |
|
| 881 |
177
|
| 882 |
-
00:11:44,
|
| 883 |
I'm sure this has been a thing that's
|
| 884 |
been integrated into tools for a
|
| 885 |
|
| 886 |
178
|
| 887 |
-
00:11:
|
| 888 |
while, but it's it's the kind of
|
| 889 |
thing that when you start using these
|
| 890 |
|
| 891 |
179
|
| 892 |
-
00:11:52,
|
| 893 |
tools every day, the need for it
|
| 894 |
is almost instantly apparent, uh,
|
| 895 |
|
| 896 |
180
|
| 897 |
-
00:11:56,
|
| 898 |
because text that doesn't have any
|
| 899 |
punctuation or paragraph spacing
|
| 900 |
|
| 901 |
181
|
| 902 |
-
00:12:00,
|
| 903 |
takes a long time to, you know,
|
| 904 |
it takes so long to get it into
|
| 905 |
|
| 906 |
182
|
| 907 |
-
00:12:04,
|
| 908 |
a presentable email that again,
|
| 909 |
it's it's it moves speech tech
|
| 910 |
|
| 911 |
183
|
| 912 |
-
00:12:09,
|
| 913 |
into that before that inflection
|
| 914 |
point where you're like, no,
|
| 915 |
|
| 916 |
184
|
| 917 |
-
00:12:
|
| 918 |
it's just not worth it.
|
| 919 |
It's like it'll just be quicker
|
| 920 |
|
| 921 |
185
|
| 922 |
-
00:12:16,
|
| 923 |
to type this.
|
| 924 |
So it's a big it's a little touch.
|
| 925 |
|
| 926 |
186
|
| 927 |
-
00:12:18,
|
| 928 |
That actually is a big deal.
|
| 929 |
Uh, so I was on whisper and I've
|
| 930 |
|
| 931 |
187
|
| 932 |
-
00:12:24,
|
| 933 |
been using whisper and I kind of
|
| 934 |
early on found a couple of tools.
|
| 935 |
|
| 936 |
188
|
| 937 |
-
00:12:28,
|
| 938 |
I couldn't find what I was
|
| 939 |
looking for on Linux, which is,
|
| 940 |
|
| 941 |
189
|
| 942 |
-
00:12:31,
|
| 943 |
um, basically just something
|
| 944 |
that'll run in the background.
|
| 945 |
|
| 946 |
190
|
| 947 |
-
00:12:35,
|
| 948 |
You'll give it an API key and it
|
| 949 |
will just transcribe. Um.
|
| 950 |
|
| 951 |
191
|
| 952 |
-
00:12:41,
|
| 953 |
with, like, a little key to
|
| 954 |
start and stop the dictation.
|
| 955 |
|
| 956 |
192
|
| 957 |
-
00:12:44,
|
| 958 |
Uh, and the issues were I discovered
|
| 959 |
that, like most people involved in
|
| 960 |
|
| 961 |
193
|
| 962 |
-
00:12:49,
|
| 963 |
creating these projects were very
|
| 964 |
much focused on local models running
|
| 965 |
|
| 966 |
194
|
| 967 |
-
00:12:
|
| 968 |
whisper locally, because you can.
|
| 969 |
And I tried that a bunch of
|
| 970 |
|
| 971 |
195
|
| 972 |
-
00:12:57,
|
| 973 |
times and just never got results
|
| 974 |
that were as good as the cloud.
|
| 975 |
|
| 976 |
196
|
| 977 |
-
00:13:01,
|
| 978 |
And when I began looking at the
|
| 979 |
cost of the speech to text APIs
|
| 980 |
|
| 981 |
197
|
| 982 |
-
00:13:04,
|
| 983 |
and what I was spending,
|
| 984 |
I just thought there's it's actually,
|
| 985 |
|
| 986 |
198
|
| 987 |
-
00:13:08,
|
| 988 |
in my opinion, just one of the better
|
| 989 |
deals in API spending and in cloud.
|
| 990 |
|
| 991 |
199
|
| 992 |
-
00:13:13,
|
| 993 |
Like it's just not that expensive
|
| 994 |
for very, very good models that are
|
| 995 |
|
| 996 |
200
|
| 997 |
-
00:13:17,
|
| 998 |
much more, you know, you're going
|
| 999 |
to be able to run the full model,
|
| 1000 |
|
| 1001 |
201
|
| 1002 |
-
00:13:21,
|
| 1003 |
the latest model versus whatever
|
| 1004 |
you can run on your average GPU.
|
| 1005 |
|
| 1006 |
202
|
| 1007 |
-
00:13:26,
|
| 1008 |
Unless you want to buy a crazy GPU.
|
| 1009 |
It doesn't really make sense to me.
|
| 1010 |
|
| 1011 |
203
|
| 1012 |
-
00:13:29,
|
| 1013 |
Now, privacy is another concern.
|
| 1014 |
Um, that I know is kind of like a
|
| 1015 |
|
| 1016 |
204
|
| 1017 |
-
00:13:33,
|
| 1018 |
very much a separate thing that
|
| 1019 |
people just don't want their voice,
|
| 1020 |
|
| 1021 |
205
|
| 1022 |
-
00:13:
|
| 1023 |
data, and their voice leaving
|
| 1024 |
their local environment,
|
| 1025 |
|
| 1026 |
206
|
| 1027 |
-
00:13:40,
|
| 1028 |
maybe for regulatory reasons as well.
|
| 1029 |
Um, but I'm not in that.
|
| 1030 |
|
| 1031 |
207
|
| 1032 |
-
00:13:
|
| 1033 |
Um, I'm neither really care about
|
| 1034 |
people listening to my, uh,
|
| 1035 |
|
| 1036 |
208
|
| 1037 |
-
00:13:
|
| 1038 |
grocery list consisting of, uh,
|
| 1039 |
reminding myself that I need to
|
| 1040 |
|
| 1041 |
209
|
| 1042 |
-
00:13:51,
|
| 1043 |
buy more beer, Cheetos and hummus,
|
| 1044 |
which is kind of the three,
|
| 1045 |
|
| 1046 |
210
|
| 1047 |
-
00:13:
|
| 1048 |
three staples of my diet.
|
| 1049 |
Um, during periods of poor nutrition.
|
| 1050 |
|
| 1051 |
211
|
| 1052 |
-
00:13:59,
|
| 1053 |
Uh, but the kind of stuff that I
|
| 1054 |
transcribe, it's just not it's not a,
|
| 1055 |
|
| 1056 |
212
|
| 1057 |
-
00:14:
|
| 1058 |
it's not a privacy thing and that
|
| 1059 |
sort of sensitive about and, uh,
|
| 1060 |
|
| 1061 |
213
|
| 1062 |
-
00:14:09,
|
| 1063 |
I don't do anything so,
|
| 1064 |
you know, sensitive or secure,
|
| 1065 |
|
| 1066 |
214
|
| 1067 |
-
00:14:13,
|
| 1068 |
that requires air gapping.
|
| 1069 |
So, um, I looked at the pricing and
|
| 1070 |
|
| 1071 |
215
|
| 1072 |
-
00:14:16,
|
| 1073 |
especially the kind of older models,
|
| 1074 |
mini, um, some of them are very,
|
| 1075 |
|
| 1076 |
216
|
| 1077 |
-
00:14:20,
|
| 1078 |
very affordable.
|
| 1079 |
And I did a back of the I did a
|
| 1080 |
|
| 1081 |
217
|
| 1082 |
-
00:14:23,
|
| 1083 |
calculation once with ChatGPT
|
| 1084 |
and I was like, okay, this is a,
|
| 1085 |
|
| 1086 |
218
|
| 1087 |
-
00:14:27,
|
| 1088 |
this is the API price for I can't
|
| 1089 |
remember whatever the model was.
|
| 1090 |
|
| 1091 |
219
|
| 1092 |
-
00:14:31,
|
| 1093 |
Uh, let's say I just go at it
|
| 1094 |
like nonstop,
|
| 1095 |
|
| 1096 |
220
|
| 1097 |
-
00:14:34,
|
| 1098 |
which it rarely happens. Probably.
|
| 1099 |
I would say on average,
|
| 1100 |
|
| 1101 |
221
|
| 1102 |
-
00:14:37,
|
| 1103 |
I might dictate 30 to 60 minutes per
|
| 1104 |
day if I was probably summing up
|
| 1105 |
|
| 1106 |
222
|
| 1107 |
-
00:14:
|
| 1108 |
the emails, documents, outlines,
|
| 1109 |
um, which is a lot, but it's it's
|
| 1110 |
|
| 1111 |
223
|
| 1112 |
-
00:14:48,
|
| 1113 |
still a fairly modest amount.
|
| 1114 |
And I was like, well,
|
| 1115 |
|
| 1116 |
224
|
| 1117 |
-
00:14:50,
|
| 1118 |
some days I do go on like 1 or 2
|
| 1119 |
days where I've been.
|
| 1120 |
|
| 1121 |
225
|
| 1122 |
-
00:14:54,
|
| 1123 |
Usually when I'm like kind of out of
|
| 1124 |
the house and just have something
|
| 1125 |
|
| 1126 |
226
|
| 1127 |
-
00:14:59,
|
| 1128 |
like, I have nothing else to do.
|
| 1129 |
Like if I'm at a hospital with a
|
| 1130 |
|
| 1131 |
227
|
| 1132 |
-
00:15:02,
|
| 1133 |
newborn, uh, and you're waiting
|
| 1134 |
for like eight hours and hours
|
| 1135 |
|
| 1136 |
228
|
| 1137 |
-
00:15:
|
| 1138 |
for an appointment, and I would
|
| 1139 |
probably have listened to podcasts
|
| 1140 |
|
| 1141 |
229
|
| 1142 |
-
00:15:10,
|
| 1143 |
before becoming a speech fanatic.
|
| 1144 |
And I'm like, oh, wait,
|
| 1145 |
|
| 1146 |
230
|
| 1147 |
-
00:15:14,
|
| 1148 |
let me just get down.
|
| 1149 |
Let me just get these ideas out
|
| 1150 |
|
| 1151 |
231
|
| 1152 |
-
00:15:16,
|
| 1153 |
of my head.
|
| 1154 |
And that's when I'll go on my
|
| 1155 |
|
| 1156 |
232
|
| 1157 |
-
00:15:19,
|
| 1158 |
speech binges.
|
| 1159 |
But those are like once every
|
| 1160 |
|
| 1161 |
233
|
| 1162 |
-
00:15:21,
|
| 1163 |
few months, like not frequently.
|
| 1164 |
But I said, okay, let's just say
|
| 1165 |
|
| 1166 |
234
|
| 1167 |
-
00:15:
|
| 1168 |
if I'm gonna price out.
|
| 1169 |
Cloud asked if I was like, dedicated
|
| 1170 |
|
| 1171 |
235
|
| 1172 |
-
00:15:30,
|
| 1173 |
every second of every waking hour to
|
| 1174 |
transcribing for some odd reason. Um.
|
| 1175 |
|
| 1176 |
236
|
| 1177 |
-
00:15:37,
|
| 1178 |
I mean, it'd have to, like,
|
| 1179 |
eat and use the toilet and,
|
| 1180 |
|
| 1181 |
237
|
| 1182 |
-
00:15:39,
|
| 1183 |
like, you know, there's only so
|
| 1184 |
many hours I'm awake for.
|
| 1185 |
|
| 1186 |
238
|
| 1187 |
-
00:15:42,
|
| 1188 |
So, like,
|
| 1189 |
let's just say a maximum of, like,
|
| 1190 |
|
| 1191 |
239
|
| 1192 |
-
00:15:44,
|
| 1193 |
40 hours, 45 minutes in the hour.
|
| 1194 |
Then I said, all right,
|
| 1195 |
|
| 1196 |
240
|
| 1197 |
-
00:15:48,
|
| 1198 |
let's just say 50. Who knows?
|
| 1199 |
You're dictating on the toilet.
|
| 1200 |
|
| 1201 |
241
|
| 1202 |
-
00:15:52,
|
| 1203 |
We do it.
|
| 1204 |
Uh,
|
| 1205 |
|
| 1206 |
242
|
| 1207 |
-
00:15:
|
| 1208 |
so it could be you could just do 60.
|
| 1209 |
But whatever I did, and every day,
|
| 1210 |
|
| 1211 |
243
|
| 1212 |
-
00:15:58,
|
| 1213 |
like, you're going flat out seven
|
| 1214 |
days a week dictating non-stop.
|
| 1215 |
|
| 1216 |
244
|
| 1217 |
-
00:16:02,
|
| 1218 |
I was like, what's my monthly API
|
| 1219 |
bill going to be at this price?
|
| 1220 |
|
| 1221 |
245
|
| 1222 |
-
00:16:06,
|
| 1223 |
And it came out to like 70 or 80
|
| 1224 |
bucks.
|
| 1225 |
|
| 1226 |
246
|
| 1227 |
-
00:16:09,
|
| 1228 |
And I was like, well, that would be
|
| 1229 |
an extraordinary amount of dictation.
|
| 1230 |
|
| 1231 |
247
|
| 1232 |
-
00:16:14,
|
| 1233 |
And I would hope that there was
|
| 1234 |
some compelling reason,
|
| 1235 |
|
| 1236 |
248
|
| 1237 |
-
00:16:18,
|
| 1238 |
more worth more than $70,
|
| 1239 |
that I embarked upon that project.
|
| 1240 |
|
| 1241 |
249
|
| 1242 |
-
00:16:22,
|
| 1243 |
Uh, so given that that's kind of the
|
| 1244 |
max point for me, I said, that's
|
| 1245 |
|
| 1246 |
250
|
| 1247 |
-
00:16:25,
|
| 1248 |
actually very, very affordable.
|
| 1249 |
Um, now you're gonna if you want
|
| 1250 |
|
| 1251 |
251
|
| 1252 |
-
00:16:29,
|
| 1253 |
to spec out the costs and you want
|
| 1254 |
to do the post-processing that I
|
| 1255 |
|
| 1256 |
252
|
| 1257 |
-
00:16:34,
|
| 1258 |
really do feel is valuable.
|
| 1259 |
Um, that's going to cost some more as
|
| 1260 |
|
| 1261 |
253
|
| 1262 |
-
00:16:37,
|
| 1263 |
well, unless you're using Gemini,
|
| 1264 |
which, uh, needless to say, is a
|
| 1265 |
|
| 1266 |
254
|
| 1267 |
-
00:16:43,
|
| 1268 |
random person sitting in Jerusalem.
|
| 1269 |
Uh, I have no affiliation,
|
| 1270 |
|
| 1271 |
255
|
| 1272 |
-
00:16:
|
| 1273 |
nor with Google, nor anthropic,
|
| 1274 |
nor Gemini, nor any major tech vendor
|
| 1275 |
|
| 1276 |
256
|
| 1277 |
-
00:16:51,
|
| 1278 |
for that matter. Um, I like Gemini.
|
| 1279 |
Not so much as a everyday model.
|
| 1280 |
|
| 1281 |
257
|
| 1282 |
-
00:16:56,
|
| 1283 |
Um, it's kind of underwhelmed in
|
| 1284 |
that respect, I would say.
|
| 1285 |
|
| 1286 |
258
|
| 1287 |
-
00:17:00,
|
| 1288 |
But for multimodal,
|
| 1289 |
I think it's got a lot to offer.
|
| 1290 |
|
| 1291 |
259
|
| 1292 |
-
00:17:03,
|
| 1293 |
And I think that the transcribing
|
| 1294 |
functionality whereby it can,
|
| 1295 |
|
| 1296 |
260
|
| 1297 |
-
00:17:07,
|
| 1298 |
um, process audio with a system
|
| 1299 |
prompt and both give you
|
| 1300 |
|
| 1301 |
261
|
| 1302 |
-
00:17:12,
|
| 1303 |
transcription that's cleaned up,
|
| 1304 |
that reduces two steps to one.
|
| 1305 |
|
| 1306 |
262
|
| 1307 |
-
00:17:15,
|
| 1308 |
And that for me is a very,
|
| 1309 |
very big deal.
|
| 1310 |
|
| 1311 |
263
|
| 1312 |
-
00:17:18,
|
| 1313 |
And, uh, I feel like even Google
|
| 1314 |
has haven't really sort of thought
|
| 1315 |
|
| 1316 |
264
|
| 1317 |
-
00:17:
|
| 1318 |
through how useful the that
|
| 1319 |
modality is and what kind of use
|
| 1320 |
|
| 1321 |
265
|
| 1322 |
-
00:17:27,
|
| 1323 |
cases you can achieve with it.
|
| 1324 |
Because I found in the course of
|
| 1325 |
|
| 1326 |
266
|
| 1327 |
-
00:17:30,
|
| 1328 |
this year just an endless list
|
| 1329 |
of really kind of system prompt,
|
| 1330 |
|
| 1331 |
267
|
| 1332 |
-
00:17:36,
|
| 1333 |
system prompt stuff that I can say,
|
| 1334 |
okay, I've used it to capture context
|
| 1335 |
|
| 1336 |
268
|
| 1337 |
-
00:17:41,
|
| 1338 |
data for AI, which is literally I
|
| 1339 |
might speak for if I wanted to have a
|
| 1340 |
|
| 1341 |
269
|
| 1342 |
-
00:17:45,
|
| 1343 |
good bank of context data about,
|
| 1344 |
who knows, my childhood.
|
| 1345 |
|
| 1346 |
270
|
| 1347 |
-
00:17:50,
|
| 1348 |
Uh, more realistically,
|
| 1349 |
maybe my career goals, uh,
|
| 1350 |
|
| 1351 |
271
|
| 1352 |
-
00:17:53,
|
| 1353 |
something that would just be,
|
| 1354 |
like, really boring to type out.
|
| 1355 |
|
| 1356 |
272
|
| 1357 |
-
00:17:56,
|
| 1358 |
So I'll just, like, sit in my car
|
| 1359 |
and record it for ten minutes.
|
| 1360 |
|
| 1361 |
273
|
| 1362 |
-
00:18:01,
|
| 1363 |
And that ten minutes,
|
| 1364 |
you get a lot of information in,
|
| 1365 |
|
| 1366 |
274
|
| 1367 |
-
00:18:04,
|
| 1368 |
um, emails, which is short text.
|
| 1369 |
Um, just there is a whole bunch.
|
| 1370 |
|
| 1371 |
275
|
| 1372 |
-
00:18:10,
|
| 1373 |
And all these workflows kind of
|
| 1374 |
require a little bit of treatment
|
| 1375 |
|
| 1376 |
276
|
| 1377 |
-
00:18:13,
|
| 1378 |
afterwards and different treatment.
|
| 1379 |
My context pipeline is kind of like
|
| 1380 |
|
| 1381 |
277
|
| 1382 |
-
00:18:17,
|
| 1383 |
just extract the bare essentials.
|
| 1384 |
So you end up with me talking very
|
| 1385 |
|
| 1386 |
278
|
| 1387 |
-
00:18:21,
|
| 1388 |
loosely about sort of what I've done
|
| 1389 |
in my career, where I've worked,
|
| 1390 |
|
| 1391 |
279
|
| 1392 |
-
00:18:24,
|
| 1393 |
where I might like to work,
|
| 1394 |
and it goes it condenses that
|
| 1395 |
|
| 1396 |
280
|
| 1397 |
-
00:18:27,
|
| 1398 |
down to very robotic language
|
| 1399 |
that is easy to chunk, parse,
|
| 1400 |
|
| 1401 |
281
|
| 1402 |
-
00:18:31,
|
| 1403 |
and maybe put into a vector database.
|
| 1404 |
Daniel has worked in technology,
|
| 1405 |
|
| 1406 |
282
|
| 1407 |
-
00:18:36,
|
| 1408 |
Daniel is a has been working in,
|
| 1409 |
you know, stuff like that.
|
| 1410 |
|
| 1411 |
283
|
| 1412 |
-
00:18:39,
|
| 1413 |
That's not how you would speak.
|
| 1414 |
Um, but I figure it's probably easier
|
| 1415 |
|
| 1416 |
284
|
| 1417 |
-
00:18:43,
|
| 1418 |
to parse for, after all, robots.
|
| 1419 |
So we've almost got to 20 minutes.
|
| 1420 |
|
| 1421 |
285
|
| 1422 |
-
00:18:48,
|
| 1423 |
And this is actually a success
|
| 1424 |
because I wasted 20 minutes of my,
|
| 1425 |
|
| 1426 |
286
|
| 1427 |
-
00:18:52,
|
| 1428 |
uh, of the evening speaking into
|
| 1429 |
a microphone, and, uh,
|
| 1430 |
|
| 1431 |
287
|
| 1432 |
-
00:18:
|
| 1433 |
the levels were shot and, uh, it,
|
| 1434 |
uh, it was clipping and I said,
|
| 1435 |
|
| 1436 |
288
|
| 1437 |
-
00:19:00,
|
| 1438 |
I can't really do an evaluation.
|
| 1439 |
I have to be fair.
|
| 1440 |
|
| 1441 |
289
|
| 1442 |
-
00:19:03,
|
| 1443 |
I have to give the models a
|
| 1444 |
chance to do their thing.
|
| 1445 |
|
| 1446 |
290
|
| 1447 |
-
00:19:07,
|
| 1448 |
Uh,
|
| 1449 |
what am I hoping to achieve in this?
|
| 1450 |
|
| 1451 |
291
|
| 1452 |
-
00:19:09,
|
| 1453 |
Okay, my fine tune was a dud,
|
| 1454 |
as mentioned Deepgram SVT.
|
| 1455 |
|
| 1456 |
292
|
| 1457 |
-
00:19:12,
|
| 1458 |
I'm really, really hopeful that
|
| 1459 |
this prototype will work.
|
| 1460 |
|
| 1461 |
293
|
| 1462 |
-
00:19:15,
|
| 1463 |
And it's a built in public open
|
| 1464 |
source, so anyone is welcome to
|
| 1465 |
|
| 1466 |
294
|
| 1467 |
-
00:19:19,
|
| 1468 |
use it if I make anything good.
|
| 1469 |
Um, but that was really exciting for
|
| 1470 |
|
| 1471 |
295
|
| 1472 |
-
00:19:
|
| 1473 |
me last night when after hours of,
|
| 1474 |
um, trying my own prototype,
|
| 1475 |
|
| 1476 |
296
|
| 1477 |
-
00:19:27,
|
| 1478 |
seeing someone just made
|
| 1479 |
something that works like that.
|
| 1480 |
|
| 1481 |
297
|
| 1482 |
-
00:19:31,
|
| 1483 |
You know,
|
| 1484 |
you're not going to have to build a
|
| 1485 |
|
| 1486 |
298
|
| 1487 |
-
00:19:32,
|
| 1488 |
custom conda environment and image.
|
| 1489 |
I have AMD GPU, which makes
|
| 1490 |
|
| 1491 |
299
|
| 1492 |
-
00:19:38,
|
| 1493 |
things much more complicated.
|
| 1494 |
I didn't find it and I was about
|
| 1495 |
|
| 1496 |
300
|
| 1497 |
-
00:19:42,
|
| 1498 |
to give up and I said,
|
| 1499 |
all right, let me just give deep
|
| 1500 |
|
| 1501 |
301
|
| 1502 |
-
00:19:
|
| 1503 |
grams Linux thing a shot.
|
| 1504 |
And if this doesn't work, um,
|
| 1505 |
|
| 1506 |
302
|
| 1507 |
-
00:19:48,
|
| 1508 |
I'm just going to go back to
|
| 1509 |
trying to code something myself.
|
| 1510 |
|
| 1511 |
303
|
| 1512 |
-
00:19:51,
|
| 1513 |
And when I ran the script,
|
| 1514 |
I was using cloud code to do the
|
| 1515 |
|
| 1516 |
304
|
| 1517 |
-
00:19:56,
|
| 1518 |
installation process.
|
| 1519 |
It ran the script and oh my gosh,
|
| 1520 |
|
| 1521 |
305
|
| 1522 |
-
00:20:00,
|
| 1523 |
it works just like that.
|
| 1524 |
Uh, the tricky thing for all those
|
| 1525 |
|
| 1526 |
306
|
| 1527 |
-
00:20:05,
|
| 1528 |
who wants to know all the nitty
|
| 1529 |
gritty, nitty gritty details, um, was
|
| 1530 |
|
| 1531 |
307
|
| 1532 |
-
00:20:10,
|
| 1533 |
that I don't think it was actually
|
| 1534 |
struggling with transcription, but
|
| 1535 |
|
| 1536 |
308
|
| 1537 |
-
00:20:13,
|
| 1538 |
pasting Wayland makes life very hard,
|
| 1539 |
and I think there was something not
|
| 1540 |
|
| 1541 |
309
|
| 1542 |
-
00:20:18,
|
| 1543 |
running in the right time anyway.
|
| 1544 |
Deepgram I looked at how they
|
| 1545 |
|
| 1546 |
310
|
| 1547 |
-
00:20:21,
|
| 1548 |
actually handle that because it
|
| 1549 |
worked out of the box when other
|
| 1550 |
|
| 1551 |
311
|
| 1552 |
-
00:20:24,
|
| 1553 |
stuff didn't, and it was quite a
|
| 1554 |
clever little mechanism,
|
| 1555 |
|
| 1556 |
312
|
| 1557 |
-
00:20:29,
|
| 1558 |
and but more so than that,
|
| 1559 |
the accuracy was brilliant.
|
| 1560 |
|
| 1561 |
313
|
| 1562 |
-
00:20:32,
|
| 1563 |
Now, what am I doing here?
|
| 1564 |
This is going to be a 20 minute
|
| 1565 |
|
| 1566 |
314
|
| 1567 |
-
00:20:35,
|
| 1568 |
audio sample, and I'm I think
|
| 1569 |
I've done 1 or 2 of these before,
|
| 1570 |
|
| 1571 |
315
|
| 1572 |
-
00:20:
|
| 1573 |
but I did it with short, snappy voice
|
| 1574 |
notes. This is kind of long form.
|
| 1575 |
|
| 1576 |
316
|
| 1577 |
-
00:20:49,
|
| 1578 |
This actually might be a better
|
| 1579 |
approximation for what's useful
|
| 1580 |
|
| 1581 |
317
|
| 1582 |
-
00:20:51,
|
| 1583 |
to me than voice memos.
|
| 1584 |
Like I need to buy three liters
|
| 1585 |
|
| 1586 |
318
|
| 1587 |
-
00:20:56,
|
| 1588 |
of milk tomorrow, and pita bread,
|
| 1589 |
which is probably how like half
|
| 1590 |
|
| 1591 |
319
|
| 1592 |
-
00:20:59,
|
| 1593 |
my voice voice notes sound like
|
| 1594 |
if anyone were to, I don't know,
|
| 1595 |
|
| 1596 |
320
|
| 1597 |
-
00:21:02,
|
| 1598 |
like find my phone,
|
| 1599 |
they'd be like, this is the most
|
| 1600 |
|
| 1601 |
321
|
| 1602 |
-
00:21:04,
|
| 1603 |
boring person in the world.
|
| 1604 |
Although actually there are some
|
| 1605 |
|
| 1606 |
322
|
| 1607 |
-
00:21:07,
|
| 1608 |
like kind of, uh,
|
| 1609 |
journaling thoughts as well.
|
| 1610 |
|
| 1611 |
323
|
| 1612 |
-
00:21:09,
|
| 1613 |
But it's a lot of content like that.
|
| 1614 |
And the probably for the evaluation,
|
| 1615 |
|
| 1616 |
324
|
| 1617 |
-
00:21:13,
|
| 1618 |
the most useful thing is slightly
|
| 1619 |
obscure tech GitHub uh, hugging face
|
| 1620 |
|
| 1621 |
325
|
| 1622 |
-
00:21:21,
|
| 1623 |
not so obscure that it's not going
|
| 1624 |
to have a chance of knowing it,
|
| 1625 |
|
| 1626 |
326
|
| 1627 |
-
00:21:24,
|
| 1628 |
but hopefully sufficiently well
|
| 1629 |
known that the model should get it.
|
| 1630 |
|
| 1631 |
327
|
| 1632 |
-
00:21:28,
|
| 1633 |
I tried to do a little bit of
|
| 1634 |
speaking really fast and
|
| 1635 |
|
| 1636 |
328
|
| 1637 |
-
00:21:30,
|
| 1638 |
speaking very slowly.
|
| 1639 |
I would say in general,
|
| 1640 |
|
| 1641 |
329
|
| 1642 |
-
00:21:33,
|
| 1643 |
I've spoken, delivered this at a
|
| 1644 |
faster pace than I usually would
|
| 1645 |
|
| 1646 |
330
|
| 1647 |
-
00:21:
|
| 1648 |
owing to strong coffee flowing
|
| 1649 |
through my bloodstream.
|
| 1650 |
|
| 1651 |
331
|
| 1652 |
-
00:21:
|
| 1653 |
And the thing that I'm not going
|
| 1654 |
to get in this benchmark is
|
| 1655 |
|
| 1656 |
332
|
| 1657 |
-
00:21:44,
|
| 1658 |
background noise, which in my first
|
| 1659 |
take that I had to get rid of,
|
| 1660 |
|
| 1661 |
333
|
| 1662 |
-
00:21:47,
|
| 1663 |
my wife came in with my son and
|
| 1664 |
for a good night kiss.
|
| 1665 |
|
| 1666 |
334
|
| 1667 |
-
00:21:51,
|
| 1668 |
And that actually would have
|
| 1669 |
been super helpful to get in
|
| 1670 |
|
| 1671 |
335
|
| 1672 |
-
00:21:55,
|
| 1673 |
because it was not diarised.
|
| 1674 |
Or if we had diarisation a female,
|
| 1675 |
|
| 1676 |
336
|
| 1677 |
-
00:
|
| 1678 |
I could say I want the male
|
| 1679 |
voice and that wasn't intended
|
| 1680 |
|
| 1681 |
337
|
| 1682 |
-
00:22:02,
|
| 1683 |
for transcription.
|
| 1684 |
Um, and we're not going to get
|
| 1685 |
|
| 1686 |
338
|
| 1687 |
-
00:22:05,
|
| 1688 |
background noise like people
|
| 1689 |
honking their horns,
|
| 1690 |
|
| 1691 |
339
|
| 1692 |
-
00:22:
|
| 1693 |
which is something I've done in my
|
| 1694 |
main data set where I am trying to
|
| 1695 |
|
| 1696 |
340
|
| 1697 |
-
00:22:11,
|
| 1698 |
go back to some of my voice notes,
|
| 1699 |
annotate them, and run a benchmark.
|
| 1700 |
|
| 1701 |
341
|
| 1702 |
-
00:22:15,
|
| 1703 |
But this is going to be just a
|
| 1704 |
pure quick test.
|
| 1705 |
|
| 1706 |
342
|
| 1707 |
-
00:22:19,
|
| 1708 |
And as someone I'm working on a
|
| 1709 |
voice note idea,
|
| 1710 |
|
| 1711 |
343
|
| 1712 |
-
00:22:
|
| 1713 |
that's my sort of end motivation.
|
| 1714 |
Besides thinking it's an
|
| 1715 |
|
| 1716 |
344
|
| 1717 |
-
00:22:28,
|
| 1718 |
absolutely outstanding technology
|
| 1719 |
that's coming to viability.
|
| 1720 |
|
| 1721 |
345
|
| 1722 |
-
00:22:31,
|
| 1723 |
And really, I know this sounds
|
| 1724 |
cheesy can actually have a very
|
| 1725 |
|
| 1726 |
346
|
| 1727 |
-
00:22:34,
|
| 1728 |
transformative effect.
|
| 1729 |
Um, it's, you know, voice technology
|
| 1730 |
|
| 1731 |
347
|
| 1732 |
-
00:22:38,
|
| 1733 |
has been life changing for, uh,
|
| 1734 |
folks living with, um, disabilities.
|
| 1735 |
|
| 1736 |
348
|
| 1737 |
-
00:22:45,
|
| 1738 |
And I think there's something
|
| 1739 |
really nice about the fact that
|
| 1740 |
|
| 1741 |
349
|
| 1742 |
-
00:22:48,
|
| 1743 |
it can also benefit, you know,
|
| 1744 |
folks who are able bodied and like,
|
| 1745 |
|
| 1746 |
350
|
| 1747 |
-
00:22:52,
|
| 1748 |
we can all in different ways, um,
|
| 1749 |
make this tech as useful as possible,
|
| 1750 |
|
| 1751 |
351
|
| 1752 |
-
00:22:
|
| 1753 |
regardless of the exact way that
|
| 1754 |
we're using it.
|
| 1755 |
|
| 1756 |
352
|
| 1757 |
-
00:23:01,
|
| 1758 |
Um, and I think there's something
|
| 1759 |
very powerful in that, and it can be
|
| 1760 |
|
| 1761 |
353
|
| 1762 |
-
00:23:04,
|
| 1763 |
very cool. Um, I see use potential.
|
| 1764 |
What excites me about voice tech?
|
| 1765 |
|
| 1766 |
354
|
| 1767 |
-
00:23:09,
|
| 1768 |
A lot of things, actually.
|
| 1769 |
Firstly, the fact that it's cheap
|
| 1770 |
|
| 1771 |
355
|
| 1772 |
-
00:23:13,
|
| 1773 |
and accurate, as I mentioned at
|
| 1774 |
the very start of this, um,
|
| 1775 |
|
| 1776 |
356
|
| 1777 |
-
00:23:17,
|
| 1778 |
and it's getting better and better
|
| 1779 |
with stuff like accent handling, um,
|
| 1780 |
|
| 1781 |
357
|
| 1782 |
-
00:23:20,
|
| 1783 |
I'm not sure my, my fine tune will
|
| 1784 |
actually ever come to fruition in the
|
| 1785 |
|
| 1786 |
358
|
| 1787 |
-
00:23:24,
|
| 1788 |
sense that I'll use it day to day,
|
| 1789 |
as I imagine I get like superb,
|
| 1790 |
|
| 1791 |
359
|
| 1792 |
-
00:23:27,
|
| 1793 |
flawless word error rates because I'm
|
| 1794 |
just kind of skeptical about local
|
| 1795 |
|
| 1796 |
360
|
| 1797 |
-
00:23:33,
|
| 1798 |
speech to texts, as I mentioned.
|
| 1799 |
And I think the pace of innovation
|
| 1800 |
|
| 1801 |
361
|
| 1802 |
-
00:23:38,
|
| 1803 |
and improvement in the models,
|
| 1804 |
the main reasons for fine tuning from
|
| 1805 |
|
| 1806 |
362
|
| 1807 |
-
00:23:42,
|
| 1808 |
what I've seen have been people who
|
| 1809 |
are something that really blows,
|
| 1810 |
|
| 1811 |
363
|
| 1812 |
-
00:23:46,
|
| 1813 |
blows my mind about ASR is the idea
|
| 1814 |
that it's inherently a lingual
|
| 1815 |
|
| 1816 |
364
|
| 1817 |
-
00:23:
|
| 1818 |
or multilingual phonetic based.
|
| 1819 |
So as folks who use speak very
|
| 1820 |
|
| 1821 |
365
|
| 1822 |
-
00:23:59,
|
| 1823 |
obscure languages that there may
|
| 1824 |
be there might be a paucity of
|
| 1825 |
|
| 1826 |
366
|
| 1827 |
-
00:24:02,
|
| 1828 |
training data or almost none at all,
|
| 1829 |
and therefore the accuracy is
|
| 1830 |
|
| 1831 |
367
|
| 1832 |
-
00:24:05,
|
| 1833 |
significantly reduced or folks
|
| 1834 |
in very critical environments.
|
| 1835 |
|
| 1836 |
368
|
| 1837 |
-
00:24:10,
|
| 1838 |
I know there are.
|
| 1839 |
This is used extensively in medical
|
| 1840 |
|
| 1841 |
369
|
| 1842 |
-
00:24:13,
|
| 1843 |
transcription and dispatcher work as,
|
| 1844 |
um, you know, the call centers who
|
| 1845 |
|
| 1846 |
370
|
| 1847 |
-
00:24:18,
|
| 1848 |
send out ambulances, etc., where
|
| 1849 |
accuracy is absolutely paramount.
|
| 1850 |
|
| 1851 |
371
|
| 1852 |
-
00:24:22,
|
| 1853 |
And in the case of doctors,
|
| 1854 |
radiologists, they might be using
|
| 1855 |
|
| 1856 |
372
|
| 1857 |
-
00:24:26,
|
| 1858 |
very specialized vocab all the time.
|
| 1859 |
So those are kind of the main
|
| 1860 |
|
| 1861 |
373
|
| 1862 |
-
00:24:29,
|
| 1863 |
two things.
|
| 1864 |
And I'm not sure that really just for
|
| 1865 |
|
| 1866 |
374
|
| 1867 |
-
00:24:31,
|
| 1868 |
trying to make it better on a few
|
| 1869 |
random tech words with my slightly.
|
| 1870 |
|
| 1871 |
375
|
| 1872 |
-
00:24:37,
|
| 1873 |
I mean, I have an accent, but like,
|
| 1874 |
not, you know, an accent that a few
|
| 1875 |
|
| 1876 |
376
|
| 1877 |
-
00:24:41,
|
| 1878 |
other million people have. Ish.
|
| 1879 |
I'm not sure that my little fine
|
| 1880 |
|
| 1881 |
377
|
| 1882 |
-
00:24:47,
|
| 1883 |
tune is going to actually like the
|
| 1884 |
bump in word error rate reduction.
|
| 1885 |
|
| 1886 |
378
|
| 1887 |
-
00:24:52,
|
| 1888 |
If I ever actually figure out how
|
| 1889 |
to do it and get it up to the
|
| 1890 |
|
| 1891 |
379
|
| 1892 |
-
00:24:54,
|
| 1893 |
cloud by the time I've done that.
|
| 1894 |
I suspect that the next
|
| 1895 |
|
| 1896 |
380
|
| 1897 |
-
00:24:58,
|
| 1898 |
generation of ASR will just be
|
| 1899 |
so good that it will kind of be.
|
| 1900 |
|
| 1901 |
381
|
| 1902 |
-
00:25:
|
| 1903 |
Ah, well,
|
| 1904 |
that would be cool if it worked out,
|
| 1905 |
|
| 1906 |
382
|
| 1907 |
-
00:25:03,
|
| 1908 |
but I'll just use this instead.
|
| 1909 |
So that's going to be it for today's
|
| 1910 |
|
| 1911 |
383
|
| 1912 |
-
00:25:08,
|
| 1913 |
episode of, uh, voice training data.
|
| 1914 |
Single long shot evaluation.
|
| 1915 |
|
| 1916 |
384
|
| 1917 |
-
00:25:14,
|
| 1918 |
Who am I going to compare?
|
| 1919 |
Whisper is always good as a
|
| 1920 |
|
| 1921 |
385
|
| 1922 |
-
00:25:17,
|
| 1923 |
benchmark, but I'm more
|
| 1924 |
interested in seeing Whisperer
|
| 1925 |
|
| 1926 |
386
|
| 1927 |
-
00:25:20,
|
| 1928 |
head to head with two things,
|
| 1929 |
really. One is whisper variance.
|
| 1930 |
|
| 1931 |
387
|
| 1932 |
-
00:25:25,
|
| 1933 |
So you've got these projects like
|
| 1934 |
faster Whisper, Still whisper.
|
| 1935 |
|
| 1936 |
388
|
| 1937 |
-
00:25:
|
| 1938 |
It's a bit confusing.
|
| 1939 |
There's a whole bunch of them
|
| 1940 |
|
| 1941 |
389
|
| 1942 |
-
00:25:
|
| 1943 |
and the emerging acers,
|
| 1944 |
which are also a thing.
|
| 1945 |
|
| 1946 |
390
|
| 1947 |
-
00:25:35,
|
| 1948 |
My intention for this is I'm not
|
| 1949 |
sure I'm going to have the time
|
| 1950 |
|
| 1951 |
391
|
| 1952 |
-
00:25:37,
|
| 1953 |
in any point in the foreseeable
|
| 1954 |
future to go back through this whole
|
| 1955 |
|
| 1956 |
392
|
| 1957 |
-
00:25:41,
|
| 1958 |
episode and create a proper source,
|
| 1959 |
truth or a fix.
|
| 1960 |
|
| 1961 |
393
|
| 1962 |
-
00:25:47,
|
| 1963 |
Everything might do it if I can
|
| 1964 |
get one transcription that
|
| 1965 |
|
| 1966 |
394
|
| 1967 |
-
00:25:51,
|
| 1968 |
sufficiently close to perfection.
|
| 1969 |
But what I would actually love
|
| 1970 |
|
| 1971 |
395
|
| 1972 |
-
00:25:56,
|
| 1973 |
to do on Hugging Face I think
|
| 1974 |
would be a great.
|
| 1975 |
|
| 1976 |
396
|
| 1977 |
-
00:25:59,
|
| 1978 |
Probably how I might visualize this
|
| 1979 |
is having the audio waveform play,
|
| 1980 |
|
| 1981 |
397
|
| 1982 |
-
00:26:04,
|
| 1983 |
and then have the transcript for each
|
| 1984 |
model below it, and maybe even a,
|
| 1985 |
|
| 1986 |
398
|
| 1987 |
-
00:26:10,
|
| 1988 |
um, like, you know, two scale and
|
| 1989 |
maybe even a local one as well,
|
| 1990 |
|
| 1991 |
399
|
| 1992 |
-
00:26:15,
|
| 1993 |
like local whisper versus open
|
| 1994 |
AI API, Etc. and, um, I can then
|
| 1995 |
|
| 1996 |
400
|
| 1997 |
-
00:26:21,
|
| 1998 |
actually listen back to segments
|
| 1999 |
or anyone who wants to can listen
|
| 2000 |
|
| 2001 |
401
|
| 2002 |
-
00:26:24,
|
| 2003 |
back to segments of this recording
|
| 2004 |
and see where a particular model
|
| 2005 |
|
| 2006 |
402
|
| 2007 |
-
00:26:29,
|
| 2008 |
struggled and others didn't, as well
|
| 2009 |
as the sort of headline finding
|
| 2010 |
|
| 2011 |
403
|
| 2012 |
-
00:26:
|
| 2013 |
of which had the best, uh, wer.
|
| 2014 |
But that would require the source
|
| 2015 |
|
| 2016 |
404
|
| 2017 |
-
00:26:36,
|
| 2018 |
of truth. Okay. That's it.
|
| 2019 |
Hope this was, I don't know,
|
| 2020 |
|
| 2021 |
405
|
| 2022 |
-
00:26:40,
|
| 2023 |
maybe useful for other folks
|
| 2024 |
interested in stuff you want to see.
|
| 2025 |
|
| 2026 |
406
|
| 2027 |
-
00:26:
|
| 2028 |
I always feel think I've just said
|
| 2029 |
something I didn't intend to say.
|
| 2030 |
|
| 2031 |
407
|
| 2032 |
-
00:26:48,
|
| 2033 |
I said for those, listen carefully.
|
| 2034 |
Including, hopefully,
|
| 2035 |
|
| 2036 |
408
|
| 2037 |
-
00:26:51,
|
| 2038 |
the models themselves.
|
| 2039 |
This has been myself,
|
| 2040 |
|
| 2041 |
409
|
| 2042 |
-
00:26:54,
|
| 2043 |
Daniel Rosehill, for more, um,
|
| 2044 |
jumbled repositories about my,
|
| 2045 |
|
| 2046 |
410
|
| 2047 |
-
00:26:
|
| 2048 |
uh, roving interest in AI,
|
| 2049 |
but particularly Agentic,
|
| 2050 |
|
| 2051 |
411
|
| 2052 |
-
00:27:01,
|
| 2053 |
MCP and voice tech.
|
| 2054 |
Uh, you can find me on GitHub.
|
| 2055 |
|
| 2056 |
412
|
| 2057 |
-
00:27:05,
|
| 2058 |
Hugging face. Where else?
|
| 2059 |
Daniel, which is my personal website,
|
| 2060 |
|
| 2061 |
413
|
| 2062 |
-
00:27:11,
|
| 2063 |
as well as this podcast whose
|
| 2064 |
name I sadly cannot remember.
|
| 2065 |
|
| 2066 |
414
|
| 2067 |
-
00:27:15,
|
| 2068 |
Until next time.
|
| 2069 |
Thanks for listening.
|
|
|
|
| 1 |
1
|
| 2 |
+
00:00:00,000 --> 00:00:06,400
|
| 3 |
Hello and welcome to a audio data
|
| 4 |
set consisting of one single
|
| 5 |
|
| 6 |
2
|
| 7 |
+
00:00:06,400 --> 00:00:12,000
|
| 8 |
episode of a non-existent podcast.
|
| 9 |
Or it, uh, I may append this to a
|
| 10 |
|
| 11 |
3
|
| 12 |
+
00:00:12,000 --> 00:00:16,520
|
| 13 |
podcast that I set up recently.
|
| 14 |
Um, regarding my, uh,
|
| 15 |
|
| 16 |
4
|
| 17 |
+
00:00:16,560 --> 00:00:21,840
|
| 18 |
with my thoughts on speech,
|
| 19 |
tech and AI in particular,
|
| 20 |
|
| 21 |
5
|
| 22 |
+
00:00:22,120 --> 00:00:27,840
|
| 23 |
more AI and generative AI, I would,
|
| 24 |
uh, I would say, but in any event,
|
| 25 |
|
| 26 |
6
|
| 27 |
+
00:00:27,840 --> 00:00:32,360
|
| 28 |
the purpose of this, um,
|
| 29 |
voice recording is actually to create
|
| 30 |
|
| 31 |
7
|
| 32 |
+
00:00:32,560 --> 00:00:37,440
|
| 33 |
a lengthy voice sample for a quick
|
| 34 |
evaluation, a back of the envelope
|
| 35 |
|
| 36 |
8
|
| 37 |
+
00:00:37,440 --> 00:00:41,040
|
| 38 |
evaluation, as they might say,
|
| 39 |
for different speech to text models.
|
| 40 |
|
| 41 |
9
|
| 42 |
+
00:00:41,040 --> 00:00:43,680
|
| 43 |
And I'm doing this because I,
|
| 44 |
uh, I thought I'd made a great
|
| 45 |
|
| 46 |
10
|
| 47 |
+
00:00:43,680 --> 00:00:48,200
|
| 48 |
breakthrough in my journey with
|
| 49 |
speech tech, and that was succeeding
|
| 50 |
|
| 51 |
11
|
| 52 |
+
00:00:48,200 --> 00:00:52,600
|
| 53 |
in the elusive task of fine tuning.
|
| 54 |
Whisper, whisper is.
|
| 55 |
|
| 56 |
12
|
| 57 |
+
00:00:52,720 --> 00:00:56,840
|
| 58 |
And I'm going to just talk.
|
| 59 |
I'm trying to mix up, uh,
|
| 60 |
|
| 61 |
13
|
| 62 |
+
00:00:56,840 --> 00:01:00,350
|
| 63 |
I'm going to try a few different
|
| 64 |
styles of speaking.
|
| 65 |
|
| 66 |
14
|
| 67 |
+
00:01:00,350 --> 00:01:02,510
|
| 68 |
I might whisper something at
|
| 69 |
some point as well,
|
| 70 |
|
| 71 |
15
|
| 72 |
+
00:01:03,070 --> 00:01:07,030
|
| 73 |
and I'll go back to speaking loud in,
|
| 74 |
uh, in different parts.
|
| 75 |
|
| 76 |
16
|
| 77 |
+
00:01:07,030 --> 00:01:09,590
|
| 78 |
I'm going to sound really like a
|
| 79 |
crazy person, because I'm also
|
| 80 |
|
| 81 |
17
|
| 82 |
+
00:01:09,590 --> 00:01:15,750
|
| 83 |
going to try to speak at different
|
| 84 |
pitches and cadences in order to
|
| 85 |
|
| 86 |
18
|
| 87 |
+
00:01:15,790 --> 00:01:20,510
|
| 88 |
really try to put a speech to
|
| 89 |
text model through its paces,
|
| 90 |
|
| 91 |
19
|
| 92 |
+
00:01:20,510 --> 00:01:25,750
|
| 93 |
which is trying to make sense of,
|
| 94 |
is this guy just on incoherently in
|
| 95 |
|
| 96 |
20
|
| 97 |
+
00:01:25,750 --> 00:01:34,230
|
| 98 |
one long sentence, or are these just
|
| 99 |
actually a series of step standalone,
|
| 100 |
|
| 101 |
21
|
| 102 |
+
00:01:34,230 --> 00:01:37,390
|
| 103 |
standalone, standalone sentences?
|
| 104 |
And how is it going to handle
|
| 105 |
|
| 106 |
22
|
| 107 |
+
00:01:37,390 --> 00:01:40,630
|
| 108 |
step alone? That's not a word.
|
| 109 |
Uh, what happens when you use
|
| 110 |
|
| 111 |
23
|
| 112 |
+
00:01:40,630 --> 00:01:43,910
|
| 113 |
speech to text and you use a fake
|
| 114 |
word and then you're like, wait,
|
| 115 |
|
| 116 |
24
|
| 117 |
+
00:01:43,910 --> 00:01:48,230
|
| 118 |
that's not actually that word doesn't
|
| 119 |
exist. How does AI handle that?
|
| 120 |
|
| 121 |
25
|
| 122 |
+
00:01:48,270 --> 00:01:53,790
|
| 123 |
And, uh, these and more are all
|
| 124 |
the questions that I'm seeking
|
| 125 |
|
| 126 |
26
|
| 127 |
+
00:01:53,790 --> 00:01:57,230
|
| 128 |
to answer in this training data.
|
| 129 |
Now, why did why was it trying
|
| 130 |
|
| 131 |
27
|
| 132 |
+
00:01:57,230 --> 00:01:59,620
|
| 133 |
to fine tune a whisper?
|
| 134 |
And what is whisper?
|
| 135 |
|
| 136 |
28
|
| 137 |
+
00:01:59,660 --> 00:02:03,420
|
| 138 |
As I said, I'm gonna try to, uh,
|
| 139 |
record this at a couple of different
|
| 140 |
|
| 141 |
29
|
| 142 |
+
00:02:03,420 --> 00:02:08,940
|
| 143 |
levels of technicality for folks who
|
| 144 |
are, uh, you know, in the normal, uh,
|
| 145 |
|
| 146 |
30
|
| 147 |
+
00:02:08,940 --> 00:02:13,340
|
| 148 |
world and not totally stuck down
|
| 149 |
the rabbit hole of AI, uh, which I
|
| 150 |
|
| 151 |
31
|
| 152 |
+
00:02:13,340 --> 00:02:17,340
|
| 153 |
have to say is a really wonderful,
|
| 154 |
uh, rabbit hole to be to be down.
|
| 155 |
|
| 156 |
32
|
| 157 |
+
00:02:17,460 --> 00:02:21,580
|
| 158 |
Um, it's a really interesting area.
|
| 159 |
And speech and voice tech is is
|
| 160 |
|
| 161 |
33
|
| 162 |
+
00:02:21,820 --> 00:02:24,860
|
| 163 |
the aspect of it that I find
|
| 164 |
actually most.
|
| 165 |
|
| 166 |
34
|
| 167 |
+
00:02:25,060 --> 00:02:28,220
|
| 168 |
I'm not sure I would say the most
|
| 169 |
interesting, because there's just
|
| 170 |
|
| 171 |
35
|
| 172 |
+
00:02:28,220 --> 00:02:32,580
|
| 173 |
so much that is fascinating in AI.
|
| 174 |
Uh, but the most that I find the
|
| 175 |
|
| 176 |
36
|
| 177 |
+
00:02:32,580 --> 00:02:36,100
|
| 178 |
most personally transformative
|
| 179 |
in terms of the impact that it's
|
| 180 |
|
| 181 |
37
|
| 182 |
+
00:02:36,100 --> 00:02:41,540
|
| 183 |
had on my daily work life and
|
| 184 |
productivity and how I sort of work.
|
| 185 |
|
| 186 |
38
|
| 187 |
+
00:02:41,820 --> 00:02:47,900
|
| 188 |
And I'm persevering hard with the
|
| 189 |
task of trying to guess a good
|
| 190 |
|
| 191 |
39
|
| 192 |
+
00:02:47,900 --> 00:02:51,580
|
| 193 |
solution working for Linux, which if
|
| 194 |
anyone actually does listen to this,
|
| 195 |
|
| 196 |
40
|
| 197 |
+
00:02:51,580 --> 00:02:54,980
|
| 198 |
not just for the training data
|
| 199 |
and for the actual content, uh,
|
| 200 |
|
| 201 |
41
|
| 202 |
+
00:02:55,020 --> 00:02:59,480
|
| 203 |
this is this is has sparked I had
|
| 204 |
besides the fine tune not working.
|
| 205 |
|
| 206 |
42
|
| 207 |
+
00:02:59,480 --> 00:03:05,440
|
| 208 |
Well, that was the failure.
|
| 209 |
Um, I used clod code because one
|
| 210 |
|
| 211 |
43
|
| 212 |
+
00:03:05,440 --> 00:03:10,040
|
| 213 |
thinks these days that there is
|
| 214 |
nothing short of solving,
|
| 215 |
|
| 216 |
44
|
| 217 |
+
00:03:10,920 --> 00:03:14,560
|
| 218 |
you know, the, uh,
|
| 219 |
the reason of life or something.
|
| 220 |
|
| 221 |
45
|
| 222 |
+
00:03:14,960 --> 00:03:19,440
|
| 223 |
Uh, that clod and agentic AI can't
|
| 224 |
do, uh, which is not really the case.
|
| 225 |
|
| 226 |
46
|
| 227 |
+
00:03:19,480 --> 00:03:23,480
|
| 228 |
Uh, it does seem that way sometimes,
|
| 229 |
but it fails a lot as well.
|
| 230 |
|
| 231 |
47
|
| 232 |
+
00:03:23,480 --> 00:03:26,840
|
| 233 |
And this is one of those, uh,
|
| 234 |
instances where last week I put
|
| 235 |
|
| 236 |
48
|
| 237 |
+
00:03:26,840 --> 00:03:31,280
|
| 238 |
together an hour of voice training
|
| 239 |
data, basically speaking just
|
| 240 |
|
| 241 |
49
|
| 242 |
+
00:03:31,280 --> 00:03:34,920
|
| 243 |
random things for three minutes.
|
| 244 |
And, um,
|
| 245 |
|
| 246 |
50
|
| 247 |
+
00:03:35,600 --> 00:03:38,400
|
| 248 |
it was actually kind of tedious
|
| 249 |
because the texts were really weird.
|
| 250 |
|
| 251 |
51
|
| 252 |
+
00:03:38,400 --> 00:03:42,000
|
| 253 |
Some of them were it was like it
|
| 254 |
was AI generated.
|
| 255 |
|
| 256 |
52
|
| 257 |
+
00:03:42,200 --> 00:03:44,800
|
| 258 |
Um, I tried before to read
|
| 259 |
Sherlock Holmes for an hour and
|
| 260 |
|
| 261 |
53
|
| 262 |
+
00:03:44,800 --> 00:03:46,880
|
| 263 |
I just couldn't.
|
| 264 |
I was so bored, uh,
|
| 265 |
|
| 266 |
54
|
| 267 |
+
00:03:46,920 --> 00:03:50,680
|
| 268 |
after ten minutes that I was like,
|
| 269 |
okay, now I'm just gonna have to
|
| 270 |
|
| 271 |
55
|
| 272 |
+
00:03:50,680 --> 00:03:56,350
|
| 273 |
find something else to read.
|
| 274 |
So I used a created with AI
|
| 275 |
|
| 276 |
56
|
| 277 |
+
00:03:56,390 --> 00:04:00,030
|
| 278 |
studio vibe coded.
|
| 279 |
A synthetic text generator.
|
| 280 |
|
| 281 |
57
|
| 282 |
+
00:04:00,270 --> 00:04:03,870
|
| 283 |
Um, which actually I thought was
|
| 284 |
probably a better way of doing it
|
| 285 |
|
| 286 |
58
|
| 287 |
+
00:04:03,870 --> 00:04:08,750
|
| 288 |
because it would give me more short
|
| 289 |
samples with more varied content.
|
| 290 |
|
| 291 |
59
|
| 292 |
+
00:04:08,750 --> 00:04:13,190
|
| 293 |
So I was like, okay, give me a voice
|
| 294 |
note, like I'm recording an email,
|
| 295 |
|
| 296 |
60
|
| 297 |
+
00:04:13,190 --> 00:04:17,990
|
| 298 |
give me a short story to read,
|
| 299 |
give me prose, um, to read.
|
| 300 |
|
| 301 |
61
|
| 302 |
+
00:04:17,990 --> 00:04:21,190
|
| 303 |
So I came up with all these
|
| 304 |
different things, and I added a
|
| 305 |
|
| 306 |
62
|
| 307 |
+
00:04:21,190 --> 00:04:24,630
|
| 308 |
little timer to it so I could
|
| 309 |
see how close I was to one hour.
|
| 310 |
|
| 311 |
63
|
| 312 |
+
00:04:24,870 --> 00:04:29,710
|
| 313 |
Um, and, uh, I spent like an hour one
|
| 314 |
afternoon or probably two hours by
|
| 315 |
|
| 316 |
64
|
| 317 |
+
00:04:29,710 --> 00:04:34,070
|
| 318 |
the time you, um, you do retakes
|
| 319 |
or whatever because you want to.
|
| 320 |
|
| 321 |
65
|
| 322 |
+
00:04:34,870 --> 00:04:39,070
|
| 323 |
It gave me a source of truth,
|
| 324 |
which I'm not sure if that's the
|
| 325 |
|
| 326 |
66
|
| 327 |
+
00:04:39,070 --> 00:04:43,430
|
| 328 |
scientific way to approach this topic
|
| 329 |
of gathering, uh, training data,
|
| 330 |
|
| 331 |
67
|
| 332 |
+
00:04:43,430 --> 00:04:47,950
|
| 333 |
but I thought it made sense.
|
| 334 |
Um, I have a lot of audio data
|
| 335 |
|
| 336 |
68
|
| 337 |
+
00:04:47,950 --> 00:04:51,950
|
| 338 |
from recording voice notes,
|
| 339 |
which I've also kind of used, um,
|
| 340 |
|
| 341 |
69
|
| 342 |
+
00:04:51,950 --> 00:04:55,660
|
| 343 |
been experimenting with using for
|
| 344 |
a different purpose, slightly
|
| 345 |
|
| 346 |
70
|
| 347 |
+
00:04:55,660 --> 00:05:00,700
|
| 348 |
different annotating task types.
|
| 349 |
It's more text classification
|
| 350 |
|
| 351 |
71
|
| 352 |
+
00:05:00,700 --> 00:05:03,620
|
| 353 |
experiment or uh, well,
|
| 354 |
it's more than that, actually.
|
| 355 |
|
| 356 |
72
|
| 357 |
+
00:05:03,620 --> 00:05:07,980
|
| 358 |
I'm working on a voice app,
|
| 359 |
so it's a prototype I guess is
|
| 360 |
|
| 361 |
73
|
| 362 |
+
00:05:07,980 --> 00:05:12,660
|
| 363 |
really more accurate.
|
| 364 |
Um, but you can do that and you
|
| 365 |
|
| 366 |
74
|
| 367 |
+
00:05:12,660 --> 00:05:14,100
|
| 368 |
can work backwards.
|
| 369 |
You're like,
|
| 370 |
|
| 371 |
75
|
| 372 |
+
00:05:14,140 --> 00:05:18,500
|
| 373 |
you listen back to a voice note
|
| 374 |
and you painfully go through one
|
| 375 |
|
| 376 |
76
|
| 377 |
+
00:05:18,500 --> 00:05:21,860
|
| 378 |
of those transcribing, you know,
|
| 379 |
where you start and stop and scrub
|
| 380 |
|
| 381 |
77
|
| 382 |
+
00:05:21,860 --> 00:05:23,980
|
| 383 |
around it and you fix the errors.
|
| 384 |
But it's really,
|
| 385 |
|
| 386 |
78
|
| 387 |
+
00:05:23,980 --> 00:05:27,100
|
| 388 |
really boring to do that.
|
| 389 |
So I thought it would be less
|
| 390 |
|
| 391 |
79
|
| 392 |
+
00:05:27,100 --> 00:05:31,740
|
| 393 |
tedious in the long term if I just
|
| 394 |
recorded The Source of truth.
|
| 395 |
|
| 396 |
80
|
| 397 |
+
00:05:32,060 --> 00:05:34,180
|
| 398 |
So it gave me these three minute
|
| 399 |
snippets.
|
| 400 |
|
| 401 |
81
|
| 402 |
+
00:05:34,180 --> 00:05:38,660
|
| 403 |
I recorded them and saved an MP3
|
| 404 |
and a txt in the same folder,
|
| 405 |
|
| 406 |
82
|
| 407 |
+
00:05:38,660 --> 00:05:43,700
|
| 408 |
and I created an hour of that data.
|
| 409 |
Uh, so I was very hopeful, quietly,
|
| 410 |
|
| 411 |
83
|
| 412 |
+
00:05:43,740 --> 00:05:46,260
|
| 413 |
you know, a little bit hopeful
|
| 414 |
that I would be able that I could
|
| 415 |
|
| 416 |
84
|
| 417 |
+
00:05:46,260 --> 00:05:49,580
|
| 418 |
actually fine tune, whisper.
|
| 419 |
Um, I want to fine tune whisper
|
| 420 |
|
| 421 |
85
|
| 422 |
+
00:05:49,580 --> 00:05:54,720
|
| 423 |
because when I got into voice tech
|
| 424 |
last November, my wife was in
|
| 425 |
|
| 426 |
86
|
| 427 |
+
00:05:54,720 --> 00:05:59,480
|
| 428 |
the US and I was alone at home.
|
| 429 |
And you know, when crazy people
|
| 430 |
|
| 431 |
87
|
| 432 |
+
00:05:59,480 --> 00:06:03,640
|
| 433 |
like me do really wild things like
|
| 434 |
use voice to tech, uh, technology.
|
| 435 |
|
| 436 |
88
|
| 437 |
+
00:06:03,640 --> 00:06:06,400
|
| 438 |
That was basically, um,
|
| 439 |
when I started doing it,
|
| 440 |
|
| 441 |
89
|
| 442 |
+
00:06:06,400 --> 00:06:10,160
|
| 443 |
I didn't feel like a crazy person
|
| 444 |
speaking to myself, and my
|
| 445 |
|
| 446 |
90
|
| 447 |
+
00:06:10,160 --> 00:06:16,000
|
| 448 |
expectations weren't that high.
|
| 449 |
Uh, I used speech tech now and again.
|
| 450 |
|
| 451 |
91
|
| 452 |
+
00:06:16,080 --> 00:06:18,360
|
| 453 |
Um, tried it out.
|
| 454 |
I was like, it'd be really cool
|
| 455 |
|
| 456 |
92
|
| 457 |
+
00:06:18,360 --> 00:06:20,400
|
| 458 |
if you could just, like,
|
| 459 |
speak into your computer.
|
| 460 |
|
| 461 |
93
|
| 462 |
+
00:06:20,760 --> 00:06:24,600
|
| 463 |
And whatever I tried out that
|
| 464 |
had Linux support was just.
|
| 465 |
|
| 466 |
94
|
| 467 |
+
00:06:25,320 --> 00:06:28,520
|
| 468 |
It was not good, basically.
|
| 469 |
Um, and this blew me away from
|
| 470 |
|
| 471 |
95
|
| 472 |
+
00:06:28,520 --> 00:06:31,920
|
| 473 |
the first go.
|
| 474 |
I mean, it wasn't 100% accurate
|
| 475 |
|
| 476 |
96
|
| 477 |
+
00:06:31,960 --> 00:06:35,040
|
| 478 |
out of the box and it took work,
|
| 479 |
but it was good enough that there was
|
| 480 |
|
| 481 |
97
|
| 482 |
+
00:06:35,040 --> 00:06:39,600
|
| 483 |
a solid foundation and it kind of
|
| 484 |
passed that, uh, pivot point that
|
| 485 |
|
| 486 |
98
|
| 487 |
+
00:06:39,600 --> 00:06:42,760
|
| 488 |
it's actually worth doing this.
|
| 489 |
You know, there's a point where
|
| 490 |
|
| 491 |
99
|
| 492 |
+
00:06:42,760 --> 00:06:46,800
|
| 493 |
it's so like the transcript is you
|
| 494 |
don't have to get 100% accuracy
|
| 495 |
|
| 496 |
100
|
| 497 |
+
00:06:46,800 --> 00:06:50,510
|
| 498 |
for it to be worth your time for
|
| 499 |
speech to text to be a worthwhile
|
| 500 |
|
| 501 |
101
|
| 502 |
+
00:06:50,510 --> 00:06:52,950
|
| 503 |
addition to your productivity.
|
| 504 |
But you do need to get above.
|
| 505 |
|
| 506 |
102
|
| 507 |
+
00:06:52,990 --> 00:06:57,630
|
| 508 |
Let's say, I don't know, 85%.
|
| 509 |
If it's 60% or 50%,
|
| 510 |
|
| 511 |
103
|
| 512 |
+
00:06:57,630 --> 00:07:00,670
|
| 513 |
you inevitably say, screw it.
|
| 514 |
I'll just type it because you end up
|
| 515 |
|
| 516 |
104
|
| 517 |
+
00:07:00,670 --> 00:07:04,950
|
| 518 |
missing errors in the transcript
|
| 519 |
and it becomes actually worse.
|
| 520 |
|
| 521 |
105
|
| 522 |
+
00:07:04,950 --> 00:07:06,710
|
| 523 |
You end up in a worse position
|
| 524 |
than you started with.
|
| 525 |
|
| 526 |
106
|
| 527 |
+
00:07:06,710 --> 00:07:10,910
|
| 528 |
And that's been my experience.
|
| 529 |
So, um, I was like, oh,
|
| 530 |
|
| 531 |
107
|
| 532 |
+
00:07:10,950 --> 00:07:13,430
|
| 533 |
this is actually really, really good.
|
| 534 |
Now how did that happen?
|
| 535 |
|
| 536 |
108
|
| 537 |
+
00:07:13,430 --> 00:07:18,790
|
| 538 |
And the answer is ASR whisper
|
| 539 |
being open sourced and the
|
| 540 |
|
| 541 |
109
|
| 542 |
+
00:07:18,790 --> 00:07:21,790
|
| 543 |
transformer architecture,
|
| 544 |
if you want to go back to the,
|
| 545 |
|
| 546 |
110
|
| 547 |
+
00:07:22,390 --> 00:07:26,630
|
| 548 |
um, to the underpinnings, which
|
| 549 |
really blows my mind and it's on my
|
| 550 |
|
| 551 |
111
|
| 552 |
+
00:07:26,630 --> 00:07:32,310
|
| 553 |
list to read through that paper.
|
| 554 |
Um, all you need is attention as
|
| 555 |
|
| 556 |
112
|
| 557 |
+
00:07:33,350 --> 00:07:38,350
|
| 558 |
attentively as can be done with my
|
| 559 |
limited brain because it's super,
|
| 560 |
|
| 561 |
113
|
| 562 |
+
00:07:38,350 --> 00:07:42,190
|
| 563 |
super high level stuff.
|
| 564 |
Um, super advanced stuff.
|
| 565 |
|
| 566 |
114
|
| 567 |
+
00:07:42,230 --> 00:07:47,950
|
| 568 |
I mean, uh, but that I think of all
|
| 569 |
the things that are fascinating
|
| 570 |
|
| 571 |
115
|
| 572 |
+
00:07:48,060 --> 00:07:52,700
|
| 573 |
about the sudden rise in AI and
|
| 574 |
the dramatic capabilities.
|
| 575 |
|
| 576 |
116
|
| 577 |
+
00:07:53,300 --> 00:07:55,580
|
| 578 |
I find it fascinating that few
|
| 579 |
people are like, hang on,
|
| 580 |
|
| 581 |
117
|
| 582 |
+
00:07:55,740 --> 00:07:59,620
|
| 583 |
you've got this thing that can speak
|
| 584 |
to you like a chatbot, an LLM,
|
| 585 |
|
| 586 |
118
|
| 587 |
+
00:08:00,300 --> 00:08:05,460
|
| 588 |
and then you've got image generation.
|
| 589 |
Okay, so firstly, those two things on
|
| 590 |
|
| 591 |
119
|
| 592 |
+
00:08:05,460 --> 00:08:10,740
|
| 593 |
the surface have nothing in common.
|
| 594 |
Um, so like how are they how did that
|
| 595 |
|
| 596 |
120
|
| 597 |
+
00:08:10,740 --> 00:08:12,980
|
| 598 |
just happen all at the same time.
|
| 599 |
And then when you extend that
|
| 600 |
|
| 601 |
121
|
| 602 |
+
00:08:12,980 --> 00:08:16,060
|
| 603 |
further, um, you're like sooner,
|
| 604 |
right?
|
| 605 |
|
| 606 |
122
|
| 607 |
+
00:08:16,060 --> 00:08:21,580
|
| 608 |
You can sing a song and AI will like,
|
| 609 |
come up with an instrumental and then
|
| 610 |
|
| 611 |
123
|
| 612 |
+
00:08:21,580 --> 00:08:23,740
|
| 613 |
you've got whisper and you're like,
|
| 614 |
wait a second,
|
| 615 |
|
| 616 |
124
|
| 617 |
+
00:08:23,940 --> 00:08:27,980
|
| 618 |
how did all this stuff, like,
|
| 619 |
if it's all AI, what's like there
|
| 620 |
|
| 621 |
125
|
| 622 |
+
00:08:27,980 --> 00:08:30,580
|
| 623 |
has to be some commonality.
|
| 624 |
Otherwise these are four.
|
| 625 |
|
| 626 |
126
|
| 627 |
+
00:08:30,660 --> 00:08:34,660
|
| 628 |
These are totally different
|
| 629 |
technologies on the surface of it.
|
| 630 |
|
| 631 |
127
|
| 632 |
+
00:08:34,660 --> 00:08:40,100
|
| 633 |
And, uh, the transformer architecture
|
| 634 |
is, as far as I know, the answer.
|
| 635 |
|
| 636 |
128
|
| 637 |
+
00:08:40,100 --> 00:08:43,740
|
| 638 |
And I can't even say can't even
|
| 639 |
pretend that I really understand
|
| 640 |
|
| 641 |
129
|
| 642 |
+
00:08:44,020 --> 00:08:47,170
|
| 643 |
what the transformer
|
| 644 |
architecture means in depth,
|
| 645 |
|
| 646 |
130
|
| 647 |
+
00:08:47,170 --> 00:08:51,690
|
| 648 |
but I have scanned it and as I said,
|
| 649 |
I want to print it and really kind
|
| 650 |
|
| 651 |
131
|
| 652 |
+
00:08:51,690 --> 00:08:56,650
|
| 653 |
of think over it at some point,
|
| 654 |
and I'll probably feel bad about
|
| 655 |
|
| 656 |
132
|
| 657 |
+
00:08:56,650 --> 00:08:58,970
|
| 658 |
myself, I think,
|
| 659 |
because weren't those guys in their
|
| 660 |
|
| 661 |
133
|
| 662 |
+
00:08:59,010 --> 00:09:03,890
|
| 663 |
in their 20s like, that's crazy.
|
| 664 |
I think I asked ChatGPT once who
|
| 665 |
|
| 666 |
134
|
| 667 |
+
00:09:03,930 --> 00:09:08,250
|
| 668 |
were the who wrote that paper
|
| 669 |
and how old were they when it
|
| 670 |
|
| 671 |
135
|
| 672 |
+
00:09:08,250 --> 00:09:11,170
|
| 673 |
was published in arXiv?
|
| 674 |
And I was expecting like,
|
| 675 |
|
| 676 |
136
|
| 677 |
+
00:09:11,410 --> 00:09:13,330
|
| 678 |
I don't know,
|
| 679 |
what do you what do you imagine?
|
| 680 |
|
| 681 |
137
|
| 682 |
+
00:09:13,330 --> 00:09:14,930
|
| 683 |
I personally imagine kind of like,
|
| 684 |
you know,
|
| 685 |
|
| 686 |
138
|
| 687 |
+
00:09:14,970 --> 00:09:19,090
|
| 688 |
you have these breakthroughs during
|
| 689 |
Covid and things like that where
|
| 690 |
|
| 691 |
139
|
| 692 |
+
00:09:19,130 --> 00:09:22,090
|
| 693 |
like these kind of really obscure
|
| 694 |
scientists who are like in their
|
| 695 |
|
| 696 |
140
|
| 697 |
+
00:09:22,090 --> 00:09:27,130
|
| 698 |
50s and they've just kind of been
|
| 699 |
laboring in labs and, uh, wearily
|
| 700 |
|
| 701 |
141
|
| 702 |
+
00:09:27,130 --> 00:09:30,530
|
| 703 |
and writing in publishing in kind
|
| 704 |
of obscure academic publications.
|
| 705 |
|
| 706 |
142
|
| 707 |
+
00:09:30,730 --> 00:09:33,930
|
| 708 |
And they finally, like,
|
| 709 |
hit a big or win a Nobel Prize and
|
| 710 |
|
| 711 |
143
|
| 712 |
+
00:09:33,930 --> 00:09:37,810
|
| 713 |
then their household household names.
|
| 714 |
Uh, so that was kind of what I
|
| 715 |
|
| 716 |
144
|
| 717 |
+
00:09:37,810 --> 00:09:39,650
|
| 718 |
had in mind.
|
| 719 |
That was the mental image I'd
|
| 720 |
|
| 721 |
145
|
| 722 |
+
00:09:39,650 --> 00:09:43,890
|
| 723 |
formed of the birth of arXiv.
|
| 724 |
Like, I wasn't expecting 20
|
| 725 |
|
| 726 |
146
|
| 727 |
+
00:09:43,930 --> 00:09:47,310
|
| 728 |
somethings in San Francisco,
|
| 729 |
though I thought that was both very,
|
| 730 |
|
| 731 |
147
|
| 732 |
+
00:09:47,310 --> 00:09:49,870
|
| 733 |
very funny, very cool,
|
| 734 |
and actually kind of inspiring.
|
| 735 |
|
| 736 |
148
|
| 737 |
+
00:09:50,390 --> 00:09:55,510
|
| 738 |
It's nice to think that people who,
|
| 739 |
you know, just you might put them
|
| 740 |
|
| 741 |
149
|
| 742 |
+
00:09:55,510 --> 00:10:00,910
|
| 743 |
in the kind of milieu or bubble or
|
| 744 |
world that you are in or credibly in,
|
| 745 |
|
| 746 |
150
|
| 747 |
+
00:10:00,950 --> 00:10:03,590
|
| 748 |
through, you know,
|
| 749 |
a series of connections that are
|
| 750 |
|
| 751 |
151
|
| 752 |
+
00:10:03,590 --> 00:10:07,630
|
| 753 |
coming up with such literally
|
| 754 |
world changing, um, innovations.
|
| 755 |
|
| 756 |
152
|
| 757 |
+
00:10:07,670 --> 00:10:11,430
|
| 758 |
Uh, so that was, I thought,
|
| 759 |
anyway, that, that that was cool.
|
| 760 |
|
| 761 |
153
|
| 762 |
+
00:10:12,070 --> 00:10:13,950
|
| 763 |
Okay. Voice training data.
|
| 764 |
How are we doing?
|
| 765 |
|
| 766 |
154
|
| 767 |
+
00:10:13,950 --> 00:10:17,990
|
| 768 |
We're about ten minutes, and I'm
|
| 769 |
still talking about voice technology.
|
| 770 |
|
| 771 |
155
|
| 772 |
+
00:10:18,190 --> 00:10:22,350
|
| 773 |
Um, so whisper was brilliant,
|
| 774 |
and I was so excited that I was.
|
| 775 |
|
| 776 |
156
|
| 777 |
+
00:10:22,350 --> 00:10:25,630
|
| 778 |
My first instinct was to, like,
|
| 779 |
get like, oh, my gosh,
|
| 780 |
|
| 781 |
157
|
| 782 |
+
00:10:25,630 --> 00:10:27,710
|
| 783 |
I have to get, like,
|
| 784 |
a really good microphone for this.
|
| 785 |
|
| 786 |
158
|
| 787 |
+
00:10:27,950 --> 00:10:31,630
|
| 788 |
So, um, I didn't go on a
|
| 789 |
spending spree because I said,
|
| 790 |
|
| 791 |
159
|
| 792 |
+
00:10:31,670 --> 00:10:34,470
|
| 793 |
I'm gonna have to just wait a
|
| 794 |
month and see if I still use this.
|
| 795 |
|
| 796 |
160
|
| 797 |
+
00:10:34,910 --> 00:10:39,990
|
| 798 |
And it just kind of became it's
|
| 799 |
become really part of my daily
|
| 800 |
|
| 801 |
161
|
| 802 |
+
00:10:39,990 --> 00:10:42,990
|
| 803 |
routine.
|
| 804 |
Like, if I'm writing an email,
|
| 805 |
|
| 806 |
162
|
| 807 |
+
00:10:42,990 --> 00:10:47,020
|
| 808 |
I'll record a voice note.
|
| 809 |
And then I've developed and it's
|
| 810 |
|
| 811 |
163
|
| 812 |
+
00:10:47,020 --> 00:10:49,900
|
| 813 |
nice to see that everyone is
|
| 814 |
like developing the same things
|
| 815 |
|
| 816 |
164
|
| 817 |
+
00:10:49,900 --> 00:10:51,900
|
| 818 |
in parallel.
|
| 819 |
Like, that's kind of a weird thing
|
| 820 |
|
| 821 |
165
|
| 822 |
+
00:10:51,940 --> 00:10:57,340
|
| 823 |
to say, but when I look, I kind of
|
| 824 |
came when I started working on this,
|
| 825 |
|
| 826 |
166
|
| 827 |
+
00:10:57,380 --> 00:11:00,700
|
| 828 |
these prototypes on GitHub,
|
| 829 |
which is where I just kind of
|
| 830 |
|
| 831 |
167
|
| 832 |
+
00:11:00,740 --> 00:11:04,740
|
| 833 |
share very freely and loosely,
|
| 834 |
uh, ideas and, you know,
|
| 835 |
|
| 836 |
168
|
| 837 |
+
00:11:04,780 --> 00:11:10,020
|
| 838 |
first iterations on, on concepts,
|
| 839 |
um, and for want of a better word,
|
| 840 |
|
| 841 |
169
|
| 842 |
+
00:11:10,020 --> 00:11:13,900
|
| 843 |
I called it like, uh,
|
| 844 |
lm post-processing or cleanup or
|
| 845 |
|
| 846 |
170
|
| 847 |
+
00:11:14,140 --> 00:11:18,100
|
| 848 |
basically a system prompt that after
|
| 849 |
you get back the raw text from
|
| 850 |
|
| 851 |
171
|
| 852 |
+
00:11:18,420 --> 00:11:24,100
|
| 853 |
whisper, you run it through a model
|
| 854 |
and say, okay, this is crappy text,
|
| 855 |
|
| 856 |
172
|
| 857 |
+
00:11:24,140 --> 00:11:27,140
|
| 858 |
like add sentence structure and,
|
| 859 |
you know, fix it up.
|
| 860 |
|
| 861 |
173
|
| 862 |
+
00:11:27,580 --> 00:11:32,660
|
| 863 |
And, um, now when I'm exploring the
|
| 864 |
different tools that are out there
|
| 865 |
|
| 866 |
174
|
| 867 |
+
00:11:32,700 --> 00:11:36,580
|
| 868 |
that people have built, I see, uh,
|
| 869 |
quite a number of projects have
|
| 870 |
|
| 871 |
175
|
| 872 |
+
00:11:37,180 --> 00:11:41,700
|
| 873 |
basically done the same thing,
|
| 874 |
um, less that be misconstrued.
|
| 875 |
|
| 876 |
176
|
| 877 |
+
00:11:41,700 --> 00:11:44,370
|
| 878 |
I'm not saying for a millisecond
|
| 879 |
that I inspired them.
|
| 880 |
|
| 881 |
177
|
| 882 |
+
00:11:44,370 --> 00:11:48,890
|
| 883 |
I'm sure this has been a thing that's
|
| 884 |
been integrated into tools for a
|
| 885 |
|
| 886 |
178
|
| 887 |
+
00:11:48,930 --> 00:11:52,290
|
| 888 |
while, but it's it's the kind of
|
| 889 |
thing that when you start using these
|
| 890 |
|
| 891 |
179
|
| 892 |
+
00:11:52,290 --> 00:11:56,730
|
| 893 |
tools every day, the need for it
|
| 894 |
is almost instantly apparent, uh,
|
| 895 |
|
| 896 |
180
|
| 897 |
+
00:11:56,730 --> 00:12:00,770
|
| 898 |
because text that doesn't have any
|
| 899 |
punctuation or paragraph spacing
|
| 900 |
|
| 901 |
181
|
| 902 |
+
00:12:00,810 --> 00:12:04,250
|
| 903 |
takes a long time to, you know,
|
| 904 |
it takes so long to get it into
|
| 905 |
|
| 906 |
182
|
| 907 |
+
00:12:04,250 --> 00:12:09,370
|
| 908 |
a presentable email that again,
|
| 909 |
it's it's it moves speech tech
|
| 910 |
|
| 911 |
183
|
| 912 |
+
00:12:09,410 --> 00:12:12,930
|
| 913 |
into that before that inflection
|
| 914 |
point where you're like, no,
|
| 915 |
|
| 916 |
184
|
| 917 |
+
00:12:12,930 --> 00:12:16,250
|
| 918 |
it's just not worth it.
|
| 919 |
It's like it'll just be quicker
|
| 920 |
|
| 921 |
185
|
| 922 |
+
00:12:16,250 --> 00:12:18,850
|
| 923 |
to type this.
|
| 924 |
So it's a big it's a little touch.
|
| 925 |
|
| 926 |
186
|
| 927 |
+
00:12:18,850 --> 00:12:24,090
|
| 928 |
That actually is a big deal.
|
| 929 |
Uh, so I was on whisper and I've
|
| 930 |
|
| 931 |
187
|
| 932 |
+
00:12:24,090 --> 00:12:28,170
|
| 933 |
been using whisper and I kind of
|
| 934 |
early on found a couple of tools.
|
| 935 |
|
| 936 |
188
|
| 937 |
+
00:12:28,210 --> 00:12:30,930
|
| 938 |
I couldn't find what I was
|
| 939 |
looking for on Linux, which is,
|
| 940 |
|
| 941 |
189
|
| 942 |
+
00:12:31,370 --> 00:12:35,770
|
| 943 |
um, basically just something
|
| 944 |
that'll run in the background.
|
| 945 |
|
| 946 |
190
|
| 947 |
+
00:12:35,810 --> 00:12:40,130
|
| 948 |
You'll give it an API key and it
|
| 949 |
will just transcribe. Um.
|
| 950 |
|
| 951 |
191
|
| 952 |
+
00:12:41,280 --> 00:12:44,000
|
| 953 |
with, like, a little key to
|
| 954 |
start and stop the dictation.
|
| 955 |
|
| 956 |
192
|
| 957 |
+
00:12:44,600 --> 00:12:49,040
|
| 958 |
Uh, and the issues were I discovered
|
| 959 |
that, like most people involved in
|
| 960 |
|
| 961 |
193
|
| 962 |
+
00:12:49,040 --> 00:12:53,920
|
| 963 |
creating these projects were very
|
| 964 |
much focused on local models running
|
| 965 |
|
| 966 |
194
|
| 967 |
+
00:12:53,920 --> 00:12:57,400
|
| 968 |
whisper locally, because you can.
|
| 969 |
And I tried that a bunch of
|
| 970 |
|
| 971 |
195
|
| 972 |
+
00:12:57,400 --> 00:13:00,840
|
| 973 |
times and just never got results
|
| 974 |
that were as good as the cloud.
|
| 975 |
|
| 976 |
196
|
| 977 |
+
00:13:01,160 --> 00:13:04,640
|
| 978 |
And when I began looking at the
|
| 979 |
cost of the speech to text APIs
|
| 980 |
|
| 981 |
197
|
| 982 |
+
00:13:04,640 --> 00:13:08,520
|
| 983 |
and what I was spending,
|
| 984 |
I just thought there's it's actually,
|
| 985 |
|
| 986 |
198
|
| 987 |
+
00:13:08,720 --> 00:13:13,200
|
| 988 |
in my opinion, just one of the better
|
| 989 |
deals in API spending and in cloud.
|
| 990 |
|
| 991 |
199
|
| 992 |
+
00:13:13,240 --> 00:13:17,280
|
| 993 |
Like it's just not that expensive
|
| 994 |
for very, very good models that are
|
| 995 |
|
| 996 |
200
|
| 997 |
+
00:13:17,400 --> 00:13:20,840
|
| 998 |
much more, you know, you're going
|
| 999 |
to be able to run the full model,
|
| 1000 |
|
| 1001 |
201
|
| 1002 |
+
00:13:21,360 --> 00:13:25,960
|
| 1003 |
the latest model versus whatever
|
| 1004 |
you can run on your average GPU.
|
| 1005 |
|
| 1006 |
202
|
| 1007 |
+
00:13:26,000 --> 00:13:29,760
|
| 1008 |
Unless you want to buy a crazy GPU.
|
| 1009 |
It doesn't really make sense to me.
|
| 1010 |
|
| 1011 |
203
|
| 1012 |
+
00:13:29,760 --> 00:13:33,480
|
| 1013 |
Now, privacy is another concern.
|
| 1014 |
Um, that I know is kind of like a
|
| 1015 |
|
| 1016 |
204
|
| 1017 |
+
00:13:33,520 --> 00:13:36,920
|
| 1018 |
very much a separate thing that
|
| 1019 |
people just don't want their voice,
|
| 1020 |
|
| 1021 |
205
|
| 1022 |
+
00:13:36,920 --> 00:13:39,790
|
| 1023 |
data, and their voice leaving
|
| 1024 |
their local environment,
|
| 1025 |
|
| 1026 |
206
|
| 1027 |
+
00:13:40,110 --> 00:13:43,830
|
| 1028 |
maybe for regulatory reasons as well.
|
| 1029 |
Um, but I'm not in that.
|
| 1030 |
|
| 1031 |
207
|
| 1032 |
+
00:13:43,910 --> 00:13:47,910
|
| 1033 |
Um, I'm neither really care about
|
| 1034 |
people listening to my, uh,
|
| 1035 |
|
| 1036 |
208
|
| 1037 |
+
00:13:47,950 --> 00:13:51,190
|
| 1038 |
grocery list consisting of, uh,
|
| 1039 |
reminding myself that I need to
|
| 1040 |
|
| 1041 |
209
|
| 1042 |
+
00:13:51,230 --> 00:13:54,790
|
| 1043 |
buy more beer, Cheetos and hummus,
|
| 1044 |
which is kind of the three,
|
| 1045 |
|
| 1046 |
210
|
| 1047 |
+
00:13:54,990 --> 00:13:59,310
|
| 1048 |
three staples of my diet.
|
| 1049 |
Um, during periods of poor nutrition.
|
| 1050 |
|
| 1051 |
211
|
| 1052 |
+
00:13:59,590 --> 00:14:03,310
|
| 1053 |
Uh, but the kind of stuff that I
|
| 1054 |
transcribe, it's just not it's not a,
|
| 1055 |
|
| 1056 |
212
|
| 1057 |
+
00:14:03,990 --> 00:14:09,350
|
| 1058 |
it's not a privacy thing and that
|
| 1059 |
sort of sensitive about and, uh,
|
| 1060 |
|
| 1061 |
213
|
| 1062 |
+
00:14:09,350 --> 00:14:13,070
|
| 1063 |
I don't do anything so,
|
| 1064 |
you know, sensitive or secure,
|
| 1065 |
|
| 1066 |
214
|
| 1067 |
+
00:14:13,070 --> 00:14:16,590
|
| 1068 |
that requires air gapping.
|
| 1069 |
So, um, I looked at the pricing and
|
| 1070 |
|
| 1071 |
215
|
| 1072 |
+
00:14:16,590 --> 00:14:20,270
|
| 1073 |
especially the kind of older models,
|
| 1074 |
mini, um, some of them are very,
|
| 1075 |
|
| 1076 |
216
|
| 1077 |
+
00:14:20,270 --> 00:14:23,110
|
| 1078 |
very affordable.
|
| 1079 |
And I did a back of the I did a
|
| 1080 |
|
| 1081 |
217
|
| 1082 |
+
00:14:23,110 --> 00:14:27,150
|
| 1083 |
calculation once with ChatGPT
|
| 1084 |
and I was like, okay, this is a,
|
| 1085 |
|
| 1086 |
218
|
| 1087 |
+
00:14:27,150 --> 00:14:31,070
|
| 1088 |
this is the API price for I can't
|
| 1089 |
remember whatever the model was.
|
| 1090 |
|
| 1091 |
219
|
| 1092 |
+
00:14:31,550 --> 00:14:33,910
|
| 1093 |
Uh, let's say I just go at it
|
| 1094 |
like nonstop,
|
| 1095 |
|
| 1096 |
220
|
| 1097 |
+
00:14:34,030 --> 00:14:37,410
|
| 1098 |
which it rarely happens. Probably.
|
| 1099 |
I would say on average,
|
| 1100 |
|
| 1101 |
221
|
| 1102 |
+
00:14:37,410 --> 00:14:41,890
|
| 1103 |
I might dictate 30 to 60 minutes per
|
| 1104 |
day if I was probably summing up
|
| 1105 |
|
| 1106 |
222
|
| 1107 |
+
00:14:41,890 --> 00:14:48,490
|
| 1108 |
the emails, documents, outlines,
|
| 1109 |
um, which is a lot, but it's it's
|
| 1110 |
|
| 1111 |
223
|
| 1112 |
+
00:14:48,490 --> 00:14:50,730
|
| 1113 |
still a fairly modest amount.
|
| 1114 |
And I was like, well,
|
| 1115 |
|
| 1116 |
224
|
| 1117 |
+
00:14:50,770 --> 00:14:53,930
|
| 1118 |
some days I do go on like 1 or 2
|
| 1119 |
days where I've been.
|
| 1120 |
|
| 1121 |
225
|
| 1122 |
+
00:14:54,450 --> 00:14:58,450
|
| 1123 |
Usually when I'm like kind of out of
|
| 1124 |
the house and just have something
|
| 1125 |
|
| 1126 |
226
|
| 1127 |
+
00:14:59,090 --> 00:15:02,250
|
| 1128 |
like, I have nothing else to do.
|
| 1129 |
Like if I'm at a hospital with a
|
| 1130 |
|
| 1131 |
227
|
| 1132 |
+
00:15:02,250 --> 00:15:06,970
|
| 1133 |
newborn, uh, and you're waiting
|
| 1134 |
for like eight hours and hours
|
| 1135 |
|
| 1136 |
228
|
| 1137 |
+
00:15:06,970 --> 00:15:10,210
|
| 1138 |
for an appointment, and I would
|
| 1139 |
probably have listened to podcasts
|
| 1140 |
|
| 1141 |
229
|
| 1142 |
+
00:15:10,490 --> 00:15:14,010
|
| 1143 |
before becoming a speech fanatic.
|
| 1144 |
And I'm like, oh, wait,
|
| 1145 |
|
| 1146 |
230
|
| 1147 |
+
00:15:14,050 --> 00:15:16,370
|
| 1148 |
let me just get down.
|
| 1149 |
Let me just get these ideas out
|
| 1150 |
|
| 1151 |
231
|
| 1152 |
+
00:15:16,410 --> 00:15:19,610
|
| 1153 |
of my head.
|
| 1154 |
And that's when I'll go on my
|
| 1155 |
|
| 1156 |
232
|
| 1157 |
+
00:15:19,650 --> 00:15:21,530
|
| 1158 |
speech binges.
|
| 1159 |
But those are like once every
|
| 1160 |
|
| 1161 |
233
|
| 1162 |
+
00:15:21,530 --> 00:15:24,970
|
| 1163 |
few months, like not frequently.
|
| 1164 |
But I said, okay, let's just say
|
| 1165 |
|
| 1166 |
234
|
| 1167 |
+
00:15:24,970 --> 00:15:30,650
|
| 1168 |
if I'm gonna price out.
|
| 1169 |
Cloud asked if I was like, dedicated
|
| 1170 |
|
| 1171 |
235
|
| 1172 |
+
00:15:30,650 --> 00:15:36,880
|
| 1173 |
every second of every waking hour to
|
| 1174 |
transcribing for some odd reason. Um.
|
| 1175 |
|
| 1176 |
236
|
| 1177 |
+
00:15:37,200 --> 00:15:39,680
|
| 1178 |
I mean, it'd have to, like,
|
| 1179 |
eat and use the toilet and,
|
| 1180 |
|
| 1181 |
237
|
| 1182 |
+
00:15:39,720 --> 00:15:42,520
|
| 1183 |
like, you know, there's only so
|
| 1184 |
many hours I'm awake for.
|
| 1185 |
|
| 1186 |
238
|
| 1187 |
+
00:15:42,520 --> 00:15:44,680
|
| 1188 |
So, like,
|
| 1189 |
let's just say a maximum of, like,
|
| 1190 |
|
| 1191 |
239
|
| 1192 |
+
00:15:44,720 --> 00:15:48,680
|
| 1193 |
40 hours, 45 minutes in the hour.
|
| 1194 |
Then I said, all right,
|
| 1195 |
|
| 1196 |
240
|
| 1197 |
+
00:15:48,680 --> 00:15:52,600
|
| 1198 |
let's just say 50. Who knows?
|
| 1199 |
You're dictating on the toilet.
|
| 1200 |
|
| 1201 |
241
|
| 1202 |
+
00:15:52,640 --> 00:15:53,880
|
| 1203 |
We do it.
|
| 1204 |
Uh,
|
| 1205 |
|
| 1206 |
242
|
| 1207 |
+
00:15:53,880 --> 00:15:58,720
|
| 1208 |
so it could be you could just do 60.
|
| 1209 |
But whatever I did, and every day,
|
| 1210 |
|
| 1211 |
243
|
| 1212 |
+
00:15:58,760 --> 00:16:02,440
|
| 1213 |
like, you're going flat out seven
|
| 1214 |
days a week dictating non-stop.
|
| 1215 |
|
| 1216 |
244
|
| 1217 |
+
00:16:02,480 --> 00:16:06,440
|
| 1218 |
I was like, what's my monthly API
|
| 1219 |
bill going to be at this price?
|
| 1220 |
|
| 1221 |
245
|
| 1222 |
+
00:16:06,720 --> 00:16:09,120
|
| 1223 |
And it came out to like 70 or 80
|
| 1224 |
bucks.
|
| 1225 |
|
| 1226 |
246
|
| 1227 |
+
00:16:09,120 --> 00:16:14,080
|
| 1228 |
And I was like, well, that would be
|
| 1229 |
an extraordinary amount of dictation.
|
| 1230 |
|
| 1231 |
247
|
| 1232 |
+
00:16:14,080 --> 00:16:17,840
|
| 1233 |
And I would hope that there was
|
| 1234 |
some compelling reason,
|
| 1235 |
|
| 1236 |
248
|
| 1237 |
+
00:16:18,040 --> 00:16:22,200
|
| 1238 |
more worth more than $70,
|
| 1239 |
that I embarked upon that project.
|
| 1240 |
|
| 1241 |
249
|
| 1242 |
+
00:16:22,400 --> 00:16:25,200
|
| 1243 |
Uh, so given that that's kind of the
|
| 1244 |
max point for me, I said, that's
|
| 1245 |
|
| 1246 |
250
|
| 1247 |
+
00:16:25,240 --> 00:16:29,000
|
| 1248 |
actually very, very affordable.
|
| 1249 |
Um, now you're gonna if you want
|
| 1250 |
|
| 1251 |
251
|
| 1252 |
+
00:16:29,040 --> 00:16:34,080
|
| 1253 |
to spec out the costs and you want
|
| 1254 |
to do the post-processing that I
|
| 1255 |
|
| 1256 |
252
|
| 1257 |
+
00:16:34,150 --> 00:16:37,110
|
| 1258 |
really do feel is valuable.
|
| 1259 |
Um, that's going to cost some more as
|
| 1260 |
|
| 1261 |
253
|
| 1262 |
+
00:16:37,110 --> 00:16:43,110
|
| 1263 |
well, unless you're using Gemini,
|
| 1264 |
which, uh, needless to say, is a
|
| 1265 |
|
| 1266 |
254
|
| 1267 |
+
00:16:43,110 --> 00:16:46,950
|
| 1268 |
random person sitting in Jerusalem.
|
| 1269 |
Uh, I have no affiliation,
|
| 1270 |
|
| 1271 |
255
|
| 1272 |
+
00:16:46,950 --> 00:16:51,350
|
| 1273 |
nor with Google, nor anthropic,
|
| 1274 |
nor Gemini, nor any major tech vendor
|
| 1275 |
|
| 1276 |
256
|
| 1277 |
+
00:16:51,350 --> 00:16:56,790
|
| 1278 |
for that matter. Um, I like Gemini.
|
| 1279 |
Not so much as a everyday model.
|
| 1280 |
|
| 1281 |
257
|
| 1282 |
+
00:16:56,870 --> 00:16:59,830
|
| 1283 |
Um, it's kind of underwhelmed in
|
| 1284 |
that respect, I would say.
|
| 1285 |
|
| 1286 |
258
|
| 1287 |
+
00:17:00,230 --> 00:17:03,030
|
| 1288 |
But for multimodal,
|
| 1289 |
I think it's got a lot to offer.
|
| 1290 |
|
| 1291 |
259
|
| 1292 |
+
00:17:03,310 --> 00:17:06,870
|
| 1293 |
And I think that the transcribing
|
| 1294 |
functionality whereby it can,
|
| 1295 |
|
| 1296 |
260
|
| 1297 |
+
00:17:07,270 --> 00:17:12,150
|
| 1298 |
um, process audio with a system
|
| 1299 |
prompt and both give you
|
| 1300 |
|
| 1301 |
261
|
| 1302 |
+
00:17:12,190 --> 00:17:15,390
|
| 1303 |
transcription that's cleaned up,
|
| 1304 |
that reduces two steps to one.
|
| 1305 |
|
| 1306 |
262
|
| 1307 |
+
00:17:15,710 --> 00:17:18,630
|
| 1308 |
And that for me is a very,
|
| 1309 |
very big deal.
|
| 1310 |
|
| 1311 |
263
|
| 1312 |
+
00:17:18,630 --> 00:17:22,990
|
| 1313 |
And, uh, I feel like even Google
|
| 1314 |
has haven't really sort of thought
|
| 1315 |
|
| 1316 |
264
|
| 1317 |
+
00:17:22,990 --> 00:17:27,430
|
| 1318 |
through how useful the that
|
| 1319 |
modality is and what kind of use
|
| 1320 |
|
| 1321 |
265
|
| 1322 |
+
00:17:27,430 --> 00:17:30,790
|
| 1323 |
cases you can achieve with it.
|
| 1324 |
Because I found in the course of
|
| 1325 |
|
| 1326 |
266
|
| 1327 |
+
00:17:30,790 --> 00:17:36,490
|
| 1328 |
this year just an endless list
|
| 1329 |
of really kind of system prompt,
|
| 1330 |
|
| 1331 |
267
|
| 1332 |
+
00:17:36,730 --> 00:17:41,290
|
| 1333 |
system prompt stuff that I can say,
|
| 1334 |
okay, I've used it to capture context
|
| 1335 |
|
| 1336 |
268
|
| 1337 |
+
00:17:41,290 --> 00:17:45,570
|
| 1338 |
data for AI, which is literally I
|
| 1339 |
might speak for if I wanted to have a
|
| 1340 |
|
| 1341 |
269
|
| 1342 |
+
00:17:45,570 --> 00:17:49,730
|
| 1343 |
good bank of context data about,
|
| 1344 |
who knows, my childhood.
|
| 1345 |
|
| 1346 |
270
|
| 1347 |
+
00:17:50,010 --> 00:17:53,450
|
| 1348 |
Uh, more realistically,
|
| 1349 |
maybe my career goals, uh,
|
| 1350 |
|
| 1351 |
271
|
| 1352 |
+
00:17:53,450 --> 00:17:56,010
|
| 1353 |
something that would just be,
|
| 1354 |
like, really boring to type out.
|
| 1355 |
|
| 1356 |
272
|
| 1357 |
+
00:17:56,130 --> 00:18:01,130
|
| 1358 |
So I'll just, like, sit in my car
|
| 1359 |
and record it for ten minutes.
|
| 1360 |
|
| 1361 |
273
|
| 1362 |
+
00:18:01,130 --> 00:18:04,090
|
| 1363 |
And that ten minutes,
|
| 1364 |
you get a lot of information in,
|
| 1365 |
|
| 1366 |
274
|
| 1367 |
+
00:18:04,530 --> 00:18:10,090
|
| 1368 |
um, emails, which is short text.
|
| 1369 |
Um, just there is a whole bunch.
|
| 1370 |
|
| 1371 |
275
|
| 1372 |
+
00:18:10,090 --> 00:18:13,570
|
| 1373 |
And all these workflows kind of
|
| 1374 |
require a little bit of treatment
|
| 1375 |
|
| 1376 |
276
|
| 1377 |
+
00:18:13,570 --> 00:18:17,490
|
| 1378 |
afterwards and different treatment.
|
| 1379 |
My context pipeline is kind of like
|
| 1380 |
|
| 1381 |
277
|
| 1382 |
+
00:18:17,490 --> 00:18:21,210
|
| 1383 |
just extract the bare essentials.
|
| 1384 |
So you end up with me talking very
|
| 1385 |
|
| 1386 |
278
|
| 1387 |
+
00:18:21,210 --> 00:18:24,250
|
| 1388 |
loosely about sort of what I've done
|
| 1389 |
in my career, where I've worked,
|
| 1390 |
|
| 1391 |
279
|
| 1392 |
+
00:18:24,250 --> 00:18:27,610
|
| 1393 |
where I might like to work,
|
| 1394 |
and it goes it condenses that
|
| 1395 |
|
| 1396 |
280
|
| 1397 |
+
00:18:27,610 --> 00:18:31,600
|
| 1398 |
down to very robotic language
|
| 1399 |
that is easy to chunk, parse,
|
| 1400 |
|
| 1401 |
281
|
| 1402 |
+
00:18:31,600 --> 00:18:35,960
|
| 1403 |
and maybe put into a vector database.
|
| 1404 |
Daniel has worked in technology,
|
| 1405 |
|
| 1406 |
282
|
| 1407 |
+
00:18:36,000 --> 00:18:39,640
|
| 1408 |
Daniel is a has been working in,
|
| 1409 |
you know, stuff like that.
|
| 1410 |
|
| 1411 |
283
|
| 1412 |
+
00:18:39,640 --> 00:18:43,600
|
| 1413 |
That's not how you would speak.
|
| 1414 |
Um, but I figure it's probably easier
|
| 1415 |
|
| 1416 |
284
|
| 1417 |
+
00:18:43,600 --> 00:18:48,120
|
| 1418 |
to parse for, after all, robots.
|
| 1419 |
So we've almost got to 20 minutes.
|
| 1420 |
|
| 1421 |
285
|
| 1422 |
+
00:18:48,120 --> 00:18:52,640
|
| 1423 |
And this is actually a success
|
| 1424 |
because I wasted 20 minutes of my,
|
| 1425 |
|
| 1426 |
286
|
| 1427 |
+
00:18:52,800 --> 00:18:56,880
|
| 1428 |
uh, of the evening speaking into
|
| 1429 |
a microphone, and, uh,
|
| 1430 |
|
| 1431 |
287
|
| 1432 |
+
00:18:56,920 --> 00:19:00,840
|
| 1433 |
the levels were shot and, uh, it,
|
| 1434 |
uh, it was clipping and I said,
|
| 1435 |
|
| 1436 |
288
|
| 1437 |
+
00:19:00,840 --> 00:19:03,200
|
| 1438 |
I can't really do an evaluation.
|
| 1439 |
I have to be fair.
|
| 1440 |
|
| 1441 |
289
|
| 1442 |
+
00:19:03,200 --> 00:19:07,000
|
| 1443 |
I have to give the models a
|
| 1444 |
chance to do their thing.
|
| 1445 |
|
| 1446 |
290
|
| 1447 |
+
00:19:07,520 --> 00:19:09,360
|
| 1448 |
Uh,
|
| 1449 |
what am I hoping to achieve in this?
|
| 1450 |
|
| 1451 |
291
|
| 1452 |
+
00:19:09,400 --> 00:19:12,600
|
| 1453 |
Okay, my fine tune was a dud,
|
| 1454 |
as mentioned Deepgram SVT.
|
| 1455 |
|
| 1456 |
292
|
| 1457 |
+
00:19:12,640 --> 00:19:15,520
|
| 1458 |
I'm really, really hopeful that
|
| 1459 |
this prototype will work.
|
| 1460 |
|
| 1461 |
293
|
| 1462 |
+
00:19:15,800 --> 00:19:18,960
|
| 1463 |
And it's a built in public open
|
| 1464 |
source, so anyone is welcome to
|
| 1465 |
|
| 1466 |
294
|
| 1467 |
+
00:19:19,000 --> 00:19:22,920
|
| 1468 |
use it if I make anything good.
|
| 1469 |
Um, but that was really exciting for
|
| 1470 |
|
| 1471 |
295
|
| 1472 |
+
00:19:22,920 --> 00:19:27,400
|
| 1473 |
me last night when after hours of,
|
| 1474 |
um, trying my own prototype,
|
| 1475 |
|
| 1476 |
296
|
| 1477 |
+
00:19:27,400 --> 00:19:31,230
|
| 1478 |
seeing someone just made
|
| 1479 |
something that works like that.
|
| 1480 |
|
| 1481 |
297
|
| 1482 |
+
00:19:31,270 --> 00:19:32,670
|
| 1483 |
You know,
|
| 1484 |
you're not going to have to build a
|
| 1485 |
|
| 1486 |
298
|
| 1487 |
+
00:19:32,670 --> 00:19:38,230
|
| 1488 |
custom conda environment and image.
|
| 1489 |
I have AMD GPU, which makes
|
| 1490 |
|
| 1491 |
299
|
| 1492 |
+
00:19:38,230 --> 00:19:42,310
|
| 1493 |
things much more complicated.
|
| 1494 |
I didn't find it and I was about
|
| 1495 |
|
| 1496 |
300
|
| 1497 |
+
00:19:42,310 --> 00:19:43,990
|
| 1498 |
to give up and I said,
|
| 1499 |
all right, let me just give deep
|
| 1500 |
|
| 1501 |
301
|
| 1502 |
+
00:19:43,990 --> 00:19:48,750
|
| 1503 |
grams Linux thing a shot.
|
| 1504 |
And if this doesn't work, um,
|
| 1505 |
|
| 1506 |
302
|
| 1507 |
+
00:19:48,750 --> 00:19:51,150
|
| 1508 |
I'm just going to go back to
|
| 1509 |
trying to code something myself.
|
| 1510 |
|
| 1511 |
303
|
| 1512 |
+
00:19:51,510 --> 00:19:56,190
|
| 1513 |
And when I ran the script,
|
| 1514 |
I was using cloud code to do the
|
| 1515 |
|
| 1516 |
304
|
| 1517 |
+
00:19:56,190 --> 00:20:00,030
|
| 1518 |
installation process.
|
| 1519 |
It ran the script and oh my gosh,
|
| 1520 |
|
| 1521 |
305
|
| 1522 |
+
00:20:00,070 --> 00:20:05,350
|
| 1523 |
it works just like that.
|
| 1524 |
Uh, the tricky thing for all those
|
| 1525 |
|
| 1526 |
306
|
| 1527 |
+
00:20:05,350 --> 00:20:10,310
|
| 1528 |
who wants to know all the nitty
|
| 1529 |
gritty, nitty gritty details, um, was
|
| 1530 |
|
| 1531 |
307
|
| 1532 |
+
00:20:10,310 --> 00:20:13,750
|
| 1533 |
that I don't think it was actually
|
| 1534 |
struggling with transcription, but
|
| 1535 |
|
| 1536 |
308
|
| 1537 |
+
00:20:13,750 --> 00:20:18,550
|
| 1538 |
pasting Wayland makes life very hard,
|
| 1539 |
and I think there was something not
|
| 1540 |
|
| 1541 |
309
|
| 1542 |
+
00:20:18,550 --> 00:20:21,870
|
| 1543 |
running in the right time anyway.
|
| 1544 |
Deepgram I looked at how they
|
| 1545 |
|
| 1546 |
310
|
| 1547 |
+
00:20:21,870 --> 00:20:24,710
|
| 1548 |
actually handle that because it
|
| 1549 |
worked out of the box when other
|
| 1550 |
|
| 1551 |
311
|
| 1552 |
+
00:20:24,710 --> 00:20:29,140
|
| 1553 |
stuff didn't, and it was quite a
|
| 1554 |
clever little mechanism,
|
| 1555 |
|
| 1556 |
312
|
| 1557 |
+
00:20:29,460 --> 00:20:32,100
|
| 1558 |
and but more so than that,
|
| 1559 |
the accuracy was brilliant.
|
| 1560 |
|
| 1561 |
313
|
| 1562 |
+
00:20:32,140 --> 00:20:35,020
|
| 1563 |
Now, what am I doing here?
|
| 1564 |
This is going to be a 20 minute
|
| 1565 |
|
| 1566 |
314
|
| 1567 |
+
00:20:35,260 --> 00:20:42,980
|
| 1568 |
audio sample, and I'm I think
|
| 1569 |
I've done 1 or 2 of these before,
|
| 1570 |
|
| 1571 |
315
|
| 1572 |
+
00:20:42,980 --> 00:20:49,180
|
| 1573 |
but I did it with short, snappy voice
|
| 1574 |
notes. This is kind of long form.
|
| 1575 |
|
| 1576 |
316
|
| 1577 |
+
00:20:49,460 --> 00:20:51,740
|
| 1578 |
This actually might be a better
|
| 1579 |
approximation for what's useful
|
| 1580 |
|
| 1581 |
317
|
| 1582 |
+
00:20:51,740 --> 00:20:56,100
|
| 1583 |
to me than voice memos.
|
| 1584 |
Like I need to buy three liters
|
| 1585 |
|
| 1586 |
318
|
| 1587 |
+
00:20:56,100 --> 00:20:59,180
|
| 1588 |
of milk tomorrow, and pita bread,
|
| 1589 |
which is probably how like half
|
| 1590 |
|
| 1591 |
319
|
| 1592 |
+
00:20:59,180 --> 00:21:02,820
|
| 1593 |
my voice voice notes sound like
|
| 1594 |
if anyone were to, I don't know,
|
| 1595 |
|
| 1596 |
320
|
| 1597 |
+
00:21:02,860 --> 00:21:04,580
|
| 1598 |
like find my phone,
|
| 1599 |
they'd be like, this is the most
|
| 1600 |
|
| 1601 |
321
|
| 1602 |
+
00:21:04,580 --> 00:21:07,420
|
| 1603 |
boring person in the world.
|
| 1604 |
Although actually there are some
|
| 1605 |
|
| 1606 |
322
|
| 1607 |
+
00:21:07,460 --> 00:21:09,700
|
| 1608 |
like kind of, uh,
|
| 1609 |
journaling thoughts as well.
|
| 1610 |
|
| 1611 |
323
|
| 1612 |
+
00:21:09,700 --> 00:21:13,700
|
| 1613 |
But it's a lot of content like that.
|
| 1614 |
And the probably for the evaluation,
|
| 1615 |
|
| 1616 |
324
|
| 1617 |
+
00:21:13,700 --> 00:21:20,660
|
| 1618 |
the most useful thing is slightly
|
| 1619 |
obscure tech GitHub uh, hugging face
|
| 1620 |
|
| 1621 |
325
|
| 1622 |
+
00:21:21,180 --> 00:21:24,660
|
| 1623 |
not so obscure that it's not going
|
| 1624 |
to have a chance of knowing it,
|
| 1625 |
|
| 1626 |
326
|
| 1627 |
+
00:21:24,660 --> 00:21:27,640
|
| 1628 |
but hopefully sufficiently well
|
| 1629 |
known that the model should get it.
|
| 1630 |
|
| 1631 |
327
|
| 1632 |
+
00:21:28,200 --> 00:21:30,760
|
| 1633 |
I tried to do a little bit of
|
| 1634 |
speaking really fast and
|
| 1635 |
|
| 1636 |
328
|
| 1637 |
+
00:21:30,760 --> 00:21:33,200
|
| 1638 |
speaking very slowly.
|
| 1639 |
I would say in general,
|
| 1640 |
|
| 1641 |
329
|
| 1642 |
+
00:21:33,200 --> 00:21:36,880
|
| 1643 |
I've spoken, delivered this at a
|
| 1644 |
faster pace than I usually would
|
| 1645 |
|
| 1646 |
330
|
| 1647 |
+
00:21:36,920 --> 00:21:40,280
|
| 1648 |
owing to strong coffee flowing
|
| 1649 |
through my bloodstream.
|
| 1650 |
|
| 1651 |
331
|
| 1652 |
+
00:21:40,920 --> 00:21:44,200
|
| 1653 |
And the thing that I'm not going
|
| 1654 |
to get in this benchmark is
|
| 1655 |
|
| 1656 |
332
|
| 1657 |
+
00:21:44,200 --> 00:21:46,880
|
| 1658 |
background noise, which in my first
|
| 1659 |
take that I had to get rid of,
|
| 1660 |
|
| 1661 |
333
|
| 1662 |
+
00:21:47,680 --> 00:21:51,240
|
| 1663 |
my wife came in with my son and
|
| 1664 |
for a good night kiss.
|
| 1665 |
|
| 1666 |
334
|
| 1667 |
+
00:21:51,440 --> 00:21:55,120
|
| 1668 |
And that actually would have
|
| 1669 |
been super helpful to get in
|
| 1670 |
|
| 1671 |
335
|
| 1672 |
+
00:21:55,120 --> 00:21:59,760
|
| 1673 |
because it was not diarised.
|
| 1674 |
Or if we had diarisation a female,
|
| 1675 |
|
| 1676 |
336
|
| 1677 |
+
00:21:59,880 --> 00:22:02,280
|
| 1678 |
I could say I want the male
|
| 1679 |
voice and that wasn't intended
|
| 1680 |
|
| 1681 |
337
|
| 1682 |
+
00:22:02,280 --> 00:22:05,280
|
| 1683 |
for transcription.
|
| 1684 |
Um, and we're not going to get
|
| 1685 |
|
| 1686 |
338
|
| 1687 |
+
00:22:05,280 --> 00:22:06,960
|
| 1688 |
background noise like people
|
| 1689 |
honking their horns,
|
| 1690 |
|
| 1691 |
339
|
| 1692 |
+
00:22:06,960 --> 00:22:11,280
|
| 1693 |
which is something I've done in my
|
| 1694 |
main data set where I am trying to
|
| 1695 |
|
| 1696 |
340
|
| 1697 |
+
00:22:11,440 --> 00:22:15,520
|
| 1698 |
go back to some of my voice notes,
|
| 1699 |
annotate them, and run a benchmark.
|
| 1700 |
|
| 1701 |
341
|
| 1702 |
+
00:22:15,520 --> 00:22:18,960
|
| 1703 |
But this is going to be just a
|
| 1704 |
pure quick test.
|
| 1705 |
|
| 1706 |
342
|
| 1707 |
+
00:22:19,440 --> 00:22:23,880
|
| 1708 |
And as someone I'm working on a
|
| 1709 |
voice note idea,
|
| 1710 |
|
| 1711 |
343
|
| 1712 |
+
00:22:23,880 --> 00:22:28,230
|
| 1713 |
that's my sort of end motivation.
|
| 1714 |
Besides thinking it's an
|
| 1715 |
|
| 1716 |
344
|
| 1717 |
+
00:22:28,230 --> 00:22:31,590
|
| 1718 |
absolutely outstanding technology
|
| 1719 |
that's coming to viability.
|
| 1720 |
|
| 1721 |
345
|
| 1722 |
+
00:22:31,590 --> 00:22:34,670
|
| 1723 |
And really, I know this sounds
|
| 1724 |
cheesy can actually have a very
|
| 1725 |
|
| 1726 |
346
|
| 1727 |
+
00:22:34,670 --> 00:22:38,830
|
| 1728 |
transformative effect.
|
| 1729 |
Um, it's, you know, voice technology
|
| 1730 |
|
| 1731 |
347
|
| 1732 |
+
00:22:38,870 --> 00:22:44,910
|
| 1733 |
has been life changing for, uh,
|
| 1734 |
folks living with, um, disabilities.
|
| 1735 |
|
| 1736 |
348
|
| 1737 |
+
00:22:45,630 --> 00:22:48,550
|
| 1738 |
And I think there's something
|
| 1739 |
really nice about the fact that
|
| 1740 |
|
| 1741 |
349
|
| 1742 |
+
00:22:48,550 --> 00:22:52,710
|
| 1743 |
it can also benefit, you know,
|
| 1744 |
folks who are able bodied and like,
|
| 1745 |
|
| 1746 |
350
|
| 1747 |
+
00:22:52,750 --> 00:22:58,950
|
| 1748 |
we can all in different ways, um,
|
| 1749 |
make this tech as useful as possible,
|
| 1750 |
|
| 1751 |
351
|
| 1752 |
+
00:22:58,990 --> 00:23:01,110
|
| 1753 |
regardless of the exact way that
|
| 1754 |
we're using it.
|
| 1755 |
|
| 1756 |
352
|
| 1757 |
+
00:23:01,510 --> 00:23:04,710
|
| 1758 |
Um, and I think there's something
|
| 1759 |
very powerful in that, and it can be
|
| 1760 |
|
| 1761 |
353
|
| 1762 |
+
00:23:04,710 --> 00:23:08,910
|
| 1763 |
very cool. Um, I see use potential.
|
| 1764 |
What excites me about voice tech?
|
| 1765 |
|
| 1766 |
354
|
| 1767 |
+
00:23:09,750 --> 00:23:13,550
|
| 1768 |
A lot of things, actually.
|
| 1769 |
Firstly, the fact that it's cheap
|
| 1770 |
|
| 1771 |
355
|
| 1772 |
+
00:23:13,550 --> 00:23:17,110
|
| 1773 |
and accurate, as I mentioned at
|
| 1774 |
the very start of this, um,
|
| 1775 |
|
| 1776 |
356
|
| 1777 |
+
00:23:17,110 --> 00:23:20,790
|
| 1778 |
and it's getting better and better
|
| 1779 |
with stuff like accent handling, um,
|
| 1780 |
|
| 1781 |
357
|
| 1782 |
+
00:23:20,790 --> 00:23:24,180
|
| 1783 |
I'm not sure my, my fine tune will
|
| 1784 |
actually ever come to fruition in the
|
| 1785 |
|
| 1786 |
358
|
| 1787 |
+
00:23:24,180 --> 00:23:27,860
|
| 1788 |
sense that I'll use it day to day,
|
| 1789 |
as I imagine I get like superb,
|
| 1790 |
|
| 1791 |
359
|
| 1792 |
+
00:23:27,860 --> 00:23:33,540
|
| 1793 |
flawless word error rates because I'm
|
| 1794 |
just kind of skeptical about local
|
| 1795 |
|
| 1796 |
360
|
| 1797 |
+
00:23:33,540 --> 00:23:38,100
|
| 1798 |
speech to texts, as I mentioned.
|
| 1799 |
And I think the pace of innovation
|
| 1800 |
|
| 1801 |
361
|
| 1802 |
+
00:23:38,100 --> 00:23:42,060
|
| 1803 |
and improvement in the models,
|
| 1804 |
the main reasons for fine tuning from
|
| 1805 |
|
| 1806 |
362
|
| 1807 |
+
00:23:42,060 --> 00:23:46,340
|
| 1808 |
what I've seen have been people who
|
| 1809 |
are something that really blows,
|
| 1810 |
|
| 1811 |
363
|
| 1812 |
+
00:23:46,380 --> 00:23:52,940
|
| 1813 |
blows my mind about ASR is the idea
|
| 1814 |
that it's inherently a lingual
|
| 1815 |
|
| 1816 |
364
|
| 1817 |
+
00:23:52,940 --> 00:23:59,100
|
| 1818 |
or multilingual phonetic based.
|
| 1819 |
So as folks who use speak very
|
| 1820 |
|
| 1821 |
365
|
| 1822 |
+
00:23:59,140 --> 00:24:02,220
|
| 1823 |
obscure languages that there may
|
| 1824 |
be there might be a paucity of
|
| 1825 |
|
| 1826 |
366
|
| 1827 |
+
00:24:02,220 --> 00:24:05,500
|
| 1828 |
training data or almost none at all,
|
| 1829 |
and therefore the accuracy is
|
| 1830 |
|
| 1831 |
367
|
| 1832 |
+
00:24:05,500 --> 00:24:10,660
|
| 1833 |
significantly reduced or folks
|
| 1834 |
in very critical environments.
|
| 1835 |
|
| 1836 |
368
|
| 1837 |
+
00:24:10,700 --> 00:24:13,380
|
| 1838 |
I know there are.
|
| 1839 |
This is used extensively in medical
|
| 1840 |
|
| 1841 |
369
|
| 1842 |
+
00:24:13,380 --> 00:24:18,140
|
| 1843 |
transcription and dispatcher work as,
|
| 1844 |
um, you know, the call centers who
|
| 1845 |
|
| 1846 |
370
|
| 1847 |
+
00:24:18,140 --> 00:24:22,490
|
| 1848 |
send out ambulances, etc., where
|
| 1849 |
accuracy is absolutely paramount.
|
| 1850 |
|
| 1851 |
371
|
| 1852 |
+
00:24:22,490 --> 00:24:26,050
|
| 1853 |
And in the case of doctors,
|
| 1854 |
radiologists, they might be using
|
| 1855 |
|
| 1856 |
372
|
| 1857 |
+
00:24:26,050 --> 00:24:29,610
|
| 1858 |
very specialized vocab all the time.
|
| 1859 |
So those are kind of the main
|
| 1860 |
|
| 1861 |
373
|
| 1862 |
+
00:24:29,610 --> 00:24:31,530
|
| 1863 |
two things.
|
| 1864 |
And I'm not sure that really just for
|
| 1865 |
|
| 1866 |
374
|
| 1867 |
+
00:24:31,530 --> 00:24:37,290
|
| 1868 |
trying to make it better on a few
|
| 1869 |
random tech words with my slightly.
|
| 1870 |
|
| 1871 |
375
|
| 1872 |
+
00:24:37,330 --> 00:24:41,250
|
| 1873 |
I mean, I have an accent, but like,
|
| 1874 |
not, you know, an accent that a few
|
| 1875 |
|
| 1876 |
376
|
| 1877 |
+
00:24:41,290 --> 00:24:47,210
|
| 1878 |
other million people have. Ish.
|
| 1879 |
I'm not sure that my little fine
|
| 1880 |
|
| 1881 |
377
|
| 1882 |
+
00:24:47,210 --> 00:24:52,250
|
| 1883 |
tune is going to actually like the
|
| 1884 |
bump in word error rate reduction.
|
| 1885 |
|
| 1886 |
378
|
| 1887 |
+
00:24:52,250 --> 00:24:54,570
|
| 1888 |
If I ever actually figure out how
|
| 1889 |
to do it and get it up to the
|
| 1890 |
|
| 1891 |
379
|
| 1892 |
+
00:24:54,570 --> 00:24:58,610
|
| 1893 |
cloud by the time I've done that.
|
| 1894 |
I suspect that the next
|
| 1895 |
|
| 1896 |
380
|
| 1897 |
+
00:24:58,610 --> 00:25:01,410
|
| 1898 |
generation of ASR will just be
|
| 1899 |
so good that it will kind of be.
|
| 1900 |
|
| 1901 |
381
|
| 1902 |
+
00:25:01,930 --> 00:25:03,770
|
| 1903 |
Ah, well,
|
| 1904 |
that would be cool if it worked out,
|
| 1905 |
|
| 1906 |
382
|
| 1907 |
+
00:25:03,770 --> 00:25:08,730
|
| 1908 |
but I'll just use this instead.
|
| 1909 |
So that's going to be it for today's
|
| 1910 |
|
| 1911 |
383
|
| 1912 |
+
00:25:08,730 --> 00:25:14,130
|
| 1913 |
episode of, uh, voice training data.
|
| 1914 |
Single long shot evaluation.
|
| 1915 |
|
| 1916 |
384
|
| 1917 |
+
00:25:14,410 --> 00:25:17,330
|
| 1918 |
Who am I going to compare?
|
| 1919 |
Whisper is always good as a
|
| 1920 |
|
| 1921 |
385
|
| 1922 |
+
00:25:17,330 --> 00:25:20,600
|
| 1923 |
benchmark, but I'm more
|
| 1924 |
interested in seeing Whisperer
|
| 1925 |
|
| 1926 |
386
|
| 1927 |
+
00:25:20,600 --> 00:25:25,080
|
| 1928 |
head to head with two things,
|
| 1929 |
really. One is whisper variance.
|
| 1930 |
|
| 1931 |
387
|
| 1932 |
+
00:25:25,080 --> 00:25:29,880
|
| 1933 |
So you've got these projects like
|
| 1934 |
faster Whisper, Still whisper.
|
| 1935 |
|
| 1936 |
388
|
| 1937 |
+
00:25:29,880 --> 00:25:31,640
|
| 1938 |
It's a bit confusing.
|
| 1939 |
There's a whole bunch of them
|
| 1940 |
|
| 1941 |
389
|
| 1942 |
+
00:25:31,920 --> 00:25:34,800
|
| 1943 |
and the emerging acers,
|
| 1944 |
which are also a thing.
|
| 1945 |
|
| 1946 |
390
|
| 1947 |
+
00:25:35,200 --> 00:25:37,680
|
| 1948 |
My intention for this is I'm not
|
| 1949 |
sure I'm going to have the time
|
| 1950 |
|
| 1951 |
391
|
| 1952 |
+
00:25:37,680 --> 00:25:41,640
|
| 1953 |
in any point in the foreseeable
|
| 1954 |
future to go back through this whole
|
| 1955 |
|
| 1956 |
392
|
| 1957 |
+
00:25:41,640 --> 00:25:46,560
|
| 1958 |
episode and create a proper source,
|
| 1959 |
truth or a fix.
|
| 1960 |
|
| 1961 |
393
|
| 1962 |
+
00:25:47,320 --> 00:25:51,680
|
| 1963 |
Everything might do it if I can
|
| 1964 |
get one transcription that
|
| 1965 |
|
| 1966 |
394
|
| 1967 |
+
00:25:51,680 --> 00:25:56,720
|
| 1968 |
sufficiently close to perfection.
|
| 1969 |
But what I would actually love
|
| 1970 |
|
| 1971 |
395
|
| 1972 |
+
00:25:56,720 --> 00:25:59,800
|
| 1973 |
to do on Hugging Face I think
|
| 1974 |
would be a great.
|
| 1975 |
|
| 1976 |
396
|
| 1977 |
+
00:25:59,800 --> 00:26:03,560
|
| 1978 |
Probably how I might visualize this
|
| 1979 |
is having the audio waveform play,
|
| 1980 |
|
| 1981 |
397
|
| 1982 |
+
00:26:04,040 --> 00:26:09,800
|
| 1983 |
and then have the transcript for each
|
| 1984 |
model below it, and maybe even a,
|
| 1985 |
|
| 1986 |
398
|
| 1987 |
+
00:26:10,480 --> 00:26:15,120
|
| 1988 |
um, like, you know, two scale and
|
| 1989 |
maybe even a local one as well,
|
| 1990 |
|
| 1991 |
399
|
| 1992 |
+
00:26:15,160 --> 00:26:21,700
|
| 1993 |
like local whisper versus open
|
| 1994 |
AI API, Etc. and, um, I can then
|
| 1995 |
|
| 1996 |
400
|
| 1997 |
+
00:26:21,700 --> 00:26:24,380
|
| 1998 |
actually listen back to segments
|
| 1999 |
or anyone who wants to can listen
|
| 2000 |
|
| 2001 |
401
|
| 2002 |
+
00:26:24,380 --> 00:26:29,420
|
| 2003 |
back to segments of this recording
|
| 2004 |
and see where a particular model
|
| 2005 |
|
| 2006 |
402
|
| 2007 |
+
00:26:29,460 --> 00:26:32,940
|
| 2008 |
struggled and others didn't, as well
|
| 2009 |
as the sort of headline finding
|
| 2010 |
|
| 2011 |
403
|
| 2012 |
+
00:26:32,980 --> 00:26:36,780
|
| 2013 |
of which had the best, uh, wer.
|
| 2014 |
But that would require the source
|
| 2015 |
|
| 2016 |
404
|
| 2017 |
+
00:26:36,780 --> 00:26:40,020
|
| 2018 |
of truth. Okay. That's it.
|
| 2019 |
Hope this was, I don't know,
|
| 2020 |
|
| 2021 |
405
|
| 2022 |
+
00:26:40,180 --> 00:26:43,460
|
| 2023 |
maybe useful for other folks
|
| 2024 |
interested in stuff you want to see.
|
| 2025 |
|
| 2026 |
406
|
| 2027 |
+
00:26:43,940 --> 00:26:48,100
|
| 2028 |
I always feel think I've just said
|
| 2029 |
something I didn't intend to say.
|
| 2030 |
|
| 2031 |
407
|
| 2032 |
+
00:26:48,660 --> 00:26:51,020
|
| 2033 |
I said for those, listen carefully.
|
| 2034 |
Including, hopefully,
|
| 2035 |
|
| 2036 |
408
|
| 2037 |
+
00:26:51,020 --> 00:26:54,060
|
| 2038 |
the models themselves.
|
| 2039 |
This has been myself,
|
| 2040 |
|
| 2041 |
409
|
| 2042 |
+
00:26:54,100 --> 00:26:57,900
|
| 2043 |
Daniel Rosehill, for more, um,
|
| 2044 |
jumbled repositories about my,
|
| 2045 |
|
| 2046 |
410
|
| 2047 |
+
00:26:57,940 --> 00:27:00,820
|
| 2048 |
uh, roving interest in AI,
|
| 2049 |
but particularly Agentic,
|
| 2050 |
|
| 2051 |
411
|
| 2052 |
+
00:27:01,180 --> 00:27:05,340
|
| 2053 |
MCP and voice tech.
|
| 2054 |
Uh, you can find me on GitHub.
|
| 2055 |
|
| 2056 |
412
|
| 2057 |
+
00:27:05,820 --> 00:27:11,140
|
| 2058 |
Hugging face. Where else?
|
| 2059 |
Daniel, which is my personal website,
|
| 2060 |
|
| 2061 |
413
|
| 2062 |
+
00:27:11,140 --> 00:27:15,260
|
| 2063 |
as well as this podcast whose
|
| 2064 |
name I sadly cannot remember.
|
| 2065 |
|
| 2066 |
414
|
| 2067 |
+
00:27:15,700 --> 00:27:17,420
|
| 2068 |
Until next time.
|
| 2069 |
Thanks for listening.
|