Paul Bird
commited on
Upload RunWhisper.cs
Browse files- RunWhisper.cs +18 -12
RunWhisper.cs
CHANGED
@@ -36,6 +36,7 @@ public class RunWhisper : MonoBehaviour
|
|
36 |
// Link your audioclip here. Format must be 16Hz mono non-compressed.
|
37 |
public AudioClip audioClip;
|
38 |
|
|
|
39 |
const int maxTokens = 100;
|
40 |
|
41 |
//Special tokens
|
@@ -56,19 +57,22 @@ public class RunWhisper : MonoBehaviour
|
|
56 |
int[] outputTokens = new int[maxTokens];
|
57 |
|
58 |
// Used for special character decoding
|
59 |
-
int[]
|
60 |
|
61 |
TensorFloat encodedAudio;
|
62 |
|
63 |
bool transcribe = false;
|
64 |
string outputString = "";
|
65 |
|
|
|
|
|
|
|
66 |
void Start()
|
67 |
{
|
68 |
allocator = new TensorCachingAllocator();
|
69 |
ops = WorkerFactory.CreateOps(backend, allocator);
|
70 |
|
71 |
-
|
72 |
|
73 |
GetTokens();
|
74 |
|
@@ -117,9 +121,7 @@ public class RunWhisper : MonoBehaviour
|
|
117 |
|
118 |
void EncodeAudio()
|
119 |
{
|
120 |
-
var input = new TensorFloat(new TensorShape(1, numSamples), data);
|
121 |
-
|
122 |
-
int maxSamples = 30 * 16000;
|
123 |
if (numSamples > maxSamples)
|
124 |
{
|
125 |
Debug.Log("The AudioClip is too long.");
|
@@ -127,7 +129,7 @@ public class RunWhisper : MonoBehaviour
|
|
127 |
}
|
128 |
|
129 |
// Pad out to 30 seconds at 16khz if necessary
|
130 |
-
var input30seconds = ops.Pad(input, new int[] { 0, 0, 0,
|
131 |
|
132 |
spectroEngine.Execute(input30seconds);
|
133 |
var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
|
@@ -142,7 +144,7 @@ public class RunWhisper : MonoBehaviour
|
|
142 |
{
|
143 |
if (transcribe && currentToken < outputTokens.Length - 1)
|
144 |
{
|
145 |
-
var tokensSoFar = new TensorInt(new TensorShape(1, outputTokens.Length), outputTokens);
|
146 |
|
147 |
var inputs = new Dictionary<string, Tensor>
|
148 |
{
|
@@ -153,7 +155,7 @@ public class RunWhisper : MonoBehaviour
|
|
153 |
decoderEngine.Execute(inputs);
|
154 |
var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
|
155 |
|
156 |
-
var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
|
157 |
tokensPredictions.MakeReadable();
|
158 |
|
159 |
int ID = tokensPredictions[currentToken];
|
@@ -165,7 +167,10 @@ public class RunWhisper : MonoBehaviour
|
|
165 |
{
|
166 |
transcribe = false;
|
167 |
}
|
168 |
-
else if (ID >= tokens.Length)
|
|
|
|
|
|
|
169 |
else outputString += GetUnicodeText(tokens[ID]);
|
170 |
|
171 |
Debug.Log(outputString);
|
@@ -185,16 +190,16 @@ public class RunWhisper : MonoBehaviour
|
|
185 |
foreach (char letter in text)
|
186 |
{
|
187 |
outText += ((int)letter <= 256) ? letter :
|
188 |
-
(char)
|
189 |
}
|
190 |
return outText;
|
191 |
}
|
192 |
|
193 |
-
void
|
194 |
{
|
195 |
for (int i = 0, n = 0; i < 256; i++)
|
196 |
{
|
197 |
-
if (IsWhiteSpace((char)i))
|
198 |
}
|
199 |
}
|
200 |
|
@@ -209,5 +214,6 @@ public class RunWhisper : MonoBehaviour
|
|
209 |
encoderEngine?.Dispose();
|
210 |
spectroEngine?.Dispose();
|
211 |
ops?.Dispose();
|
|
|
212 |
}
|
213 |
}
|
|
|
36 |
// Link your audioclip here. Format must be 16Hz mono non-compressed.
|
37 |
public AudioClip audioClip;
|
38 |
|
39 |
+
// This is how many tokens you want. It can be adjusted.
|
40 |
const int maxTokens = 100;
|
41 |
|
42 |
//Special tokens
|
|
|
57 |
int[] outputTokens = new int[maxTokens];
|
58 |
|
59 |
// Used for special character decoding
|
60 |
+
int[] whiteSpaceCharacters = new int[256];
|
61 |
|
62 |
TensorFloat encodedAudio;
|
63 |
|
64 |
bool transcribe = false;
|
65 |
string outputString = "";
|
66 |
|
67 |
+
// Maximum size of audioClip (30s at 16kHz)
|
68 |
+
const int maxSamples = 30 * 16000;
|
69 |
+
|
70 |
void Start()
|
71 |
{
|
72 |
allocator = new TensorCachingAllocator();
|
73 |
ops = WorkerFactory.CreateOps(backend, allocator);
|
74 |
|
75 |
+
SetupWhiteSpaceShifts();
|
76 |
|
77 |
GetTokens();
|
78 |
|
|
|
121 |
|
122 |
void EncodeAudio()
|
123 |
{
|
124 |
+
using var input = new TensorFloat(new TensorShape(1, numSamples), data);
|
|
|
|
|
125 |
if (numSamples > maxSamples)
|
126 |
{
|
127 |
Debug.Log("The AudioClip is too long.");
|
|
|
129 |
}
|
130 |
|
131 |
// Pad out to 30 seconds at 16khz if necessary
|
132 |
+
using var input30seconds = ops.Pad(input, new int[] { 0, 0, 0, maxSamples - numSamples });
|
133 |
|
134 |
spectroEngine.Execute(input30seconds);
|
135 |
var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
|
|
|
144 |
{
|
145 |
if (transcribe && currentToken < outputTokens.Length - 1)
|
146 |
{
|
147 |
+
using var tokensSoFar = new TensorInt(new TensorShape(1, outputTokens.Length), outputTokens);
|
148 |
|
149 |
var inputs = new Dictionary<string, Tensor>
|
150 |
{
|
|
|
155 |
decoderEngine.Execute(inputs);
|
156 |
var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
|
157 |
|
158 |
+
using var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
|
159 |
tokensPredictions.MakeReadable();
|
160 |
|
161 |
int ID = tokensPredictions[currentToken];
|
|
|
167 |
{
|
168 |
transcribe = false;
|
169 |
}
|
170 |
+
else if (ID >= tokens.Length)
|
171 |
+
{
|
172 |
+
outputString += $"(time={(ID - START_TIME) * 0.02f})";
|
173 |
+
}
|
174 |
else outputString += GetUnicodeText(tokens[ID]);
|
175 |
|
176 |
Debug.Log(outputString);
|
|
|
190 |
foreach (char letter in text)
|
191 |
{
|
192 |
outText += ((int)letter <= 256) ? letter :
|
193 |
+
(char)whiteSpaceCharacters[(int)(letter - 256)];
|
194 |
}
|
195 |
return outText;
|
196 |
}
|
197 |
|
198 |
+
void SetupWhiteSpaceShifts()
|
199 |
{
|
200 |
for (int i = 0, n = 0; i < 256; i++)
|
201 |
{
|
202 |
+
if (IsWhiteSpace((char)i)) whiteSpaceCharacters[n++] = i;
|
203 |
}
|
204 |
}
|
205 |
|
|
|
214 |
encoderEngine?.Dispose();
|
215 |
spectroEngine?.Dispose();
|
216 |
ops?.Dispose();
|
217 |
+
allocator?.Dispose();
|
218 |
}
|
219 |
}
|