In order to generate subtitles for my videos, I converted them to audio files and used the Cloud Speech-to-Text. It works, but it only generates transcriptions, whereas what I need is a *.srt/*.vtt/similar file.
What I need is what YouTube does: to generate transcriptions and sync them with the video, like a subtitle format, ie.: transcriptions with the times when captions should appear.
Although I could upload them to YouTube and then download their auto-generated captions, it doesn't seem very correct.
Is there a way to generate an SRT file (or similar) using Google Cloud Speech?
There's no way really to do this directly from the Speech-to-Text API. What you could try to do is some post-processing on the speech recognition result.
For example, here's a request to the REST API using a model meant to transcribe video, with a public google-provided sample file:
curl -s -H "Content-Type: application/json" \
-H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
https://speech.googleapis.com/v1p1beta1/speech:longrunningrecognize \
--data "{
'config': {
'encoding': 'LINEAR16',
'sampleRateHertz': 16000,
'languageCode': 'en-US',
'enableWordTimeOffsets': true,
'enableAutomaticPunctuation': true,
'model': 'video'
},
'audio': {
'uri':'gs://cloud-samples-tests/speech/Google_Gnome.wav'
}
}"
The above uses asynchronous recognition (speech:longrunningrecognize), which is more fitting for larger files. Enabling punctuation ('enableAutomaticPunctuation': true) in combination with the start and end times of words ('enableWordTimeOffsets': true) near the start and end of each sentence (which you'd also have to convert from nanos to timestamps) could allow you to provide a text file in the srt format. You would probably also have to include some rules about the maximum length of a sentence appearing on the screen at any given time.
The above should not be too difficult to implement, however, there's a strong possibility that you would still encounter timing/synchronization issues.
There is no way to do it using Google Cloud itself buy as suggested you may post-process the result.
In this file I have made a quick code that kind of does the job. You may want to adapt it to your needs:
function convertGSTTToSRT(string) {
var obj = JSON.parse(string);
var i = 1;
var result = ''
for (const line of obj.response.results) {
result += i++;
result += '\n'
var word = line.alternatives[0].words[0]
var time = convertSecondStringToRealtime(word.startTime);
result += formatTime(time) + ' --> '
var word = line.alternatives[0].words[line.alternatives[0].words.length - 1]
time = convertSecondStringToRealtime(word.endTime);
result += formatTime(time) + '\n'
result += line.alternatives[0].transcript + '\n\n'
}
return result;
}
function formatTime(time) {
return String(time.hours).padStart(2, '0')+ ':' + String(time.minutes).padStart(2, '0') + ':' +
String(time.seconds).padStart(2, '0') + ',000';
}
function convertSecondStringToRealtime(string) {
var seconds = string.substring(0, string.length - 1);
var hours = Math.floor(seconds / 3600);
var minutes = Math.floor(seconds % 3600 / 60);
seconds = Math.floor(seconds % 3600 % 60);
return {
hours, minutes, seconds
}
}
here is the code I used
import math
import json
import datetime
def to_hms(s):
m, s = divmod(s, 60)
h, m = divmod(m, 60)
return '{}:{:0>2}:{:0>2}'.format(h, m, s)
def srt_generation(filepath, filename):
filename = 'DL_BIRTHDAY'
with open('{}{}.json'.format(filepath, filename), 'r') as file:
data = file.read()
results = json.loads(data)['response']['annotationResults'][0]['speechTranscriptions']
processed_results = []
counter = 1
lines = []
wordlist = []
for transcription in results:
alternative = transcription['alternatives'][0]
if alternative.has_key('transcript'):
# print(counter)
# lines.append(counter)
tsc = alternative['transcript']
stime = alternative['words'][0]['startTime'].replace('s','').split('.')
etime = alternative['words'][-1]['endTime'].replace('s','').split('.')
if(len(stime) == 1):
stime.append('000')
if(len(etime) == 1):
etime.append('000')
lines.append('{}\n{},{} --> {},{}\n{}\n\n\n'.format(counter, to_hms(int(stime[0])), stime[1], to_hms(int(etime[0])), etime[1],tsc.encode('ascii', 'ignore')))
counter = counter+1
wordlist.extend(alternative['words'])
srtfile = open('{}{}.srt'.format(filepath, filename), 'wr')
srtfile.writelines(lines)
srtfile.close()
## Now generate 3 seconds duration chunks of those words.
lines = []
counter = 1
strtime =0
entime = 0
words = []
standardDuration = 3
srtcounter = 1
for word in wordlist:
stime = word['startTime'].replace('s','').split('.')
etime = word['endTime'].replace('s','').split('.')
if(len(stime) == 1):
stime.append('000 ')
if(len(etime) == 1):
etime.append('000')
if(counter == 1):
strtime = '{},{}'.format(stime[0], stime[1])
entime = '{},{}'.format(etime[0], etime[1])
words.append(word['word'])
else:
tempstmime = int(stime[0])
tempentime = int(etime[0])
stimearr = strtime.split(',')
etimearr = entime.split(',')
if(tempentime - int(strtime.split(',')[0]) > standardDuration ):
transcript = ' '.join(words)
lines.append('{}\n{},{} --> {},{}\n{}\n\n\n'.format(srtcounter, to_hms(int(stimearr[0])), stimearr[1], to_hms(int(etimearr[0])), etimearr[1],transcript.encode('ascii', 'ignore')))
srtcounter = srtcounter+1
words = []
strtime = '{},{}'.format(stime[0], stime[1])
entime = '{},{}'.format(etime[0], etime[1])
words.append(' ')
words.append(word['word'])
else:
words.append(' ')
words.append(word['word'])
entime = '{},{}'.format(etime[0], etime[1])
counter = counter +1
if(len(words) > 0):
tscp = ' '.join(words)
stimearr = strtime.split(',')
etimearr = entime.split(',')
lines.append('{}\n{},{} --> {},{}\n{}\n\n\n'.format(srtcounter, to_hms(int(stimearr[0])), stimearr[1], to_hms(int(etimearr[0])), etimearr[1],tscp.encode('ascii', 'ignore')))
srtfile = open('{}{}_3_Sec_Custom.srt'.format(filepath, filename), 'wr')
srtfile.writelines(lines)
srtfile.close()
Use this request parameter "enable_word_time_offsets: True" to get the time stamps for the word groups. Then create an srt programmatically.
If you require a *.vtt file, here is a snippet to convert the API response received from GCP speech-to-text client into a valid *.vtt. Some answers above are for *.srt so sharing this here.
const client = new speech.SpeechClient();
const [response] = await client.recognize(request);
createVTT(response);
function createVTT(response) {
const wordsArray = response.results[0].alternatives[0].words;
let VTT = '';
let buffer = [];
const phraseLength = 10;
let startPointer = '00:00:00';
let endPointer = '00:00:00';
VTT += 'WEBVTT\n\n';
wordsArray.forEach((wordItem) => {
const { startTime, endTime, word } = wordItem;
const start = startTime.seconds;
const end = endTime.seconds;
if (buffer.length === 0) {
// first word of the phrase
startPointer = secondsToFormat(start);
}
if (buffer.length < phraseLength) {
buffer.push(word);
}
if (buffer.length === phraseLength) {
endPointer = secondsToFormat(end);
const phrase = buffer.join(' ');
VTT += `${startPointer + ' --> ' + endPointer}\n`;
VTT += `${phrase}\n\n`;
buffer = [];
}
});
if (buffer.length) {
// handle the left over buffer items
const lastItem = wordsArray[wordsArray.length - 1];
const end = lastItem.endTime.seconds;
endPointer = secondsToFormat(end);
const phrase = buffer.join(' ');
VTT += `${startPointer + ' --> ' + endPointer}\n`;
VTT += `${phrase}\n\n`;
}
return VTT;
}
function secondsToFormat(seconds) {
const timeHours = Math.floor(seconds / 3600)
.toString()
.padStart(2, '0');
const timeMinutes = Math.floor(seconds / 60)
.toString()
.padStart(2, '0');
const timeSeconds = (seconds % 60).toString().padStart(2, '0');
const formattedTime = timeHours + ':' + timeMinutes + ':' + timeSeconds + '.000';
return formattedTime;
}
Note: enableWordTimeOffsets: true must be set but that's already answered above. This answer is for people who want .vtt copy.
Hope this was helpful to someone :)
I am trying to design a parser using Ragel and C++ as host langauge.
There is a particular case where a parameter can be defined in two formats :
a. Integer : eg. SignalValue = 24
b. Hexadecimal : eg. SignalValue = 0x18
I have the below code to parse such a parameter :
INT = ((digit+)$incr_Count) %get_int >!(int_error); #[0-9]
HEX = (([0].'x'.[0-9A-F]+)$incr_Count) %get_hex >!(hex_error); #[hexadecimal]
SIGNAL_VAL = ( INT | HEX ) %/getSignalValue;
However in the above defined parser command, only the integer values(as defined in section a) gets recognized and parsed correctly.
If an hexadecimal number(eg. 0x24) is provided, then the number gets stored as ´0´ . There is no error called in case of hexadecimal number. The parser recognizes the hexadecimal, but the value stored is '0'.
I seem to be missing out some minor details with Ragel. Has anyone faced a similar situation?
The remaning part of the code :
//Global
int lInt = -1;
action incr_Count {
iGenrlCount++;
}
action get_int {
int channel = 0xFF;
std::stringstream str;
while(iGenrlCount > 0)
{
str << *(p - iGenrlCount);
iGenrlCount--;
}
str >> lInt; //push the values
str.clear();
}
action get_hex {
std::stringstream str;
while(iGenrlCount > 0)
{
str << std::hex << *(p - iGenrlCount);
iGenrlCount--;
}
str >> lInt; //push the values
}
action getSignalValue {
cout << "lInt = " << lInt << endl;
}
It's not a problem with your FSM (which looks fine for the task you have), it's more of a C++ coding issue. Try this implementation of get_hex():
action get_hex {
std::stringstream str;
cout << "get_hex()" << endl;
while(iGenrlCount > 0)
{
str << *(p - iGenrlCount);
iGenrlCount--;
}
str >> std::hex >> lInt; //push the values
}
Notice that it uses str just as a string buffer and applies std::hex to >> from std::stringstream to int. So in the end you get:
$ ./a.out 245
lInt = 245
$ ./a.out 0x245
lInt = 581
Which probably is what you want.
Is there a way to convert or create a new [[bracket style string]] based on an existing 'quote style string'?
s = "one\ntwo" -- how the string was created
s2 = [[one\ntwo]] -- what i want the new string to be
Escaping the escape sequence seems to achieve the desired effect, at least in this case.
s2 = string.gsub(s, "\n", "\\n")
> print(s2)
one\ntwo
One way is to make a table that has all the possible escape sequences:
local t = {["\a"] = [[\a]],
["\b"] = [[\b]],
["\f"] = [[\f]],
["\n"] = [[\n]],
["\r"] = [[\r]],
["\t"] = [[\t]],
["\r"] = [[\r]],
["\\"] = [[\\]],
["\""] = [["]],
["\'"] = [[']],
}
local s2 = s:gsub(".", t)
I'm trying to integrate with ybp.com, a vendor of proprietary software for managing book ordering workflows in large libraries. It keeps feeding me URLs that contain characters encoded with an extra "25" in them. Like this book title:
VOLATILE KNOWING%253a PARENTS%252c TEACHERS%252c AND THE CENSORED STORY OF ACCOUNTABILITY IN AMERICA%2527S PUBLIC SCHOOLS.
The encoded characters in this sample are as follows:
%253a = %3A = a colon
%252c = %2C = a comma
%2527 = %27 = an apostrophe (non-curly)
I need to convert these encodings to a format my internal apps can recognize, and the extra 25 is throwing things off kilter. The final two digits of the hex encoded characters appear to be identical to standard URL encodings, so a brute force method would be to replace "%25" with "%". But I'm leary of doing that because it would be sure to haunt me later when an actual %25 shows up for some reason.
So, what standard is this? Is there an official algorithm for converting values like this to other encodings?
%25 is actually a % character. My guess is that the external website is URLEncoding their output twice accidentally.
If that's the case, it is safe to replace %25 with % (or just URLDecode twice)
The ASCII code 37 (25 in hexadecimal) is %, so the URL encoding of % is %25.
It looks like your data got URL encoded twice: , -> %2C -> %252C
Substituting every %25 for % should not generate any problems, as an actual %25 would get encoded to %25252525.
Create a counter that increments one by one for next two characters, and if you found modulus, you go back, assign the previous counter the '%' char and proceed again. Something like this.
char *str, *newstr; // Fill up with some memory before proceeding below..
....
int k = 0, j = 0;
short modulus = 0;
char first = 0, second = 0;
short proceed = 0;
for(k=0,j=0; k<some_size; j++,k++) {
if(str[k] == '%') {
++k; first = str[k];
++k; second = str[k];
proceed = 1;
} else if(modulus == 1) {
modulus = 0;
--j; first = str[k];
++k; second = str[k];
newstr[j] = '%';
proceed = 1;
} else proceed = 0; // Do not do decoding..
if(proceed == 1) {
if(first == '2' && second == '5') {
newstr[j] = '%';
modulus = 1;
......
So I've got a fairly deep hierarchy of record definitions:
-record(cat, {name = '_', attitude = '_',}).
-record(mat, {color = '_', fabric = '_'}).
-record(packet, {cat = '_', mat = '_'}).
-record(stamped_packet, {packet = '_', timestamp = '_'}).
-record(enchilada, {stamped_packet = '_', snarky_comment = ""}).
And now I've got an enchilada, and I want to make a new one that's
just like it except for the value of one of the subsubsubrecords.
Here's what I've been doing.
update_attitude(Ench0, NewState)
when is_record(Ench0, enchilada)->
%% Pick the old one apart.
#enchilada{stamped_packet = SP0} = Ench0,
#stamped_packet{packet = PK0} = SP0,
#packet{cat = Tag0} = PK0,
%% Build up the new one.
Tude1 = Tude0#cat{attitude = NewState},
PK1 = PK0#packet{cat = Tude1},
SP1 = SP0#stamped_packet{packet = PK1},
%% Thank God that's over.
Ench0#enchilada{stamped_packet = SP1}.
Just thinking about this is painful. Is there a better way?
As Hynek suggests, you can elide the temporary variables and do:
update_attitude(E = #enchilada{stamped_packet = (P = #packet{cat=C})},
NewAttitude) ->
E#enchilada{stamped_packet = P#packet{cat = C#cat{attitude=NewAttitude}}}.
Yariv Sadan got frustrated with the same issue and wrote Recless, a type inferring parse transform for records which would allow you to write:
-compile({parse_transform, recless}).
update_attitude(Enchilada = #enchilada{}, Attitude) ->
Enchilada.stamped_packet.packet.cat.attitude = Attitude.
Try this:
update_attitude(E = #enchilada{
stamped_packet = (SP = #stamped_packet{
packet = (P = #packet{
cat = C
})})}, NewState) ->
E#enchilada{
stamped_packet = SP#stamped_packet{
packet = P#packet{
cat = C#cat{
attitude = NewState
}}}}.
anyway, structures is not most powerful part of Erlang.