diff options
author | jstebbins <[email protected]> | 2009-10-03 16:51:58 +0000 |
---|---|---|
committer | jstebbins <[email protected]> | 2009-10-03 16:51:58 +0000 |
commit | 3f0772a709a839258eee5b4a8fcf3f5bb830b81a (patch) | |
tree | 9f2debb517c5395037faf1be03eda5851f56c683 /libhb/decsrtsub.c | |
parent | d3a040aafa87d4f56dc6e6b2531afe407432f0b2 (diff) |
fix reading of UTF-16 SRT subtitle files (and other wide charsets)
The entire file is encoded in the chosen charset, not just the subtitle text.
So we must read and convert the text prior to any parsing. Using fgets() to
read a line doesn't work because wide charsets can have 0x0a in the high byte
of some char.
git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@2863 b64f7644-9d1e-0410-96f1-a4d463321fa5
Diffstat (limited to 'libhb/decsrtsub.c')
-rw-r--r-- | libhb/decsrtsub.c | 151 |
1 files changed, 119 insertions, 32 deletions
diff --git a/libhb/decsrtsub.c b/libhb/decsrtsub.c index 5c99218e5..bf35c9e45 100644 --- a/libhb/decsrtsub.c +++ b/libhb/decsrtsub.c @@ -25,6 +25,7 @@ typedef struct srt_entry_s { long offset, duration; long start, stop; char text[1024]; + int pos; } srt_entry_t; /* @@ -32,8 +33,14 @@ typedef struct srt_entry_s { */ struct hb_work_private_s { - hb_job_t *job; - FILE *file; + hb_job_t * job; + FILE * file; + char buf[1024]; + int pos; + int end; + char utf8_buf[2048]; + int utf8_pos; + int utf8_end; unsigned long current_time; unsigned long number_of_entries; unsigned long current_state; @@ -60,12 +67,107 @@ static struct start_and_end read_time_from_string( const char* timeString ) return result; } +static int utf8_fill( hb_work_private_t * pv ) +{ + int bytes, conversion = 0; + size_t out_size; + + /* Align utf8 data to beginning of the buffer so that we can + * fill the buffer to its maximum */ + memmove( pv->utf8_buf, pv->utf8_buf + pv->utf8_pos, pv->utf8_end - pv->utf8_pos ); + pv->utf8_end -= pv->utf8_pos; + pv->utf8_pos = 0; + out_size = 2048 - pv->utf8_end; + while( out_size ) + { + char *p, *q; + size_t in_size, retval; + + if( pv->end == pv->pos ) + { + bytes = fread( pv->buf, 1, 1024, pv->file ); + pv->pos = 0; + pv->end = bytes; + if( bytes == 0 ) + return 0; + } + + p = pv->buf + pv->pos; + q = pv->utf8_buf + pv->utf8_end; + in_size = pv->end - pv->pos; + + retval = iconv( pv->iconv_context, &p, &in_size, &q, &out_size); + if( q != pv->utf8_buf + pv->utf8_pos ) + conversion = 1; + + pv->utf8_end = q - pv->utf8_buf; + pv->pos = p - pv->buf; + + if( ( retval == -1 ) && ( errno == EINVAL ) ) + { + /* Incomplete multibyte sequence, read more data */ + memmove( pv->buf, p, pv->end - pv->pos ); + pv->end -= pv->pos; + pv->pos = 0; + bytes = fread( pv->buf + pv->end, 1, 1024 - pv->end, pv->file ); + if( bytes == 0 ) + { + if( !conversion ) + return 0; + else + return 1; + } + pv->end += bytes; + } else if ( ( retval == -1 ) && ( errno == EILSEQ ) ) + { + hb_error( "Invalid byte for codeset in input, discard byte" ); + /* Try the next byte of the input */ + pv->pos++; + } else if ( ( retval == -1 ) && ( errno == E2BIG ) ) + { + /* buffer full */ + return conversion; + } + } + return 1; +} + +static int get_line( hb_work_private_t * pv, char *buf, int size ) +{ + int i; + char c; + + /* Find newline in converted UTF-8 buffer */ + for( i = 0; i < size - 1; i++ ) + { + if( pv->utf8_pos >= pv->utf8_end ) + { + if( !utf8_fill( pv ) ) + { + if( i ) + return 1; + else + return 0; + } + } + c = pv->utf8_buf[pv->utf8_pos++]; + if( c == '\n' ) + { + buf[i] = '\n'; + buf[i+1] = '\0'; + return 1; + } + buf[i] = c; + } + buf[0] = '\0'; + return 1; +} + /* * Read the SRT file and put the entries into the subtitle fifo for all to read */ static hb_buffer_t *srt_read( hb_work_private_t *pv ) { - char line_buffer[1024]; if( !pv->file ) @@ -73,7 +175,7 @@ static hb_buffer_t *srt_read( hb_work_private_t *pv ) return NULL; } - while( fgets( line_buffer, sizeof( line_buffer ), pv->file ) ) + while( get_line( pv, line_buffer, sizeof( line_buffer ) ) ) { switch (pv->current_state) { @@ -94,10 +196,8 @@ static hb_buffer_t *srt_read( hb_work_private_t *pv ) case k_state_inEntry: { - char *p, *q; - size_t in_size; - size_t out_size; - size_t retval; + char *q; + int size, len; // If the current line is empty, we assume this is the // seperation betwene two entries. In case we are wrong, @@ -107,28 +207,12 @@ static hb_buffer_t *srt_read( hb_work_private_t *pv ) continue; } - - for( q = pv->current_entry.text; (q < pv->current_entry.text+1024) && *q; q++); - - p = line_buffer; - - in_size = strlen(line_buffer); - out_size = (pv->current_entry.text+1024) - q; - - retval = iconv( pv->iconv_context, &p, &in_size, &q, &out_size); - *q = '\0'; - - if( ( retval == -1 ) && ( errno == EINVAL ) ) - { - hb_error( "Invalid shift sequence" ); - } else if ( ( retval == -1 ) && ( errno == EILSEQ ) ) - { - hb_error( "Invalid byte for codeset in input, %"PRId64" bytes discarded", (int64_t)in_size); - } else if ( ( retval == -1 ) && ( errno == E2BIG ) ) - { - hb_error( "Not enough space in output buffer"); - } - + q = pv->current_entry.text + pv->current_entry.pos; + len = strlen( line_buffer ); + size = MIN(1024 - pv->current_entry.pos - 1, len ); + memcpy(q, line_buffer, size); + pv->current_entry.pos += size; + pv->current_entry.text[pv->current_entry.pos] = '\0'; break; } @@ -140,7 +224,8 @@ static hb_buffer_t *srt_read( hb_work_private_t *pv ) /* * Is this really new next entry begin? */ - if (potential_entry_number == pv->number_of_entries + 1) { + if (potential_entry_number == pv->number_of_entries + 1) + { /* * We found the next entry - or a really rare error condition */ @@ -190,7 +275,9 @@ static hb_buffer_t *srt_read( hb_work_private_t *pv ) return buffer; } continue; - } else { + } + else + { /* * Well.. looks like we are in the wrong mode.. lets add the * newline we misinterpreted... |