summaryrefslogtreecommitdiffstats
path: root/libhb/decssasub.c
blob: 2b350dc476190a76d8bfd0f7583ec7b1eb9248f6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
/* 
   This file is part of the HandBrake source code.
   Homepage: <http://handbrake.fr/>.
   It may be used under the terms of the GNU General Public License. */

/*
 * Converts SSA subtitles to UTF-8 subtitles with limited HTML-style markup (<b>, <i>, <u>).
 * 
 * SSA format references:
 *   http://www.matroska.org/technical/specs/subtitles/ssa.html
 *   http://moodub.free.fr/video/ass-specs.doc
 *   vlc-1.0.4/modules/codec/subtitles/subsass.c:ParseSSAString
 * 
 * @author David Foster (davidfstr)
 */

#include <stdlib.h>
#include <stdio.h>
#include "hb.h"

typedef enum {
    BOLD        = 0x01,
    ITALIC      = 0x02,
    UNDERLINE   = 0x04
} StyleSet;

// "<b></b>".len + "<i></i>".len + "<u></u>".len
#define MAX_OVERHEAD_PER_OVERRIDE (7 * 3)

#define SSA_2_HB_TIME(hr,min,sec,centi) \
    ( 90L * ( hr    * 1000L * 60 * 60 +\
              min   * 1000L * 60 +\
              sec   * 1000L +\
              centi * 10L ) )

static StyleSet ssa_parse_style_override( uint8_t *pos, StyleSet prevStyles )
{
    StyleSet nextStyles = prevStyles;
    for (;;)
    {
        // Skip over leading '{' or last '\\'
        pos++;
        
        // Scan for next \code
        while ( *pos != '\\' && *pos != '}' && *pos != '\0' ) pos++;
        if ( *pos != '\\' )
        {
            // End of style override block
            break;
        }
        
        // If next chars are \[biu][01], interpret it
        if ( strchr("biu", pos[1]) && strchr("01", pos[2]) )
        {
            StyleSet styleID =
                pos[1] == 'b' ? BOLD :
                pos[1] == 'i' ? ITALIC :
                pos[1] == 'u' ? UNDERLINE : 0;
            int enabled = (pos[2] == '1');
            
            if (enabled)
            {
                nextStyles |= styleID;
            }
            else
            {
                nextStyles &= ~styleID;
            }
        }
    }
    return nextStyles;
}

static void ssa_append_html_tags_for_style_change(
    uint8_t **dst, StyleSet prevStyles, StyleSet nextStyles )
{
    #define APPEND(str) { \
        char *src = str; \
        while (*src) { *(*dst)++ = *src++; } \
    }

    // Reverse-order close all previous styles
    if (prevStyles & UNDERLINE) APPEND("</u>");
    if (prevStyles & ITALIC)    APPEND("</i>");
    if (prevStyles & BOLD)      APPEND("</b>");
    
    // Forward-order open all next styles
    if (nextStyles & BOLD)      APPEND("<b>");
    if (nextStyles & ITALIC)    APPEND("<i>");
    if (nextStyles & UNDERLINE) APPEND("<u>");
    
    #undef APPEND
}

static hb_buffer_t *ssa_decode_to_utf8_line( uint8_t *in_data, int in_size );

/*
 * SSA packet format:
 * ( Dialogue: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text CR LF ) +
 *             1      2     3   4     5    6       7       8       9      10
 */
static hb_buffer_t *ssa_decode_to_utf8( hb_buffer_t *in )
{
    // Store NULL after the end of the buffer to make using string processing safe
    hb_buffer_realloc( in, in->size + 1 );
    in->data[in->size] = '\0';
    
    hb_buffer_t *out_list = NULL;
    hb_buffer_t **nextPtr = &out_list;
    
    const char *EOL = "\r\n";
    char *curLine, *curLine_parserData;
    for ( curLine = strtok_r( (char *) in->data, EOL, &curLine_parserData );
          curLine;
          curLine = strtok_r( NULL, EOL, &curLine_parserData ) )
    {
        // Skip empty lines and spaces between adjacent CR and LF
        if (curLine[0] == '\0')
            continue;
        
        // Decode an individual SSA line
        hb_buffer_t *out = ssa_decode_to_utf8_line( (uint8_t*)curLine, strlen( curLine ) );
        
        // We shouldn't be storing the extra NULL character,
        // but the MP4 muxer expects this, unfortunately.
        if ( out->size > 0 && out->data[out->size - 1] != '\0' ) {
            // NOTE: out->size remains unchanged
            hb_buffer_realloc( out, out->size + 1 );
            out->data[out->size] = '\0';
        }
        
        // If the input packet was non-empty, do not pass through
        // an empty output packet (even if the subtitle was empty),
        // as this would be interpreted as an end-of-stream
        if ( in->size > 0 && out->size == 0 ) {
            hb_buffer_close(&out);
            continue;
        }
        
        // Append 'out' to 'out_list'
        *nextPtr = out;
        nextPtr = &out->next;
    }
    
    return out_list;
}

/*
 * SSA line format:
 *   Dialogue: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text '\0'
 *             1      2     3   4     5    6       7       8       9      10
 */
static hb_buffer_t *ssa_decode_to_utf8_line( uint8_t *in_data, int in_size )
{
    uint8_t *pos = in_data;
    uint8_t *end = in_data + in_size;
    
    /*
     * Parse Start and End fields for timing information
     */
    int start_hr, start_min, start_sec, start_centi;
    int   end_hr,   end_min,   end_sec,   end_centi;
    int numPartsRead = sscanf( (char *) in_data, "%*128[^,],"
        "%d:%d:%d.%d,"  // Start
        "%d:%d:%d.%d,", // End
        &start_hr, &start_min, &start_sec, &start_centi,
          &end_hr,   &end_min,   &end_sec,   &end_centi );
    if ( numPartsRead != 8 )
        goto fail;
    
    int64_t in_start = SSA_2_HB_TIME(start_hr, start_min, start_sec, start_centi);
    int64_t in_stop  = SSA_2_HB_TIME(  end_hr,   end_min,   end_sec,   end_centi);
    
    /*
     * Advance 'pos' to the beginning of the Text field
     */
    int curFieldID = 1;
    while (pos < end)
    {
        if ( *pos++ == ',' )
        {
            curFieldID++;
            if ( curFieldID == 10 ) // Text
                break;
        }
    }
    if ( curFieldID != 10 )
        goto fail;
    uint8_t *textFieldPos = pos;
    
    // Count the number of style overrides in the Text field
    int numStyleOverrides = 0;
    while ( pos < end )
    {
        if (*pos++ == '{')
        {
            numStyleOverrides++;
        }
    }
    
    int maxOutputSize = (end - textFieldPos) + ((numStyleOverrides + 1) * MAX_OVERHEAD_PER_OVERRIDE);
    hb_buffer_t *out = hb_buffer_init( maxOutputSize );
    if ( out == NULL )
        return NULL;
    
    /*
     * The Text field contains plain text marked up with:
     * (1) '\n' -> space
     * (2) '\N' -> newline
     * (3) curly-brace control codes like '{\k44}' -> empty (strip them)
     * 
     * Perform the above conversions and copy it to the output packet
     */
    StyleSet prevStyles = 0;
    uint8_t *dst = out->data;
    pos = textFieldPos;
    while ( pos < end )
    {
        if ( pos[0] == '\\' && pos[1] == 'n' )
        {
            *dst++ = ' ';
            pos += 2;
        }
        else if ( pos[0] == '\\' && pos[1] == 'N' )
        {
            *dst++ = '\n';
            pos += 2;
        }
        else if ( pos[0] == '{' )
        {
            // Parse SSA style overrides and append appropriate HTML style tags
            StyleSet nextStyles = ssa_parse_style_override( pos, prevStyles );
            ssa_append_html_tags_for_style_change( &dst, prevStyles, nextStyles );
            prevStyles = nextStyles;
            
            // Skip past SSA control code
            while ( pos < end && *pos != '}' ) pos++;
            if    ( pos < end && *pos == '}' ) pos++;
        }
        else
        {
            // Copy raw character
            *dst++ = *pos++;
        }
    }
    
    // Append closing HTML style tags
    ssa_append_html_tags_for_style_change( &dst, prevStyles, 0 );
    
    // Trim output buffer to the actual amount of data written
    out->size = dst - out->data;
    
    // Copy metadata from the input packet to the output packet
    out->start = in_start;
    out->stop = in_stop;
    
    return out;
    
fail:
    hb_log( "decssasub: malformed SSA subtitle packet: %.*s\n", in_size, in_data );
    return NULL;
}

static int decssaInit( hb_work_object_t * w, hb_job_t * job )
{
    return 0;
}

static int decssaWork( hb_work_object_t * w, hb_buffer_t ** buf_in,
                        hb_buffer_t ** buf_out )
{
    hb_buffer_t * in = *buf_in;
    hb_buffer_t * out_list = NULL;
    
    if ( in->size > 0 ) {
        out_list = ssa_decode_to_utf8(in);
    } else {
        out_list = hb_buffer_init( 0 );
    }
    
    // Dispose the input packet, as it is no longer needed
    hb_buffer_close(&in);
    
    *buf_in = NULL;
    *buf_out = out_list;
    return HB_WORK_OK;
}

static void decssaClose( hb_work_object_t * w )
{
    // nothing
}

hb_work_object_t hb_decssasub =
{
    WORK_DECSSASUB,
    "SSA Subtitle Decoder",
    decssaInit,
    decssaWork,
    decssaClose
};