Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
input.c
1/*************************************************************************/
2/* */
3/* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4/* University of Edinburgh. */
5/* */
6/* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9/* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10/* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11/* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12/* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13/* */
14/*************************************************************************/
15/*
16 * This code is in a distressed state due to hackery for windoze.
17 * See comment in url.c.
18 */
19
20#include <stdio.h>
21#include <stdlib.h>
22#include <assert.h>
23
24#ifdef FOR_LT
25
26#include "lt-memory.h"
27#include "nsllib.h"
28
29#define ERR(m) LT_ERROR(NECHAR,m)
30#define ERR1(m,x) LT_ERROR1(NECHAR,m,x)
31#define ERR2(m,x,y) LT_ERROR2(NECHAR,m,x,y)
32#define ERR3(m,x,y,z) LT_ERROR3(NECHAR,m,x,y,z)
33
34#define Malloc salloc
35#define Realloc srealloc
36#define Free sfree
37
38#else
39
40#include "system.h"
41#define ERR(m) fprintf(stderr,m)
42#define ERR1(m,x) fprintf(stderr,m,x)
43#define ERR2(m,x,y) fprintf(stderr,m,x,y)
44#define ERR3(m,x,y,z) fprintf(stderr,m,x,y,z)
45
46#endif
47
48#include "charset.h"
49#include "string16.h"
50#include "dtd.h"
51#include "input.h"
52#include "url.h"
53#include "ctype16.h"
54
55static int get_translated_line1(InputSource s);
56
57InputSource SourceFromStream(const char8 *description, FILE *file)
58{
59 Entity e;
60
61 e = NewExternalEntity(0, 0, description, 0, 0);
62 if(!strchr8(description, '/'))
63 EntitySetBaseURL(e, default_base_url());
64
65 return NewInputSource(e, MakeFILE16FromFILE(file, "r"));
66}
67
68InputSource EntityOpen(Entity e)
69{
70 FILE16 *f16;
71
72 if(e->type == ET_external)
73 {
74 const char8 *url = EntityURL(e);
75
76 if(!url || !(f16 = url_open(url, 0, "r", 0)))
77 return 0;
78 }
79 else
80 {
81 f16 = MakeFILE16FromString((char *)e->text, -1, "r");
82 }
83
84 return NewInputSource(e, f16);
85}
86
87
88InputSource NewInputSource(Entity e, FILE16 *f16)
89{
90 InputSource source;
91
92 if(!(source = Malloc(sizeof(*source))))
93 return 0;
94
95 source->line = 0;
96 source->line_alloc = 0;
97 source->line_length = 0;
98 source->next = 0;
99 source->seen_eoe = 0;
100
101 source->entity = e;
102
103 source->file16 = f16;
104
105 source->bytes_consumed = 0;
106 source->bytes_before_current_line = 0;
107 source->line_end_was_cr = 0;
108 source->line_number = 0;
109 source->not_read_yet = 1;
110
111 source->nextin = source->insize = 0;
112
113 source->parent = 0;
114
115 return source;
116}
117
118int SourceLineAndChar(InputSource s, int *linenum, int *charnum)
119{
120 Entity e = s->entity, f = e->parent;
121
122 if(e->type == ET_external)
123 {
124 *linenum = s->line_number;
125 *charnum = s->next;
126 return 1;
127 }
128
129 if(f && f->type == ET_external)
130 {
131 if(e->matches_parent_text)
132 {
133 *linenum = e->line_offset + s->line_number;
134 *charnum = (s->line_number == 0 ? e->line1_char_offset : 0) +
135 s->next;
136 return 1;
137 }
138 else
139 {
140 *linenum = e->line_offset;
141 *charnum = e->line1_char_offset;
142 return 0;
143 }
144 }
145
146 if(f && f->matches_parent_text)
147 {
148 *linenum = f->line_offset + e->line_offset;
149 *charnum = (e->line_offset == 0 ? f->line1_char_offset : 0) +
150 e->line1_char_offset;
151 return 0;
152 }
153
154 return -1;
155}
156
157void SourcePosition(InputSource s, Entity *entity, int *byte_offset)
158{
159 *entity = s->entity;
160 *byte_offset = SourceTell(s);
161}
162
163int SourceTell(InputSource s)
164{
165#if CHAR_SIZE == 8
166 return s->bytes_before_current_line + s->next;
167#else
168 switch(s->entity->encoding)
169 {
170 case CE_ISO_10646_UCS_2B:
171 case CE_UTF_16B:
172 case CE_ISO_10646_UCS_2L:
173 case CE_UTF_16L:
174 return s->bytes_before_current_line + 2 * s->next;
175 case CE_ISO_8859_1:
176 case CE_ISO_8859_2:
177 case CE_ISO_8859_3:
178 case CE_ISO_8859_4:
179 case CE_ISO_8859_5:
180 case CE_ISO_8859_6:
181 case CE_ISO_8859_7:
182 case CE_ISO_8859_8:
183 case CE_ISO_8859_9:
184 case CE_unspecified_ascii_superset:
185 return s->bytes_before_current_line + s->next;
186 case CE_UTF_8:
187 if(s->complicated_utf8_line)
188 {
189 /* examine earlier chars in line to see how many bytes they used */
190 int i, c, n=0;
191 for(i = 0; i < s->next; i++)
192 {
193 c = s->line[i];
194 if(c <= 0x7f)
195 n += 1;
196 else if(c <= 0x7ff)
197 n += 2;
198 else if(c >= 0xd800 && c <= 0xdfff)
199 /* One of a surrogate pair, count 2 each */
200 n += 2;
201 else if(c <= 0xffff)
202 n += 3;
203 else if(c <= 0x1ffff)
204 n += 4;
205 else if(c <= 0x3ffffff)
206 n += 5;
207 else
208 n += 6;
209
210 }
211 return s->bytes_before_current_line + n;
212 }
213 else
214 return s->bytes_before_current_line + s->next;
215 default:
216 return -1;
217 }
218#endif
219}
220
221int SourceSeek(InputSource s, int offset)
222{
223 s->line_length = 0;
224 s->next = 0;
225 s->seen_eoe = 0;
226 s->bytes_consumed = s->bytes_before_current_line = offset;
227 s->nextin = s->insize = 0;
228 /* XXX line number will be wrong! */
229 s->line_number = -999999;
230 return Fseek(s->file16, offset, SEEK_SET);
231}
232
233static int get_translated_line(InputSource s)
234{
235 /* This is a hack, pending some reorganisation */
236
237 struct _FILE16 {
238 void *handle;
239 int handle2, handle3;
240 /* we don't need the rest here */
241 };
242
243 Entity e = s->entity;
244 Char *p;
245 struct _FILE16 *f16 = (struct _FILE16 *)s->file16;
246
247
248 if(e->type == ET_external)
249 return get_translated_line1(s);
250
251 if(!*(Char *)((char *)f16->handle + f16->handle2))
252 {
253 s->line_length = 0;
254 return 0;
255 }
256
257 s->line = (Char *)((char *)f16->handle + f16->handle2);
258 for(p=s->line; *p && *p != '\n'; p++)
259 ;
260 if(*p)
261 p++;
262 f16->handle2 = (char *)p - (char *)f16->handle;
263 s->line_length = p - s->line;
264
265 s->bytes_before_current_line = f16->handle2;
266
267 return 0;
268}
269
270static int get_translated_line1(InputSource s)
271{
272 unsigned int c; /* can't use Char, it might be >0x10000 */
273 unsigned char *inbuf = s->inbuf;
274 int nextin = s->nextin, insize = s->insize;
275 int startin = s->nextin;
276 Char *outbuf = s->line;
277 int outsize = s->line_alloc;
278 int nextout = 0;
279 int remaining = 0;
280 int ignore_linefeed = s->line_end_was_cr;
281
282#if CHAR_SIZE == 16
283
284 int *to_unicode = 0; /* initialize to shut gcc up */
285 CharacterEncoding enc = s->entity->encoding;
286 int more, i;
287 s->complicated_utf8_line = 0;
288
289 if(enc >= CE_ISO_8859_2 && enc <= CE_ISO_8859_9)
290 to_unicode = iso_to_unicode[enc - CE_ISO_8859_2];
291
292#endif
293
294 s->line_end_was_cr = 0;
295 s->bytes_before_current_line = s->bytes_consumed;
296
297 while(1)
298 {
299 /* There are never more characters than bytes in the input */
300 if(outsize < nextout + (insize - nextin))
301 {
302 outsize = nextout + (insize - nextin);
303 outbuf = Realloc(outbuf, outsize * sizeof(Char));
304 }
305
306 while(nextin < insize)
307 {
308#if CHAR_SIZE == 8
309 c = inbuf[nextin++];
310#else
311 switch(enc)
312 {
313 case CE_ISO_10646_UCS_2B:
314 case CE_UTF_16B:
315 if(nextin+2 > insize)
316 goto more_bytes;
317 c = (inbuf[nextin] << 8) + inbuf[nextin+1];
318 nextin += 2;
319 break;
320 case CE_ISO_10646_UCS_2L:
321 case CE_UTF_16L:
322 if(nextin+2 > insize)
323 goto more_bytes;
324 c = (inbuf[nextin+1] << 8) + inbuf[nextin];
325 nextin += 2;
326 break;
327 case CE_ISO_8859_1:
328 case CE_unspecified_ascii_superset:
329 c = inbuf[nextin++];
330 break;
331 case CE_ISO_8859_2:
332 case CE_ISO_8859_3:
333 case CE_ISO_8859_4:
334 case CE_ISO_8859_5:
335 case CE_ISO_8859_6:
336 case CE_ISO_8859_7:
337 case CE_ISO_8859_8:
338 case CE_ISO_8859_9:
339 c = to_unicode[inbuf[nextin++]];
340 if(c == (unsigned int)-1)
341 ERR3("Illegal %s character <0x%x> "
342 "at file offset %d\n",
343 CharacterEncodingName[enc], inbuf[nextin-1],
344 s->bytes_consumed + nextin - 1 - startin);
345 break;
346 case CE_UTF_8:
347 c = inbuf[nextin++];
348 if(c <= 0x7f)
349 break;
350 if(c <= 0xc0 || c >= 0xfe)
351 {
352 ERR2("Illegal UTF-8 start byte <0x%x> "
353 "at file offset %d\n",
354 c, s->bytes_consumed + nextin - 1 - startin);
355 return -1;
356 }
357 if(c <= 0xdf)
358 {
359 c &= 0x1f;
360 more = 1;
361 }
362 else if(c <= 0xef)
363 {
364 c &= 0x0f;
365 more = 2;
366 }
367 else if(c <= 0xf7)
368 {
369 c &= 0x07;
370 more = 3;
371 }
372 else if(c <= 0xfb)
373 {
374 c &= 0x03;
375 more = 4;
376 }
377 else
378 {
379 c &= 0x01;
380 more = 5;
381 }
382 if(nextin+more > insize)
383 {
384 nextin--;
385 goto more_bytes;
386 }
387 s->complicated_utf8_line = 1;
388 for(i=0; i<more; i++)
389 c = (c << 6) + (inbuf[nextin++] & 0x3f);
390 break;
391 default:
392 ERR("read from entity with unsupported encoding!\n");
393 return -1;
394 }
395
396 if(c > 0x110000 || (c < 0x10000 && !is_xml_legal(c)))
397 if(!(enc == CE_UTF_16L || enc == CE_UTF_16B) ||
398 c < 0xd800 || c > 0xdfff)
399 /* We treat the surrogates as legal because we didn't
400 combine them when translating from UTF-16. XXX */
401 {
402 ERR2("Error: illegal character <0x%x> "
403 "immediately before file offset %d\n",
404 c, s->bytes_consumed + nextin - startin);
405 return -1;
406 }
407#endif
408 if(c == '\n' && ignore_linefeed)
409 {
410 /* Ignore lf at start of line if last line ended with cr */
411 ignore_linefeed = 0;
412 s->bytes_before_current_line += (nextin - startin);
413 }
414 else
415 {
416 ignore_linefeed = 0;
417 if(c == '\r')
418 {
419 s->line_end_was_cr = 1;
420 c = '\n';
421 }
422
423#if CHAR_SIZE == 16
424 if(c >= 0x10000)
425 {
426 /* Use surrogates */
427 outbuf[nextout++] = ((c - 0x10000) >> 10) + 0xd800;
428 outbuf[nextout++] = ((c - 0x10000) & 0x3ff) + 0xdc00;
429 }
430 else
431 outbuf[nextout++] = c;
432#else
433 outbuf[nextout++] = c;
434#endif
435
436 if(c == '\n')
437 {
438 s->nextin = nextin;
439 s->insize = insize;
440 s->bytes_consumed += (nextin - startin);
441 s->line = outbuf;
442 s->line_alloc = outsize;
443 s->line_length = nextout;
444 return 0;
445 }
446 }
447 }
448
449#if CHAR_SIZE == 16
451 /* Copy down any partial character */
452
453 remaining = insize - nextin;
454 for(i=0; i<remaining; i++)
455 inbuf[i] = inbuf[nextin + i];
456#endif
457
458 /* Get another block */
459
460 s->bytes_consumed += (nextin - startin);
461
462 insize = Readu(s->file16,
463 inbuf+insize-nextin, sizeof(s->inbuf)-remaining);
464 nextin = startin = 0;
465
466 if(insize <= 0)
467 {
468 s->nextin = nextin;
469 s->insize = 0;
470 s->line = outbuf;
471 s->line_alloc = outsize;
472 s->line_length = nextout;
473 return insize;
474 }
475
476 insize += remaining;
477 }
478}
479
480void determine_character_encoding(InputSource s)
481{
482 Entity e = s->entity;
483 int nread;
484 unsigned char *b = (unsigned char *)s->inbuf;
485
486 b[0] = b[1] = b[2] = b[3] = 0;
487
488 while(s->insize < 4)
489 {
490 nread = Readu(s->file16, s->inbuf + s->insize, 4 - s->insize);
491 if(nread == -1)
492 return;
493 if(nread == 0)
494 break;
495 s->insize += nread;
496 }
497
498#if 0
499 if(b[0] == 0 && b[1] == 0 && b[2] == 0 && b[3] == '<')
500 e->encoding = CE_ISO_10646_UCS_4B;
501 else if(b[0] == '<' && b[1] == 0 && b[2] == 0 && b[3] == 0)
502 e->encoding = CE_ISO_10646_UCS_4L;
503 else
504#endif
505 if(b[0] == 0xfe && b[1] == 0xff)
506 {
507 e->encoding = CE_UTF_16B;
508 s->nextin = 2;
509 }
510 else if(b[0] == 0 && b[1] == '<' && b[2] == 0 && b[3] == '?')
511 e->encoding = CE_UTF_16B;
512 else if(b[0] == 0xff && b[1] == 0xfe)
513 {
514 e->encoding = CE_UTF_16L;
515 s->nextin = 2;
516 }
517 else if(b[0] == '<' && b[1] == 0 && b[2] == '?' && b[3] == 0)
518 e->encoding = CE_UTF_16L;
519 else
520 {
521#if CHAR_SIZE == 8
522 e->encoding = CE_unspecified_ascii_superset;
523#else
524 e->encoding = CE_UTF_8;
525#endif
526 }
527}
528
529int get_with_fill(InputSource s)
530{
531 assert(!s->seen_eoe);
532
533 if(get_translated_line(s) != 0)
534 {
535 /* It would be nice to pass this up to the parser, but we don't
536 know anything about parsers here! */
537 ERR1("I/O error on stream <%s>, ignore further errors\n",
538 EntityDescription(s->entity));
539
540 /* Restore old line and return EOE (is this the best thing to do?) */
541 s->line_length = s->next;
542 s->seen_eoe = 1;
543 return XEOE;
544 }
545
546 if(s->line_length == 0)
547 {
548 /* Restore old line */
549 s->line_length = s->next;
550 s->seen_eoe = 1;
551 return XEOE;
552 }
553
554 s->next = 0;
555
556 if(s->not_read_yet)
557 s->not_read_yet = 0;
558 else
559 s->line_number++;
560
561 return s->line[s->next++];
562}
Definition dtd.h:71