00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include "file.h"
00040 #include "magic.h"
00041 #include <stdio.h>
00042 #include <string.h>
00043 #include <memory.h>
00044 #include <ctype.h>
00045 #include <stdlib.h>
00046 #ifdef HAVE_UNISTD_H
00047 #include <unistd.h>
00048 #endif
00049 #include "names.h"
00050
00051 #ifndef lint
00052 FILE_RCSID("@(#)$Id: ascmagic.c,v 1.43 2005/06/25 15:52:14 christos Exp $")
00053 #endif
00054
00055 typedef unsigned long unichar;
00056
00057 #define MAXLINELEN 300
00058 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00059 || (x) == 0x85 || (x) == '\f')
00060
00061 private int looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00062 ;
00063 private int looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00064 ;
00065 private int looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00066 ;
00067 private int looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00068 ;
00069 private int looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00070 ;
00071 private void from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00072 ;
00073 private int ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00074 ;
00075
00076
00077 protected int
00078 file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
00079 {
00080 size_t i;
00081 unsigned char *nbuf = NULL;
00082 unichar *ubuf = NULL;
00083 size_t ulen;
00084 struct names *p;
00085 int rv = -1;
00086
00087 const char *code = NULL;
00088 const char *code_mime = NULL;
00089 const char *type = NULL;
00090 const char *subtype = NULL;
00091 const char *subtype_mime = NULL;
00092
00093 int has_escapes = 0;
00094 int has_backspace = 0;
00095 int seen_cr = 0;
00096
00097 int n_crlf = 0;
00098 int n_lf = 0;
00099 int n_cr = 0;
00100 int n_nel = 0;
00101
00102 int last_line_end = -1;
00103 int has_long_lines = 0;
00104
00105
00106
00107
00108
00109 while (nbytes > 1 && buf[nbytes - 1] == '\0')
00110 nbytes--;
00111
00112 if ((nbuf = malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
00113 goto done;
00114 if ((ubuf = malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
00115 goto done;
00116
00117
00118
00119
00120
00121
00122
00123 if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
00124 code = "ASCII";
00125 code_mime = "us-ascii";
00126 type = "text";
00127 } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
00128 code = "UTF-8 Unicode";
00129 code_mime = "utf-8";
00130 type = "text";
00131 } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
00132 if (i == 1)
00133 code = "Little-endian UTF-16 Unicode";
00134 else
00135 code = "Big-endian UTF-16 Unicode";
00136
00137 type = "character data";
00138 code_mime = "utf-16";
00139 } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
00140 code = "ISO-8859";
00141 type = "text";
00142 code_mime = "iso-8859-1";
00143 } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
00144 code = "Non-ISO extended-ASCII";
00145 type = "text";
00146 code_mime = "unknown";
00147 } else {
00148 from_ebcdic(buf, nbytes, nbuf);
00149
00150 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
00151 code = "EBCDIC";
00152 type = "character data";
00153 code_mime = "ebcdic";
00154 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
00155 code = "International EBCDIC";
00156 type = "character data";
00157 code_mime = "ebcdic";
00158 } else {
00159 rv = 0;
00160 goto done;
00161 }
00162 }
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172 if (*ubuf == '.') {
00173 unichar *tp = ubuf + 1;
00174
00175 while (ISSPC(*tp))
00176 ++tp;
00177 if ((tp[0] == '\\' && tp[1] == '\"') ||
00178 (isascii((unsigned char)tp[0]) &&
00179 isalnum((unsigned char)tp[0]) &&
00180 isascii((unsigned char)tp[1]) &&
00181 isalnum((unsigned char)tp[1]) &&
00182 ISSPC(tp[2]))) {
00183 subtype_mime = "text/troff";
00184 subtype = "troff or preprocessor input";
00185 goto subtype_identified;
00186 }
00187 }
00188
00189 if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00190 subtype_mime = "text/fortran";
00191 subtype = "fortran program";
00192 goto subtype_identified;
00193 }
00194
00195
00196
00197 i = 0;
00198 while (i < ulen) {
00199 size_t end;
00200
00201
00202
00203
00204 while (i < ulen && ISSPC(ubuf[i]))
00205 i++;
00206 if (i >= ulen)
00207 break;
00208
00209
00210
00211
00212 for (end = i + 1; end < nbytes; end++)
00213 if (ISSPC(ubuf[end]))
00214 break;
00215
00216
00217
00218
00219 for (p = names; p < names + NNAMES; p++) {
00220 if (ascmatch((const unsigned char *)p->name, ubuf + i,
00221 end - i)) {
00222 subtype = types[p->type].human;
00223 subtype_mime = types[p->type].mime;
00224 goto subtype_identified;
00225 }
00226 }
00227
00228 i = end;
00229 }
00230
00231 subtype_identified:
00232
00233
00234
00235
00236 for (i = 0; i < ulen; i++) {
00237 if (ubuf[i] == '\n') {
00238 if (seen_cr)
00239 n_crlf++;
00240 else
00241 n_lf++;
00242 last_line_end = i;
00243 } else if (seen_cr)
00244 n_cr++;
00245
00246 seen_cr = (ubuf[i] == '\r');
00247 if (seen_cr)
00248 last_line_end = i;
00249
00250 if (ubuf[i] == 0x85) {
00251 n_nel++;
00252 last_line_end = i;
00253 }
00254
00255
00256 if (i > last_line_end + MAXLINELEN)
00257 has_long_lines = 1;
00258
00259 if (ubuf[i] == '\033')
00260 has_escapes = 1;
00261 if (ubuf[i] == '\b')
00262 has_backspace = 1;
00263 }
00264
00265
00266
00267
00268
00269 if (seen_cr && nbytes < HOWMANY)
00270 n_cr++;
00271
00272 if ((ms->flags & MAGIC_MIME)) {
00273 if (subtype_mime) {
00274 if (file_printf(ms, subtype_mime) == -1)
00275 goto done;
00276 } else {
00277 if (file_printf(ms, "text/plain") == -1)
00278 goto done;
00279 }
00280
00281 if (code_mime) {
00282 if (file_printf(ms, "; charset=") == -1)
00283 goto done;
00284 if (file_printf(ms, code_mime) == -1)
00285 goto done;
00286 }
00287 } else {
00288 if (file_printf(ms, code) == -1)
00289 goto done;
00290
00291 if (subtype) {
00292 if (file_printf(ms, " ") == -1)
00293 goto done;
00294 if (file_printf(ms, subtype) == -1)
00295 goto done;
00296 }
00297
00298 if (file_printf(ms, " ") == -1)
00299 goto done;
00300 if (file_printf(ms, type) == -1)
00301 goto done;
00302
00303 if (has_long_lines)
00304 if (file_printf(ms, ", with very long lines") == -1)
00305 goto done;
00306
00307
00308
00309
00310
00311 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00312 (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00313 if (file_printf(ms, ", with") == -1)
00314 goto done;
00315
00316 if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
00317 if (file_printf(ms, " no") == -1)
00318 goto done;
00319 } else {
00320 if (n_crlf) {
00321 if (file_printf(ms, " CRLF") == -1)
00322 goto done;
00323 if (n_cr || n_lf || n_nel)
00324 if (file_printf(ms, ",") == -1)
00325 goto done;
00326 }
00327 if (n_cr) {
00328 if (file_printf(ms, " CR") == -1)
00329 goto done;
00330 if (n_lf || n_nel)
00331 if (file_printf(ms, ",") == -1)
00332 goto done;
00333 }
00334 if (n_lf) {
00335 if (file_printf(ms, " LF") == -1)
00336 goto done;
00337 if (n_nel)
00338 if (file_printf(ms, ",") == -1)
00339 goto done;
00340 }
00341 if (n_nel)
00342 if (file_printf(ms, " NEL") == -1)
00343 goto done;
00344 }
00345
00346 if (file_printf(ms, " line terminators") == -1)
00347 goto done;
00348 }
00349
00350 if (has_escapes)
00351 if (file_printf(ms, ", with escape sequences") == -1)
00352 goto done;
00353 if (has_backspace)
00354 if (file_printf(ms, ", with overstriking") == -1)
00355 goto done;
00356 }
00357 rv = 1;
00358 done:
00359 if (nbuf)
00360 free(nbuf);
00361 if (ubuf)
00362 free(ubuf);
00363
00364 return rv;
00365 }
00366
00367 private int
00368 ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00369 {
00370 size_t i;
00371
00372 for (i = 0; i < ulen; i++) {
00373 if (s[i] != us[i])
00374 return 0;
00375 }
00376
00377 if (s[i])
00378 return 0;
00379 else
00380 return 1;
00381 }
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435 #define F 0
00436 #define T 1
00437 #define I 2
00438 #define X 3
00439
00440
00441 private char text_chars[256] = {
00442
00443 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,
00444
00445 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,
00446 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00447 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00448 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00449 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00450 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00451 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,
00452
00453 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,
00454 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,
00455 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00456 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00457 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00458 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00459 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00460 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I
00461 };
00462
00463 private int
00464 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00465 size_t *ulen)
00466 {
00467 int i;
00468
00469 *ulen = 0;
00470
00471 for (i = 0; i < nbytes; i++) {
00472 int t = text_chars[buf[i]];
00473
00474 if (t != T)
00475 return 0;
00476
00477 ubuf[(*ulen)++] = buf[i];
00478 }
00479
00480 return 1;
00481 }
00482
00483 private int
00484 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00485 {
00486 int i;
00487
00488 *ulen = 0;
00489
00490 for (i = 0; i < nbytes; i++) {
00491 int t = text_chars[buf[i]];
00492
00493 if (t != T && t != I)
00494 return 0;
00495
00496 ubuf[(*ulen)++] = buf[i];
00497 }
00498
00499 return 1;
00500 }
00501
00502 private int
00503 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00504 size_t *ulen)
00505 {
00506 int i;
00507
00508 *ulen = 0;
00509
00510 for (i = 0; i < nbytes; i++) {
00511 int t = text_chars[buf[i]];
00512
00513 if (t != T && t != I && t != X)
00514 return 0;
00515
00516 ubuf[(*ulen)++] = buf[i];
00517 }
00518
00519 return 1;
00520 }
00521
00522 private int
00523 looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00524 {
00525 int i, n;
00526 unichar c;
00527 int gotone = 0;
00528
00529 *ulen = 0;
00530
00531 for (i = 0; i < nbytes; i++) {
00532 if ((buf[i] & 0x80) == 0) {
00533
00534
00535
00536
00537
00538 if (text_chars[buf[i]] != T)
00539 return 0;
00540
00541 ubuf[(*ulen)++] = buf[i];
00542 } else if ((buf[i] & 0x40) == 0) {
00543 return 0;
00544 } else {
00545 int following;
00546
00547 if ((buf[i] & 0x20) == 0) {
00548 c = buf[i] & 0x1f;
00549 following = 1;
00550 } else if ((buf[i] & 0x10) == 0) {
00551 c = buf[i] & 0x0f;
00552 following = 2;
00553 } else if ((buf[i] & 0x08) == 0) {
00554 c = buf[i] & 0x07;
00555 following = 3;
00556 } else if ((buf[i] & 0x04) == 0) {
00557 c = buf[i] & 0x03;
00558 following = 4;
00559 } else if ((buf[i] & 0x02) == 0) {
00560 c = buf[i] & 0x01;
00561 following = 5;
00562 } else
00563 return 0;
00564
00565 for (n = 0; n < following; n++) {
00566 i++;
00567 if (i >= nbytes)
00568 goto done;
00569
00570 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00571 return 0;
00572
00573 c = (c << 6) + (buf[i] & 0x3f);
00574 }
00575
00576 ubuf[(*ulen)++] = c;
00577 gotone = 1;
00578 }
00579 }
00580 done:
00581 return gotone;
00582 }
00583
00584 private int
00585 looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00586 size_t *ulen)
00587 {
00588 int bigend;
00589 int i;
00590
00591 if (nbytes < 2)
00592 return 0;
00593
00594 if (buf[0] == 0xff && buf[1] == 0xfe)
00595 bigend = 0;
00596 else if (buf[0] == 0xfe && buf[1] == 0xff)
00597 bigend = 1;
00598 else
00599 return 0;
00600
00601 *ulen = 0;
00602
00603 for (i = 2; i + 1 < nbytes; i += 2) {
00604
00605
00606 if (bigend)
00607 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00608 else
00609 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00610
00611 if (ubuf[*ulen - 1] == 0xfffe)
00612 return 0;
00613 if (ubuf[*ulen - 1] < 128 &&
00614 text_chars[(size_t)ubuf[*ulen - 1]] != T)
00615 return 0;
00616 }
00617
00618 return 1 + bigend;
00619 }
00620
00621 #undef F
00622 #undef T
00623 #undef I
00624 #undef X
00625
00626
00627
00628
00629
00630
00631
00632
00633
00634
00635
00636
00637
00638
00639
00640
00641
00642
00643
00644
00645
00646
00647
00648
00649 private unsigned char ebcdic_to_ascii[] = {
00650 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
00651 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
00652 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
00653 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
00654 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00655 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00656 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00657 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00658 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00659 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00660 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00661 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00662 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00663 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00664 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00665 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00666 };
00667
00668 #ifdef notdef
00669
00670
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680
00681
00682
00683
00684 private unsigned char ebcdic_1047_to_8859[] = {
00685 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00686 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00687 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00688 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00689 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00690 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00691 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00692 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00693 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00694 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00695 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00696 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00697 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00698 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00699 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00700 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00701 };
00702 #endif
00703
00704
00705
00706
00707 private void
00708 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00709 {
00710 int i;
00711
00712 for (i = 0; i < nbytes; i++) {
00713 out[i] = ebcdic_to_ascii[buf[i]];
00714 }
00715 }