From d5bf11e5b58b3ff5a523257d2729e54790cdda24 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 4 Sep 2008 16:15:12 +0000 Subject: ISO-8859-n codec. This needs some testing. svn path=/trunk/libparserutils/; revision=5233 --- src/charset/codecs/8859_tables.h | 241 ++++++++++++++++ src/charset/codecs/Makefile | 2 +- src/charset/codecs/codec_8859.c | 583 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 825 insertions(+), 1 deletion(-) create mode 100644 src/charset/codecs/8859_tables.h create mode 100644 src/charset/codecs/codec_8859.c (limited to 'src/charset/codecs') diff --git a/src/charset/codecs/8859_tables.h b/src/charset/codecs/8859_tables.h new file mode 100644 index 0000000..d8d7525 --- /dev/null +++ b/src/charset/codecs/8859_tables.h @@ -0,0 +1,241 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#ifndef parserutils_charset_codecs_8859tables_h_ +#define parserutils_charset_codecs_8859tables_h_ + +/* Mapping tables for ISO-8859-n -> UCS4. + * Undefined characters are mapped to U+FFFF, + * which is a guaranteed non-character + */ + +static uint32_t t1[96] = { + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, + 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, + 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, +}; + +static uint32_t t2[96] = { + 0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7, + 0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B, + 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7, + 0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C, + 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7, + 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E, + 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7, + 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF, + 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7, + 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F, + 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7, + 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9, +}; + +static uint32_t t3[96] = { + 0x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, 0xFFFF, 0x0124, 0x00A7, + 0x00A8, 0x0130, 0x015E, 0x011E, 0x0134, 0x00AD, 0xFFFF, 0x017B, + 0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7, + 0x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, 0xFFFF, 0x017C, + 0x00C0, 0x00C1, 0x00C2, 0xFFFF, 0x00C4, 0x010A, 0x0108, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0xFFFF, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7, + 0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x016C, 0x015C, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0xFFFF, 0x00E4, 0x010B, 0x0109, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0xFFFF, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7, + 0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9, +}; + +static uint32_t t4[96] = { + 0x00A0, 0x0104, 0x0138, 0x0156, 0x00A4, 0x0128, 0x013B, 0x00A7, + 0x00A8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00AD, 0x017D, 0x00AF, + 0x00B0, 0x0105, 0x02DB, 0x0157, 0x00B4, 0x0129, 0x013C, 0x02C7, + 0x00B8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014A, 0x017E, 0x014B, + 0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E, + 0x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x012A, + 0x0110, 0x0145, 0x014C, 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D7, + 0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x0168, 0x016A, 0x00DF, + 0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F, + 0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x012B, + 0x0111, 0x0146, 0x014D, 0x0137, 0x00F4, 0x00F5, 0x00F6, 0x00F7, + 0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x0169, 0x016B, 0x02D9, +}; + +static uint32_t t5[96] = { + 0x00A0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x00AD, 0x040E, 0x040F, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, + 0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x00A7, 0x045E, 0x045F, +}; + +static uint32_t t6[96] = { + 0x00A0, 0xFFFF, 0xFFFF, 0xFFFF, 0x00A4, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x060C, 0x00AD, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x061B, 0xFFFF, 0xFFFF, 0xFFFF, 0x061F, + 0xFFFF, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, + 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F, + 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, + 0x0638, 0x0639, 0x063A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, + 0x0648, 0x0649, 0x064A, 0x064B, 0x064C, 0x064D, 0x064E, 0x064F, + 0x0650, 0x0651, 0x0652, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, +}; + +static uint32_t t7[96] = { + 0x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7, + 0x00A8, 0x00A9, 0x037A, 0x00AB, 0x00AC, 0x00AD, 0xFFFF, 0x2015, + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x00B7, + 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F, + 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F, + 0x03A0, 0x03A1, 0xFFFF, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, + 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF, + 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, + 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, + 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFF, +}; + +static uint32_t t8[96] = { + 0x00A0, 0xFFFF, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, + 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, + 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2017, + 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, + 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF, + 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, + 0x05E8, 0x05E9, 0x05EA, 0xFFFF, 0xFFFF, 0x200E, 0x200F, 0xFFFF, +}; + +static uint32_t t9[96] = { + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, + 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, + 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x011F, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF, +}; + +static uint32_t t10[96] = { + 0x00A0, 0x0104, 0x0112, 0x0122, 0x012A, 0x0128, 0x0136, 0x00A7, + 0x013B, 0x0110, 0x0160, 0x0166, 0x017D, 0x00AD, 0x016A, 0x014A, + 0x00B0, 0x0105, 0x0113, 0x0123, 0x012B, 0x0129, 0x0137, 0x00B7, + 0x013C, 0x0111, 0x0161, 0x0167, 0x017E, 0x2015, 0x016B, 0x014B, + 0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E, + 0x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x00CF, + 0x00D0, 0x0145, 0x014C, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0168, + 0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, + 0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F, + 0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x00EF, + 0x00F0, 0x0146, 0x014D, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0169, + 0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x0138, +}; + +static uint32_t t11[96] = { + 0x00A0, 0x0E01, 0x0E02, 0x0E03, 0x0E04, 0x0E05, 0x0E06, 0x0E07, + 0x0E08, 0x0E09, 0x0E0A, 0x0E0B, 0x0E0C, 0x0E0D, 0x0E0E, 0x0E0F, + 0x0E10, 0x0E11, 0x0E12, 0x0E13, 0x0E14, 0x0E15, 0x0E16, 0x0E17, + 0x0E18, 0x0E19, 0x0E1A, 0x0E1B, 0x0E1C, 0x0E1D, 0x0E1E, 0x0E1F, + 0x0E20, 0x0E21, 0x0E22, 0x0E23, 0x0E24, 0x0E25, 0x0E26, 0x0E27, + 0x0E28, 0x0E29, 0x0E2A, 0x0E2B, 0x0E2C, 0x0E2D, 0x0E2E, 0x0E2F, + 0x0E30, 0x0E31, 0x0E32, 0x0E33, 0x0E34, 0x0E35, 0x0E36, 0x0E37, + 0x0E38, 0x0E39, 0x0E3A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0E3F, + 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47, + 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F, + 0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, + 0x0E58, 0x0E59, 0x0E5A, 0x0E5B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, +}; + +static uint32_t t13[96] = { + 0x00A0, 0x201D, 0x00A2, 0x00A3, 0x00A4, 0x201E, 0x00A6, 0x00A7, + 0x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6, + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x201C, 0x00B5, 0x00B6, 0x00B7, + 0x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6, + 0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112, + 0x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B, + 0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7, + 0x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF, + 0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113, + 0x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C, + 0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7, + 0x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x2019, +}; + +static uint32_t t14[96] = { + 0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7, + 0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178, + 0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56, + 0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61, + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A, + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B, + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF, +}; + +static uint32_t t15[96] = { + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AC, 0x00A5, 0x0160, 0x00A7, + 0x0161, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x017D, 0x00B5, 0x00B6, 0x00B7, + 0x017E, 0x00B9, 0x00BA, 0x00BB, 0x0152, 0x0153, 0x0178, 0x00BF, + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, +}; + +static uint32_t t16[96] = { + 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7, + 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B, + 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7, + 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C, + 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A, + 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B, + 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF, +}; + +#endif diff --git a/src/charset/codecs/Makefile b/src/charset/codecs/Makefile index 6d3b78e..fd0365b 100644 --- a/src/charset/codecs/Makefile +++ b/src/charset/codecs/Makefile @@ -32,7 +32,7 @@ dirstack_$(sp) := $(d) d := $(DIR) # Sources -SRCS_$(d) := codec_iconv.c codec_utf8.c codec_utf16.c +SRCS_$(d) := codec_8859.c codec_iconv.c codec_utf8.c codec_utf16.c # Append to sources for component SOURCES += $(addprefix $(d), $(SRCS_$(d))) diff --git a/src/charset/codecs/codec_8859.c b/src/charset/codecs/codec_8859.c new file mode 100644 index 0000000..16ad0ef --- /dev/null +++ b/src/charset/codecs/codec_8859.c @@ -0,0 +1,583 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#include +#include +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include + +#include "charset/codecs/codec_impl.h" +#include "utils/utils.h" + +#include "charset/codecs/8859_tables.h" + +static struct { + uint16_t mib; + const char *name; + size_t len; + uint32_t *table; +} known_charsets[] = { + { 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 }, + { 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 }, + { 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 }, + { 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 }, + { 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 }, + { 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 }, + { 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 }, + { 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 }, + { 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 }, + { 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 }, + { 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 }, + { 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 }, + { 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 }, + { 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 }, + { 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 } +}; + +/** + * ISO-8859-n charset codec + */ +typedef struct charset_8859_codec { + parserutils_charset_codec base; /**< Base class */ + + uint32_t *table; /**< Mapping table for 0xA0-0xFF */ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + * (host-endian) */ + size_t read_len; /**< Character length of read_buf */ + +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + * (host-endian) */ + size_t write_len; /**< Character length of write_buf */ + +} charset_8859_codec; + +static bool charset_8859_codec_handles_charset(const char *charset); +static parserutils_charset_codec *charset_8859_codec_create(const char *charset, + parserutils_alloc alloc, void *pw); +static void charset_8859_codec_destroy (parserutils_charset_codec *codec); +static parserutils_error charset_8859_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_8859_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_8859_codec_reset( + parserutils_charset_codec *codec); +static inline parserutils_error charset_8859_codec_read_char( + charset_8859_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static inline parserutils_error charset_8859_codec_output_decoded_char( + charset_8859_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); +static inline parserutils_error charset_8859_from_ucs4(charset_8859_codec *c, + uint32_t ucs4, uint8_t **s, size_t *len); +static inline parserutils_error charset_8859_to_ucs4(charset_8859_codec *c, + const uint8_t *s, size_t len, uint32_t *ucs4); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool charset_8859_codec_handles_charset(const char *charset) +{ + uint16_t match = parserutils_charset_mibenum_from_name(charset, + strlen(charset)); + + if (known_charsets[0].mib == 0) { + for (uint32_t i = 0; i < N_ELEMENTS(known_charsets); i++) { + known_charsets[i].mib = + parserutils_charset_mibenum_from_name( + known_charsets[i].name, + known_charsets[i].len); + } + } + + for (uint32_t i = 0; i < N_ELEMENTS(known_charsets); i++) { + if (known_charsets[i].mib == match) + return true; + } + + return false; +} + +/** + * Create an ISO-8859-n codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +parserutils_charset_codec *charset_8859_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + charset_8859_codec *codec; + uint16_t match = parserutils_charset_mibenum_from_name( + charset, strlen(charset)); + uint32_t *table = NULL; + + for (uint32_t i = 0; i < N_ELEMENTS(known_charsets); i++) { + if (known_charsets[i].mib == match) { + table = known_charsets[i].table; + break; + } + } + + assert(table != NULL); + + codec = alloc(NULL, sizeof(charset_8859_codec), pw); + if (codec == NULL) + return NULL; + + codec->table = table; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = charset_8859_codec_destroy; + codec->base.handler.encode = charset_8859_codec_encode; + codec->base.handler.decode = charset_8859_codec_decode; + codec->base.handler.reset = charset_8859_codec_reset; + + return (parserutils_charset_codec *) codec; +} + +/** + * Destroy an ISO-8859-n codec + * + * \param codec The codec to destroy + */ +void charset_8859_codec_destroy (parserutils_charset_codec *codec) +{ + UNUSED(codec); +} + +/** + * Encode a chunk of UCS-4 (big endian) data into ISO-8859-n + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_8859_codec *c = (charset_8859_codec *) codec; + uint32_t ucs4; + uint32_t *towrite; + size_t towritelen; + parserutils_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + error = charset_8859_from_ucs4(c, pwrite[0], + dest, destlen); + if (error != PARSERUTILS_OK) { + assert(error == PARSERUTILS_NOMEM); + + for (uint32_t len = 0; + len < c->write_len; len++) { + c->write_buf[len] = pwrite[len]; + } + + return error; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + ucs4 = ntohl(*((uint32_t *) (void *) *source)); + towrite = &ucs4; + towritelen = 1; + + /* Output current characters */ + while (towritelen > 0) { + error = charset_8859_from_ucs4(c, towrite[0], dest, + destlen); + if (error != PARSERUTILS_OK) { + if (error != PARSERUTILS_NOMEM) { + return error; + } + + /* Insufficient output space */ + if (towritelen >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen; + + /* Copy pending chars to save area, for + * processing next call. */ + for (uint32_t len = 0; len < towritelen; len++) + c->write_buf[len] = towrite[len]; + + /* Claim character we've just buffered, + * so it's not reprocessed */ + *source += 4; + *sourcelen -= 4; + + return PARSERUTILS_NOMEM; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return PARSERUTILS_OK; +} + +/** + * Decode a chunk of ISO-8859-n data into UCS-4 (big endian) + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_8859_codec *c = (charset_8859_codec *) codec; + parserutils_error error; + + if (c->read_len > 0) { + /* Output left over from last decode */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = htonl(pread[0]); + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Ran out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) + c->read_buf[i] = pread[i]; + + return PARSERUTILS_NOMEM; + } + } + + /* Finally, the "normal" case; process all outstanding characters */ + while (*sourcelen > 0) { + error = charset_8859_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != PARSERUTILS_OK) { + return error; + } + } + + return PARSERUTILS_OK; +} + +/** + * Clear an ISO-8859-n codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec) +{ + charset_8859_codec *c = (charset_8859_codec *) codec; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return PARSERUTILS_OK; +} + + +/** + * Read a character from the ISO-8859-n to UCS-4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_8859_codec_read_char(charset_8859_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + uint32_t ucs4; + parserutils_error error; + + /* Convert a single character */ + error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4); + if (error == PARSERUTILS_OK) { + /* Read a character */ + error = charset_8859_codec_output_decoded_char(c, + ucs4, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += 1; + *sourcelen -= 1; + } + + return error; + } else if (error == PARSERUTILS_NEEDDATA) { + /* Can only happen if sourcelen == 0 */ + return error; + } else if (error == PARSERUTILS_INVALID) { + /* Illegal input sequence */ + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { + return PARSERUTILS_INVALID; + } + + /* output U+FFFD and continue processing. */ + error = charset_8859_codec_output_decoded_char(c, + 0xFFFD, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += 1; + *sourcelen -= 1; + } + + return error; + } + + return PARSERUTILS_OK; +} + +/** + * Output a UCS-4 character (big endian) + * + * \param c Codec to use + * \param ucs4 UCS-4 character (host endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + */ +parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (*destlen < 4) { + /* Run out of output buffer */ + c->read_len = 1; + c->read_buf[0] = ucs4; + + return PARSERUTILS_NOMEM; + } + + *((uint32_t *) (void *) *dest) = htonl(ucs4); + *dest += 4; + *destlen -= 4; + + return PARSERUTILS_OK; +} + +/** + * Convert a UCS4 (host endian) character to ISO-8859-n + * + * \param c The codec instance + * \param ucs4 The UCS4 character to convert + * \param s Pointer to pointer to destination buffer + * \param len Pointer to destination buffer length + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if there's insufficient space in the output buffer, + * PARSERUTILS_INVALID if the character cannot be represented + * + * _INVALID will only be returned if the codec's conversion mode is STRICT. + * Otherwise, '?' will be output. + * + * On successful conversion, *s and *len will be updated. + */ +parserutils_error charset_8859_from_ucs4(charset_8859_codec *c, + uint32_t ucs4, uint8_t **s, size_t *len) +{ + uint8_t out = 0; + + if (*len < 1) + return PARSERUTILS_NOMEM; + + if (ucs4 < 0x80) { + /* ASCII */ + out = ucs4; + } else { + uint32_t i; + + for (i = 0; i < 96; i++) { + if (ucs4 == c->table[i]) + break; + } + + if (i == 96) { + if (c->base.errormode == + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) + return PARSERUTILS_INVALID; + else + out = '?'; + } else { + out = c->table[i]; + } + } + + *(*s++) = out; + *len--; + + return PARSERUTILS_OK; +} + +/** + * Convert an ISO-8859-n character to UCS4 (host endian) + * + * \param c The codec instance + * \param s Pointer to source buffer + * \param len Source buffer length + * \param ucs4 Pointer to destination buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NEEDDATA if there's insufficient input data + * PARSERUTILS_INVALID if the character cannot be represented + */ +parserutils_error charset_8859_to_ucs4(charset_8859_codec *c, + const uint8_t *s, size_t len, uint32_t *ucs4) +{ + uint32_t out; + + if (len < 1) + return PARSERUTILS_NEEDDATA; + + if (*s < 0x80) { + out = *s; + } else if (*s >= 0xA0) { + if (c->table[*s] == 0xFFFF) + return PARSERUTILS_INVALID; + + out = c->table[*s]; + } else { + return PARSERUTILS_INVALID; + } + + *ucs4 = out; + + return PARSERUTILS_OK; +} + +const parserutils_charset_handler charset_8859_codec_handler = { + charset_8859_codec_handles_charset, + charset_8859_codec_create +}; + -- cgit v1.2.3