summaryrefslogtreecommitdiff
path: root/normtest.c
blob: ac464d31a35f203ba8ffd82493d571f1bb46292b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdarg.h>

#include "mojibake.h"

size_t lineno = 0;

void check(int cond, const char *format, ...)
{
     if (!cond) {
          va_list args;
          fprintf(stderr, "line %zd: ", lineno);
          va_start(args, format);
          vfprintf(stderr, format, args);
          va_end(args);
          fprintf(stderr, "\n");
          exit(1);
     }
}

/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
   separated by whitespace, and terminated by any character not in
   [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
   in dest, returning the number of bytes read from buf */
size_t encode(char *dest, const char *buf)
{
     size_t i = 0, j, d = 0;
     do {
          int c;
          while (isspace(buf[i])) ++i; /* skip whitespace */
          for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
               ; /* find end of hex input */
          if (j == i) { /* no codepoint found */
               dest[d] = 0; /* NUL-terminate destination string */
               return i + 1;
          }
          check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
          i = j; /* skip to char after hex input */
          d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
     } while (1);
}

#define CHECK_NORM(NRM, norm, src) {                                 \
    char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src);      \
    check(!strcmp(norm, src_norm),                                  \
          "normalization failed for %s -> %s", src, norm);          \
    free(src_norm);                                                 \
}

int main(void)
{
     char *buf = NULL;
     size_t bufsize = 0;
     FILE *f = fopen("NormalizationTest.txt", "r");
     char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024];

     check(f != NULL, "error opening NormalizationTest.txt");
     while (getline(&buf, &bufsize, f) > 0) {
          size_t offset;
          lineno += 1;

          if (buf[0] == '@') {
               printf("line %zd: %s", lineno, buf + 1);
               continue;
          }
          else if (lineno % 1000 == 0)
               printf("checking line %zd...\n", lineno);

          if (buf[0] == '#') continue;

          offset = encode(source, buf);
          offset += encode(NFC, buf + offset);
          offset += encode(NFD, buf + offset);
          offset += encode(NFKC, buf + offset);
          offset += encode(NFKD, buf + offset);

          CHECK_NORM(NFC, NFC, source);
          CHECK_NORM(NFC, NFC, NFC);
          CHECK_NORM(NFC, NFC, NFD);
          CHECK_NORM(NFC, NFKC, NFKC);
          CHECK_NORM(NFC, NFKC, NFKD);

          CHECK_NORM(NFD, NFD, source);
          CHECK_NORM(NFD, NFD, NFC);
          CHECK_NORM(NFD, NFD, NFD);
          CHECK_NORM(NFD, NFKD, NFKC);
          CHECK_NORM(NFD, NFKD, NFKD);

          CHECK_NORM(NFKC, NFKC, source);
          CHECK_NORM(NFKC, NFKC, NFC);
          CHECK_NORM(NFKC, NFKC, NFD);
          CHECK_NORM(NFKC, NFKC, NFKC);
          CHECK_NORM(NFKC, NFKC, NFKD);

          CHECK_NORM(NFKD, NFKD, source);
          CHECK_NORM(NFKD, NFKD, NFC);
          CHECK_NORM(NFKD, NFKD, NFD);
          CHECK_NORM(NFKD, NFKD, NFKC);
          CHECK_NORM(NFKD, NFKD, NFKD);
     }
     fclose(f);
     printf("Passed tests after %zd lines!\n", lineno);
     return 0;
}