From f5b08ded5898003ba8d81fc50f2231f4a49bffda Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Wed, 18 Jun 2008 10:59:30 +0000
Subject: Correctly decode UTF-16 surrogates

svn path=/trunk/json-c/; revision=4384
---
 json-c/json_tokener.c | 78 ++++++++++++++++++++++++++++++++++++++++++---------
 json-c/json_tokener.h |  3 +-
 json-c/test1.c        | 12 ++++++++
 3 files changed, 79 insertions(+), 14 deletions(-)

(limited to 'json-c')

diff --git a/json-c/json_tokener.c b/json-c/json_tokener.c
index c904f48..d594569 100644
--- a/json-c/json_tokener.c
+++ b/json-c/json_tokener.c
@@ -124,6 +124,39 @@ char* strndup(const char* str, size_t n)
 }
 #endif
 
+static void json_tokener_output_ucs(struct printbuf *pb, unsigned int ucs)
+{
+  unsigned char utf_out[4];
+
+  /* Don't permit surrogates or undefined characters */
+  if ((0xd800 <= ucs && ucs <= 0xdfff) || (ucs & 0xfffe) == 0xfffe)
+    ucs = 0xfffd;
+
+  if (ucs < 0x80) {
+    utf_out[0] = ucs;
+    printbuf_memappend(pb, (char*)utf_out, 1);
+  } else if (ucs < 0x800) {
+    utf_out[0] = 0xc0 | (ucs >> 6);
+    utf_out[1] = 0x80 | (ucs & 0x3f);
+    printbuf_memappend(pb, (char*)utf_out, 2);
+  } else if (ucs < 0x10000) {
+    utf_out[0] = 0xe0 | (ucs >> 12);
+    utf_out[1] = 0x80 | ((ucs >> 6) & 0x3f);
+    utf_out[2] = 0x80 | (ucs & 0x3f);
+    printbuf_memappend(pb, (char*)utf_out, 3);
+  } else if (ucs < 0x110000) {
+    utf_out[0] = 0xf0 | (ucs >> 18);
+    utf_out[1] = 0x80 | ((ucs >> 12) & 0x3f);
+    utf_out[2] = 0x80 | ((ucs >> 6) & 0x3f);
+    utf_out[3] = 0x80 | (ucs & 0x3f);
+    printbuf_memappend(pb, (char*)utf_out, 4);  
+  } else {
+    utf_out[0] = 0xef;
+    utf_out[1] = 0xbf;
+    utf_out[2] = 0xbd;
+    printbuf_memappend(pb, (char*)utf_out, 3);
+  }
+}
 
 #define state  tok->stack[tok->depth].state
 #define saved_state  tok->stack[tok->depth].saved_state
@@ -316,6 +349,7 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
 	break;
       case 'u':
 	tok->ucs_char = 0;
+	tok->ucs_surrogate = 0;
 	tok->st_pos = 0;
 	state = json_tokener_state_escape_unicode;
 	break;
@@ -329,21 +363,15 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
       if(strchr(json_hex_chars, c)) {
 	tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
 	if(tok->st_pos == 4) {
-	  unsigned char utf_out[3];
-	  if (tok->ucs_char < 0x80) {
-	    utf_out[0] = tok->ucs_char;
-	    printbuf_memappend(tok->pb, (char*)utf_out, 1);
-	  } else if (tok->ucs_char < 0x800) {
-	    utf_out[0] = 0xc0 | (tok->ucs_char >> 6);
-	    utf_out[1] = 0x80 | (tok->ucs_char & 0x3f);
-	    printbuf_memappend(tok->pb, (char*)utf_out, 2);
+	  if (0xD800 <= tok->ucs_char && tok->ucs_char <= 0xDBFF) {
+	    tok->ucs_surrogate = tok->ucs_char;
+	    tok->ucs_char = 0;
+	    tok->st_pos = 0;
+	    state = json_tokener_state_escape_unicode_surrogate;
 	  } else {
-	    utf_out[0] = 0xe0 | (tok->ucs_char >> 12);
-	    utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
-	    utf_out[2] = 0x80 | (tok->ucs_char & 0x3f);
-	    printbuf_memappend(tok->pb, (char*)utf_out, 3);
+	    json_tokener_output_ucs(tok->pb, tok->ucs_char);
+	    state = saved_state;
 	  }
-	  state = saved_state;
 	}
       } else {
 	tok->err = json_tokener_error_parse_string;
@@ -351,6 +379,30 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
       }
       break;
 
+    case json_tokener_state_escape_unicode_surrogate:
+      if (tok->st_pos == 0 && c == '\\') {
+        tok->st_pos++;
+      } else if (tok->st_pos == 1 && c == 'u') {
+        tok->st_pos++;
+      } else if (tok->st_pos > 1 && strchr(json_hex_chars, c)) {
+        tok->ucs_char += ((unsigned int)hexdigit(c) << ((5-tok->st_pos++)*4));
+	if (tok->st_pos == 6) {
+	  if (0xDC00 <= tok->ucs_char && tok->ucs_char <= 0xDFFF) {
+            tok->ucs_char = (tok->ucs_surrogate << 10) + tok->ucs_char + 
+	                    (0x10000 - (0xd800 << 10) - 0xdc00);
+	    json_tokener_output_ucs(tok->pb, tok->ucs_char);
+            state = saved_state;
+          } else {
+            tok->err = json_tokener_error_parse_string;
+	    goto out;
+	  }
+	}
+      } else {
+        tok->err = json_tokener_error_parse_string;
+	goto out;
+      }
+      break;
+
     case json_tokener_state_boolean:
       printbuf_memappend(tok->pb, &c, 1);
       if(strncasecmp(json_true_str, tok->pb->buf,
diff --git a/json-c/json_tokener.h b/json-c/json_tokener.h
index 117d6ef..00f3349 100644
--- a/json-c/json_tokener.h
+++ b/json-c/json_tokener.h
@@ -44,6 +44,7 @@ enum json_tokener_state {
   json_tokener_state_string,
   json_tokener_state_string_escape,
   json_tokener_state_escape_unicode,
+  json_tokener_state_escape_unicode_surrogate,
   json_tokener_state_boolean,
   json_tokener_state_number,
   json_tokener_state_array,
@@ -73,7 +74,7 @@ struct json_tokener
   struct printbuf *pb;
   int depth, is_double, st_pos, char_offset;
   ptrdiff_t err;
-  unsigned int ucs_char;
+  unsigned int ucs_char, ucs_surrogate;
   char quote_char;
   struct json_tokener_srec stack[JSON_TOKENER_MAX_DEPTH];
 };
diff --git a/json-c/test1.c b/json-c/test1.c
index a64a255..2f4e8dc 100644
--- a/json-c/test1.c
+++ b/json-c/test1.c
@@ -144,6 +144,18 @@ int main(int argc, char **argv)
   new_obj = json_tokener_parse("{ \"foo");
   if(is_error(new_obj)) printf("got error as expected\n");
 
+  new_obj = json_tokener_parse("\"\\ud800\\udc00\"");
+  printf("new_obj.to_string()=%s\n", json_object_to_json_string(new_obj));
+
+  new_obj = json_tokener_parse("\"\\udbff\\udfff\"");
+  printf("new_obj.to_string()=%s\n", json_object_to_json_string(new_obj));
+
+  new_obj = json_tokener_parse("\"\\ud800foo\"");
+  if(is_error(new_obj)) printf("got error as expected\n");
+
+  new_obj = json_tokener_parse("\"\\ud800\\ufffd\"");
+  if(is_error(new_obj)) printf("got error as expected\n");
+
   /* test incremental parsing */
   tok = json_tokener_new();
   new_obj = json_tokener_parse_ex(tok, "{ \"foo", 6);
-- 
cgit v1.2.3