4 files changed, 92 insertions, 56 deletions
diff --git a/NEWS.md b/NEWS.md
index a45e9ac..37f3a89 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -28,6 +28,10 @@
 
 - Most `#defined` constants are now `enums`.
 
+- New preprocessor constants `UTF8PROC_VERSION_MAJOR`,
+  `UTF8PROC_VERSION_MINOR`, and `UTF8PROC_VERSION_PATCH` for compile-time
+  detection of the API version.
+
 - Doxygen-formatted documentation.
 
 ## Version 1.1.6 ##
diff --git a/test/printproperty.c b/test/printproperty.c
index 6be8cb1..b876f0c 100644
--- a/test/printproperty.c
+++ b/test/printproperty.c
@@ -8,6 +8,10 @@ int main(int argc, char **argv)
 
      for (i = 1; i < argc; ++i) {
           int c;
+          if (!strcmp(argv[i], "-V")) {
+               printf("utf8proc version %s\n", utf8proc_version());
+               continue;
+          }
           check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
           const utf8proc_property_t *p = utf8proc_get_property(c);
           printf("U+%s:\n"
diff --git a/utf8proc.c b/utf8proc.c
index b0c68e7..133685d 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -1,3 +1,4 @@
+/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
 /*
  *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
  *
@@ -84,8 +85,10 @@ DLLEXPORT const int8_t utf8proc_utf8class[256] = {
 /* Should follow semantic-versioning rules (semver.org) based on API
    compatibility.  (Note that the shared-library version number will
    be different, being based on ABI compatibility.): */
+#define STRINGIZEx(x) #x
+#define STRINGIZE(x) STRINGIZEx(x)
 DLLEXPORT const char *utf8proc_version(void) {
-  return "1.2-dev";
+  return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "-dev";
 }
 
 DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode) {
@@ -193,54 +196,54 @@ static const utf8proc_property_t *get_property(int32_t uc) {
 }
 
 DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
-     return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc);
+  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc);
 }
 
 /* return whether there is a grapheme break between boundclasses lbc and tbc */
 static bool grapheme_break(int lbc, int tbc) {
-     return 
-          (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
-          (lbc == UTF8PROC_BOUNDCLASS_CR &&
-           tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
-          (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-          (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-          (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
-          (lbc == UTF8PROC_BOUNDCLASS_L &&
-           (tbc == UTF8PROC_BOUNDCLASS_L ||
-            tbc == UTF8PROC_BOUNDCLASS_V ||
-            tbc == UTF8PROC_BOUNDCLASS_LV ||
-            tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
-          ((lbc == UTF8PROC_BOUNDCLASS_LV ||
-            lbc == UTF8PROC_BOUNDCLASS_V) &&
-           (tbc == UTF8PROC_BOUNDCLASS_V ||
-            tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
-          ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
-            lbc == UTF8PROC_BOUNDCLASS_T) &&
-           tbc == UTF8PROC_BOUNDCLASS_T) ? false :
-          (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
-           tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
-          (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
+  return 
+    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
+    (lbc == UTF8PROC_BOUNDCLASS_CR &&
+     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
+    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+    (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
+    (lbc == UTF8PROC_BOUNDCLASS_L &&
+     (tbc == UTF8PROC_BOUNDCLASS_L ||
+      tbc == UTF8PROC_BOUNDCLASS_V ||
+      tbc == UTF8PROC_BOUNDCLASS_LV ||
+      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
+    ((lbc == UTF8PROC_BOUNDCLASS_LV ||
+      lbc == UTF8PROC_BOUNDCLASS_V) &&
+     (tbc == UTF8PROC_BOUNDCLASS_V ||
+      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
+    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
+      lbc == UTF8PROC_BOUNDCLASS_T) &&
+     tbc == UTF8PROC_BOUNDCLASS_T) ? false :
+    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
+     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
+    (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
 }
 
 /* return whether there is a grapheme break between codepoints c1 and c2 */
 DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
-     return grapheme_break(utf8proc_get_property(c1)->boundclass,
-                           utf8proc_get_property(c2)->boundclass);
+  return grapheme_break(utf8proc_get_property(c1)->boundclass,
+                        utf8proc_get_property(c2)->boundclass);
 }
 
 /* return a character width analogous to wcwidth (except portable and
    hopefully less buggy than most system wcwidth functions). */
 DLLEXPORT int utf8proc_charwidth(int32_t c) {
-     return utf8proc_get_property(c)->charwidth;
+  return utf8proc_get_property(c)->charwidth;
 }
 
 DLLEXPORT utf8proc_category_t utf8proc_category(int32_t c) {
-     return utf8proc_get_property(c)->category;
+  return utf8proc_get_property(c)->category;
 }
 
 DLLEXPORT const char *utf8proc_category_string(int32_t c) {
-     static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
-     return s[utf8proc_category(c)];
+  static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
+  return s[utf8proc_category(c)];
 }
 
 #define utf8proc_decompose_lump(replacement_uc) \
diff --git a/utf8proc.h b/utf8proc.h
index dc97b85..37940f9 100644
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -24,31 +24,27 @@
 /** 
  * @mainpage
  *
- * uf8proc is a tool for processing UTF-8 strings, with the following features:
+ * utf8proc is a free/open-source (MIT/expat licensed) C library
+ * providing Unicode normalization, case-folding, and other operations
+ * for strings in the UTF-8 encoding, supporting Unicode version
+ * 7.0.0.  See the utf8proc home page (http://julialang.org/utf8proc/)
+ * for downloads and other information, or the source code on github
+ * (https://github.com/JuliaLang/utf8proc).
  *
- * - decomposing and composing of strings
- * - replacing compatibility characters with their equivalents
- * - grapheme segmentation
- * - stripping of "default ignorable characters"
- *   like SOFT-HYPHEN or ZERO-WIDTH-SPACE
- * - folding of certain characters for string comparison
- *   (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-")
- *   (see "LUMP" option)
- * - optional rejection of strings containing non-assigned code points
- * - stripping of control characters
- * - stripping of character marks (accents, etc.)
- * - transformation of LF, CRLF, CR and NEL to line-feed (LF)
- *   or to the unicode chararacters for paragraph separation (PS)
- *   or line separation (LS).
- * - unicode case folding (for case insensitive string comparisons)
- * - rejection of invalid UTF-8 data
- *   (i.e. UTF-8 encoded UTF-16 surrogates)
- * - support for korean hangul characters
- * - character widths
+ * For the utf8proc API documentation, see: @ref utf8proc.h
  *
- * Unicode Version 7.0.0 is supported.
+ * The features of utf8proc include:
  *
- * See @ref utf8proc.h for the API.
+ * - Transformation of strings (@ref utf8proc_map) to:
+ *    - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
+ *    - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
+ *    - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)
+ *    - case-folding (@ref UTF8PROC_CASEFOLD)
+ * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
+ * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
+ * - Character-width computation: @ref utf8proc_charwidth
+ * - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string
+ * - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.
  */
 
 /** @file */
@@ -56,6 +52,27 @@
 #ifndef UTF8PROC_H
 #define UTF8PROC_H
 
+/** @name API version
+ *  
+ * The utf8proc API version MAJOR.MINOR.PATCH, following
+ * semantic-versioning rules (http://semver.org) based on API
+ * compatibility.
+ *
+ * This is also returned at runtime by @ref utf8proc_version; however, the
+ * runtime version may append a string like "-dev" to the version number
+ * for prerelease versions.
+ *
+ * @note The shared-library version number in the Makefile may be different,
+ *       being based on ABI compatibility rather than API compatibility.
+ */
+/** @{ */
+/** The MAJOR version number (increased when backwards API compatibility is broken). */
+#define UTF8PROC_VERSION_MAJOR 1
+/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
+#define UTF8PROC_VERSION_MINOR 2
+/** The PATCH version (increased for fixes that do not change the API). */
+#define UTF8PROC_VERSION_PATCH 0
+/** @} */
 
 #include <stdlib.h>
 #include <sys/types.h>
@@ -114,7 +131,7 @@ typedef enum {
   UTF8PROC_COMPOSE   = (1<<3),
   /** Return a result with decomposed characters. */
   UTF8PROC_DECOMPOSE = (1<<4),
-  /** Strip "default ignorable characters". */
+  /** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */
   UTF8PROC_IGNORE    = (1<<5),
   /** Return an error, if the input contains unassigned code points. */
   UTF8PROC_REJECTNA  = (1<<6),
@@ -337,7 +354,9 @@ typedef enum {
 DLLEXPORT extern const int8_t utf8proc_utf8class[256];
 
 /**
- * Returns the version as a string.
+ * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
+ * (http://semver.org format), possibly with a "-dev" suffix for
+ * development versions.
  */
 DLLEXPORT const char *utf8proc_version(void);
 
@@ -524,15 +543,21 @@ DLLEXPORT ssize_t utf8proc_map(
   const uint8_t *str, ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options
 );
 
-/** @name Normalized versions.
+/** @name Unicode normalization
  *
  * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
- * normalized version of the null-terminated string 'str'.
+ * normalized version of the null-terminated string 'str'.  These
+ * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
+ * combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
  */
 /** @{ */
+/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
 DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str);
+/** NFC normalization (@ref UTF8PROC_COMPOSE). */
 DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str);
+/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
 DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str);
+/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str);
 /** @} */