123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- /*******************************************************************************
- * Copyright (c) 2009, 2018 IBM Corp.
- *
- * All rights reserved. This program and the accompanying materials
- * are made available under the terms of the Eclipse Public License v1.0
- * and Eclipse Distribution License v1.0 which accompany this distribution.
- *
- * The Eclipse Public License is available at
- * http://www.eclipse.org/legal/epl-v10.html
- * and the Eclipse Distribution License is available at
- * http://www.eclipse.org/org/documents/edl-v10.php.
- *
- * Contributors:
- * Ian Craggs - initial API and implementation and/or initial documentation
- *******************************************************************************/
- /**
- * @file
- * \brief Functions for checking that strings contain UTF-8 characters only
- *
- * See page 104 of the Unicode Standard 5.0 for the list of well formed
- * UTF-8 byte sequences.
- *
- */
- #include "utf-8.h"
- #include <stdlib.h>
- #include <string.h>
- #include "StackTrace.h"
- /**
- * Macro to determine the number of elements in a single-dimension array
- */
- #if !defined(ARRAY_SIZE)
- #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
- #endif
- /**
- * Structure to hold the valid ranges of UTF-8 characters, for each byte up to 4
- */
- struct
- {
- int len; /**< number of elements in the following array (1 to 4) */
- struct
- {
- char lower; /**< lower limit of valid range */
- char upper; /**< upper limit of valid range */
- } bytes[4]; /**< up to 4 bytes can be used per character */
- }
- valid_ranges[] =
- {
- {1, { {00, 0x7F} } },
- {2, { {0xC2, 0xDF}, {0x80, 0xBF} } },
- {3, { {0xE0, 0xE0}, {0xA0, 0xBF}, {0x80, 0xBF} } },
- {3, { {0xE1, 0xEC}, {0x80, 0xBF}, {0x80, 0xBF} } },
- {3, { {0xED, 0xED}, {0x80, 0x9F}, {0x80, 0xBF} } },
- {3, { {0xEE, 0xEF}, {0x80, 0xBF}, {0x80, 0xBF} } },
- {4, { {0xF0, 0xF0}, {0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } },
- {4, { {0xF1, 0xF3}, {0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } },
- {4, { {0xF4, 0xF4}, {0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF} } },
- };
- static const char* UTF8_char_validate(int len, const char* data);
- /**
- * Validate a single UTF-8 character
- * @param len the length of the string in "data"
- * @param data the bytes to check for a valid UTF-8 char
- * @return pointer to the start of the next UTF-8 character in "data"
- */
- static const char* UTF8_char_validate(int len, const char* data)
- {
- int good = 0;
- int charlen = 2;
- int i, j;
- const char *rc = NULL;
- /* first work out how many bytes this char is encoded in */
- if ((data[0] & 128) == 0)
- charlen = 1;
- else if ((data[0] & 0xF0) == 0xF0)
- charlen = 4;
- else if ((data[0] & 0xE0) == 0xE0)
- charlen = 3;
- if (charlen > len)
- goto exit; /* not enough characters in the string we were given */
- for (i = 0; i < ARRAY_SIZE(valid_ranges); ++i)
- { /* just has to match one of these rows */
- if (valid_ranges[i].len == charlen)
- {
- good = 1;
- for (j = 0; j < charlen; ++j)
- {
- if (data[j] < valid_ranges[i].bytes[j].lower ||
- data[j] > valid_ranges[i].bytes[j].upper)
- {
- good = 0; /* failed the check */
- break;
- }
- }
- if (good)
- break;
- }
- }
- if (good)
- rc = data + charlen;
- exit:
- return rc;
- }
- /**
- * Validate a length-delimited string has only UTF-8 characters
- * @param len the length of the string in "data"
- * @param data the bytes to check for valid UTF-8 characters
- * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise
- */
- int UTF8_validate(int len, const char* data)
- {
- const char* curdata = NULL;
- int rc = 0;
- FUNC_ENTRY;
- if (len == 0)
- {
- rc = 1;
- goto exit;
- }
- curdata = UTF8_char_validate(len, data);
- while (curdata && (curdata < data + len))
- curdata = UTF8_char_validate((int)(data + len - curdata), curdata);
- rc = curdata != NULL;
- exit:
- FUNC_EXIT_RC(rc);
- return rc;
- }
- /**
- * Validate a null-terminated string has only UTF-8 characters
- * @param string the string to check for valid UTF-8 characters
- * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise
- */
- int UTF8_validateString(const char* string)
- {
- int rc = 0;
- FUNC_ENTRY;
- rc = UTF8_validate((int)strlen(string), string);
- FUNC_EXIT_RC(rc);
- return rc;
- }
- #if defined(UNIT_TESTS)
- #include <stdio.h>
- typedef struct
- {
- int len;
- char data[20];
- } tests;
- tests valid_strings[] =
- {
- {3, "hjk" },
- {7, {0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E} },
- {3, {'f', 0xC9, 0xB1 } },
- {9, {0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4} },
- {9, {0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E} },
- {4, {0x2F, 0x2E, 0x2E, 0x2F} },
- {7, {0xEF, 0xBB, 0xBF, 0xF0, 0xA3, 0x8E, 0xB4} },
- };
- tests invalid_strings[] =
- {
- {2, {0xC0, 0x80} },
- {5, {0x2F, 0xC0, 0xAE, 0x2E, 0x2F} },
- {6, {0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4} },
- {1, {0xF4} },
- };
- int main (int argc, char *argv[])
- {
- int i, failed = 0;
- for (i = 0; i < ARRAY_SIZE(valid_strings); ++i)
- {
- if (!UTF8_validate(valid_strings[i].len, valid_strings[i].data))
- {
- printf("valid test %d failed\n", i);
- failed = 1;
- }
- else
- printf("valid test %d passed\n", i);
- }
- for (i = 0; i < ARRAY_SIZE(invalid_strings); ++i)
- {
- if (UTF8_validate(invalid_strings[i].len, invalid_strings[i].data))
- {
- printf("invalid test %d failed\n", i);
- failed = 1;
- }
- else
- printf("invalid test %d passed\n", i);
- }
- if (failed)
- printf("Failed\n");
- else
- printf("Passed\n");
- return 0;
- } /* End of main function*/
- #endif
|