/* UCS-4 to UTF-8 converter by Sami Farin v1.0.0 (20020405) released under GPL references: rfc2279 UCS-4 range (hex.) UTF-8 octet sequence (binary) 0000 0000-0000 007F 0xxxxxxx 0000 0080-0000 07FF 110xxxxx 10xxxxxx 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ #include #include #include #include #include #include #include int main(int argc, char* argv[]) { const uint8_t mask1[6] = {0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; const uint8_t mask2[6] = {0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01}; uint32_t uni; uint8_t utf8_out[7]; int utf8_len, i, curr_arg; char* endptr = 0; if(argc < 2) { fprintf(stderr, "usage: \"%.100s value1 value2 ... (UCS-4, e.g. 308E)\n", argv[0]); exit(EXIT_FAILURE); } for(curr_arg = 1; curr_arg < argc; curr_arg++) { memset(utf8_out, 0, sizeof(utf8_out)); errno = 0; uni = strtoull(argv[curr_arg], &endptr, 16); if((argv[1] == endptr) || (errno != 0) || (uni > 0x7FFFFFFF)) { fprintf(stderr, "invalid value, must be 0 ... 7FFFFFFF\n"); exit(EXIT_FAILURE); } if(uni <= 0x0000007F) { utf8_len = 0; } else if(uni <= 0x000007FF) { utf8_len = 1; } else if(uni <= 0x0000FFFF) { utf8_len = 2; } else if(uni <= 0x001FFFFF) { utf8_len = 3; } else if(uni <= 0x03FFFFFF) { utf8_len = 4; } else { utf8_len = 5; } fprintf(stdout, "0x%08" PRIx32 ":", uni); utf8_out[0] = ((uni >> (utf8_len*6)) & mask2[utf8_len]) | mask1[utf8_len]; for(i = utf8_len; i > 0; i--) { utf8_out[i] = (uni & 0x3F) | 0x80; uni >>= 6; } for(i = 0; i < (utf8_len + 1); i++) { fprintf(stdout, " %" PRIx8, utf8_out[i]); } fprintf(stdout, ": [ "); fprintf(stdout, "%s", utf8_out); fprintf(stdout, " ]\n"); } return EXIT_SUCCESS; }