unicode: cache the normalization tables in struct unicode_map (6ca99ce7) · Commits · EulixOS / Software / Kernel

fs/unicode/utf8-core.c

+19 −18

Original line number	Diff line number	Diff line
		@@ -5,16 +5,13 @@
		#include <linux/slab.h>
		#include <linux/parser.h>
		#include <linux/errno.h>
		#include <linux/unicode.h>
		#include <linux/stringhash.h>

		#include "utf8n.h"

		int utf8_validate(const struct unicode_map um, const struct qstr str)
		{
		const struct utf8data *data = utf8nfdi(um->version);

		if (utf8nlen(data, str->name, str->len) < 0)
		if (utf8nlen(um, UTF8_NFDI, str->name, str->len) < 0)
		return -1;
		return 0;
		}
		@@ -23,14 +20,13 @@ EXPORT_SYMBOL(utf8_validate);
		int utf8_strncmp(const struct unicode_map *um,
		const struct qstr s1, const struct qstr s2)
		{
		const struct utf8data *data = utf8nfdi(um->version);
		struct utf8cursor cur1, cur2;
		int c1, c2;

		if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
		if (utf8ncursor(&cur1, um, UTF8_NFDI, s1->name, s1->len) < 0)
		return -EINVAL;

		if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
		if (utf8ncursor(&cur2, um, UTF8_NFDI, s2->name, s2->len) < 0)
		return -EINVAL;

		do {
		@@ -50,14 +46,13 @@ EXPORT_SYMBOL(utf8_strncmp);
		int utf8_strncasecmp(const struct unicode_map *um,
		const struct qstr s1, const struct qstr s2)
		{
		const struct utf8data *data = utf8nfdicf(um->version);
		struct utf8cursor cur1, cur2;
		int c1, c2;

		if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
		if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0)
		return -EINVAL;

		if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
		if (utf8ncursor(&cur2, um, UTF8_NFDICF, s2->name, s2->len) < 0)
		return -EINVAL;

		do {
		@@ -81,12 +76,11 @@ int utf8_strncasecmp_folded(const struct unicode_map *um,
		const struct qstr *cf,
		const struct qstr *s1)
		{
		const struct utf8data *data = utf8nfdicf(um->version);
		struct utf8cursor cur1;
		int c1, c2;
		int i = 0;

		if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
		if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0)
		return -EINVAL;

		do {
		@@ -105,11 +99,10 @@ EXPORT_SYMBOL(utf8_strncasecmp_folded);
		int utf8_casefold(const struct unicode_map um, const struct qstr str,
		unsigned char *dest, size_t dlen)
		{
		const struct utf8data *data = utf8nfdicf(um->version);
		struct utf8cursor cur;
		size_t nlen = 0;

		if (utf8ncursor(&cur, data, str->name, str->len) < 0)
		if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0)
		return -EINVAL;

		for (nlen = 0; nlen < dlen; nlen++) {
		@@ -128,12 +121,11 @@ EXPORT_SYMBOL(utf8_casefold);
		int utf8_casefold_hash(const struct unicode_map um, const void salt,
		struct qstr *str)
		{
		const struct utf8data *data = utf8nfdicf(um->version);
		struct utf8cursor cur;
		int c;
		unsigned long hash = init_name_hash(salt);

		if (utf8ncursor(&cur, data, str->name, str->len) < 0)
		if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0)
		return -EINVAL;

		while ((c = utf8byte(&cur))) {
		@@ -149,11 +141,10 @@ EXPORT_SYMBOL(utf8_casefold_hash);
		int utf8_normalize(const struct unicode_map um, const struct qstr str,
		unsigned char *dest, size_t dlen)
		{
		const struct utf8data *data = utf8nfdi(um->version);
		struct utf8cursor cur;
		ssize_t nlen = 0;

		if (utf8ncursor(&cur, data, str->name, str->len) < 0)
		if (utf8ncursor(&cur, um, UTF8_NFDI, str->name, str->len) < 0)
		return -EINVAL;

		for (nlen = 0; nlen < dlen; nlen++) {
		@@ -180,7 +171,17 @@ struct unicode_map *utf8_load(unsigned int version)
		if (!um)
		return ERR_PTR(-ENOMEM);
		um->version = version;
		um->ntab[UTF8_NFDI] = utf8nfdi(version);
		if (!um->ntab[UTF8_NFDI])
		goto out_free_um;
		um->ntab[UTF8_NFDICF] = utf8nfdicf(version);
		if (!um->ntab[UTF8_NFDICF])
		goto out_free_um;
		return um;

		out_free_um:
		kfree(um);
		return ERR_PTR(-EINVAL);
		}
		EXPORT_SYMBOL(utf8_load);

fs/unicode/utf8-norm.c

+20 −25

Original line number	Diff line number	Diff line
		@@ -309,21 +309,19 @@ utf8hangul(const char str, unsigned char hangul)
		* is well-formed and corresponds to a known unicode code point. The
		* shorthand for this will be "is valid UTF-8 unicode".
		*/
		static utf8leaf_t utf8nlookup(const struct utf8data data,
		unsigned char hangul, const char s, size_t len)
		static utf8leaf_t utf8nlookup(const struct unicode_map um,
		enum utf8_normalization n, unsigned char hangul, const char s,
		size_t len)
		{
		utf8trie_t *trie = NULL;
		utf8trie_t *trie = utf8data + um->ntab[n]->offset;
		int offlen;
		int offset;
		int mask;
		int node;

		if (!data)
		return NULL;
		if (len == 0)
		return NULL;

		trie = utf8data + data->offset;
		node = 1;
		while (node) {
		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
		@@ -385,29 +383,28 @@ static utf8leaf_t utf8nlookup(const struct utf8data data,
		*
		* Forwards to utf8nlookup().
		*/
		static utf8leaf_t utf8lookup(const struct utf8data data,
		unsigned char hangul, const char s)
		static utf8leaf_t utf8lookup(const struct unicode_map um,
		enum utf8_normalization n, unsigned char hangul, const char s)
		{
		return utf8nlookup(data, hangul, s, (size_t)-1);
		return utf8nlookup(um, n, hangul, s, (size_t)-1);
		}

		/*
		* Length of the normalization of s, touch at most len bytes.
		* Return -1 if s is not valid UTF-8 unicode.
		*/
		ssize_t utf8nlen(const struct utf8data data, const char s, size_t len)
		ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
		const char *s, size_t len)
		{
		utf8leaf_t *leaf;
		size_t ret = 0;
		unsigned char hangul[UTF8HANGULLEAF];

		if (!data)
		return -1;
		while (len && *s) {
		leaf = utf8nlookup(data, hangul, s, len);
		leaf = utf8nlookup(um, n, hangul, s, len);
		if (!leaf)
		return -1;
		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
		if (utf8agetab[LEAF_GEN(leaf)] > um->ntab[n]->maxage)
		ret += utf8clen(s);
		else if (LEAF_CCC(leaf) == DECOMPOSE)
		ret += strlen(LEAF_STR(leaf));
		@@ -430,14 +427,13 @@ EXPORT_SYMBOL(utf8nlen);
		*
		* Returns -1 on error, 0 on success.
		*/
		int utf8ncursor(struct utf8cursor u8c, const struct utf8data data,
		const char *s, size_t len)
		int utf8ncursor(struct utf8cursor u8c, const struct unicode_map um,
		enum utf8_normalization n, const char *s, size_t len)
		{
		if (!data)
		return -1;
		if (!s)
		return -1;
		u8c->data = data;
		u8c->um = um;
		u8c->n = n;
		u8c->s = s;
		u8c->p = NULL;
		u8c->ss = NULL;
		@@ -512,9 +508,9 @@ int utf8byte(struct utf8cursor *u8c)

		/* Look up the data for the current character. */
		if (u8c->p) {
		leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
		leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
		} else {
		leaf = utf8nlookup(u8c->data, u8c->hangul,
		leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul,
		u8c->s, u8c->len);
		}

		@@ -524,7 +520,8 @@ int utf8byte(struct utf8cursor *u8c)

		ccc = LEAF_CCC(leaf);
		/* Characters that are too new have CCC 0. */
		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
		if (utf8agetab[LEAF_GEN(leaf)] >
		u8c->um->ntab[u8c->n]->maxage) {
		ccc = STOPPER;
		} else if (ccc == DECOMPOSE) {
		u8c->len -= utf8clen(u8c->s);
		@@ -538,7 +535,7 @@ int utf8byte(struct utf8cursor *u8c)
		goto ccc_mismatch;
		}

		leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
		leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
		if (!leaf)
		return -1;
		ccc = LEAF_CCC(leaf);
		@@ -611,7 +608,6 @@ const struct utf8data *utf8nfdi(unsigned int maxage)
		return NULL;
		return &utf8nfdidata[i];
		}
		EXPORT_SYMBOL(utf8nfdi);

		const struct utf8data *utf8nfdicf(unsigned int maxage)
		{
		@@ -623,4 +619,3 @@ const struct utf8data *utf8nfdicf(unsigned int maxage)
		return NULL;
		return &utf8nfdicfdata[i];
		}
		EXPORT_SYMBOL(utf8nfdicf);

fs/unicode/utf8-selftest.c

+33 −47

Original line number	Diff line number	Diff line
		@@ -18,9 +18,7 @@ unsigned int failed_tests;
		unsigned int total_tests;

		/* Tests will be based on this version. */
		#define latest_maj 12
		#define latest_min 1
		#define latest_rev 0
		#define UTF8_LATEST UNICODE_AGE(12, 1, 0)

		#define _test(cond, func, line, fmt, ...) do { \
		total_tests++; \
		@@ -160,29 +158,22 @@ static const struct {
		}
		};

		static ssize_t utf8len(const struct utf8data data, const char s)
		static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n,
		const char *s)
		{
		return utf8nlen(data, s, (size_t)-1);
		return utf8nlen(um, n, s, (size_t)-1);
		}

		static int utf8cursor(struct utf8cursor u8c, const struct utf8data data,
		const char *s)
		static int utf8cursor(struct utf8cursor u8c, const struct unicode_map um,
		enum utf8_normalization n, const char *s)
		{
		return utf8ncursor(u8c, data, s, (unsigned int)-1);
		return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
		}

		static void check_utf8_nfdi(void)
		static void check_utf8_nfdi(struct unicode_map *um)
		{
		int i;
		struct utf8cursor u8c;
		const struct utf8data *data;

		data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev));
		if (!data) {
		pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
		__func__, latest_maj, latest_min, latest_rev);
		return;
		}

		for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
		int len = strlen(nfdi_test_data[i].str);
		@@ -190,10 +181,11 @@ static void check_utf8_nfdi(void)
		int j = 0;
		unsigned char c;

		test((utf8len(data, nfdi_test_data[i].str) == nlen));
		test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen));
		test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen));
		test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) ==
		nlen));

		if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0)
		if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0)
		pr_err("can't create cursor\n");

		while ((c = utf8byte(&u8c)) > 0) {
		@@ -207,18 +199,10 @@ static void check_utf8_nfdi(void)
		}
		}

		static void check_utf8_nfdicf(void)
		static void check_utf8_nfdicf(struct unicode_map *um)
		{
		int i;
		struct utf8cursor u8c;
		const struct utf8data *data;

		data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev));
		if (!data) {
		pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
		__func__, latest_maj, latest_min, latest_rev);
		return;
		}

		for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
		int len = strlen(nfdicf_test_data[i].str);
		@@ -226,10 +210,13 @@ static void check_utf8_nfdicf(void)
		int j = 0;
		unsigned char c;

		test((utf8len(data, nfdicf_test_data[i].str) == nlen));
		test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen));
		test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) ==
		nlen));
		test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) ==
		nlen));

		if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0)
		if (utf8cursor(&u8c, um, UTF8_NFDICF,
		nfdicf_test_data[i].str) < 0)
		pr_err("can't create cursor\n");

		while ((c = utf8byte(&u8c)) > 0) {
		@@ -243,16 +230,9 @@ static void check_utf8_nfdicf(void)
		}
		}

		static void check_utf8_comparisons(void)
		static void check_utf8_comparisons(struct unicode_map *table)
		{
		int i;
		struct unicode_map *table = utf8_load(UNICODE_AGE(12, 1, 0));

		if (IS_ERR(table)) {
		pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
		__func__, latest_maj, latest_min, latest_rev);
		return;
		}

		for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
		const struct qstr s1 = {.name = nfdi_test_data[i].str,
		@@ -273,8 +253,6 @@ static void check_utf8_comparisons(void)
		test_f(!utf8_strncasecmp(table, &s1, &s2),
		"%s %s comparison mismatch\n", s1.name, s2.name);
		}

		utf8_unload(table);
		}

		static void check_supported_versions(void)
		@@ -286,8 +264,7 @@ static void check_supported_versions(void)
		test(utf8version_is_supported(UNICODE_AGE(9, 0, 0)));

		/* Unicode 1x.0.0 (the latest version) should be supported. */
		test(utf8version_is_supported(
		UNICODE_AGE(latest_maj, latest_min, latest_rev)));
		test(utf8version_is_supported(UTF8_LATEST));

		/* Next versions don't exist. */
		test(!utf8version_is_supported(UNICODE_AGE(13, 0, 0)));
		@@ -297,19 +274,28 @@ static void check_supported_versions(void)

		static int __init init_test_ucd(void)
		{
		struct unicode_map *um;

		failed_tests = 0;
		total_tests = 0;

		um = utf8_load(UTF8_LATEST);
		if (IS_ERR(um)) {
		pr_err("%s: Unable to load utf8 table.\n", __func__);
		return PTR_ERR(um);
		}

		check_supported_versions();
		check_utf8_nfdi();
		check_utf8_nfdicf();
		check_utf8_comparisons();
		check_utf8_nfdi(um);
		check_utf8_nfdicf(um);
		check_utf8_comparisons(um);

		if (!failed_tests)
		pr_info("All %u tests passed\n", total_tests);
		else
		pr_err("%u out of %u tests failed\n", failed_tests,
		total_tests);
		utf8_unload(um);
		return 0;
		}

fs/unicode/utf8n.h

+6 −4

Original line number	Diff line number	Diff line
		@@ -39,7 +39,8 @@ extern const struct utf8data *utf8nfdicf(unsigned int maxage);
		* Returns 0 if only ignorable code points are present.
		* Returns -1 if the input is not valid UTF-8.
		*/
		extern ssize_t utf8nlen(const struct utf8data data, const char s, size_t len);
		ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
		const char *s, size_t len);

		/* Needed in struct utf8cursor below. */
		#define UTF8HANGULLEAF (12)
		@@ -48,7 +49,8 @@ extern ssize_t utf8nlen(const struct utf8data data, const char s, size_t len);
		* Cursor structure used by the normalizer.
		*/
		struct utf8cursor {
		const struct utf8data *data;
		const struct unicode_map *um;
		enum utf8_normalization n;
		const char *s;
		const char *p;
		const char *ss;
		@@ -65,8 +67,8 @@ struct utf8cursor {
		* Returns 0 on success.
		* Returns -1 on failure.
		*/
		extern int utf8ncursor(struct utf8cursor u8c, const struct utf8data data,
		const char *s, size_t len);
		int utf8ncursor(struct utf8cursor u8c, const struct unicode_map um,
		enum utf8_normalization n, const char *s, size_t len);

		/*
		* Get the next byte in the normalization.

include/linux/unicode.h

+19 −0

Original line number	Diff line number	Diff line
		@@ -5,6 +5,8 @@
		#include <linux/init.h>
		#include <linux/dcache.h>

		struct utf8data;

		#define UNICODE_MAJ_SHIFT 16
		#define UNICODE_MIN_SHIFT 8

		@@ -28,8 +30,25 @@ static inline u8 unicode_rev(unsigned int age)
		return age & 0xff;
		}

		/*
		* Two normalization forms are supported:
		* 1) NFDI
		* - Apply unicode normalization form NFD.
		* - Remove any Default_Ignorable_Code_Point.
		* 2) NFDICF
		* - Apply unicode normalization form NFD.
		* - Remove any Default_Ignorable_Code_Point.
		* - Apply a full casefold (C + F).
		*/
		enum utf8_normalization {
		UTF8_NFDI = 0,
		UTF8_NFDICF,
		UTF8_NMAX,
		};

		struct unicode_map {
		unsigned int version;
		const struct utf8data *ntab[UTF8_NMAX];
		};

		int utf8_validate(const struct unicode_map um, const struct qstr str);