Commit 2b3d0478 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Gabriel Krisman Bertazi
Browse files

unicode: Add utf8-data module



utf8data.h contains a large database table which is an auto-generated
decodification trie for the unicode normalization functions.

Allow building it into a separate module.

Based on a patch from Shreeya Patel <shreeya.patel@collabora.com>.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarGabriel Krisman Bertazi <krisman@collabora.com>
parent 6ca99ce7
Loading
Loading
Loading
Loading
+11 −2
Original line number Diff line number Diff line
@@ -8,7 +8,16 @@ config UNICODE
	  Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
	  support.

config UNICODE_UTF8_DATA
	tristate "UTF-8 normalization and casefolding tables"
	depends on UNICODE
	default UNICODE
	help
	  This contains a large table of case foldings, which can be loaded as
	  a separate module if you say M here.  To be on the safe side stick
	  to the default of Y.  Saying N here makes no sense, if you do not want
	  utf8 casefolding support, disable CONFIG_UNICODE instead.

config UNICODE_NORMALIZATION_SELFTEST
	tristate "Test UTF-8 normalization support"
	depends on UNICODE
	default n
	depends on UNICODE_UTF8_DATA
+7 −6
Original line number Diff line number Diff line
@@ -2,14 +2,15 @@

obj-$(CONFIG_UNICODE) += unicode.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o

unicode-y := utf8-norm.o utf8-core.o

$(obj)/utf8-norm.o: $(obj)/utf8data.h
$(obj)/utf8-data.o: $(obj)/utf8data.c

# In the normal build, the checked-in utf8data.h is just shipped.
# In the normal build, the checked-in utf8data.c is just shipped.
#
# To generate utf8data.h from UCD, put *.txt files in this directory
# To generate utf8data.c from UCD, put *.txt files in this directory
# and pass REGENERATE_UTF8DATA=1 from the command line.
ifdef REGENERATE_UTF8DATA

@@ -24,15 +25,15 @@ quiet_cmd_utf8data = GEN $@
		-t $(srctree)/$(src)/NormalizationTest.txt \
		-o $@

$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
$(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
	$(call if_changed,utf8data)

else

$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE
$(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE
	$(call if_changed,shipped)

endif

targets += utf8data.h
targets += utf8data.c
hostprogs += mkutf8data
+19 −5
Original line number Diff line number Diff line
@@ -3287,12 +3287,10 @@ static void write_file(void)
		open_fail(utf8_name, errno);

	fprintf(file, "/* This file is generated code, do not edit. */\n");
	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
	fprintf(file, "#endif\n");
	fprintf(file, "\n");
	fprintf(file, "static const unsigned int utf8vers = %#x;\n",
		unicode_maxage);
	fprintf(file, "#include <linux/module.h>\n");
	fprintf(file, "#include <linux/kernel.h>\n");
	fprintf(file, "#include \"utf8n.h\"\n");
	fprintf(file, "\n");
	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
	for (i = 0; i != ages_count; i++)
@@ -3339,6 +3337,22 @@ static void write_file(void)
		fprintf(file, "\n");
	}
	fprintf(file, "};\n");
	fprintf(file, "\n");
	fprintf(file, "struct utf8data_table utf8_data_table = {\n");
	fprintf(file, "\t.utf8agetab = utf8agetab,\n");
	fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
	fprintf(file, "\n");
	fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
	fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
	fprintf(file, "\n");
	fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
	fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
	fprintf(file, "\n");
	fprintf(file, "\t.utf8data = utf8data,\n");
	fprintf(file, "};\n");
	fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
	fprintf(file, "\n");
	fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
	fclose(file);
}

+31 −8
Original line number Diff line number Diff line
@@ -160,25 +160,45 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
}
EXPORT_SYMBOL(utf8_normalize);

static const struct utf8data *find_table_version(const struct utf8data *table,
		size_t nr_entries, unsigned int version)
{
	size_t i = nr_entries - 1;

	while (version < table[i].maxage)
		i--;
	if (version > table[i].maxage)
		return NULL;
	return &table[i];
}

struct unicode_map *utf8_load(unsigned int version)
{
	struct unicode_map *um;

	if (!utf8version_is_supported(version))
		return ERR_PTR(-EINVAL);

	um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
	if (!um)
		return ERR_PTR(-ENOMEM);
	um->version = version;
	um->ntab[UTF8_NFDI] = utf8nfdi(version);
	if (!um->ntab[UTF8_NFDI])

	um->tables = symbol_request(utf8_data_table);
	if (!um->tables)
		goto out_free_um;
	um->ntab[UTF8_NFDICF] = utf8nfdicf(version);

	if (!utf8version_is_supported(um, version))
		goto out_symbol_put;
	um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata,
			um->tables->utf8nfdidata_size, um->version);
	if (!um->ntab[UTF8_NFDI])
		goto out_symbol_put;
	um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata,
			um->tables->utf8nfdicfdata_size, um->version);
	if (!um->ntab[UTF8_NFDICF])
		goto out_free_um;
		goto out_symbol_put;
	return um;

out_symbol_put:
	symbol_put(um->tables);
out_free_um:
	kfree(um);
	return ERR_PTR(-EINVAL);
@@ -187,8 +207,11 @@ EXPORT_SYMBOL(utf8_load);

void utf8_unload(struct unicode_map *um)
{
	if (um) {
		symbol_put(utf8_data_table);
		kfree(um);
	}
}
EXPORT_SYMBOL(utf8_unload);

MODULE_LICENSE("GPL v2");
+9 −39
Original line number Diff line number Diff line
@@ -6,21 +6,12 @@

#include "utf8n.h"

struct utf8data {
	unsigned int maxage;
	unsigned int offset;
};

#define __INCLUDED_FROM_UTF8NORM_C__
#include "utf8data.h"
#undef __INCLUDED_FROM_UTF8NORM_C__

int utf8version_is_supported(unsigned int version)
int utf8version_is_supported(const struct unicode_map *um, unsigned int version)
{
	int i = ARRAY_SIZE(utf8agetab) - 1;
	int i = um->tables->utf8agetab_size - 1;

	while (i >= 0 && utf8agetab[i] != 0) {
		if (version == utf8agetab[i])
	while (i >= 0 && um->tables->utf8agetab[i] != 0) {
		if (version == um->tables->utf8agetab[i])
			return 1;
		i--;
	}
@@ -161,7 +152,7 @@ typedef const unsigned char utf8trie_t;
 * underlying datatype: unsigned char.
 *
 * leaf[0]: The unicode version, stored as a generation number that is
 *          an index into utf8agetab[].  With this we can filter code
 *          an index into ->utf8agetab[].  With this we can filter code
 *          points based on the unicode version in which they were
 *          defined.  The CCC of a non-defined code point is 0.
 * leaf[1]: Canonical Combining Class. During normalization, we need
@@ -313,7 +304,7 @@ static utf8leaf_t *utf8nlookup(const struct unicode_map *um,
		enum utf8_normalization n, unsigned char *hangul, const char *s,
		size_t len)
{
	utf8trie_t	*trie = utf8data + um->ntab[n]->offset;
	utf8trie_t	*trie = um->tables->utf8data + um->ntab[n]->offset;
	int		offlen;
	int		offset;
	int		mask;
@@ -404,7 +395,8 @@ ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
		leaf = utf8nlookup(um, n, hangul, s, len);
		if (!leaf)
			return -1;
		if (utf8agetab[LEAF_GEN(leaf)] > um->ntab[n]->maxage)
		if (um->tables->utf8agetab[LEAF_GEN(leaf)] >
		    um->ntab[n]->maxage)
			ret += utf8clen(s);
		else if (LEAF_CCC(leaf) == DECOMPOSE)
			ret += strlen(LEAF_STR(leaf));
@@ -520,7 +512,7 @@ int utf8byte(struct utf8cursor *u8c)

		ccc = LEAF_CCC(leaf);
		/* Characters that are too new have CCC 0. */
		if (utf8agetab[LEAF_GEN(leaf)] >
		if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >
		    u8c->um->ntab[u8c->n]->maxage) {
			ccc = STOPPER;
		} else if (ccc == DECOMPOSE) {
@@ -597,25 +589,3 @@ int utf8byte(struct utf8cursor *u8c)
	}
}
EXPORT_SYMBOL(utf8byte);

const struct utf8data *utf8nfdi(unsigned int maxage)
{
	int i = ARRAY_SIZE(utf8nfdidata) - 1;

	while (maxage < utf8nfdidata[i].maxage)
		i--;
	if (maxage > utf8nfdidata[i].maxage)
		return NULL;
	return &utf8nfdidata[i];
}

const struct utf8data *utf8nfdicf(unsigned int maxage)
{
	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;

	while (maxage < utf8nfdicfdata[i].maxage)
		i--;
	if (maxage > utf8nfdicfdata[i].maxage)
		return NULL;
	return &utf8nfdicfdata[i];
}
Loading