CREATE TABLE BidiClasses (id SERIAL PRIMARY KEY, name TEXT NOT NULL UNIQUE, description TEXT); CREATE TABLE Categories (id SERIAL PRIMARY KEY, name TEXT NOT NULL UNIQUE, description TEXT); CREATE TABLE Properties (id SERIAL PRIMARY KEY, shorthand TEXT NOT NULL UNIQUE, description TEXT NOT NULL UNIQUE); CREATE TABLE Blocks (id SERIAL PRIMARY KEY, name TEXT NOT NULL UNIQUE); CREATE TABLE Han_Properties (id SERIAL PRIMARY KEY, codepoint INTEGER, Definition TEXT, TotalStrokes INTEGER); -- TODO: other properties of Unihan CREATE TABLE Characters (id SERIAL PRIMARY KEY, codepoint INTEGER NOT NULL UNIQUE, name TEXT NOT NULL, -- Warning: unlike what some people say, Unicode names -- are not UNIQUE ( characters, for instance, have no unique name) version TEXT, -- Version of Unicode where it was added category TEXT REFERENCES Categories(name), bidiclass TEXT REFERENCES BidiCLasses(name)); CREATE TABLE Characters_Properties (id SERIAL PRIMARY KEY, codepoint INTEGER, -- TODO: This schema works only for boolean properties property TEXT REFERENCES Properties(description)); -- Displays a codepoint in standard Unicode notation -- TODO: pad with leading zeroes CREATE FUNCTION To_U(INTEGER) RETURNS TEXT AS 'SELECT ''U+'' || Upper(To_hex($1))' LANGUAGE 'SQL'; -- Complicated, yes. See http://www.varlena.com/GeneralBits/104.php -- That's why it's in comments. Better to use something like: -- SELECT * FROM Characters WHERE codepoint = x'0DC7'::INTEGER; --CREATE FUNCTION To_D(TEXT) RETURNS INTEGER AS -- $$ -- DECLARE -- r RECORD; -- BEGIN -- FOR r IN EXECUTE 'SELECT x'''||$1||'''::integer AS hex' LOOP -- RETURN r.hex; -- END LOOP; -- END -- $$ -- LANGUAGE 'PLPGSQL' IMMUTABLE STRICT;