From 52bc2f64f44ebf8ef938b37abc9cbfe53eaa9220 Mon Sep 17 00:00:00 2001 From: Echo Date: Wed, 2 Jul 2025 10:58:39 +0200 Subject: [PATCH] Import Emojibase data (#35229) --- .../mastodon/features/emoji/locale.test.ts | 64 +++++++++ .../mastodon/features/emoji/locale.ts | 51 +++++++ .../mastodon/features/emoji/normalize.test.ts | 72 ++++++++++ .../mastodon/features/emoji/normalize.ts | 135 ++++++++++++++++++ package.json | 2 + yarn.lock | 18 +++ 6 files changed, 342 insertions(+) create mode 100644 app/javascript/mastodon/features/emoji/locale.test.ts create mode 100644 app/javascript/mastodon/features/emoji/locale.ts create mode 100644 app/javascript/mastodon/features/emoji/normalize.test.ts create mode 100644 app/javascript/mastodon/features/emoji/normalize.ts diff --git a/app/javascript/mastodon/features/emoji/locale.test.ts b/app/javascript/mastodon/features/emoji/locale.test.ts new file mode 100644 index 000000000..0e098b2d4 --- /dev/null +++ b/app/javascript/mastodon/features/emoji/locale.test.ts @@ -0,0 +1,64 @@ +import { flattenEmojiData, SUPPORTED_LOCALES } from 'emojibase'; +import emojiEnData from 'emojibase-data/en/compact.json'; +import emojiFrData from 'emojibase-data/fr/compact.json'; + +import { toSupportedLocale, unicodeToLocaleLabel } from './locale'; + +describe('unicodeToLocaleLabel', () => { + const emojiTestCases = [ + '1F3CB-1F3FF-200D-2640-FE0F', // πŸ‹πŸΏβ€β™€οΈ Woman weightlifter, dark skin + '1F468-1F3FB', // πŸ‘¨πŸ» Man, light skin + '1F469-1F3FB-200D-2695-FE0F', // πŸ‘©πŸ»β€βš•οΈ Woman health worker, light skin + '1F468-1F3FD-200D-1F692', // πŸ‘¨πŸ½β€πŸš’ Man firefighter, medium skin + '1F469-1F3FE', // πŸ‘©πŸΎ Woman, medium-dark skin + '1F469-1F3FF-200D-1F4BB', // πŸ‘©πŸΏβ€πŸ’» Woman technologist, dark skin + '1F478-1F3FF', // πŸ‘ΈπŸΏ Princess with dark skin tone + '1F935-1F3FC-200D-2640-FE0F', // πŸ€΅πŸΌβ€β™€οΈ Woman in tuxedo, medium-light skin + '1F9D1-1F3FC', // πŸ§‘πŸΌ Person, medium-light skin + '1F9D4-1F3FE', // πŸ§”πŸΎ Person with beard, medium-dark skin + ]; + + const flattenedEnData = flattenEmojiData(emojiEnData); + const flattenedFrData = flattenEmojiData(emojiFrData); + + const emojiTestEnLabels = new Map( + emojiTestCases.map((code) => [ + code, + flattenedEnData.find((emoji) => emoji.hexcode === code)?.label, + ]), + ); + const emojiTestFrLabels = new Map( + emojiTestCases.map((code) => [ + code, + flattenedFrData.find((emoji) => emoji.hexcode === code)?.label, + ]), + ); + + test.for( + emojiTestCases.flatMap((code) => [ + [code, 'en', emojiTestEnLabels.get(code)], + [code, 'fr', emojiTestFrLabels.get(code)], + ]) satisfies [string, string, string | undefined][], + )( + 'returns correct label for %s for %s locale', + async ([unicodeHex, locale, expectedLabel]) => { + const label = await unicodeToLocaleLabel(unicodeHex, locale); + expect(label).toBe(expectedLabel); + }, + ); +}); + +describe('toSupportedLocale', () => { + test('returns the same locale if it is supported', () => { + for (const locale of SUPPORTED_LOCALES) { + expect(toSupportedLocale(locale)).toBe(locale); + } + }); + + test('returns "en" for unsupported locales', () => { + const unsupportedLocales = ['xx', 'fr-CA']; + for (const locale of unsupportedLocales) { + expect(toSupportedLocale(locale)).toBe('en'); + } + }); +}); diff --git a/app/javascript/mastodon/features/emoji/locale.ts b/app/javascript/mastodon/features/emoji/locale.ts new file mode 100644 index 000000000..aac6c376b --- /dev/null +++ b/app/javascript/mastodon/features/emoji/locale.ts @@ -0,0 +1,51 @@ +import type { CompactEmoji, Locale } from 'emojibase'; +import { flattenEmojiData, SUPPORTED_LOCALES } from 'emojibase'; + +// Simple cache. This will be replaced with an IndexedDB cache in the future. +const localeCache = new Map>(); + +export async function unicodeToLocaleLabel( + unicodeHex: string, + localeString: string, +) { + const locale = toSupportedLocale(localeString); + let hexMap = localeCache.get(locale); + if (!hexMap) { + hexMap = await loadLocaleLabels(locale); + localeCache.set(locale, hexMap); + } + + const label = hexMap.get(unicodeHex)?.label; + if (!label) { + throw new Error( + `Label for unicode hex ${unicodeHex} not found in locale ${locale}`, + ); + } + return label; +} + +async function loadLocaleLabels( + locale: Locale, +): Promise> { + const { default: localeEmoji } = ((await import( + `emojibase-data/${locale}/compact.json` + )) ?? { default: [] }) as { default: CompactEmoji[] }; + if (!Array.isArray(localeEmoji)) { + throw new Error(`Locale data for ${locale} not found`); + } + const hexMapEntries = flattenEmojiData(localeEmoji).map( + (emoji) => [emoji.hexcode, emoji] satisfies [string, CompactEmoji], + ); + return new Map(hexMapEntries); +} + +export function toSupportedLocale(locale: string): Locale { + if (isSupportedLocale(locale)) { + return locale; + } + return 'en'; // Default to English if unsupported +} + +function isSupportedLocale(locale: string): locale is Locale { + return SUPPORTED_LOCALES.includes(locale as Locale); +} diff --git a/app/javascript/mastodon/features/emoji/normalize.test.ts b/app/javascript/mastodon/features/emoji/normalize.test.ts new file mode 100644 index 000000000..29255d529 --- /dev/null +++ b/app/javascript/mastodon/features/emoji/normalize.test.ts @@ -0,0 +1,72 @@ +import { readdir } from 'fs/promises'; +import { basename, resolve } from 'path'; + +import unicodeEmojis from 'emojibase-data/en/data.json'; + +import { twemojiToUnicodeInfo, unicodeToTwemojiHex } from './normalize'; + +const emojiSVGFiles = await readdir( + // This assumes tests are run from project root + resolve(process.cwd(), 'public/emoji'), + { + withFileTypes: true, + }, +); +const svgFileNames = emojiSVGFiles + .filter( + (file) => + file.isFile() && + file.name.endsWith('.svg') && + !file.name.endsWith('_border.svg'), + ) + .map((file) => basename(file.name, '.svg').toUpperCase()); + +describe('normalizeEmoji', () => { + describe('unicodeToSVGName', () => { + test.concurrent.for( + unicodeEmojis + // Our version of Twemoji only supports up to version 15.1 + .filter((emoji) => emoji.version < 16) + .map((emoji) => [emoji.hexcode, emoji.label] as [string, string]), + )('verifying an emoji exists for %s (%s)', ([hexcode], { expect }) => { + const result = unicodeToTwemojiHex(hexcode); + expect(svgFileNames).toContain(result); + }); + }); + + describe('twemojiToUnicodeInfo', () => { + const unicodeMap = new Map( + unicodeEmojis.flatMap((emoji) => { + const base: [string, string][] = [[emoji.hexcode, emoji.label]]; + if (emoji.skins) { + base.push( + ...emoji.skins.map( + (skin) => [skin.hexcode, skin.label] as [string, string], + ), + ); + } + return base; + }), + ); + + test.concurrent.for(svgFileNames)( + 'verifying SVG file %s maps to Unicode emoji', + (svgFileName, { expect }) => { + assert(!!svgFileName); + const result = twemojiToUnicodeInfo(svgFileName); + const hexcode = + typeof result === 'string' ? result : result.unqualified; + if (!hexcode) { + // No hexcode means this is a special case like the Shibuya 109 emoji + expect(result).toHaveProperty('label'); + return; + } + assert(!!hexcode); + expect( + unicodeMap.has(hexcode), + `${hexcode} (${svgFileName}) not found`, + ).toBeTruthy(); + }, + ); + }); +}); diff --git a/app/javascript/mastodon/features/emoji/normalize.ts b/app/javascript/mastodon/features/emoji/normalize.ts new file mode 100644 index 000000000..024cd5362 --- /dev/null +++ b/app/javascript/mastodon/features/emoji/normalize.ts @@ -0,0 +1,135 @@ +// Utility codes +const VARIATION_SELECTOR_CODE = 0xfe0f; +const KEYCAP_CODE = 0x20e3; + +// Gender codes +const GENDER_FEMALE_CODE = 0x2640; +const GENDER_MALE_CODE = 0x2642; + +// Skin tone codes +const SKIN_TONE_CODES = [ + 0x1f3fb, // Light skin tone + 0x1f3fc, // Medium-light skin tone + 0x1f3fd, // Medium skin tone + 0x1f3fe, // Medium-dark skin tone + 0x1f3ff, // Dark skin tone +] as const; + +// Misc codes that have special handling +const SKIER_CODE = 0x26f7; +const CHRISTMAS_TREE_CODE = 0x1f384; +const MR_CLAUS_CODE = 0x1f385; +const EYE_CODE = 0x1f441; +const LEVITATING_PERSON_CODE = 0x1f574; +const SPEECH_BUBBLE_CODE = 0x1f5e8; +const MS_CLAUS_CODE = 0x1f936; + +export function unicodeToTwemojiHex(unicodeHex: string): string { + const codes = hexStringToNumbers(unicodeHex); + const normalizedCodes: number[] = []; + for (let i = 0; i < codes.length; i++) { + const code = codes[i]; + if (!code) { + continue; + } + // Some emoji have their variation selector removed + if (code === VARIATION_SELECTOR_CODE) { + // Key emoji + if (i === 1 && codes.at(-1) === KEYCAP_CODE) { + continue; + } + // Eye in speech bubble + if (codes.at(0) === EYE_CODE && codes.at(-2) === SPEECH_BUBBLE_CODE) { + continue; + } + } + // This removes zero padding to correctly match the SVG filenames + normalizedCodes.push(code); + } + + return hexNumbersToString(normalizedCodes, 0); +} + +interface TwemojiSpecificEmoji { + unqualified?: string; + gender?: number; + skin?: number; + label?: string; +} + +// Normalize man/woman to male/female +const GENDER_CODES_MAP: Record = { + [GENDER_FEMALE_CODE]: GENDER_FEMALE_CODE, + [GENDER_MALE_CODE]: GENDER_MALE_CODE, + // These are man/woman markers, but are used for gender sometimes. + [0x1f468]: GENDER_MALE_CODE, + [0x1f469]: GENDER_FEMALE_CODE, +}; + +const TWEMOJI_SPECIAL_CASES: Record = { + '1F441-200D-1F5E8': '1F441-FE0F-200D-1F5E8-FE0F', // Eye in speech bubble + // An emoji that was never ported to the Unicode standard. + // See: https://emojipedia.org/shibuya + E50A: { label: 'Shibuya 109' }, +}; + +export function twemojiToUnicodeInfo( + twemojiHex: string, +): TwemojiSpecificEmoji | string { + const specialCase = TWEMOJI_SPECIAL_CASES[twemojiHex.toUpperCase()]; + if (specialCase) { + return specialCase; + } + const codes = hexStringToNumbers(twemojiHex); + let gender: undefined | number; + let skin: undefined | number; + for (const code of codes) { + if (code in GENDER_CODES_MAP) { + gender = GENDER_CODES_MAP[code]; + } else if (code in SKIN_TONE_CODES) { + skin = code; + } + } + + let mappedCodes: unknown[] = codes; + + if (codes.at(-1) === CHRISTMAS_TREE_CODE && codes.length >= 3 && gender) { + // Twemoji uses the christmas tree with a ZWJ for Mr. and Mrs. Claus, + // but in Unicode that only works for Mx. Claus. + const START_CODE = + gender === GENDER_FEMALE_CODE ? MS_CLAUS_CODE : MR_CLAUS_CODE; + mappedCodes = [START_CODE, skin]; + } else if (codes.at(-1) === KEYCAP_CODE && codes.length === 2) { + // For key emoji, insert the variation selector + mappedCodes = [codes[0], VARIATION_SELECTOR_CODE, KEYCAP_CODE]; + } else if ( + codes.at(0) === SKIER_CODE || + codes.at(0) === LEVITATING_PERSON_CODE + ) { + // Twemoji offers more gender and skin options for the skier and levitating person emoji. + return { + unqualified: hexNumbersToString([codes.at(0)]), + skin, + gender, + }; + } + + return hexNumbersToString(mappedCodes); +} + +function hexStringToNumbers(hexString: string): number[] { + return hexString + .split('-') + .map((code) => Number.parseInt(code, 16)) + .filter((code) => !Number.isNaN(code)); +} + +function hexNumbersToString(codes: unknown[], padding = 4): string { + return codes + .filter( + (code): code is number => + typeof code === 'number' && code > 0 && !Number.isNaN(code), + ) + .map((code) => code.toString(16).padStart(padding, '0').toUpperCase()) + .join('-'); +} diff --git a/package.json b/package.json index 5431e0ca3..267042233 100644 --- a/package.json +++ b/package.json @@ -66,6 +66,8 @@ "cross-env": "^7.0.3", "detect-passive-events": "^2.0.3", "emoji-mart": "npm:emoji-mart-lazyload@latest", + "emojibase": "^16.0.0", + "emojibase-data": "^16.0.3", "escape-html": "^1.0.3", "fuzzysort": "^3.0.0", "history": "^4.10.1", diff --git a/yarn.lock b/yarn.lock index 7a06aa0b0..46fb5b98d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2667,6 +2667,8 @@ __metadata: cross-env: "npm:^7.0.3" detect-passive-events: "npm:^2.0.3" emoji-mart: "npm:emoji-mart-lazyload@latest" + emojibase: "npm:^16.0.0" + emojibase-data: "npm:^16.0.3" escape-html: "npm:^1.0.3" eslint: "npm:^9.23.0" eslint-import-resolver-typescript: "npm:^4.2.5" @@ -6533,6 +6535,22 @@ __metadata: languageName: node linkType: hard +"emojibase-data@npm:^16.0.3": + version: 16.0.3 + resolution: "emojibase-data@npm:16.0.3" + peerDependencies: + emojibase: "*" + checksum: 10c0/d82520917c2ec326e737da9c5a57472e41a719777fa4770b52b75f0568791613fc94829898831c7b3fff1528134de01019cdf34e571d214fee19e40950d68b7f + languageName: node + linkType: hard + +"emojibase@npm:^16.0.0": + version: 16.0.0 + resolution: "emojibase@npm:16.0.0" + checksum: 10c0/ec49ca2e131d349fa1f1dbe6ee8a6bf12da6225ce2de99d488e67a3cb80ac282f27aa480f0a7062c0c069c24508684ba524418be56b475cbd937877663686c47 + languageName: node + linkType: hard + "encodeurl@npm:~1.0.2": version: 1.0.2 resolution: "encodeurl@npm:1.0.2"