2
0

Emoji Indexing and Search (#35253)

This commit is contained in:
Echo
2025-07-09 11:55:41 +02:00
committed by GitHub
parent 76c1446416
commit a1e8813522
12 changed files with 632 additions and 171 deletions

View File

@@ -0,0 +1,110 @@
// Utility codes
export const VARIATION_SELECTOR_CODE = 0xfe0f;
export const KEYCAP_CODE = 0x20e3;
// Gender codes
export const GENDER_FEMALE_CODE = 0x2640;
export const GENDER_MALE_CODE = 0x2642;
// Skin tone codes
export const SKIN_TONE_CODES = [
0x1f3fb, // Light skin tone
0x1f3fc, // Medium-light skin tone
0x1f3fd, // Medium skin tone
0x1f3fe, // Medium-dark skin tone
0x1f3ff, // Dark skin tone
] as const;
export const EMOJIS_WITH_DARK_BORDER = [
'🎱', // 1F3B1
'🐜', // 1F41C
'⚫', // 26AB
'🖤', // 1F5A4
'⬛', // 2B1B
'◼️', // 25FC-FE0F
'◾', // 25FE
'◼️', // 25FC-FE0F
'✒️', // 2712-FE0F
'▪️', // 25AA-FE0F
'💣', // 1F4A3
'🎳', // 1F3B3
'📷', // 1F4F7
'📸', // 1F4F8
'♣️', // 2663-FE0F
'🕶️', // 1F576-FE0F
'✴️', // 2734-FE0F
'🔌', // 1F50C
'💂‍♀️', // 1F482-200D-2640-FE0F
'📽️', // 1F4FD-FE0F
'🍳', // 1F373
'🦍', // 1F98D
'💂', // 1F482
'🔪', // 1F52A
'🕳️', // 1F573-FE0F
'🕹️', // 1F579-FE0F
'🕋', // 1F54B
'🖊️', // 1F58A-FE0F
'🖋️', // 1F58B-FE0F
'💂‍♂️', // 1F482-200D-2642-FE0F
'🎤', // 1F3A4
'🎓', // 1F393
'🎥', // 1F3A5
'🎼', // 1F3BC
'♠️', // 2660-FE0F
'🎩', // 1F3A9
'🦃', // 1F983
'📼', // 1F4FC
'📹', // 1F4F9
'🎮', // 1F3AE
'🐃', // 1F403
'🏴', // 1F3F4
'🐞', // 1F41E
'🕺', // 1F57A
'📱', // 1F4F1
'📲', // 1F4F2
'🚲', // 1F6B2
'🪮', // 1FAA6
'🐦‍⬛', // 1F426-200D-2B1B
];
export const EMOJIS_WITH_LIGHT_BORDER = [
'👽', // 1F47D
'⚾', // 26BE
'🐔', // 1F414
'☁️', // 2601-FE0F
'💨', // 1F4A8
'🕊️', // 1F54A-FE0F
'👀', // 1F440
'🍥', // 1F365
'👻', // 1F47B
'🐐', // 1F410
'❕', // 2755
'❔', // 2754
'⛸️', // 26F8-FE0F
'🌩️', // 1F329-FE0F
'🔊', // 1F50A
'🔇', // 1F507
'📃', // 1F4C3
'🌧️', // 1F327-FE0F
'🐏', // 1F40F
'🍚', // 1F35A
'🍙', // 1F359
'🐓', // 1F413
'🐑', // 1F411
'💀', // 1F480
'☠️', // 2620-FE0F
'🌨️', // 1F328-FE0F
'🔉', // 1F509
'🔈', // 1F508
'💬', // 1F4AC
'💭', // 1F4AD
'🏐', // 1F3D0
'🏳️', // 1F3F3-FE0F
'⚪', // 26AA
'⬜', // 2B1C
'◽', // 25FD
'◻️', // 25FB-FE0F
'▫️', // 25AB-FE0F
'🪽', // 1FAE8
'🪿', // 1FABF
];

View File

@@ -0,0 +1,102 @@
import { SUPPORTED_LOCALES } from 'emojibase';
import type { FlatCompactEmoji, Locale } from 'emojibase';
import type { DBSchema } from 'idb';
import { openDB } from 'idb';
import type { ApiCustomEmojiJSON } from '@/mastodon/api_types/custom_emoji';
import type { LocaleOrCustom } from './locale';
import { toSupportedLocale, toSupportedLocaleOrCustom } from './locale';
interface EmojiDB extends LocaleTables, DBSchema {
custom: {
key: string;
value: ApiCustomEmojiJSON;
indexes: {
category: string;
};
};
etags: {
key: LocaleOrCustom;
value: string;
};
}
interface LocaleTable {
key: string;
value: FlatCompactEmoji;
indexes: {
group: number;
label: string;
order: number;
tags: string[];
};
}
type LocaleTables = Record<Locale, LocaleTable>;
const SCHEMA_VERSION = 1;
const db = await openDB<EmojiDB>('mastodon-emoji', SCHEMA_VERSION, {
upgrade(database) {
const customTable = database.createObjectStore('custom', {
keyPath: 'shortcode',
autoIncrement: false,
});
customTable.createIndex('category', 'category');
database.createObjectStore('etags');
for (const locale of SUPPORTED_LOCALES) {
const localeTable = database.createObjectStore(locale, {
keyPath: 'hexcode',
autoIncrement: false,
});
localeTable.createIndex('group', 'group');
localeTable.createIndex('label', 'label');
localeTable.createIndex('order', 'order');
localeTable.createIndex('tags', 'tags', { multiEntry: true });
}
},
});
export async function putEmojiData(emojis: FlatCompactEmoji[], locale: Locale) {
const trx = db.transaction(locale, 'readwrite');
await Promise.all(emojis.map((emoji) => trx.store.put(emoji)));
await trx.done;
}
export async function putCustomEmojiData(emojis: ApiCustomEmojiJSON[]) {
const trx = db.transaction('custom', 'readwrite');
await Promise.all(emojis.map((emoji) => trx.store.put(emoji)));
await trx.done;
}
export function putLatestEtag(etag: string, localeString: string) {
const locale = toSupportedLocaleOrCustom(localeString);
return db.put('etags', etag, locale);
}
export function searchEmojiByHexcode(hexcode: string, localeString: string) {
const locale = toSupportedLocale(localeString);
return db.get(locale, hexcode);
}
export function searchEmojiByTag(tag: string, localeString: string) {
const locale = toSupportedLocale(localeString);
const range = IDBKeyRange.only(tag.toLowerCase());
return db.getAllFromIndex(locale, 'tags', range);
}
export function searchCustomEmojiByShortcode(shortcode: string) {
return db.get('custom', shortcode);
}
export async function loadLatestEtag(localeString: string) {
const locale = toSupportedLocaleOrCustom(localeString);
const rowCount = await db.count(locale);
if (!rowCount) {
return null; // No data for this locale, return null even if there is an etag.
}
const etag = await db.get('etags', locale);
return etag ?? null;
}

View File

@@ -0,0 +1,38 @@
import initialState from '@/mastodon/initial_state';
import { toSupportedLocale } from './locale';
const serverLocale = toSupportedLocale(initialState?.meta.locale ?? 'en');
const worker =
'Worker' in window
? new Worker(new URL('./worker', import.meta.url), {
type: 'module',
})
: null;
export async function initializeEmoji() {
if (worker) {
worker.addEventListener('message', (event: MessageEvent<string>) => {
const { data: message } = event;
if (message === 'ready') {
worker.postMessage(serverLocale);
worker.postMessage('custom');
}
});
} else {
const { importCustomEmojiData, importEmojiData } = await import('./loader');
await Promise.all([importCustomEmojiData(), importEmojiData(serverLocale)]);
}
}
export async function loadEmojiLocale(localeString: string) {
const locale = toSupportedLocale(localeString);
if (worker) {
worker.postMessage(locale);
} else {
const { importEmojiData } = await import('./loader');
await importEmojiData(locale);
}
}

View File

@@ -0,0 +1,77 @@
import { flattenEmojiData } from 'emojibase';
import type { CompactEmoji, FlatCompactEmoji } from 'emojibase';
import type { ApiCustomEmojiJSON } from '@/mastodon/api_types/custom_emoji';
import { isDevelopment } from '@/mastodon/utils/environment';
import {
putEmojiData,
putCustomEmojiData,
loadLatestEtag,
putLatestEtag,
} from './database';
import { toSupportedLocale, toSupportedLocaleOrCustom } from './locale';
import type { LocaleOrCustom } from './locale';
export async function importEmojiData(localeString: string) {
const locale = toSupportedLocale(localeString);
const emojis = await fetchAndCheckEtag<CompactEmoji[]>(locale);
if (!emojis) {
return;
}
const flattenedEmojis: FlatCompactEmoji[] = flattenEmojiData(emojis);
await putEmojiData(flattenedEmojis, locale);
}
export async function importCustomEmojiData() {
const emojis = await fetchAndCheckEtag<ApiCustomEmojiJSON[]>('custom');
if (!emojis) {
return;
}
await putCustomEmojiData(emojis);
}
async function fetchAndCheckEtag<ResultType extends object[]>(
localeOrCustom: LocaleOrCustom,
): Promise<ResultType | null> {
const locale = toSupportedLocaleOrCustom(localeOrCustom);
let uri: string;
if (locale === 'custom') {
uri = '/api/v1/custom_emojis';
} else {
uri = `/packs${isDevelopment() ? '-dev' : ''}/emoji/${locale}.json`;
}
const oldEtag = await loadLatestEtag(locale);
const response = await fetch(uri, {
headers: {
'Content-Type': 'application/json',
'If-None-Match': oldEtag ?? '', // Send the old ETag to check for modifications
},
});
// If not modified, return null
if (response.status === 304) {
return null;
}
if (!response.ok) {
throw new Error(
`Failed to fetch emoji data for ${localeOrCustom}: ${response.statusText}`,
);
}
const data = (await response.json()) as ResultType;
if (!Array.isArray(data)) {
throw new Error(
`Unexpected data format for ${localeOrCustom}: expected an array`,
);
}
// Store the ETag for future requests
const etag = response.headers.get('ETag');
if (etag) {
await putLatestEtag(etag, localeOrCustom);
}
return data;
}

View File

@@ -1,52 +1,6 @@
import { flattenEmojiData, SUPPORTED_LOCALES } from 'emojibase';
import emojiEnData from 'emojibase-data/en/compact.json';
import emojiFrData from 'emojibase-data/fr/compact.json';
import { SUPPORTED_LOCALES } from 'emojibase';
import { toSupportedLocale, unicodeToLocaleLabel } from './locale';
describe('unicodeToLocaleLabel', () => {
const emojiTestCases = [
'1F3CB-1F3FF-200D-2640-FE0F', // 🏋🏿‍♀️ Woman weightlifter, dark skin
'1F468-1F3FB', // 👨🏻 Man, light skin
'1F469-1F3FB-200D-2695-FE0F', // 👩🏻‍⚕️ Woman health worker, light skin
'1F468-1F3FD-200D-1F692', // 👨🏽‍🚒 Man firefighter, medium skin
'1F469-1F3FE', // 👩🏾 Woman, medium-dark skin
'1F469-1F3FF-200D-1F4BB', // 👩🏿‍💻 Woman technologist, dark skin
'1F478-1F3FF', // 👸🏿 Princess with dark skin tone
'1F935-1F3FC-200D-2640-FE0F', // 🤵🏼‍♀️ Woman in tuxedo, medium-light skin
'1F9D1-1F3FC', // 🧑🏼 Person, medium-light skin
'1F9D4-1F3FE', // 🧔🏾 Person with beard, medium-dark skin
];
const flattenedEnData = flattenEmojiData(emojiEnData);
const flattenedFrData = flattenEmojiData(emojiFrData);
const emojiTestEnLabels = new Map(
emojiTestCases.map((code) => [
code,
flattenedEnData.find((emoji) => emoji.hexcode === code)?.label,
]),
);
const emojiTestFrLabels = new Map(
emojiTestCases.map((code) => [
code,
flattenedFrData.find((emoji) => emoji.hexcode === code)?.label,
]),
);
test.for(
emojiTestCases.flatMap((code) => [
[code, 'en', emojiTestEnLabels.get(code)],
[code, 'fr', emojiTestFrLabels.get(code)],
]) satisfies [string, string, string | undefined][],
)(
'returns correct label for %s for %s locale',
async ([unicodeHex, locale, expectedLabel]) => {
const label = await unicodeToLocaleLabel(unicodeHex, locale);
expect(label).toBe(expectedLabel);
},
);
});
import { toSupportedLocale, toSupportedLocaleOrCustom } from './locale';
describe('toSupportedLocale', () => {
test('returns the same locale if it is supported', () => {
@@ -62,3 +16,14 @@ describe('toSupportedLocale', () => {
}
});
});
describe('toSupportedLocaleOrCustom', () => {
test('returns custom for "custom" locale', () => {
expect(toSupportedLocaleOrCustom('custom')).toBe('custom');
});
test('returns supported locale for valid locales', () => {
for (const locale of SUPPORTED_LOCALES) {
expect(toSupportedLocaleOrCustom(locale)).toBe(locale);
}
});
});

View File

@@ -1,51 +1,23 @@
import type { CompactEmoji, Locale } from 'emojibase';
import { flattenEmojiData, SUPPORTED_LOCALES } from 'emojibase';
import type { Locale } from 'emojibase';
import { SUPPORTED_LOCALES } from 'emojibase';
// Simple cache. This will be replaced with an IndexedDB cache in the future.
const localeCache = new Map<Locale, Map<string, CompactEmoji>>();
export type LocaleOrCustom = Locale | 'custom';
export async function unicodeToLocaleLabel(
unicodeHex: string,
localeString: string,
) {
const locale = toSupportedLocale(localeString);
let hexMap = localeCache.get(locale);
if (!hexMap) {
hexMap = await loadLocaleLabels(locale);
localeCache.set(locale, hexMap);
}
const label = hexMap.get(unicodeHex)?.label;
if (!label) {
throw new Error(
`Label for unicode hex ${unicodeHex} not found in locale ${locale}`,
);
}
return label;
}
async function loadLocaleLabels(
locale: Locale,
): Promise<Map<string, CompactEmoji>> {
const { default: localeEmoji } = ((await import(
`emojibase-data/${locale}/compact.json`
)) ?? { default: [] }) as { default: CompactEmoji[] };
if (!Array.isArray(localeEmoji)) {
throw new Error(`Locale data for ${locale} not found`);
}
const hexMapEntries = flattenEmojiData(localeEmoji).map(
(emoji) => [emoji.hexcode, emoji] satisfies [string, CompactEmoji],
);
return new Map(hexMapEntries);
}
export function toSupportedLocale(locale: string): Locale {
export function toSupportedLocale(localeBase: string): Locale {
const locale = localeBase.toLowerCase();
if (isSupportedLocale(locale)) {
return locale;
}
return 'en'; // Default to English if unsupported
}
function isSupportedLocale(locale: string): locale is Locale {
return SUPPORTED_LOCALES.includes(locale as Locale);
export function toSupportedLocaleOrCustom(locale: string): LocaleOrCustom {
if (locale.toLowerCase() === 'custom') {
return 'custom';
}
return toSupportedLocale(locale);
}
function isSupportedLocale(locale: string): locale is Locale {
return SUPPORTED_LOCALES.includes(locale.toLowerCase() as Locale);
}

View File

@@ -1,9 +1,17 @@
import { readdir } from 'fs/promises';
import { basename, resolve } from 'path';
import unicodeEmojis from 'emojibase-data/en/data.json';
import { flattenEmojiData } from 'emojibase';
import unicodeRawEmojis from 'emojibase-data/en/data.json';
import { twemojiToUnicodeInfo, unicodeToTwemojiHex } from './normalize';
import {
twemojiHasBorder,
twemojiToUnicodeInfo,
unicodeToTwemojiHex,
CODES_WITH_DARK_BORDER,
CODES_WITH_LIGHT_BORDER,
emojiToUnicodeHex,
} from './normalize';
const emojiSVGFiles = await readdir(
// This assumes tests are run from project root
@@ -13,60 +21,81 @@ const emojiSVGFiles = await readdir(
},
);
const svgFileNames = emojiSVGFiles
.filter(
(file) =>
file.isFile() &&
file.name.endsWith('.svg') &&
!file.name.endsWith('_border.svg'),
)
.filter((file) => file.isFile() && file.name.endsWith('.svg'))
.map((file) => basename(file.name, '.svg').toUpperCase());
const svgFileNamesWithoutBorder = svgFileNames.filter(
(fileName) => !fileName.endsWith('_BORDER'),
);
describe('normalizeEmoji', () => {
describe('unicodeToSVGName', () => {
test.concurrent.for(
unicodeEmojis
// Our version of Twemoji only supports up to version 15.1
.filter((emoji) => emoji.version < 16)
.map((emoji) => [emoji.hexcode, emoji.label] as [string, string]),
)('verifying an emoji exists for %s (%s)', ([hexcode], { expect }) => {
const result = unicodeToTwemojiHex(hexcode);
expect(svgFileNames).toContain(result);
});
});
const unicodeEmojis = flattenEmojiData(unicodeRawEmojis);
describe('twemojiToUnicodeInfo', () => {
const unicodeMap = new Map(
unicodeEmojis.flatMap((emoji) => {
const base: [string, string][] = [[emoji.hexcode, emoji.label]];
if (emoji.skins) {
base.push(
...emoji.skins.map(
(skin) => [skin.hexcode, skin.label] as [string, string],
),
);
}
return base;
}),
);
describe('emojiToUnicodeHex', () => {
test.concurrent.for([
['🎱', '1F3B1'],
['🐜', '1F41C'],
['⚫', '26AB'],
['🖤', '1F5A4'],
['💀', '1F480'],
['💂‍♂️', '1F482-200D-2642-FE0F'],
] as const)(
'emojiToUnicodeHex converts %s to %s',
([emoji, hexcode], { expect }) => {
expect(emojiToUnicodeHex(emoji)).toBe(hexcode);
},
);
});
test.concurrent.for(svgFileNames)(
'verifying SVG file %s maps to Unicode emoji',
(svgFileName, { expect }) => {
assert(!!svgFileName);
const result = twemojiToUnicodeInfo(svgFileName);
const hexcode =
typeof result === 'string' ? result : result.unqualified;
if (!hexcode) {
// No hexcode means this is a special case like the Shibuya 109 emoji
expect(result).toHaveProperty('label');
return;
}
assert(!!hexcode);
expect(
unicodeMap.has(hexcode),
`${hexcode} (${svgFileName}) not found`,
).toBeTruthy();
},
);
describe('unicodeToTwemojiHex', () => {
test.concurrent.for(
unicodeEmojis
// Our version of Twemoji only supports up to version 15.1
.filter((emoji) => emoji.version < 16)
.map((emoji) => [emoji.hexcode, emoji.label] as [string, string]),
)('verifying an emoji exists for %s (%s)', ([hexcode], { expect }) => {
const result = unicodeToTwemojiHex(hexcode);
expect(svgFileNamesWithoutBorder).toContain(result);
});
});
describe('twemojiHasBorder', () => {
test.concurrent.for(
svgFileNames
.filter((file) => file.endsWith('_BORDER'))
.map((file) => {
const hexCode = file.replace('_BORDER', '');
return [
hexCode,
CODES_WITH_LIGHT_BORDER.includes(hexCode),
CODES_WITH_DARK_BORDER.includes(hexCode),
] as const;
}),
)('twemojiHasBorder for %s', ([hexCode, isLight, isDark], { expect }) => {
const result = twemojiHasBorder(hexCode);
expect(result).toHaveProperty('hexCode', hexCode);
expect(result).toHaveProperty('hasLightBorder', isLight);
expect(result).toHaveProperty('hasDarkBorder', isDark);
});
});
describe('twemojiToUnicodeInfo', () => {
const unicodeCodeSet = new Set(unicodeEmojis.map((emoji) => emoji.hexcode));
test.concurrent.for(svgFileNamesWithoutBorder)(
'verifying SVG file %s maps to Unicode emoji',
(svgFileName, { expect }) => {
assert(!!svgFileName);
const result = twemojiToUnicodeInfo(svgFileName);
const hexcode = typeof result === 'string' ? result : result.unqualified;
if (!hexcode) {
// No hexcode means this is a special case like the Shibuya 109 emoji
expect(result).toHaveProperty('label');
return;
}
assert(!!hexcode);
expect(
unicodeCodeSet.has(hexcode),
`${hexcode} (${svgFileName}) not found`,
).toBeTruthy();
},
);
});

View File

@@ -1,19 +1,12 @@
// Utility codes
const VARIATION_SELECTOR_CODE = 0xfe0f;
const KEYCAP_CODE = 0x20e3;
// Gender codes
const GENDER_FEMALE_CODE = 0x2640;
const GENDER_MALE_CODE = 0x2642;
// Skin tone codes
const SKIN_TONE_CODES = [
0x1f3fb, // Light skin tone
0x1f3fc, // Medium-light skin tone
0x1f3fd, // Medium skin tone
0x1f3fe, // Medium-dark skin tone
0x1f3ff, // Dark skin tone
] as const;
import {
VARIATION_SELECTOR_CODE,
KEYCAP_CODE,
GENDER_FEMALE_CODE,
GENDER_MALE_CODE,
SKIN_TONE_CODES,
EMOJIS_WITH_DARK_BORDER,
EMOJIS_WITH_LIGHT_BORDER,
} from './constants';
// Misc codes that have special handling
const SKIER_CODE = 0x26f7;
@@ -24,6 +17,17 @@ const LEVITATING_PERSON_CODE = 0x1f574;
const SPEECH_BUBBLE_CODE = 0x1f5e8;
const MS_CLAUS_CODE = 0x1f936;
export function emojiToUnicodeHex(emoji: string): string {
const codes: number[] = [];
for (const char of emoji) {
const code = char.codePointAt(0);
if (code !== undefined) {
codes.push(code);
}
}
return hexNumbersToString(codes);
}
export function unicodeToTwemojiHex(unicodeHex: string): string {
const codes = hexStringToNumbers(unicodeHex);
const normalizedCodes: number[] = [];
@@ -50,6 +54,35 @@ export function unicodeToTwemojiHex(unicodeHex: string): string {
return hexNumbersToString(normalizedCodes, 0);
}
interface TwemojiBorderInfo {
hexCode: string;
hasLightBorder: boolean;
hasDarkBorder: boolean;
}
export const CODES_WITH_DARK_BORDER =
EMOJIS_WITH_DARK_BORDER.map(emojiToUnicodeHex);
export const CODES_WITH_LIGHT_BORDER =
EMOJIS_WITH_LIGHT_BORDER.map(emojiToUnicodeHex);
export function twemojiHasBorder(twemojiHex: string): TwemojiBorderInfo {
const normalizedHex = twemojiHex.toUpperCase();
let hasLightBorder = false;
let hasDarkBorder = false;
if (CODES_WITH_LIGHT_BORDER.includes(normalizedHex)) {
hasLightBorder = true;
}
if (CODES_WITH_DARK_BORDER.includes(normalizedHex)) {
hasDarkBorder = true;
}
return {
hexCode: normalizedHex,
hasLightBorder,
hasDarkBorder,
};
}
interface TwemojiSpecificEmoji {
unqualified?: string;
gender?: number;
@@ -84,11 +117,16 @@ export function twemojiToUnicodeInfo(
let gender: undefined | number;
let skin: undefined | number;
for (const code of codes) {
if (code in GENDER_CODES_MAP) {
if (!gender && code in GENDER_CODES_MAP) {
gender = GENDER_CODES_MAP[code];
} else if (code in SKIN_TONE_CODES) {
} else if (!skin && code in SKIN_TONE_CODES) {
skin = code;
}
// Exit if we have both skin and gender
if (skin && gender) {
break;
}
}
let mappedCodes: unknown[] = codes;
@@ -103,8 +141,8 @@ export function twemojiToUnicodeInfo(
// For key emoji, insert the variation selector
mappedCodes = [codes[0], VARIATION_SELECTOR_CODE, KEYCAP_CODE];
} else if (
codes.at(0) === SKIER_CODE ||
codes.at(0) === LEVITATING_PERSON_CODE
(codes.at(0) === SKIER_CODE || codes.at(0) === LEVITATING_PERSON_CODE) &&
codes.length > 1
) {
// Twemoji offers more gender and skin options for the skier and levitating person emoji.
return {

View File

@@ -0,0 +1,13 @@
import { importEmojiData, importCustomEmojiData } from './loader';
addEventListener('message', handleMessage);
self.postMessage('ready'); // After the worker is ready, notify the main thread
function handleMessage(event: MessageEvent<string>) {
const { data: locale } = event;
if (locale !== 'custom') {
void importEmojiData(locale);
} else {
void importCustomEmojiData();
}
}