diff --git a/src/routes/_thirdparty/unescape/unescape.js b/src/routes/_thirdparty/unescape/unescape.js index 5f403363..96b5dc40 100644 --- a/src/routes/_thirdparty/unescape/unescape.js +++ b/src/routes/_thirdparty/unescape/unescape.js @@ -1,43 +1,50 @@ -// via https://github.com/jonschlinkert/unescape/blob/98d1e52/index.js +// +// Originally via https://github.com/jonschlinkert/unescape/blob/98d1e52/index.js +// +import { thunk } from '../../_utils/thunk' +// via https://www.htmlhelp.com/reference/html40/entities/special.html +// plus some more known entities like pound, nbsp, etc const chars = { - '"': '"', - '"': '"', - - ''': '\'', - ''': '\'', - '&': '&', - '&': '&', - - '>': '>', - '>': '>', - - '<': '<', - '<': '<', - + ''': '\'', + '„': '„', '¢': '¢', - '¢': '¢', - + 'ˆ': 'ˆ', '©': '©', - '©': '©', - + '†': '†', + '‡': '‡', + ' ': ' ', + ' ': ' ', '€': '€', - '€': '€', - + '>': '>', + '“': '“', + '‎': '', + '‹': '‹', + '‘': '‘', + '<': '<', + '—': '—', + ' ': ' ', + '–': '–', + 'œ': 'œ', + 'Œ': 'Œ', + '‰': '‰', '£': '£', - '£': '£', - + '"': '"', + '”': '”', '®': '®', - '®': '®', - + '›': '›', + '’': '’', + '‚': '‚', + 'š': 'š', + 'Š': 'Š', + ' ': ' ', + '˜': '˜', '¥': '¥', - '¥': '¥', - - ' ': ' ' + 'Ÿ': 'Ÿ' } -let regex +const getRegex = thunk(() => toRegex(chars)) /** * Convert HTML entities to HTML characters. @@ -45,15 +52,35 @@ let regex * @param {String} `str` String with HTML entities to un-escape. * @return {String} */ - function unescape (str) { - regex = regex || toRegex(chars) - return str.replace(regex, m => chars[m]) + return str.replace(getRegex(), replace) +} + +function replace (match) { + const knownValue = chars[match] + if (knownValue) { + return knownValue + } + let codePoint + try { + if (match.startsWith('&#x')) { // hex + codePoint = parseInt(match.substring(3, match.length - 1), 16) + } else { // decimal + codePoint = parseInt(match.substring(2, match.length - 1), 10) + } + return String.fromCodePoint(codePoint) + } catch (e) { + return match // bad code point, bail out + } } function toRegex (chars) { - var keys = Object.keys(chars).join('|') - return new RegExp('(' + keys + ')', 'g') + const patterns = Object.keys(chars).concat([ + '&#[0-9]{1,6};', // decimal code points + '&#x[a-fA-F0-9]{1,6};' // hex code points + ]) + + return new RegExp('(' + patterns.join('|') + ')', 'g') } /** diff --git a/tests/unit/test-unescape.js b/tests/unit/test-unescape.js new file mode 100644 index 00000000..21cdce3e --- /dev/null +++ b/tests/unit/test-unescape.js @@ -0,0 +1,23 @@ +/* global describe, it */ +import assert from 'assert' +import { unescape } from '../../src/routes/_thirdparty/unescape/unescape' + +describe('test-unescape.js', () => { + it('unescapes html correctly', () => { + assert.deepStrictEqual(unescape('What I’ve learned'), 'What I’ve learned') + assert.deepStrictEqual(unescape('Hello "world"'), 'Hello "world"') + assert.deepStrictEqual(unescape('That costs 3£ or 4€'), 'That costs 3£ or 4€') + assert.deepStrictEqual(unescape('That costs 3&POUND; or 4&EURO;'), 'That costs 3&POUND; or 4&EURO;') // must be lc + assert.deepStrictEqual(unescape('Foo & bar & baz'), 'Foo & bar & baz') + assert.deepStrictEqual(unescape('Winking tongue: 😜'), 'Winking tongue: 😜') + assert.deepStrictEqual(unescape('Winking tongue as hex: 😜'), 'Winking tongue as hex: 😜') + assert.deepStrictEqual(unescape('Winking tongue as hex: 😜'), 'Winking tongue as hex: 😜') + assert.deepStrictEqual(unescape('All's fair'), 'All\'s fair') + assert.deepStrictEqual(unescape('All's fair'), 'All\'s fair') + assert.deepStrictEqual(unescape('foo bar'), 'foo bar') + }) + + it('handles fake html code points', () => { + assert.deepStrictEqual(unescape('Hello �'), 'Hello �') + }) +})