From 264c9bc043ba2ed3089b4c281bbeedb177cac3c2 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 2 Jan 2012 16:12:51 +0900 Subject: [PATCH] HTML IDs can contain anything except whitespace. Introduced TY_(IsHTMLSpace)(uint c), which checks to see if c is one of the chars that the HTML spec (and browsers) treat as a space in attribute values: 0x020 (space), 0x009 (tab), 0x00a (LF), 0x00c (FF), or 0x00d (CF). Can't use ANSI C isspace(int c) here because like standard functions for many other langs, it also treats 0x00b as a space. --- src/attrs.c | 5 +---- src/lexer.c | 5 +++++ src/lexer.h | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/attrs.c b/src/attrs.c index 85d0751..84f9060 100644 --- a/src/attrs.c +++ b/src/attrs.c @@ -1367,11 +1367,8 @@ Bool TY_(IsValidHTMLID)(ctmbstr id) if (!s) return no; - if (!TY_(IsLetter)(*s++)) - return no; - while (*s) - if (!TY_(IsNamechar)(*s++)) + if (TY_(IsHTMLSpace)(*s++)) return no; return yes; diff --git a/src/lexer.c b/src/lexer.c index 522fcdc..04a139c 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -230,6 +230,11 @@ Bool TY_(IsLetter)(uint c) return (map & letter)!=0; } +Bool TY_(IsHTMLSpace)(uint c) +{ + return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d; +} + Bool TY_(IsNamechar)(uint c) { uint map = MAP(c); diff --git a/src/lexer.h b/src/lexer.h index 43b9b4a..d4d2958 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -416,6 +416,7 @@ void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers ); Bool TY_(IsWhite)(uint c); Bool TY_(IsDigit)(uint c); Bool TY_(IsLetter)(uint c); +Bool TY_(IsHTMLSpace)(uint c); Bool TY_(IsNewline)(uint c); Bool TY_(IsNamechar)(uint c); Bool TY_(IsXMLLetter)(uint c);