From 11a86488182fefffadb4c4632eb3d9c8c45890be Mon Sep 17 00:00:00 2001 From: Peter Kelly Date: Mon, 20 Aug 2012 00:29:16 +0700 Subject: [PATCH 1/2] Use a hash table for anchors --- src/attrs.c | 64 ++++++++++++++++++++++++++++++++++++----------------- src/attrs.h | 7 +++++- 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/src/attrs.c b/src/attrs.c index 2b93dc7..20ebeac 100644 --- a/src/attrs.c +++ b/src/attrs.c @@ -904,20 +904,24 @@ static void FreeAnchor(TidyDocImpl* doc, Anchor *a) void TY_(RemoveAnchorByNode)( TidyDocImpl* doc, Node *node ) { TidyAttribImpl* attribs = &doc->attribs; - Anchor *delme = NULL, *curr, *prev = NULL; + Anchor *delme = NULL, *curr, *prev; + uint h; - for ( curr=attribs->anchor_list; curr!=NULL; curr=curr->next ) - { - if ( curr->node == node ) + for (h = 0; h < ANCHOR_HASH_SIZE; h++) { + prev = NULL; + for ( curr=attribs->anchor_hash[h]; curr!=NULL; curr=curr->next ) { - if ( prev ) - prev->next = curr->next; - else - attribs->anchor_list = curr->next; - delme = curr; - break; + if ( curr->node == node ) + { + if ( prev ) + prev->next = curr->next; + else + attribs->anchor_hash[h] = curr->next; + delme = curr; + break; + } + prev = curr; } - prev = curr; } FreeAnchor( doc, delme ); } @@ -935,34 +939,51 @@ static Anchor* NewAnchor( TidyDocImpl* doc, ctmbstr name, Node* node ) return a; } +static uint anchorNameHash(ctmbstr s) +{ + uint hashval; + + for (hashval = 0; *s != '\0'; s++) + hashval = *s + 31*hashval; + + return hashval % ANCHOR_HASH_SIZE; +} + /* add new anchor to namespace */ static Anchor* AddAnchor( TidyDocImpl* doc, ctmbstr name, Node *node ) { + uint h; TidyAttribImpl* attribs = &doc->attribs; Anchor *a = NewAnchor( doc, name, node ); + tmbstr lname = TY_(tmbstrdup)(doc->allocator, name); + lname = TY_(tmbstrtolower)(lname); + h = anchorNameHash(lname); - if ( attribs->anchor_list == NULL) - attribs->anchor_list = a; + if ( attribs->anchor_hash[h] == NULL) + attribs->anchor_hash[h] = a; else { - Anchor *here = attribs->anchor_list; + Anchor *here = attribs->anchor_hash[h]; while (here->next) here = here->next; here->next = a; } - return attribs->anchor_list; + TidyDocFree(doc, lname); + return attribs->anchor_hash[h]; } /* return node associated with anchor */ static Node* GetNodeByAnchor( TidyDocImpl* doc, ctmbstr name ) { + uint h; TidyAttribImpl* attribs = &doc->attribs; Anchor *found; tmbstr lname = TY_(tmbstrdup)(doc->allocator, name); lname = TY_(tmbstrtolower)(lname); + h = anchorNameHash(lname); - for ( found = attribs->anchor_list; found != NULL; found = found->next ) + for ( found = attribs->anchor_hash[h]; found != NULL; found = found->next ) { if ( TY_(tmbstrcmp)(found->name, lname) == 0 ) break; @@ -979,10 +1000,13 @@ void TY_(FreeAnchors)( TidyDocImpl* doc ) { TidyAttribImpl* attribs = &doc->attribs; Anchor* a; - while (NULL != (a = attribs->anchor_list) ) - { - attribs->anchor_list = a->next; - FreeAnchor(doc, a); + uint h; + for (h = 0; h < ANCHOR_HASH_SIZE; h++) { + while (NULL != (a = attribs->anchor_hash[h]) ) + { + attribs->anchor_hash[h] = a->next; + FreeAnchor(doc, a); + } } } diff --git a/src/attrs.h b/src/attrs.h index 7b554a4..0410695 100644 --- a/src/attrs.h +++ b/src/attrs.h @@ -55,10 +55,15 @@ struct _AttrHash typedef struct _AttrHash AttrHash; #endif +enum +{ + ANCHOR_HASH_SIZE=1021u +}; + struct _TidyAttribImpl { /* anchor/node lookup */ - Anchor* anchor_list; + Anchor* anchor_hash[ANCHOR_HASH_SIZE]; /* Declared literal attributes */ Attribute* declared_attr_list; From 7fc32555428add36e44e1b1ea9ee810a317f86eb Mon Sep 17 00:00:00 2001 From: Peter Kelly Date: Mon, 20 Aug 2012 10:06:30 +0700 Subject: [PATCH 2/2] Applied hash table optimisation to RemoveAnchorByNode. This function now takes the anchor name as a parameter, so it can look in the correct bin. In the case of FreeAttrs, we have the name already (since we found a name or id attribute). In the case of FixAnchors, the anchor name could come from either the name or id attribute, so we call the function separately for each case, passing the appropriate attribute value. --- src/attrs.c | 60 ++++++++++++++++++++++++----------------------------- src/attrs.h | 2 +- src/clean.c | 14 +++++++------ src/lexer.c | 2 +- 4 files changed, 37 insertions(+), 41 deletions(-) diff --git a/src/attrs.c b/src/attrs.c index 20ebeac..e1c0956 100644 --- a/src/attrs.c +++ b/src/attrs.c @@ -900,28 +900,37 @@ static void FreeAnchor(TidyDocImpl* doc, Anchor *a) TidyDocFree( doc, a ); } +static uint anchorNameHash(ctmbstr s) +{ + uint hashval; + + for (hashval = 0; *s != '\0'; s++) { + tmbchar c = TY_(ToLower)( *s ); + hashval = c + 31*hashval; + } + + return hashval % ANCHOR_HASH_SIZE; +} + /* removes anchor for specific node */ -void TY_(RemoveAnchorByNode)( TidyDocImpl* doc, Node *node ) +void TY_(RemoveAnchorByNode)( TidyDocImpl* doc, ctmbstr name, Node *node ) { TidyAttribImpl* attribs = &doc->attribs; - Anchor *delme = NULL, *curr, *prev; - uint h; + Anchor *delme = NULL, *curr, *prev = NULL; + uint h = anchorNameHash(name); - for (h = 0; h < ANCHOR_HASH_SIZE; h++) { - prev = NULL; - for ( curr=attribs->anchor_hash[h]; curr!=NULL; curr=curr->next ) + for ( curr=attribs->anchor_hash[h]; curr!=NULL; curr=curr->next ) + { + if ( curr->node == node ) { - if ( curr->node == node ) - { - if ( prev ) - prev->next = curr->next; - else - attribs->anchor_hash[h] = curr->next; - delme = curr; - break; - } - prev = curr; + if ( prev ) + prev->next = curr->next; + else + attribs->anchor_hash[h] = curr->next; + delme = curr; + break; } + prev = curr; } FreeAnchor( doc, delme ); } @@ -939,25 +948,12 @@ static Anchor* NewAnchor( TidyDocImpl* doc, ctmbstr name, Node* node ) return a; } -static uint anchorNameHash(ctmbstr s) -{ - uint hashval; - - for (hashval = 0; *s != '\0'; s++) - hashval = *s + 31*hashval; - - return hashval % ANCHOR_HASH_SIZE; -} - /* add new anchor to namespace */ static Anchor* AddAnchor( TidyDocImpl* doc, ctmbstr name, Node *node ) { - uint h; TidyAttribImpl* attribs = &doc->attribs; Anchor *a = NewAnchor( doc, name, node ); - tmbstr lname = TY_(tmbstrdup)(doc->allocator, name); - lname = TY_(tmbstrtolower)(lname); - h = anchorNameHash(lname); + uint h = anchorNameHash(name); if ( attribs->anchor_hash[h] == NULL) attribs->anchor_hash[h] = a; @@ -969,19 +965,17 @@ static Anchor* AddAnchor( TidyDocImpl* doc, ctmbstr name, Node *node ) here->next = a; } - TidyDocFree(doc, lname); return attribs->anchor_hash[h]; } /* return node associated with anchor */ static Node* GetNodeByAnchor( TidyDocImpl* doc, ctmbstr name ) { - uint h; TidyAttribImpl* attribs = &doc->attribs; Anchor *found; + uint h = anchorNameHash(name); tmbstr lname = TY_(tmbstrdup)(doc->allocator, name); lname = TY_(tmbstrtolower)(lname); - h = anchorNameHash(lname); for ( found = attribs->anchor_hash[h]; found != NULL; found = found->next ) { diff --git a/src/attrs.h b/src/attrs.h index 0410695..1edec87 100644 --- a/src/attrs.h +++ b/src/attrs.h @@ -122,7 +122,7 @@ Bool TY_(IsValidHTMLID)(ctmbstr id); Bool TY_(IsValidXMLID)(ctmbstr id); /* removes anchor for specific node */ -void TY_(RemoveAnchorByNode)( TidyDocImpl* doc, Node *node ); +void TY_(RemoveAnchorByNode)( TidyDocImpl* doc, ctmbstr name, Node *node ); /* free all anchors */ void TY_(FreeAnchors)( TidyDocImpl* doc ); diff --git a/src/clean.c b/src/clean.c index 7a14a0f..0602ce8 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2638,17 +2638,19 @@ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId) if (id && !wantId /* make sure that Name has been emitted if requested */ - && (hadName || !wantName || NameEmitted) ) + && (hadName || !wantName || NameEmitted) ) { + if (!wantId && !wantName) + TY_(RemoveAnchorByNode)(doc, id->value, node); TY_(RemoveAttribute)(doc, node, id); + } if (name && !wantName /* make sure that Id has been emitted if requested */ - && (hadId || !wantId || IdEmitted) ) + && (hadId || !wantId || IdEmitted) ) { + if (!wantId && !wantName) + TY_(RemoveAnchorByNode)(doc, name->value, node); TY_(RemoveAttribute)(doc, node, name); - - if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL && - TY_(AttrGetById)(node, TidyAttr_ID) == NULL) - TY_(RemoveAnchorByNode)(doc, node); + } } if (node->content) diff --git a/src/lexer.c b/src/lexer.c index 708c393..9e5ed1e 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1077,7 +1077,7 @@ void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ) if ( (attrIsID(av) || attrIsNAME(av)) && TY_(IsAnchorElement)(doc, node) ) { - TY_(RemoveAnchorByNode)( doc, node ); + TY_(RemoveAnchorByNode)( doc, av->value, node ); } }