Mercurial > dnsbl
diff src/scanner.cpp @ 28:33e1e3910506
add configurable list of tlds
author | carl |
---|---|
date | Thu, 27 May 2004 10:08:51 -0700 |
parents | 43a4f6b3e668 |
children | 4dfdf33f1db0 |
line wrap: on
line diff
--- a/src/scanner.cpp Sat May 22 22:30:45 2004 -0700 +++ b/src/scanner.cpp Thu May 27 10:08:51 2004 -0700 @@ -15,18 +15,20 @@ struct recorder { string_set *html_tags; // valid tags + string_set *tlds; // valid tlds string_set hosts; int bad_html_tags; int binary_tags; - recorder(string_set *html_tags_); + recorder(string_set *html_tags_, string_set *tlds_); ~recorder(); void empty(); void new_url(char *host); void new_tag(char *tag); void binary(); }; -recorder::recorder(string_set *html_tags_) { +recorder::recorder(string_set *html_tags_, string_set *tlds_) { html_tags = html_tags_; + tlds = tlds_; bad_html_tags = 0; binary_tags = 0; } @@ -35,6 +37,7 @@ } void recorder::empty() { bad_html_tags = 0; + binary_tags = 0; discard(hosts); } void recorder::new_url(char *host) { @@ -47,7 +50,7 @@ string_set::iterator i = html_tags->find(tag); if (i == html_tags->end()) { bad_html_tags++; - if (debug_syslog && (bad_html_tags < 10)) { + if (debug_syslog && (bad_html_tags < 10) && (binary_tags < 10)) { // only log the first 10 bad tags char buf[200]; snprintf(buf, sizeof(buf), "bad html tag %s", tag); @@ -374,15 +377,6 @@ }; -char *tlds[] = { - ".com", - ".net", - ".org", - ".biz", - ".info", - NULL -}; - u_char hex_decode[256] = { 0, // 0x00 0, // 0x01 @@ -953,15 +947,14 @@ pending[--count] = '\0'; // null terminate host name by overwriting the terminator if (!strchr((const char *)pending, '@')) { // not an email address or message id - char *tld; - for (int i=0; (tld = tlds[i]); i++) { - int n = strlen(tld); - if (count > n) { - if (strncasecmp((const char *)(pending+count-n), tld, n) == 0) { - memory->new_url((char*)pending); - break; - } - } + char *p1 = strchr((const char *)pending, '.'); + char *p2 = strrchr((const char *)pending, '.'); + if (p1 && (p1 != p2)) { + // have two periods, so three components + for (int i=1; i<count; i++) pending[i] = tolower(pending[i]); + // is last component a tld? + string_set::iterator i = memory->tlds->find(p2); + if (i != memory->tlds->end()) memory->new_url((char*)pending); } } st = h_init;