Mercurial > dnsbl
diff src/scanner.cpp @ 117:aa07452e641b
uribl patch from Jeff Evans <jeffe@tricab.com>
author | carl |
---|---|
date | Sun, 12 Mar 2006 10:15:39 -0800 |
parents | c1280cd3e248 |
children | d9d2f8699621 |
line wrap: on
line diff
--- a/src/scanner.cpp Sun Jan 08 10:27:24 2006 -0800 +++ b/src/scanner.cpp Sun Mar 12 10:15:39 2006 -0800 @@ -92,6 +92,7 @@ fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_); void push(u_char *buf, int len); void pusher(); + void validhost(); void error(char *err); }; @@ -1141,10 +1142,11 @@ //////////////////////////////////////////////// // // -recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_) { +recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_) { priv = priv_; html_tags = &html_tags_; tlds = &tlds_; + cctlds = &cctlds_; bad_html_tags = 0; binary_tags = 0; } @@ -1197,6 +1199,29 @@ count = 0; } +void fsa::validhost () { + // remove trailing dot + if (pending[count-1]== '.') pending[--count] = '\0'; + if (!strchr((const char *)pending, '@')) { + // not an email address or message id + char *p1 = strchr((const char *)pending, '.'); + char *p2 = strrchr((const char *)pending, '.'); + char *p3 = strstr((const char *)pending, ".."); + if (p1 && (p1 != p2) & !p3) { + // have two periods, so at least three components, and no empty components + in_addr ip; + if (inet_aton((const char*)pending, &ip)) + memory->new_url((char*)pending); + else { + for (int i=0; i<count; i++) pending[i] = tolower(pending[i]); + // is last component a tld? + string_set::iterator i = memory->get_tlds()->find(p2+1); + if (i != memory->get_tlds()->end()) memory->new_url((char*)pending); + } + } + } +} + void fsa::push(u_char *buf, int len) { for (int i=0; i<len; i++) { if (count == (PENDING_LIMIT-1)) error(NULL); @@ -1209,19 +1234,11 @@ ////////////////////////////// // host name recognizer case h_end: { + if (count > 5) { + // need some minimal length host name + //otherwise binary files likely to generate false positives pending[--count] = '\0'; // null terminate host name by overwriting the terminator - if (!strchr((const char *)pending, '@')) { - // not an email address or message id - char *p1 = strchr((const char *)pending, '.'); - char *p2 = strrchr((const char *)pending, '.'); - char *p3 = strstr((const char *)pending, ".."); - if (p1 && (p1 != p2) & !p3) { - // have two periods, so at least three components, and no empty components - for (int i=0; i<count; i++) pending[i] = tolower(pending[i]); - // is last component a tld? - string_set::iterator i = memory->get_tlds()->find(p2+1); - if (i != memory->get_tlds()->end()) memory->new_url((char*)pending); - } + validhost(); } st = h_init; } // fall thru @@ -1265,18 +1282,16 @@ ////////////////////////////// // url recognizer case u_reco: { - if (count > 13) { // need some minimal length host name after the protocol + if (count > 11) { // need some minimal length host name after the protocol pending[--count] = '\0'; // null terminate host name by overwriting the terminator + // must start with protocol + if (strncasecmp((const char *)pending, "http", 4) == 0) { char *p = strrchr((const char *)pending, '/'); - if (p && // have a leading / - strchr(p, '.') && // require at least one . in a dns name - !strstr(p, "..") && // no empty components in the dns name - (strncasecmp((const char *)pending, "http", 4) == 0)) { // must start with protocol - // we seem to have a host name - p++; // skip the last / - int c = strlen(p); - for (int i=0; i<c; i++) p[i] = tolower(p[i]); - memory->new_url(p); // record it + if (p) { + count = strlen(p+1); + memmove(pending, p+1, count+1); + validhost(); + } } } st = u_init;