diff src/scanner.cpp @ 117:aa07452e641b

uribl patch from Jeff Evans <jeffe@tricab.com>
author carl
date Sun, 12 Mar 2006 10:15:39 -0800
parents c1280cd3e248
children d9d2f8699621
line wrap: on
line diff
--- a/src/scanner.cpp	Sun Jan 08 10:27:24 2006 -0800
+++ b/src/scanner.cpp	Sun Mar 12 10:15:39 2006 -0800
@@ -92,6 +92,7 @@
     fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_);
     void push(u_char *buf, int len);
     void pusher();
+	void validhost();
     void error(char *err);
 };
 
@@ -1141,10 +1142,11 @@
 ////////////////////////////////////////////////
 //
 //
-recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_) {
+recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_) {
     priv          = priv_;
     html_tags     = &html_tags_;
     tlds          = &tlds_;
+	cctlds		  = &cctlds_;
     bad_html_tags = 0;
     binary_tags   = 0;
 }
@@ -1197,6 +1199,29 @@
     count = 0;
 }
 
+void fsa::validhost () {
+	// remove trailing dot
+	if (pending[count-1]== '.') pending[--count] = '\0';
+	if (!strchr((const char *)pending, '@')) {
+		// not an email address or message id
+		char *p1 = strchr((const char *)pending, '.');
+		char *p2 = strrchr((const char *)pending, '.');
+		char *p3 = strstr((const char *)pending, "..");
+		if (p1 && (p1 != p2) & !p3) {
+			// have two periods, so at least three components, and no empty components
+			in_addr ip;
+			if (inet_aton((const char*)pending, &ip))
+				memory->new_url((char*)pending);
+			else {
+				for (int i=0; i<count; i++) pending[i] = tolower(pending[i]);
+				// is last component a tld?
+				string_set::iterator i = memory->get_tlds()->find(p2+1);
+				if (i != memory->get_tlds()->end()) memory->new_url((char*)pending);
+			}
+		}
+	}
+}
+
 void fsa::push(u_char *buf, int len) {
     for (int i=0; i<len; i++) {
         if (count == (PENDING_LIMIT-1)) error(NULL);
@@ -1209,19 +1234,11 @@
             //////////////////////////////
             //  host name recognizer
             case h_end: {
+				if (count > 5) {
+					// need some minimal length host name
+					//otherwise binary files likely to generate false positives
                 pending[--count] = '\0';  // null terminate host name by overwriting the terminator
-                if (!strchr((const char *)pending, '@')) {
-                    // not an email address or message id
-                    char *p1 = strchr((const char *)pending, '.');
-                    char *p2 = strrchr((const char *)pending, '.');
-                    char *p3 = strstr((const char *)pending, "..");
-                    if (p1 && (p1 != p2) & !p3) {
-                        // have two periods, so at least three components, and no empty components
-                        for (int i=0; i<count; i++) pending[i] = tolower(pending[i]);
-                        // is last component a tld?
-                        string_set::iterator i = memory->get_tlds()->find(p2+1);
-                        if (i != memory->get_tlds()->end()) memory->new_url((char*)pending);
-                    }
+					validhost();
                 }
                 st = h_init;
                 } // fall thru
@@ -1265,18 +1282,16 @@
             //////////////////////////////
             //  url recognizer
             case u_reco: {
-                if (count > 13) {   // need some minimal length host name after the protocol
+				if (count > 11) {	// need some minimal length host name after the protocol
                     pending[--count] = '\0';  // null terminate host name by overwriting the terminator
+					// must start with protocol
+					if (strncasecmp((const char *)pending, "http", 4) == 0) {
                     char *p = strrchr((const char *)pending, '/');
-                    if (p                &&                                     // have a leading /
-                        strchr(p, '.')   &&                                     // require at least one . in a dns name
-                        !strstr(p, "..") &&                                     // no empty components in the dns name
-                        (strncasecmp((const char *)pending, "http", 4) == 0)) { // must start with protocol
-                        // we seem to have a host name
-                        p++;                    // skip the last /
-                        int c = strlen(p);
-                        for (int i=0; i<c; i++) p[i] = tolower(p[i]);
-                        memory->new_url(p);     // record it
+						if (p) {
+							count = strlen(p+1);
+							memmove(pending, p+1, count+1);
+							validhost();
+						}
                     }
                 }
                 st = u_init;