changeset 117:aa07452e641b

uribl patch from Jeff Evans <jeffe@tricab.com>
author carl
date Sun, 12 Mar 2006 10:15:39 -0800
parents 0094678a16d0
children 13fcb0c66763
files AUTHORS ChangeLog Makefile.am NEWS dnsbl.conf dnsbl.spec.in package src/context.cpp src/context.h src/dnsbl.cpp src/scanner.cpp src/scanner.h
diffstat 12 files changed, 1532 insertions(+), 1397 deletions(-) [+]
line wrap: on
line diff
--- a/AUTHORS	Sun Jan 08 10:27:24 2006 -0800
+++ b/AUTHORS	Sun Mar 12 10:15:39 2006 -0800
@@ -5,3 +5,4 @@
     John Gunkel <jgunkel@palliser.ca>
     Nigel Horne <njh@bandsman.co.uk>
     Stephen Johnson <stephen.johnson@arkansas.gov>
+    Jeff Evans <jeffe@tricab.com>
--- a/ChangeLog	Sun Jan 08 10:27:24 2006 -0800
+++ b/ChangeLog	Sun Mar 12 10:15:39 2006 -0800
@@ -1,5 +1,10 @@
     $Id$
 
+5.13 2006-03-12
+    patch from Jeff Evans <jeffe@tricab.com>
+    add SURBL/URIBL lookups, remove trailing dots from hostnames,
+    allow ip address literals as hostnames.
+
 5.12 2006-01-08
     Use larger resolver buffer to accomodate spammers with many name
     servers.  A current example is life-all.com which needs to retry in
--- a/Makefile.am	Sun Jan 08 10:27:24 2006 -0800
+++ b/Makefile.am	Sun Mar 12 10:15:39 2006 -0800
@@ -1,7 +1,7 @@
 SUBDIRS = src man html info
 hackdir = $(sysconfdir)/dnsbl
 hack_SCRIPTS = dnsbl
-hack_DATA = dnsbl.conf hosts-ignore.conf html-tags.conf tld.conf
+hack_DATA = dnsbl.conf hosts-ignore.conf html-tags.conf tld.conf cctld.conf
 CLEANFILES = dnsbl xml/dnsbl xml/Makefile
 EXTRA_DIST = dnsbl.rc $(hack_DATA) dnsbl.spec $(wildcard xml/h*) $(wildcard xml/M*) $(wildcard xml/d*)
 
--- a/NEWS	Sun Jan 08 10:27:24 2006 -0800
+++ b/NEWS	Sun Mar 12 10:15:39 2006 -0800
@@ -1,5 +1,6 @@
     $Id$
 
+5.13 2006-03-12 add SURBL/URIBL lookups, patch from Jeff Evans <jeffe@tricab.com>
 5.12 2006-01-08 use larger resolver buffer to accomodate spammers with many name servers
 5.11 2005-12-20 switch to autoconf/automake/docbook
 5.10 2005-10-16 fix compile error on FC3
--- a/dnsbl.conf	Sun Jan 08 10:27:24 2006 -0800
+++ b/dnsbl.conf	Sun Mar 12 10:15:39 2006 -0800
@@ -9,7 +9,8 @@
         filter    sbl-xbl.spamhaus.org        "Mail containing %s rejected - sbl; see http://www.spamhaus.org/query/bl?ip=%s";
         ignore    { include "hosts-ignore.conf"; };
         tld       { include "tld.conf"; };
-        html_tags { include "html-tags.conf"; };
+        cctld       { include "cctld.conf"; };
+#        html_tags { include "html-tags.conf"; };
         html_limit off;
         host_limit soft 20;
     };
--- a/dnsbl.spec.in	Sun Jan 08 10:27:24 2006 -0800
+++ b/dnsbl.spec.in	Sun Mar 12 10:15:39 2006 -0800
@@ -97,16 +97,18 @@
 %docdir %{_datadir}/doc/@PACKAGE@-@VERSION@
 %{_datadir}/doc/@PACKAGE@-@VERSION@
 %config(noreplace) %{_sysconfdir}/@PACKAGE@
-%config(noreplace) %{_sysconfdir}/@PACKAGE@/*.conf
 /etc/rc.d/init.d/@PACKAGE@
 %dir %attr(0750,@PACKAGE@,root) /var/run/@PACKAGE@
 
 
 %changelog
-* Sun Dec 18 2005 Carl Byington 1.0
+* Fri Mar 10 2006 Carl Byington 5.13
+- remove redundant entry in files section
+
+* Sun Dec 18 2005 Carl Byington 5.11
 - use autoconf and http://www.fedora.us/docs/rpm-packaging-guidelines.html
 
-* Tue Jan 03 2005 Carl Byington 1.4
+* Tue Jan 03 2005 Carl Byington 4.0
 - added hosts-ignore conf file
 - see RELEASE_NOTES
 
--- a/package	Sun Jan 08 10:27:24 2006 -0800
+++ b/package	Sun Mar 12 10:15:39 2006 -0800
@@ -7,6 +7,7 @@
 web=/home/httpd/html/510sg/$NAME
 distlog=/tmp/distcheck
 
+mkdir -p $web
 chown --recursive root:root *
 make -f *cvs
 ./configure >/dev/null
@@ -43,6 +44,7 @@
         # add packages to the web site
         wp=$web/packages
         wp4=$wp/centos4
+        mkdir -p $wp4
         rp=/usr/src/redhat
         mv -f $BALL $wp
         scp $target:$rp/SRPMS/$NAME-$VER*rpm $wp
--- a/src/context.cpp	Sun Jan 08 10:27:24 2006 -0800
+++ b/src/context.cpp	Sun Mar 12 10:15:39 2006 -0800
@@ -50,6 +50,7 @@
 char *token_soft;
 char *token_substitute;
 char *token_tld;
+char *token_cctld;
 char *token_unknown;
 char *token_verify;
 char *token_white;
@@ -445,12 +446,12 @@
 
 CONTEXTP CONFIG::find_context(char *to) {
 	context_map::iterator i = env_to.find(to);
-	if (i != env_to.end()) return (*i).second;		// found user@domain.tld key
+	if (i != env_to.end()) return (*i).second;		// found user@domain key
 	char *x = strchr(to, '@');
 	if (x) {
 		x++;
 		i = env_to.find(x);
-		if (i != env_to.end()) return (*i).second;	// found domain.tld key
+		if (i != env_to.end()) return (*i).second;	// found domain key
 		char y = *x;
 		*x = '\0';
 		i = env_to.find(to);
@@ -553,13 +554,13 @@
 char *CONTEXT::find_from(char *from) {
 	char *rc = token_inherit;
 	string_map::iterator i = env_from.find(from);
-	if (i != env_from.end()) rc = (*i).second;	// found user@domain.tld key
+	if (i != env_from.end()) rc = (*i).second;	// found user@domain key
 	else {
 		char *x = strchr(from, '@');
 		if (x) {
 			x++;
 			i = env_from.find(x);
-			if (i != env_from.end()) rc = (*i).second;	// found domain.tld key
+			if (i != env_from.end()) rc = (*i).second;	// found domain key
 			else {
 				char y = *x;
 				*x = '\0';
@@ -577,12 +578,12 @@
 
 CONTEXTP CONTEXT::find_context(char *from) {
 	context_map::iterator i = env_from_context.find(from);
-	if (i != env_from_context.end()) return (*i).second;		// found user@domain.tld key
+	if (i != env_from_context.end()) return (*i).second;		// found user@domain key
 	char *x = strchr(from, '@');
 	if (x) {
 		x++;
 		i = env_from_context.find(x);
-		if (i != env_from_context.end()) return (*i).second;	// found domain.tld key
+		if (i != env_from_context.end()) return (*i).second;	// found domain key
 		char y = *x;
 		*x = '\0';
 		i = env_from_context.find(from);
@@ -626,6 +627,11 @@
 }
 
 
+string_set& CONTEXT::get_content_cctlds() {
+	if (content_cctlds.empty() && parent) return parent->get_content_cctlds();
+	return content_cctlds;
+}
+
 string_set& CONTEXT::get_content_tlds() {
 	if (content_tlds.empty() && parent) return parent->get_content_tlds();
 	return content_tlds;
@@ -693,6 +699,14 @@
 			}
 			printf("%s         }; \n", indent);
 		}
+		if (!content_cctlds.empty()) {
+			printf("%s         cctld { \n", indent);
+			printf("%s             ", indent);
+			for (string_set::iterator i=content_cctlds.begin(); i!=content_cctlds.end(); i++) {
+				printf("%s; ", *i);
+			}
+			printf("\n%s         }; \n", indent);
+		}
 		if (!content_tlds.empty()) {
 			printf("%s         tld { \n", indent);
 			printf("%s             ", indent);
@@ -887,6 +901,16 @@
 			}
 			if (!tsa(tok, token_semi)) return false;
 		}
+		else if (have == token_cctld) {
+			if (!tsa(tok, token_lbrace)) return false;
+			while (true) {
+				char *have = tok.next();
+				if (!have) break;
+				if (have == token_rbrace) break;  // done
+				me.add_cctld(have);
+			}
+			if (!tsa(tok, token_semi)) return false;
+		}
 		else if (have == token_tld) {
 			if (!tsa(tok, token_lbrace)) return false;
 			while (true) {
@@ -1228,6 +1252,7 @@
 //
 void token_init() {
 	token_black 	 = register_string("black");
+	token_cctld 	 = register_string("cctld");
 	token_content	 = register_string("content");
 	token_context	 = register_string("context");
 	token_dccfrom	 = register_string("dcc_from");
--- a/src/context.h	Sun Jan 08 10:27:24 2006 -0800
+++ b/src/context.h	Sun Mar 12 10:15:39 2006 -0800
@@ -100,6 +100,7 @@
 	char *			content_message;	// ""
 	string_set		content_host_ignore;// hosts to ignore for content sbl checking
 	string_set		content_tlds;		//
+	string_set		content_cctlds; 	//
 	string_set		html_tags;			// set of valid html tags
 	int 			host_limit; 		// limit on host names
 	char *			host_limit_message; // error message for excessive host names
@@ -136,6 +137,7 @@
 	void		set_content_message(char *message)			{content_message   = message;};
 	void		add_ignore(char *host)						{content_host_ignore.insert(host);};
 	void		add_tld(char *tld)							{content_tlds.insert(tld);};
+	void		add_cctld(char *cctld)						{content_cctlds.insert(cctld);};
 
 	void		set_host_limit(int limit)					{host_limit 		= limit;};
 	void		set_host_message(char *message) 			{host_limit_message = message;};
@@ -155,6 +157,7 @@
 	char*			get_content_message();
 	string_set& 	get_content_host_ignore();
 	string_set& 	get_content_tlds();
+	string_set& 	get_content_cctlds();
 	string_set& 	get_html_tags();
 	dnsblp_list&	get_dnsbl_list();
 
@@ -186,6 +189,7 @@
 };
 
 extern char *token_black;
+extern char *token_cctld;
 extern char *token_content;
 extern char *token_context;
 extern char *token_dccfrom;
@@ -206,8 +210,8 @@
 extern char *token_mailhost;
 extern char *token_many;
 extern char *token_off;
+extern char *token_ok2;
 extern char *token_ok;
-extern char *token_ok2;
 extern char *token_on;
 extern char *token_rbrace;
 extern char *token_semi;
--- a/src/dnsbl.cpp	Sun Jan 08 10:27:24 2006 -0800
+++ b/src/dnsbl.cpp	Sun Mar 12 10:15:39 2006 -0800
@@ -365,7 +365,7 @@
 	if (!memory) {
 		// first recipient that needs content filtering sets all
 		// the content filtering parameters
-		memory		  = new recorder(this, con.get_html_tags(), con.get_content_tlds());
+		memory		  = new recorder(this, con.get_html_tags(), con.get_content_tlds(), con.get_content_cctlds());
 		scanner 	  = new url_scanner(memory);
 		content_suffix		= con.get_content_suffix();
 		content_message 	= con.get_content_message();
@@ -650,6 +650,83 @@
 
 
 ////////////////////////////////////////////////
+//	lookup the domain name part of a hostname on two lists
+//
+bool uriblookup(mlfiPriv &priv ,char *hostname, char *top) ;
+bool uriblookup(mlfiPriv &priv, char *hostname, char *top) {
+	// top is pointer to '.' char at end of base domain, or null for ip address form
+	// so for hostname of www.fred.mydomain.co.uk
+	// top points to-----------------------^
+	// and we end up looking at only mydomain.co.uk, ignoring the www.fred stuff
+	char buf[maxlen];
+	char buf2[maxlen];
+	const char *uriblname[2] = { "multi.surbl.org", "multi.uribl.com" };
+
+	if (top) {
+		// add one more component
+		*top = '\0';
+		char *x = strrchr(hostname, '.');
+		if (x) hostname = x+1;
+		*top = '.';
+	}
+	for (int i=0; i<2; i++) {
+		snprintf(buf, sizeof(buf), "%s.%s", hostname, uriblname[i]);
+		if (debug_syslog > 2) {
+			char tmp[maxlen];
+			snprintf(tmp, sizeof(tmp), "Looking up %s on %s", hostname, uriblname[i]);
+			my_syslog(tmp);
+		}
+		if (dns_interface(priv, buf, false, NULL)) return true;
+	}
+	return false;
+}
+
+
+////////////////////////////////////////////////
+// uribl checker
+// -------------
+// hostname MUST not have a trailing dot
+// If tld, two level lookup.
+// Else, look up three level domain.
+bool check_uribl(mlfiPriv &priv, char *hostname) ;
+bool check_uribl(mlfiPriv &priv, char *hostname) {
+	in_addr ip;
+	if (inet_aton(hostname, &ip)) {
+		char adr[sizeof "255.255.255.255"];
+		adr[0] = '\0';
+		inet_ntop(AF_INET, (const u_char *)&ip, adr, sizeof(adr));
+		return (uriblookup(priv, adr, NULL));
+	}
+
+	char *top, *top2, *top3;
+	top = strrchr(hostname, '.');
+	if (top) {
+		*top = '\0';
+		top2 = strrchr(hostname, '.');
+		*top = '.';
+
+		if (top2) {
+			string_set::iterator i = priv.memory->get_cctlds()->find(top2+1);
+			string_set::iterator x = priv.memory->get_cctlds()->end();
+			// if we have a 2-level-cctld, just look at top three levels of the name
+			if (i != x) return uriblookup(priv, hostname, top2);
+
+			*top2 = '\0';
+			top3 = strrchr(hostname, '.');
+			*top2 = '.';
+
+			// if we have more than 3 levels in the name, look at the top three levels of the name
+			if (top3 && uriblookup(priv, hostname, top2)) return true;
+			// if that was not found, fall thru to looking at the top two levels
+		}
+		// look at the top two levels of the name
+		return uriblookup(priv, hostname, top);
+	}
+	return false;
+}
+
+
+////////////////////////////////////////////////
 //	check the hosts from the body against the content dnsbl
 //
 bool check_hosts(mlfiPriv &priv, bool random, int limit, char *&host, int &ip);
@@ -700,10 +777,11 @@
 		if (ip) {
 			int_set::iterator i = ips.find(ip);
 			if (i == ips.end()) {
+				// we haven't looked this up yet
 				ips.insert(ip);
-				if (check_single(priv, ip, suffix)) {
-					return true;
-				}
+				if (check_single(priv, ip, suffix)) return true;
+				// Check uribl & surbl
+				if (check_uribl(priv, host)) return true;
 			}
 		}
 	}
--- a/src/scanner.cpp	Sun Jan 08 10:27:24 2006 -0800
+++ b/src/scanner.cpp	Sun Mar 12 10:15:39 2006 -0800
@@ -92,6 +92,7 @@
     fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_);
     void push(u_char *buf, int len);
     void pusher();
+	void validhost();
     void error(char *err);
 };
 
@@ -1141,10 +1142,11 @@
 ////////////////////////////////////////////////
 //
 //
-recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_) {
+recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_) {
     priv          = priv_;
     html_tags     = &html_tags_;
     tlds          = &tlds_;
+	cctlds		  = &cctlds_;
     bad_html_tags = 0;
     binary_tags   = 0;
 }
@@ -1197,6 +1199,29 @@
     count = 0;
 }
 
+void fsa::validhost () {
+	// remove trailing dot
+	if (pending[count-1]== '.') pending[--count] = '\0';
+	if (!strchr((const char *)pending, '@')) {
+		// not an email address or message id
+		char *p1 = strchr((const char *)pending, '.');
+		char *p2 = strrchr((const char *)pending, '.');
+		char *p3 = strstr((const char *)pending, "..");
+		if (p1 && (p1 != p2) & !p3) {
+			// have two periods, so at least three components, and no empty components
+			in_addr ip;
+			if (inet_aton((const char*)pending, &ip))
+				memory->new_url((char*)pending);
+			else {
+				for (int i=0; i<count; i++) pending[i] = tolower(pending[i]);
+				// is last component a tld?
+				string_set::iterator i = memory->get_tlds()->find(p2+1);
+				if (i != memory->get_tlds()->end()) memory->new_url((char*)pending);
+			}
+		}
+	}
+}
+
 void fsa::push(u_char *buf, int len) {
     for (int i=0; i<len; i++) {
         if (count == (PENDING_LIMIT-1)) error(NULL);
@@ -1209,19 +1234,11 @@
             //////////////////////////////
             //  host name recognizer
             case h_end: {
+				if (count > 5) {
+					// need some minimal length host name
+					//otherwise binary files likely to generate false positives
                 pending[--count] = '\0';  // null terminate host name by overwriting the terminator
-                if (!strchr((const char *)pending, '@')) {
-                    // not an email address or message id
-                    char *p1 = strchr((const char *)pending, '.');
-                    char *p2 = strrchr((const char *)pending, '.');
-                    char *p3 = strstr((const char *)pending, "..");
-                    if (p1 && (p1 != p2) & !p3) {
-                        // have two periods, so at least three components, and no empty components
-                        for (int i=0; i<count; i++) pending[i] = tolower(pending[i]);
-                        // is last component a tld?
-                        string_set::iterator i = memory->get_tlds()->find(p2+1);
-                        if (i != memory->get_tlds()->end()) memory->new_url((char*)pending);
-                    }
+					validhost();
                 }
                 st = h_init;
                 } // fall thru
@@ -1265,18 +1282,16 @@
             //////////////////////////////
             //  url recognizer
             case u_reco: {
-                if (count > 13) {   // need some minimal length host name after the protocol
+				if (count > 11) {	// need some minimal length host name after the protocol
                     pending[--count] = '\0';  // null terminate host name by overwriting the terminator
+					// must start with protocol
+					if (strncasecmp((const char *)pending, "http", 4) == 0) {
                     char *p = strrchr((const char *)pending, '/');
-                    if (p                &&                                     // have a leading /
-                        strchr(p, '.')   &&                                     // require at least one . in a dns name
-                        !strstr(p, "..") &&                                     // no empty components in the dns name
-                        (strncasecmp((const char *)pending, "http", 4) == 0)) { // must start with protocol
-                        // we seem to have a host name
-                        p++;                    // skip the last /
-                        int c = strlen(p);
-                        for (int i=0; i<c; i++) p[i] = tolower(p[i]);
-                        memory->new_url(p);     // record it
+						if (p) {
+							count = strlen(p+1);
+							memmove(pending, p+1, count+1);
+							validhost();
+						}
                     }
                 }
                 st = u_init;
--- a/src/scanner.h	Sun Jan 08 10:27:24 2006 -0800
+++ b/src/scanner.h	Sun Mar 12 10:15:39 2006 -0800
@@ -11,18 +11,20 @@
     mlfiPriv    *priv;      // needed for syslog
     string_set  *html_tags; // valid tags
     string_set  *tlds;      // valid tlds
+	string_set	*cctlds;	  // valid cctlds
     string_set  hosts;
     int         bad_html_tags;
     int         binary_tags;
 
 public:
-    recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_);
+	recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_);
     ~recorder()                                 { empty(); };
     void empty();
     void new_url(char *host);
     void new_tag(char *tag);
     void binary();
     mlfiPriv   *get_priv()                      { return priv;                                                                      };
+	string_set *get_cctlds()					{ return cctlds;																	};
     string_set *get_tlds()                      { return tlds;                                                                      };
     string_set &get_hosts()                     { return hosts;                                                                     };
     bool        excessive_bad_tags(int limit)   { return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };