diff src/dnsbl.cpp @ 270:f92f24950bd3 stable-6-0-35

Use mozilla prefix list for tld checking, Enable surbl/uribl/dbl rhs lists
author Carl Byington <carl@five-ten-sg.com>
date Mon, 09 Sep 2013 15:15:53 -0700
parents f941563c2a95
children a99b6c1f5f67
line wrap: on
line diff
--- a/src/dnsbl.cpp	Wed May 22 11:34:37 2013 -0700
+++ b/src/dnsbl.cpp	Mon Sep 09 15:15:53 2013 -0700
@@ -1,6 +1,6 @@
 /*
 
-Copyright (c) 2009 Carl Byington - 510 Software Group, released under
+Copyright (c) 2013 Carl Byington - 510 Software Group, released under
 the GPL version 3 or any later version at your choice available at
 http://www.gnu.org/licenses/gpl-3.0.txt
 
@@ -389,27 +389,23 @@
 
 
 ////////////////////////////////////////////////
-//  lookup the domain name part of a hostname on the uribl
+//  lookup a hostname on the uribl
 //
-//  if we find part of the hostname on the uribl, return
-//  true and point found to the part of the hostname that we found
+//  if we find hostname on the uribl, return true and point found to hostname
 //  as a string registered in hosts.
 //  otherwise, return false and preserve the value of found.
 //
-bool uriblookup(mlfiPriv &priv, string_set &hosts, const char *hostname, const char *top, const char *&found) ;
-bool uriblookup(mlfiPriv &priv, string_set &hosts, const char *hostname, const char *top, const char *&found) {
-    // top is pointer to '.' char at end of base domain, or null for ip address form
-    // so for hostname of www.fred.mydomain.co.uk
-    // top points to-----------------------^
-    // and we end up looking at only mydomain.co.uk, ignoring the www.fred stuff
+bool uriblookup(mlfiPriv &priv, string_set &hosts, const char *hostname, const char *&found) ;
+bool uriblookup(mlfiPriv &priv, string_set &hosts, const char *hostname, const char *&found) {
+    if (debug_syslog > 4) {
+        char tmp[maxlen];
+        snprintf(tmp, sizeof(tmp), "looking for %s on %s", hostname, priv.uribl_suffix);
+        my_syslog(tmp);
+    }
     char buf[maxlen];
-    if (top) {
-        // add one more component
-        const char *x = (const char *)memrchr(hostname, '.', top-hostname);
-        if (x) hostname = x+1;
-    }
     snprintf(buf, sizeof(buf), "%s.%s.", hostname, priv.uribl_suffix);
-    if (dns_interface(priv, buf, false, NULL)) {
+    uint32_t ip = ntohl(dns_interface(priv, buf, false, NULL));
+    if (ip and (ip != 0x7f000000)) {
         if (debug_syslog > 2) {
             char tmp[maxlen];
             snprintf(tmp, sizeof(tmp), "found %s on %s", hostname, priv.uribl_suffix);
@@ -425,49 +421,60 @@
 ////////////////////////////////////////////////
 //  uribl checker
 //  -------------
-//  hostname MUST not have a trailing dot
-//  If tld, two level lookup.
-//  Else, look up three level domain.
-//
-//  if we find part of the hostname on the uribl, return
-//  true and point found to the part of the hostname that we found
-//  as a string registered in hosts.
-//  otherwise, return false and preserve the value of found.
+// hostname MUST not have a trailing dot. Find the tld part of
+// the hostname, and add one more level. If that is listed on
+// the uribl, return true and point found to the part of the
+// hostname that we found as a string registered in hosts.
+// Otherwise, return false and preserve the value of found.
 //
 bool check_uribl(mlfiPriv &priv, string_set &hosts, const char *hostname, const char *&found) ;
 bool check_uribl(mlfiPriv &priv, string_set &hosts, const char *hostname, const char *&found) {
     in_addr ip;
-    if (inet_aton(hostname, &ip)) {
-        const u_char *src = (const u_char *)&ip.s_addr;
-        if (src[0] == 127) return false;    // don't do dns lookups on localhost
-        if (src[0] == 10)  return false;    // don't do dns lookups on rfc1918 space
-        if ((src[0] == 192) && (src[1] == 168)) return false;
-        if ((src[0] == 172) && (16 <= src[1]) && (src[1] <= 31)) return false;
-        char adr[sizeof "255.255.255.255   "];
-        snprintf(adr, sizeof(adr), "%u.%u.%u.%u", src[3], src[2], src[1], src[0]);
-        // cannot use inet_ntop here since we want the octets reversed.
-        return (uriblookup(priv, hosts, adr, NULL, found));
+    if (inet_aton(hostname, &ip)) return false; // don't check ip addresses in uribls
+    const char* components[maxlen];
+    int n = 0;  // number of components in the hostname
+    while (n < maxlen) {
+        components[n++] = hostname;
+        const char *c = strchr(hostname, '.');
+        if (!c) break;
+        hostname = c+1;
+    }
+    string_set *tlds     = priv.memory->get_tlds();
+    string_set *tldwilds = priv.memory->get_tldwilds();
+    string_set *tldnots  = priv.memory->get_tldnots();
+    string_set::iterator xtlds     = tlds->end();
+    string_set::iterator xtldwilds = tldwilds->end();
+    string_set::iterator xtldnots  = tldnots->end();
+    for (int i=max(0,n-4); i<n; i++) {
+        const char* name = components[i];
+        bool rc = false;
+        string_set::iterator tt = tldnots->find(name);
+        if (tt != xtldnots) {
+            rc = true;
     }
-
-    const char *top, *top2, *top3;
-    top = strrchr(hostname, '.');
-    if (top) {
-        top2 = (const char *)memrchr(hostname, '.', top-hostname);
-
-        if (top2) {
-            string_set::iterator i = priv.memory->get_cctlds()->find(top2+1);
-            string_set::iterator x = priv.memory->get_cctlds()->end();
-            // if we have a 2-level-cctld, just look at top three levels of the name
-            if (i != x) return uriblookup(priv, hosts, hostname, top2, found);
-
-            // if we have more than 3 levels in the name, look at the top three levels of the name
-            top3 = (const char *)memrchr(hostname, '.', top2-hostname);
-            if (top3 && uriblookup(priv, hosts, hostname, top2, found)) return true;
-
-            // if that was not found, fall thru to looking at the top two levels
+        else {
+            tt = tldwilds->find(name);
+            if (tt != xtldwilds) {
+                if (i > 1) {
+                    rc = true;
+                    name = components[i-2];
+                }
+                else return false;
         }
-        // look at the top two levels of the name
-        return uriblookup(priv, hosts, hostname, top, found);
+            else {
+                tt = tlds->find(name);
+                if (tt != xtlds) {
+                    if (i > 0) {
+                        rc = true;
+                        name = components[i-1];
+                    }
+                    else return false;
+                }
+            }
+        }
+        if (rc) {
+            return uriblookup(priv, hosts, name, found);
+        }
     }
     return false;
 }
@@ -674,7 +681,7 @@
     if (!memory) {
         // first recipient that needs content filtering sets
         // some of the content filtering parameters
-        memory        = new recorder(this, con.get_html_tags(), con.get_content_tlds(), con.get_content_cctlds());
+        memory        = new recorder(this, con.get_html_tags(), con.get_content_tlds(), con.get_content_tldwilds(), con.get_content_tldnots());
         scanner       = new url_scanner(memory);
         content_suffix      = con.get_content_suffix();
         content_message     = con.get_content_message();