diff src/scanner.cpp @ 8:dbe18921f741

integration work on url scanner
author carl
date Thu, 22 Apr 2004 11:25:45 -0700
parents 93ff6d1ef647
children 8c65411cd7ab
line wrap: on
line diff
--- a/src/scanner.cpp	Thu Apr 22 08:38:07 2004 -0700
+++ b/src/scanner.cpp	Thu Apr 22 11:25:45 2004 -0700
@@ -1,30 +1,7 @@
-// normal stuff
-#include <stdio.h>
-#include <stdlib.h>
-
-// needed for std c++ collections
-#include <set>
-#include <map>
-#include <list>
-
-// for the dns resolver
-#include <netinet/in.h>
-#include <arpa/nameser.h>
-#include <resolv.h>
-
-// misc stuff needed here
-#include <ctype.h>
-#include <fstream>
-
-static char* version="$Id$";
+static char* scanner_version="$Id$";
 
 using namespace std;
 
-enum status {oksofar,   // not rejected yet
-             white,     // whitelisted by envelope from
-             black,     // blacklisted by envelope from or to
-             reject};   // rejected by a dns list
-
 enum state {// url  decoder states
             u_init,
             u_http,
@@ -841,23 +818,25 @@
     0,  // 0xff
 };
 
-#define PENDING_LIMIT 1000
+#define PENDING_LIMIT 100
 struct fsa {
     u_char  pending[PENDING_LIMIT];
     int     count;
     state   st;
     state   init;
     fsa*    next;
+    string_set  *urls;
 
-    fsa(state init, fsa* next_);
+    fsa(state init, fsa* next_, string_set *urls_);
     void push(u_char *buf, int len);
 };
 
-fsa::fsa(state init_, fsa* next_) {
+fsa::fsa(state init_, fsa *next_, string_set *urls_) {
     count = 0;
     st    = init_;
     init  = init_;
     next  = next_;
+    urls  = urls_;
 }
 
 void fsa::push(u_char *buf, int len) {
@@ -884,7 +863,9 @@
 
             case u_reco: {
                 pending[count-1] = 0;
-                if (strncasecmp((const char *)pending, "http://", 7) == 0) fprintf(stdout, "%s\n", pending);
+                if (strncasecmp((const char *)pending, "http://", 7) == 0) {
+                    urls->insert(strdup((const char *)pending+7));
+                }
                 }   // fall thru
 
             case u_init: {
@@ -979,75 +960,32 @@
     }
 }
 
-
+struct url_scanner {
+    fsa *urls_parser;
+    fsa *html_parser;
+    fsa *mime_parser;
+    fsa *b64_parser;
 
-////////////////////////////////////////////////
-//  ask a dns question and get an A record answer
-//
-static unsigned long dns_interface(char *question);
-static unsigned long dns_interface(char *question) {
-    u_char answer[NS_PACKETSZ];
-    int length = res_search(question, ns_c_in, ns_t_a, answer, sizeof(answer));
-    if (length < 0) return oksofar;     // error in getting answer
-    // parse the answer
-    ns_msg handle;
-    ns_rr  rr;
-    if (ns_initparse(answer, length, &handle) != 0) return oksofar;
-    int rrnum = 0;
-    while (ns_parserr(&handle, ns_s_an, rrnum++, &rr) == 0) {
-        if (ns_rr_type(rr) == ns_t_a) {
-            unsigned long address;
-            memcpy(&address, ns_rr_rdata(rr), sizeof(address));
-            return reject;
-        }
-    }
-    return 0;
+    url_scanner(string_set *urls);
+    ~url_scanner();
+    void scan(u_char *buffer, size_t length);
+};
+
+url_scanner::url_scanner(string_set *urls) {
+    urls_parser = new fsa(u_init, NULL,        urls);
+    html_parser = new fsa(e_init, urls_parser, NULL);
+    mime_parser = new fsa(m_init, html_parser, NULL);
+    b64_parser  = new fsa(b_init, mime_parser, NULL);
 }
 
-////////////////////////////////////////////////
-//  check a single dnsbl - we don't try very hard, just
-//  using the default resolver retry settings. If we cannot
-//  get an answer, we just accept the mail. The caller
-//  must ensure thread safety.
-//
-static status check_single(int ip, char *suffix);
-static status check_single(int ip, char *suffix) {
-    // make a dns question
-    const u_char *src = (const u_char *)&ip;
-    if (src[0] == 127) return oksofar;  // don't do dns lookups on localhost
-    char question[NS_MAXDNAME];
-    snprintf(question, sizeof(question), "%u.%u.%u.%u.%s.", src[3], src[2], src[1], src[0], suffix);
-    // ask the question, if we get an A record it implies a blacklisted ip address
-    unsigned long ans = dns_interface(question);
-    return (ans) ? reject : oksofar;
+url_scanner::~url_scanner() {
+    delete urls_parser;
+    delete html_parser;
+    delete mime_parser;
+    delete b64_parser;
 }
 
-
-////////////////////////////////////////////////
-//  scan a file for URLs
-//
-static void scan_file(char *fn, fsa& parser);
-static void scan_file(char *fn, fsa& parser) {
-    const int LINE_SIZE = 2000;
-    char line[LINE_SIZE];
-    ifstream is(fn);
-    while (!is.eof()) {
-        is.getline(line, LINE_SIZE-1);
-        int n = strlen(line);
-        line[n++] = '\n';
-        parser.push((u_char*)line, n);
-    }
-    is.close();
+void url_scanner::scan(u_char *buffer, size_t length) {
+    b64_parser->push(buffer, length);
 }
 
-
-int main(int argc, char**argv)
-{
-    char *fn = argv[1];
-    fsa *urls_parser = new fsa(u_init, NULL);
-    fsa *html_parser = new fsa(e_init, urls_parser);
-    fsa *mime_parser = new fsa(m_init, html_parser);
-    fsa *b64_parser  = new fsa(b_init, mime_parser);
-    if (fn) scan_file(fn, *b64_parser);
-    return 0;
-}