diff src/scanner.h @ 74:b7449114ebb0

start coding on new config syntax
author carl
date Sun, 10 Jul 2005 14:19:00 -0700
parents
children 1142e46be550
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/scanner.h	Sun Jul 10 14:19:00 2005 -0700
@@ -0,0 +1,138 @@
+#ifndef scanner_include
+#define scanner_include
+
+#include "dnsbl.h"
+
+////////////////////////////////////////////////
+// memory for the content scanner
+//
+class recorder
+{
+    mlfiPriv    *priv;      // needed for syslog
+    string_set  *html_tags; // valid tags
+    string_set  *tlds;      // valid tlds
+    string_set  hosts;
+    int         bad_html_tags;
+    int         binary_tags;
+
+public:
+    recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_);
+    ~recorder();
+    void empty();
+    void new_url(char *host);
+    void new_tag(char *tag);
+    void binary();
+    mlfiPriv   *get_priv()                      {return priv;                                                                      };
+    string_set *get_tlds()                      {return tlds;                                                                      };
+    string_set &get_hosts()                     {return hosts;                                                                     };
+    bool        excessive_bad_tags(int limit)   {return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
+    bool        excessive_hosts(int limit)      {return (limit > 0) && (hosts.size() > limit);                                     };
+};
+
+
+////////////////////////////////////////////////
+// finite state machine
+//
+enum state {// host name recognizer states
+            h_init,
+            h_host,
+
+            // html tag discarder states
+            t_init,
+            t_tag1,     // seen opening <
+            t_tag2,     // not comment
+            t_com1,     // seen !
+            t_com2,     // seen first -
+            t_com3,     // seen second -, looking for -->
+            t_com4,     // seen first -
+            t_com5,     // seen second -
+            t_disc,     // looking for closing >
+
+            // url recognizer states
+            u_init,
+            u_http,
+            u_sla,
+            u_url,
+
+            // url decoder states  %xx
+            d_init,
+            d_pcnt,
+            d_1,
+
+            // html entity decoder states &#nnn;
+            e_init,
+            e_amp,
+            e_num,
+
+            // mime decoder states =xx
+            m_init,
+            m_eq,
+            m_1,
+
+            // base64 decoder states
+            b_init,
+            b_lf,
+            b_lf2,
+            b_64,
+
+            // uuencoding decoder states
+            uu_init,
+            uu_lf,
+            uu_lf2,
+            uu_64,
+
+            // counter for number of columns in the table
+            end_state,
+
+            // temporary states
+            h_end,
+            t_bin,
+            t_end,
+            u_reco,
+            d_2,
+            e_semi,
+            m_2,
+            m_cr,
+            m_nl,
+            b_cr,
+            uu_cr
+           };
+
+#define PENDING_LIMIT 100
+class fsa {
+    u_char      pending[PENDING_LIMIT];
+    int         count;
+    state       st;
+    state       init;
+    fsa         *next1;
+    fsa         *next2;
+    recorder    *memory;
+
+public:
+    fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_);
+    void push(u_char *buf, int len);
+    void pusher();
+    void error(char *err);
+};
+
+
+////////////////////////////////////////////////
+// the content scanner
+//
+class url_scanner {
+    fsa *host_parser;
+    fsa *tags_parser;
+    fsa *urls_parser;
+    fsa *urld_parser;
+    fsa *html_parser;
+    fsa *mime_parser;
+    fsa *b64_parser;
+    fsa *uu_parser;
+
+public:
+    url_scanner(recorder *memory);
+    ~url_scanner();
+    void scan(u_char *buffer, size_t length);
+};
+
+#endif