view src/scanner.h @ 117:aa07452e641b

uribl patch from Jeff Evans <jeffe@tricab.com>
author carl
date Sun, 12 Mar 2006 10:15:39 -0800
parents 81f1e400e8ab
children ecb40aa3eaa5
line wrap: on
line source

#ifndef scanner_include
#define scanner_include

#include "dnsbl.h"

////////////////////////////////////////////////
// memory for the content scanner
//
class recorder
{
	mlfiPriv	*priv;		// needed for syslog
	string_set	*html_tags; // valid tags
	string_set	*tlds;		// valid tlds
	string_set	*cctlds;	  // valid cctlds
	string_set	hosts;
	int 		bad_html_tags;
	int 		binary_tags;

public:
	recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_);
	~recorder() 								{ empty(); };
	void empty();
	void new_url(char *host);
	void new_tag(char *tag);
	void binary();
	mlfiPriv   *get_priv()						{ return priv;																		};
	string_set *get_cctlds()					{ return cctlds;																	};
	string_set *get_tlds()						{ return tlds;																		};
	string_set &get_hosts() 					{ return hosts; 																	};
	bool		excessive_bad_tags(int limit)	{ return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
	bool		excessive_hosts(int limit)		{ return (limit > 0) && (hosts.size() > limit); 									};
};


////////////////////////////////////////////////
// the content scanner
//
class fsa;
class url_scanner {
	fsa *host_parser;
	fsa *tags_parser;
	fsa *urls_parser;
	fsa *urld_parser;
	fsa *html_parser;
	fsa *mime_parser;
	fsa *b64_parser;
	fsa *uu_parser;

public:
	url_scanner(recorder *memory);
	~url_scanner();
	void scan(u_char *buffer, size_t length);
};

#endif