/* $Id: organize-helpers.h,v 1.8 2002/09/30 11:49:09 arto Exp $
 * 
 * Copyright Arto Ters <arto.teras@hip.fi> 2002
 */

#ifndef _ORGANIZE_HELPERS_H
#define _ORGANIZE_HELPERS_H

#include "dbmailtypes.h"
#include <stdio.h>
#include <bow/libbow.h>

#define MAX_FILENAME_LENGTH 255

#define DOC_MSG_EMPTY 18446744073709551615 /* Biggest u64_t value */
#define MSGUID_INVALID 18446744073709551615 /* Biggest u64_t value */

enum MAIL_ENCODINGS { M_ENC_7BIT, M_ENC_8BIT, M_ENC_QP, 
		      M_ENC_BASE64, M_ENC_UNKNOWN};

/* Several charsets are handled together (for instance the different
   ISO_8859 variants), because they all all processed similarly. */
enum MAIL_CHARSETS {M_CSET_US_ASCII, M_CSET_ISO_8859, 
		    M_CSET_WINDOWS_125X, M_CSET_UNKNOWN};

enum MAIL_CONTENT_TYPES {M_CT_TEXT, M_CT_HTML, M_CT_MSWORD, M_CT_PDF, 
			 M_CT_BINARY, M_CT_MULTIPART, M_CT_UNKNOWN};


typedef struct
{
  int size;       /* Number of space allocated */
  int num_docs;   /* Number of non-empty mappings */
  u64_t *entries; /* Array of entries */
} doc_msg_map;

typedef struct
{
  int content_type;
  int encoding;
  int charset;
} message_data;

typedef struct
{
  u64_t uid;
  double value;
} similarity_data;
  
typedef struct
{
  int count;
  similarity_data entry[0];
} similar_msgs;

/* The following is needed to keep track of counts in how many docs a
   particular word appears because the Bow library doesn't provide
   this functionality. */
#define DOCS_PER_WORD_EMPTY 18446744073709551615 /* Biggest u64_t value */
typedef struct
{
  int size;
  int *word_map_doc_counts;
  u64_t *word_map_last_doc;
} docs_per_word;



/* All these may not be implemented yet, see code */
typedef struct
{
  int total_mails;
  int total_parts;
  int mails_valid_for_analysis;
  int parts_valid_for_analysis;
  int parts_unknown_encoding;
  int parts_unknown_charset;
  int parts_unknown_contenttype;
  int parts_unsupported_encoding;
  int parts_unsupported_charset;
  int parts_unsupported_contenttype;

  u64_t total_content_bytes;
  u64_t bytes_valid_for_analysis;
  u64_t total_words;
  int total_different_words;
  int total_different_words_pruned;
 
  float total_time; /* Seconds, user time */
  float total_elapsed_time; 
  float time_building_vocabulary_user;
  float time_building_vocabulary_elapsed;
  float time_pruning_vocabulary_user;    
  float time_pruning_vocabulary_elapsed;    
  float time_building_wordvectors_user;  
  float time_building_wordvectors_elapsed;  
  float time_weighting_wordvectors_user; 
  float time_weighting_wordvectors_elapsed; 
  float time_storing_database_user; 
  float time_storing_database_elapsed; 
} organize_statistics;             

/* Function prototypes */
char * strcasestr(char* haystack, char* needle);
int check_content_type(mime_message_t *msg, message_data *contents, 
		       organize_statistics *stats);
u64_t convert_qp_8bit(unsigned char *str);
char* get_text_contents(u64_t msguid, mime_message_t *msgdata,
			organize_statistics *stats);
void dump_message_data(u64_t msguid, mime_message_t *msgdata, int vlevel);

void dump_wordcounts(FILE *f);
void dump_wordvectors(bow_di2wv *di2wv, doc_msg_map *map, int numwords, u64_t userid, FILE *f);
void dump_similaritydata(bow_di2wv *di2wv, doc_msg_map *map, u64_t msguid1, 
			 u64_t msguid2, int loglevel);

FILE* open_file_for_writing(char *fullname, char *dirname, char *type,
			   char *user, char *ending);
int get_and_init_mailbox(u64_t mbnum, mailbox_t *mailbox, u64_t userid, int verbose);
int init_msgdata(mime_message_t *msgdata, u64_t msgnum);

/* To keep track of docs per word counts during vocabulary build */
docs_per_word *docs_per_word_new(int capacity);
int docs_per_word_add_ifnotlast(docs_per_word *doccounts, int wordindex, u64_t msguid);
void docs_per_word_free(docs_per_word *doccounts);

/* To maintain mappings between Bow documents and dbmail message uids. */
doc_msg_map *doc_msg_map_new(int capacity);
int docindex_add_mapping(doc_msg_map *map, u64_t msguid, int di);
int docindex_clear_mappings(doc_msg_map *map);
u64_t docindex_get_msguid(doc_msg_map *map, int docindex);
int docindex_get_di(doc_msg_map *map, u64_t msguid);
void doc_msg_map_free(doc_msg_map *map);

int is_duplicate(u64_t uid1, u64_t uid2);
similar_msgs* get_similar_messages_bruteforce(bow_di2wv *di2wv, doc_msg_map *map,
					      u64_t msguid, int num);
#endif
