/* $Id: organize.c,v 1.9 2002/09/30 11:47:55 arto Exp $
 *
 * Copyright Arto Ters <arto.teras@hip.fi> 2002
 *
 * A program to automatically organize messages based on their
 * contents.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *
 */

#include "organize.h"
#include "organize-helpers.h"
#include "db.h"
#include "debug.h"
#include "auth.h"
#include "list.h"
#include "dbmailtypes.h"
#include "rfcmsg.h"
#include "dbmsgbuf.h"

/* (More or less) standard C stuff */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/times.h>
#include <dirent.h>
#include <locale.h>
#include <errno.h>
extern int errno;

/* The bag of words library (a patched one...) */
#include <bow/libbow.h>
#include <bow/tfidf.h>

static int verbose = 0;


/* Add the words found in the document to the global vocabulary with
   occurrence counts (wordmap). Return -1 on failure, the total number
   of words added on success. */
int vocabulary_add(char *msg, docs_per_word *doc_counts, u64_t msguid)
{
  char word[BOW_MAX_WORD_LENGTH];
  bow_lex *lex;
  int wi;
  int total_word_count = 0;

  /* Initialize the lexer (the contents of msg is copied in the
     internal buffer of the lexer) */
  lex = bow_lexer_simple_open_str(bow_default_lexer, msg);
  if (lex == NULL)
    {
      printf("vocabulary_add: Error initializing lexer\n");
      return -1;
    }

  /* Loop once for each lexical token in this document. */
  while (bow_default_lexer->get_word (bow_default_lexer, 
				      lex, word, 
				      BOW_MAX_WORD_LENGTH))
    {
      /* Increment the word's occurrence count in the global wordmap. */
      wi = bow_word2int_add_occurrence (word);
      if (wi < 0)
	continue;
      /* Increment the doc count if the same word has not yet occurred
         in the same document */
      docs_per_word_add_ifnotlast(doc_counts, wi, msguid);

      /* Increment total word count */
      total_word_count++;
    }
  bow_default_lexer->close (bow_default_lexer, lex);
  
  return total_word_count;
}


/* Reduce the size of vocabulary based on certain criteria (skipping
   most common words, too rare words etc.). 

   WARNING: This totally changes the word/int mapping; any WV's,
   WI2DVF's or BARREL's you build with the old mapping will have bogus
   WI's afterward. */
int vocabulary_prune(organize_parameters *params, docs_per_word *doc_counts, int total_docs)
{
  /* Structure taken from bow_words_remove_occurrences_less_than() */
  bow_int4str *new_map;
  int *new_word_map_counts;
  int new_wordcount;
  int wi;
  int max_wi;
  int mindocs, maxdocs;

  if (word_map == NULL)
    {
      printf("vocabulary_prune: Word map empty\n");
      return -1;
    }

  if (params->prune_words_in_less_than_n_docs != 0)
    mindocs = params->prune_words_in_less_than_n_docs;
  else if (params->prune_words_in_less_than_n_percent != 0)
    mindocs = params->prune_words_in_less_than_n_percent * total_docs / 100;
  else
    mindocs = 0;

  if (params->prune_words_in_more_than_n_docs != 0)
    maxdocs = params->prune_words_in_more_than_n_docs;
  else if (params->prune_words_in_more_than_n_percent != 0)
    maxdocs = params->prune_words_in_more_than_n_percent * total_docs / 100;
  else
    maxdocs = INT_MAX;

  if ((mindocs == 0) && (maxdocs == INT_MAX))
    {
      /* No pruning, can return immediately without error. */
      return 0;
    }

  max_wi = word_map->str_array_length;
  new_map = bow_int4str_new (0);
  /* To maintain new word counts we initially allocate memory for all
     words, as we are dropping some words we will probably need
     less. */
  new_word_map_counts = bow_malloc(max_wi * sizeof (int));
  new_wordcount = 0;

  if (bow_word2int_use_unknown_word)
    bow_str2int (new_map, BOW_UNKNOWN_WORD);
  for (wi = 0; wi < max_wi; wi++)
    {
      /* If the counts pass the pruning criteria, add to the new map */
      if ((doc_counts->word_map_doc_counts[wi] >= mindocs) && 
	  (doc_counts->word_map_doc_counts[wi] <= maxdocs))
	{
	  bow_str2int (new_map, bow_int2str (word_map, wi));
	  new_word_map_counts[new_wordcount] = word_map_counts[wi];
	  new_wordcount++;
	}
    }
  /* Replace the old map with the new map. */
  bow_words_set_map (new_map, 1);

  /* Setting the map as in the previous function resets the word
     counts so we have to now use the new array we have built. We can
     also reduce the amount of memory it takes now that we know the
     final number of words. */
  new_word_map_counts = bow_realloc(new_word_map_counts, new_wordcount*sizeof(int));
  bow_free(word_map_counts);
  word_map_counts = new_word_map_counts;
  word_map_counts_size = new_wordcount;

  /* bow_words_remove_occurrences_less_than(0); in int4word.c
  
     or
     
     bow_wi2dvf_hide_words_by_occur_count(...)  in wi2dvf.c */

  /* bow_wi2dvf_hide_words_by_doc_count(...) */

  /* bow_words_keep_top_by_infogain (...)
     Needs the number of classes, should check more closely what this
     actually does before using it */

  return 0;
}


/* For qsort */
static int
compare_ints (const void *int1, const void *int2)
{
  return *(int*)int1 - *(int*)int2;
}

/* There are actually two types of vectors, document vectors (one per
   each word) and word vectors (one per each document).  
   This function computes both and stores the document vectors in the
   barrel and word vectors in the wv_store. */
int calculate_word_vectors(bow_barrel *barrel, bow_di2wv **di2ptr, char *msg, u64_t msguid, doc_msg_map *map)
{
  char word[BOW_MAX_WORD_LENGTH];
  bow_lex *lex;
  int i, j, wi, result;
  int *wi_array;		/* all words in document, including repeats */
  int wi_array_length = 0;
  int wi_array_size = 1024;
  int num_unique_wi;		/* the number of different words in document */
  int prev_wi;			/* used when counting number of diff words */
  bow_wv *wv = NULL;		/* the word vector to be added to the docstats */
  int old_size;
  char msguid_string[21];       /* Should be enough for 64bit integer */
  int docindex;
  int class;
  char classname[] = "Bogus class";
  bow_cdoc cdoc;
  bow_cdoc *cdocp;
  int numwords;
  
  bow_di2wv *docstats = *di2ptr;

  wi_array = bow_malloc (wi_array_size * sizeof (int));

  /* We need to fill something bogus for the bow class
     name, otherwise some functions will fail. */
  if (!(barrel->classnames))
    barrel->classnames = bow_int4str_new (0);
  class = bow_str2int (barrel->classnames, classname);

  cdoc.type = bow_doc_train;
  cdoc.class = class;
  /* Set to one so bow_infogain_per_wi_new() works correctly
     by default. */
  cdoc.prior = 1.0f;
  assert (cdoc.class >= 0);
  sprintf(msguid_string, "%lld", msguid);
  cdoc.filename = strdup(msguid_string);
  assert (cdoc.filename);
  cdoc.class_probs = NULL;
  /* Add the CDOC to CDOCS, and determine the "index" of this
     document. */
  docindex = bow_array_append (barrel->cdocs, &cdoc);

  result = docindex_add_mapping(map, msguid, docindex);
  if (result == -1)
    {
      printf("calculate_word_vectors: Error adding docindex / msguid mapping at message %lld.\n",
	     msguid);
      return -1;
    }

  /* Initialize the lexer (the contents of msg is copied in the
     internal buffer of the lexer) */
  lex = bow_lexer_simple_open_str(bow_default_lexer, msg);
  if (lex == NULL)
    {
      printf("calculate_word_vectors: Error initializing lexer\n");
      return -1;
    }

  /* Loop once for each lexical token in this document. */
  numwords = 0;
  while (bow_default_lexer->get_word (bow_default_lexer, 
				      lex, word, 
				      BOW_MAX_WORD_LENGTH))
    {
      numwords++;
      /* Get the word index from the global wordmap. */
      wi = bow_word2int_no_add(word);
      if (wi < 0)
	continue;
      bow_wi2dvf_add_wi_di_count_weight (&(barrel->wi2dvf), wi, docindex, 1, 1);

      if (wi_array_length == wi_array_size-1)
	{
	  /* wi_array needs to grow in order to hold more word indices. */
	  wi_array_size *= 2;
	  wi_array = bow_realloc (wi_array, wi_array_size * sizeof (int));
	}
      wi_array[wi_array_length++] = wi;
    }
  bow_default_lexer->close (bow_default_lexer, lex);

  /* Fill in the new CDOC's idea of WORD_COUNT */
  cdocp = bow_array_entry_at_index (barrel->cdocs, docindex);
  cdocp->word_count = numwords;
  
  /* The word vector code below mostly copied from bow_wv_new_from_lex
     (we cannot use it directly because it would add word occurrences
     to the global wordmap). */

  /* If we didn't get any words from the file, we cannot construct the
     word vector. Return -1. */
  if (wi_array_length == 0)
    {
      printf("calculate_word_vectors: No valid words found in message %llu.\n", msguid);
      bow_free (wi_array);
      return -1;
    }

  /* Sort the array of word indices. */
  qsort (wi_array, wi_array_length, sizeof (int), compare_ints);

  /* Find out how many of them are unique, (i.e. determine the correct
     value for NUM_UNIQUE_WI) so that we know how much space to allocate
     for the wv->entries. */
  for (num_unique_wi = 0, prev_wi = -1, i = 0; i < wi_array_length; i++)
    {
      if (wi_array[i] != prev_wi)
	{
	  num_unique_wi++;
	  prev_wi = wi_array[i];
	}
    }

  /* Allocate memory for the word vector we're creating. */
  wv = bow_malloc (sizeof (bow_wv) + sizeof (bow_we) * num_unique_wi);

  /* Fill in the word vector entries from WI_ARRAY. */
  wv->num_entries = num_unique_wi;
  for (i = 0, j = -1, prev_wi = -1; i < wi_array_length; i++)
    {
      if (wi_array[i] != prev_wi)
	{
	  j++;
	  wv->entry[j].wi = wi_array[i];
	  prev_wi = wi_array[i];
	  wv->entry[j].count = 1;
	}
      else
	{
	  (wv->entry[j].count)++;
	}
    }
  assert (j+1 == num_unique_wi);
  bow_free (wi_array);

  /* Initialize to a standard value. */
  wv->normalizer = 1;

  /* Add the word vector to the docstats storage. */
  if (docindex >= docstats->size)
    {
      /* Document index bigger than currently space allocated in
         docstats, need to grow the array */
      old_size = docstats->size;
      docstats->size = MAX (docindex+1, old_size * 2);
      docstats = bow_realloc(docstats, sizeof(bow_di2wv) + 
			     sizeof (bow_wv*) * docstats->size);
      if (docstats == NULL)
	{
	  printf("calculate_word_vectors: Out of memory when growing docstats\n");
	  return -1;
	}
      else
	*di2ptr = docstats;
      
      /* Initialize the new part of the realloc'ed space. */
      for (i=old_size; i < docstats->size; i++)
	docstats->entry[i] = NULL;
    }
  if (docstats->entry[docindex] != NULL)
    {
      printf("calculate_word_vectors: Warning, replacing an existing word vector at %d.\n", docindex);
      bow_wv_free(docstats->entry[docindex]);
      docstats->entry[docindex] = wv;
    }
  else
    {
      docstats->entry[docindex] = wv;
      docstats->num_docs++;
    }

  /* printf("Word vector: %s\n", bow_wv_sprintf(wv, 2048)); */

  return 0;
}


/* Set the word weights in word vectors according to the inverse
   document frequency. Needs a barrel where the weights are already
   calculated (there are functions in bow already to do that so we
   don't want to duplicate the code here). 

   Returns 0 on success, -1 on error. */
int docstats_set_idf_weights_by_barrel(bow_di2wv *docstats, bow_barrel *barrel)
{
  int i, j, wi;
  bow_wv *wv;
  bow_dv *dv;

  for (i = 0; i < docstats->size; i++)
    {
      wv = docstats->entry[i];
      if (wv != NULL)
	{
	  /* Go through all the words in this vector */
	  for (j = 0; j < wv->num_entries; j++)
	    {
	      wi = wv->entry[j].wi;
	      /* Get the document vector for this word WI */
	      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);
	      if (dv == NULL)
		{
		  printf("docstats_set_idf_weights_by_barrel: Couldn't get the vector\n");
		  printf("for word %d, aborting.\n", wi);
		  return -1;
		}
	      wv->entry[j].weight = dv->idf;
	    }
	}
    }
  return 0;
}

int docstats_normalize_weights_by_length(bow_di2wv *docstats)
{
  int i;

  for (i = 0; i < docstats->size; i++)
    {
      if (docstats->entry[i] != NULL)
	{
	  bow_wv_normalize_weights_by_vector_length(docstats->entry[i]);
	  /* printf("Word vector: %s\n", bow_wv_sprintf(docstats->entry[i], 2048)); */
	}
    }
  return 0;
}

void print_usage()
{
  printf("Usage: dbmail-organize [-u user] [-d directory] [-P amount] [-v]\n");
  printf("In pruning (excluding words, option -P), the syntax is as follows:\n");
  printf("  * First letter l for \"less than\" or m for \"more than\".\n");
  printf("  * Number of messages, optionally followed by %% to use percentages.\n\n");
  printf("Example: dbmail-organize -u joe -d debugdata -P m5%% -P l2 -v\n");
  printf("  * Process the emails of user joe, exclude words that are present in more\n");
  printf("    than 5 percent of all messages or in less than 2 messages, write some\n");
  printf("    debug files to directory debugdata, be verbose.\n");
}

int main(int argc, char *argv[])
{
  char *username = NULL;
  char *dirname = NULL;
  DIR *d_tmp; /* Just for temporary checking if the directory exists */
  u64_t userid;
  mailbox_t mailbox;        /* currently selected mailbox */
  u64_t *mb_children = NULL;
  unsigned mb_nchildren;
  mime_message_t msgdata;
  char *contents;

  struct tms start, end, totalstart, totalend;
  struct timeval elapsedstart, elapsedend, totalelapsedstart, totalelapsedend;
  struct timezone z; /* Not actually used but obligatory as argument */

  /* Keep track of doc counts during vocabulary build, not stored */
  docs_per_word *vocab_docs_per_word; 

  /* The mapping between Bow documents and message uids, stored in the database */
  doc_msg_map *map;

  /* Document and word statistics, stored in the database */
  bow_barrel *wordstats;
  bow_di2wv *docstats;
  
  organize_parameters params;
  organize_statistics stats = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

  FILE *vocabulary_file = NULL;
  FILE *wordvectors_file = NULL;

  int i,j;
  int opt;
  int result;
  unsigned long int ulresult;
  char *endptr;
  char *tempstr;

  int dump_wordvectordata = 0; /* If 1, dumps word vectors in ascii mode in a file. */
  int dump_vocabulary = 1;     /* If 1, dumps vocabulary in ascii mode in a file. */

  /* Default values for the parameters */
  params.prune_words_in_more_than_n_docs = 0;
  params.prune_words_in_more_than_n_percent = 0;
  params.prune_words_in_less_than_n_docs = 0;
  params.prune_words_in_less_than_n_percent = 0;
  params.prune_by_infogain = 0; 
  params.sorting_method = SORTING_BRUTEFORCE;
  params.smoothing_method = bow_smoothing_wittenbell;
  params.weighting_method = WEIGHTING_IDF;
  params.max_word_vector_length = 0;
  params.verbosity = 0;
  
  times(&totalstart);
  gettimeofday(&totalelapsedstart, &z);

  while (1)
    {
      opt = getopt (argc, argv, "d:I:m:P:u:v");
      if (opt == -1)
	break; /* No more options */
      
      switch (opt) 
	{
	case 'd':
	  /* Directory for logs and statistics */
	  dirname = optarg;
	  d_tmp = opendir(dirname);
	  if (d_tmp == NULL)
	    {
	      if (errno == ENOENT) /* Directory doesn't exist */
		{
		  if (mkdir(dirname, S_IRWXU) != 0)
		    {
		      printf("organize: Couldn't create directory %s.\n", dirname);
		      return -1;
		    }
		}
	      else
		{
		  printf("organize: Couldn't access directory %s.\n", dirname);
		  return -1;
		}
	    }
	  else
	    closedir(d_tmp);
	  break;
	case 'I':
	  /* Prune by infogain */
	  ulresult = strtoul(optarg, &endptr, 10);
	  if (ulresult == 0)
	    {
	      printf("organize: Couldn't read the infogain pruning count.\n");
	      return -1;
	    }
	  if ((ulresult == ULONG_MAX) || (ulresult > INT_MAX))
	    {
	      printf("organize: Infogain pruning count value too big.\n");
	      return -1;
	    }
	  params.prune_by_infogain = (int)ulresult;
	  break;
	case 'm':
	  /* Method of indexing. Currently there are no choices. */
	  printf ("organize: sorry, no alternative methods implemented\n");
	  break;
	case 'P':
	  /* Pruning the vocabulary */
	  if (optarg[0] == 'l')
	    {
	      ulresult = strtoul(&optarg[1], &endptr, 10);
	      if (ulresult == 0)
		{
		  printf("organize: Couldn't read the vocabulary pruning parameters\n");
		  return -1;
		}
	      if ((ulresult == ULONG_MAX) || (ulresult > INT_MAX))
		{
		  printf("organize: Pruning parameter value too big.\n");
		  return -1;
		}
	      if (*endptr == '%')
		{
		  params.prune_words_in_less_than_n_percent = (int)ulresult;
		  params.prune_words_in_less_than_n_docs = 0;
		}
	      else
		{
		  params.prune_words_in_less_than_n_docs = (int)ulresult;
		  params.prune_words_in_less_than_n_percent = 0;
		}
	    }
	  else if (optarg[0] == 'm')
	    {
	      ulresult = strtoul(&optarg[1], &endptr, 10);
	      if (ulresult == 0)
		{
		  printf("organize: Couldn't read the vocabulary pruning parameters\n");
		  return -1;
		}
	      if ((ulresult == ULONG_MAX) || (ulresult > INT_MAX))
		{
		  printf("organize: Pruning parameter value too big.\n");
		  return -1;
		}
	      if (*endptr == '%')
		{
		  params.prune_words_in_more_than_n_percent = (int)ulresult;
		  params.prune_words_in_more_than_n_docs = 0;
		}
	      else
		{
		  params.prune_words_in_more_than_n_docs = (int)ulresult;
		  params.prune_words_in_more_than_n_percent = 0;
		}
	    }
	  else
	    {
	      printf("organize: Please start the pruning parameter by l (less) or m (more).\n");
	      return -1;
	    }
	  break;
	case 'u':
	  /* User whose mail to index */
	  username = optarg;
	  break;
	case 'v':
	  /* Verbose */
	  verbose++;
	  break;
	default:
	  print_usage();
	}
    }

  if (! username)
    {
      /* We should do all users in this case */
      printf("Currently the username option is obligatory.\n");
      print_usage();
      return -1;
    }

  /* Set locales according to environment variables. This affects for
     instance what characters are classified as alphanumeric. */
  tempstr = setlocale(LC_ALL, "");
  if (verbose)
    printf("Current locale set is %s\n", tempstr); 

  if (verbose)
    printf ("Opening connection to the database...\n");
  if (db_connect()==-1)
    {
      printf ("Could not connect to database (check log).\n");
      return -1;
    }
	
  if (verbose)
    printf ("Opening connection to the user database...\n");
  if (auth_connect()==-1)
    {
      printf ("Could not connect to user database (check log).\n");
      db_disconnect();
      return -1;
    }
	
  if (verbose)
    printf ("Ok. Connected\n");

  userid = auth_user_exists(username);
  if (userid == 0)
    {
      printf("User [%s] does not exist.\n", username);
      return -1;
    }
  if (userid == -1)
    {
      printf("Error verifying existence of user [%s]. Please check the log.\n", username);
      return -1;
    }

  if (verbose)
    printf("Organizing mails of user %s, id %ld\n", username, (long int)userid);


  /* Get a list of all mailboxes belonging to the user */
  if (db_findmailbox_by_regex(userid, ".*", &mb_children, &mb_nchildren, 0))
    {
      printf("Couldn't get the mailbox list for user %s.\n", username);
      return -1;
    }

  /* Initialize the msguid / bow document index mapping */
  map = doc_msg_map_new(0);
  if (map == NULL)
    {
      printf("organize: Out of memory.\n");
      return -1;
    }
  vocab_docs_per_word = docs_per_word_new(0);
  if (vocab_docs_per_word == NULL)
    {
      printf("organize: Out of memory.\n");
      return -1;
    }

  /* Do not use the bow stoplist because the implementation is broken
     and may segfault if there are special characters. */
  bow_lexer_stoplist_func = NULL;

  /* Process all the mailboxes and build the vocabulary. */
  times(&start);
  gettimeofday(&elapsedstart, &z);
  for (i=0; i<mb_nchildren; i++)
    {
      if (get_and_init_mailbox(mb_children[i], &mailbox, userid, verbose) != 0)
	return -1;

      /* The mailbox.exists field contains the number of existing
         messages (not deleted ones) and the mailbox.seq_list array
         the corresponding msgid:s. */
      stats.total_mails += mailbox.exists;
      
      /* Go through all the messages and add words to the vocabulary */
      for (j=0; j<mailbox.exists; j++)
	{
	  if (init_msgdata(&msgdata, mailbox.seq_list[j]) != 0)
	    return -1;
	  
	  if (verbose > 1)
	    {
	      printf("Dumping message %lld...\n", mailbox.seq_list[j]);
	      /* configure_debug(5,1,1);
		 db_msgdump(&msgdata, mailbox.seq_list[i], 5); */
	      dump_message_data(mailbox.seq_list[j], &msgdata, verbose-2);
	    }
	  
	  stats.total_content_bytes += msgdata.bodysize;

	  contents = get_text_contents(mailbox.seq_list[j], &msgdata, &stats);
	  if (contents != NULL)
	    {
	      result = vocabulary_add(contents, vocab_docs_per_word, mailbox.seq_list[j]);
	      if (result == -1)
		{
		  printf("organize: Error adding vocabulary for message %lld\n",
			 mailbox.seq_list[j]);
		  return -1;
		}
	      else 
		{
		  stats.mails_valid_for_analysis++;
		  stats.total_words += result;
		}
	      my_free(contents);
	    }
	  db_free_msg(&msgdata);
	} /* All files in the mailbox */
    } /* All mailboxes of the user */
  times(&end);
  gettimeofday(&elapsedend, &z);
  stats.time_building_vocabulary_user = (float)(end.tms_utime - start.tms_utime) / sysconf(_SC_CLK_TCK);
  stats.time_building_vocabulary_elapsed = ((float)(1000000 * elapsedend.tv_sec + elapsedend.tv_usec) - 
					    (float)(1000000 * elapsedstart.tv_sec + elapsedstart.tv_usec)) / 1000000;
  

  /* Now the vocabulary is complete, check the number of different
     words and (optionally) dump the vocabulary in a file before
     pruning. */
  stats.total_different_words = bow_num_words();
  if (dirname != NULL) 
    {
      if ((vocabulary_file = open_file_for_writing(NULL, dirname, "vocabulary_raw", 
						   username, "txt")) != NULL)
	{
	  dump_wordcounts(vocabulary_file);
	  fclose(vocabulary_file);
	}
    }

  times(&start);
  gettimeofday(&elapsedstart, &z);
  vocabulary_prune(&params, vocab_docs_per_word, stats.mails_valid_for_analysis);
  times(&end);
  gettimeofday(&elapsedend, &z);
  stats.time_pruning_vocabulary_user = (float)(end.tms_utime - start.tms_utime) / sysconf(_SC_CLK_TCK);
  stats.time_pruning_vocabulary_elapsed = ((float)(1000000 * elapsedend.tv_sec + elapsedend.tv_usec) - 
					   (float)(1000000 * elapsedstart.tv_sec + elapsedstart.tv_usec)) / 1000000;

  stats.total_different_words_pruned = bow_num_words();

  /* We don't need these temporary doc counts any more */
  docs_per_word_free(vocab_docs_per_word);

  if (dump_vocabulary) 
    {
      if (dirname != NULL) 
	{
	  if ((vocabulary_file = open_file_for_writing(NULL, dirname, "vocabulary_pruned", 
						       username, "txt")) != NULL)
	    {
	      dump_wordcounts(vocabulary_file);
	      fclose(vocabulary_file);
	    }
	}
    }

  /* More initializations */
  docstats = bow_di2wv_new(0);
  wordstats = bow_barrel_new (0, 0, sizeof (bow_cdoc), 0);
  if (params.weighting_method == WEIGHTING_IDF)
    wordstats->method = &bow_method_tfidf;
  else
    {
      printf("dbmail-organize: Unknown weighting method, aborting\n");
      bow_barrel_free(wordstats);
      bow_di2wv_free(docstats);
      return -1;
    }

  /* Go through all the messages again, now calculating the word
     vectors against the final vocabulary. */

  times(&start);
  gettimeofday(&elapsedstart, &z);
  for (i=0; i<mb_nchildren; i++)
    {
      if (get_and_init_mailbox(mb_children[i], &mailbox, userid, verbose) != 0)
	return -1;
      for (j=0; j<mailbox.exists; j++)
	{
	  if (init_msgdata(&msgdata, mailbox.seq_list[j]) != 0)
	    return -1;
	  
	  /* The stats parameter is NULL for not to modify message
             counts during this second pass. */
	  contents = get_text_contents(mailbox.seq_list[j], &msgdata, NULL);
	  if (contents != NULL)
	    {
	      result = calculate_word_vectors(wordstats, &docstats, contents, mailbox.seq_list[j], map);
	      if (result == -1)
		{
		  printf("organize: Warning: error calculating word vector for message %lld\n",
			 mailbox.seq_list[j]);
		  /* return -1; */
		}
	      my_free(contents);
	    }
	  db_free_msg(&msgdata);
	} /* All files in the mailbox */
    } /* All mailboxes of the user */
  times(&end);
  gettimeofday(&elapsedend, &z);
  stats.time_building_wordvectors_user = (float)(end.tms_utime - start.tms_utime) / sysconf(_SC_CLK_TCK);
  stats.time_building_wordvectors_elapsed = ((float)(1000000 * elapsedend.tv_sec + elapsedend.tv_usec) - 
					     (float)(1000000 * elapsedstart.tv_sec + elapsedstart.tv_usec)) / 1000000;


  /* Set weights and normalize */
  times(&start);
  gettimeofday(&elapsedstart, &z);
  bow_barrel_set_weights(wordstats);
  bow_barrel_normalize_weights(wordstats);
  
  if (params.weighting_method == WEIGHTING_IDF)
    docstats_set_idf_weights_by_barrel(docstats, wordstats);
  
  docstats_normalize_weights_by_length(docstats);
  times(&end);
  gettimeofday(&elapsedend, &z);
  stats.time_weighting_wordvectors_user = (float)(end.tms_utime - start.tms_utime) / sysconf(_SC_CLK_TCK);
  stats.time_weighting_wordvectors_elapsed = ((float)(1000000 * elapsedend.tv_sec + elapsedend.tv_usec) - 
					      (float)(1000000 * elapsedstart.tv_sec + elapsedstart.tv_usec)) / 1000000;


  /* Store the vocabulary and word vectors in the database */
  times(&start);
  gettimeofday(&elapsedstart, &z);
  db_insert_bow_vocabulary(word_map, word_map_counts, userid);
  db_insert_bow_barrel(wordstats, userid);
  db_insert_bow_di2wv(docstats, map, userid);
  times(&end);
  gettimeofday(&elapsedend, &z);
  stats.time_storing_database_user = (float)(end.tms_utime - start.tms_utime) / sysconf(_SC_CLK_TCK);
  stats.time_storing_database_elapsed = ((float)(1000000 * elapsedend.tv_sec + elapsedend.tv_usec) - 
					 (float)(1000000 * elapsedstart.tv_sec + elapsedstart.tv_usec)) / 1000000;

  /* This is for debugging or if one wants to analyze the wordvectors
     using an external program.  */
  if (dump_wordvectordata)
    {
      if ((wordvectors_file = open_file_for_writing(NULL, dirname, "wordvectors", 
						    username, "txt")) != NULL)
	{
	  dump_wordvectors(docstats, map, bow_num_words(), userid, wordvectors_file);
	  fclose(wordvectors_file);
	}
      else
	{
	  printf("Couldn't open wordvectors file\n");
	  return -1;
	}
    }

  /* Free the memory occupied by the word and doc statistics */
  bow_barrel_free(wordstats);
  bow_di2wv_free(docstats);
  doc_msg_map_free(map);

  /* __debug_dumpallocs(); */

  db_disconnect();
  auth_disconnect();

  times(&totalend);
  gettimeofday(&totalelapsedend, &z);
  stats.total_time = (float)(totalend.tms_utime - totalstart.tms_utime) / sysconf(_SC_CLK_TCK);
  stats.total_elapsed_time = ((float)(1000000 * totalelapsedend.tv_sec + totalelapsedend.tv_usec) - 
			      (float)(1000000 * totalelapsedstart.tv_sec + totalelapsedstart.tv_usec)) / 1000000;

  if (verbose)
    {
      printf("\n---------- Some final statistics ----------\n");
      printf("Total number of mails: %d\n", stats.total_mails);
      printf("Total number of parts: %d\n", stats.total_parts);
      printf("Parts with unknown encoding: %d\n", stats.parts_unknown_encoding);
      printf("Parts with known but unsupported encoding: %d\n", stats.parts_unsupported_encoding);
      printf("Parts with unknown content type: %d\n", stats.parts_unknown_contenttype);
      printf("Parts with known but unsupported content type: %d\n", stats.parts_unsupported_contenttype);
      printf("Parts with unknown charset: %d\n", stats.parts_unknown_charset);
      printf("Parts with known but unsupported charset: %d\n", stats.parts_unsupported_charset);
      printf("Parts valid for processing: %d\n", stats.parts_valid_for_analysis);
      printf("\n");
      printf("Total bytes of contents (headers excluded): %lld\n", stats.total_content_bytes);
      printf("Total bytes valid for processing: %lld\n", stats.bytes_valid_for_analysis);
      printf("Total mails valid for processing: %d\n", stats.mails_valid_for_analysis);
      printf("Total number of words: %lld\n", stats.total_words);
      printf("Total number of different words: %d\n", stats.total_different_words);
      printf("Total number of different words after pruning: %d\n", stats.total_different_words_pruned);
      printf("\n");
      printf("Time building the vocabulary (user/elapsed): %.3f / %.3f\n", 
	     stats.time_building_vocabulary_user, stats.time_building_vocabulary_elapsed);
      printf("Time pruning the vocabulary (user/elapsed): %.3f / %.3f\n",
	     stats.time_pruning_vocabulary_user, stats.time_pruning_vocabulary_elapsed);
      printf("Time building the word vectors (user/elapsed): %.3f / %.3f\n", 
	     stats.time_building_wordvectors_user, stats.time_building_wordvectors_elapsed);
      printf("Time weighting and normalizing the word vectors (user/elapsed): %.3f / %.3f\n", 
	     stats.time_weighting_wordvectors_user, stats.time_weighting_wordvectors_elapsed);
      printf("Time storing the information in the database (user/elapsed): %.3f / %.3f\n", 
	     stats.time_storing_database_user, stats.time_storing_database_elapsed);
      printf("Total user time (seconds): %.3f\n", stats.total_time);
      printf("Total elapsed time (seconds): %.3f\n", stats.total_elapsed_time);
    }

  return 0;
}
