/* $Id: organize-helpers.c,v 1.9 2002/09/30 11:49:09 arto Exp $
 *
 * Copyright Arto Ters <arto.teras@hip.fi> 2002
 *
 * Various helper functions for the dbmail-organize program.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *
 */

#include "organize-helpers.h"
#include "db.h"
#include "debug.h"
#include "auth.h"
#include "list.h"
#include "dbmailtypes.h"
#include "rfcmsg.h"
#include "dbmsgbuf.h"

/* (More or less) standard C stuff */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

/* The bag of words library (a patched one...) */
#include <bow/libbow.h>

/* Case-insensitive string matching, similar to strstr. Code from 
   http://www2.ics.hawaii.edu/~esb/2002spring.ics451/strcasestr.html */
char * strcasestr(char* haystack, char* needle) {
  int i;
  int nlength = strlen (needle);
  int hlength = strlen (haystack);

  if (nlength > hlength) return NULL;
  if (hlength <= 0) return NULL;
  if (nlength <= 0) return haystack;
  /* hlength and nlength > 0, nlength <= hlength */
  for (i = 0; i <= (hlength - nlength); i++) {
    if (strncasecmp (haystack + i, needle, nlength) == 0) {
      return haystack + i;
    }
  }
  /* substring not found */
  return NULL;
}


/* Extract the content type, charset and encoding of a message or a
   part of message. Return 0 if the contents is ok for processing, 1
   if it was detected to be unsuitable (binary etc.) and -1 if the
   contents is unknown. */
int check_content_type(mime_message_t *msg, message_data *contents_data, 
		       organize_statistics *stats)
{
  int check_charset = 0;
  struct mime_record *mr = NULL;

  if (list_totalnodes(&(msg->rfcheader)) > 0)
    mime_findfield("content-type", &(msg->rfcheader), &mr);
  else if (list_totalnodes(&(msg->mimeheader)) > 0)
    mime_findfield("content-type", &(msg->mimeheader), &mr);
  
  if (mr != NULL)
    {    
      if (strncasecmp(mr->value, "text/plain", strlen("text/plain")) == 0)
	{
	  contents_data->content_type = M_CT_TEXT;
	  check_charset = 1;
	}
      else if (strncasecmp(mr->value, "text/html", strlen("text/html")) == 0)
	{
	  contents_data->content_type = M_CT_HTML;
	  check_charset = 1;
	}
      else if (strncasecmp(mr->value, "application/msword", strlen("application/msword")) == 0)
	contents_data->content_type = M_CT_MSWORD;
      else if (strncasecmp(mr->value, "application/pdf", strlen("application/pdf")) == 0)
	contents_data->content_type = M_CT_PDF;
      else if (strncasecmp(mr->value, "application/x-pdf", strlen("application/x-pdf")) == 0)
	contents_data->content_type = M_CT_PDF;
      else if (strncasecmp(mr->value, "application/octet-stream", strlen("application/octet-stream")) == 0)
	contents_data->content_type = M_CT_BINARY;
      else if (strncasecmp(mr->value, "multipart", strlen("multipart")) == 0)
	contents_data->content_type = M_CT_MULTIPART;
      else
	contents_data->content_type = M_CT_UNKNOWN;
      
      if (check_charset)
	{
	  /* The charset is in the same field than content-type, so we can
	     use the information retrieved already. The charset value may
	     be in parentheses. */
	  if (strcasestr(mr->value, "charset=us-ascii") != NULL)
	    contents_data->charset = M_CSET_US_ASCII;
	  else if (strcasestr(mr->value, "charset=\"us-ascii") != NULL)
	    contents_data->charset = M_CSET_US_ASCII;
	  else if (strcasestr(mr->value, "charset=iso-8859") != NULL)
	    contents_data->charset = M_CSET_ISO_8859;
	  else if (strcasestr(mr->value, "charset=\"iso-8859") != NULL)
	    contents_data->charset = M_CSET_ISO_8859;
	  else if (strcasestr(mr->value, "charset=windows-125") != NULL)
	    contents_data->charset = M_CSET_WINDOWS_125X; 
	  else if (strcasestr(mr->value, "charset=\"windows-125") != NULL)
	    contents_data->charset = M_CSET_WINDOWS_125X;
	  else
	    contents_data->charset = M_CSET_UNKNOWN;
	}
    }
  else
    {
      /* Some mailers seem to leave the content type field out
         altogether if the messages are standard text and US_ASCII.
         So technically the content type is unknown but assuming it is
         text gives better results in sorting.*/
      contents_data->content_type = M_CT_TEXT;
      contents_data->charset = M_CSET_US_ASCII;
    }
  
  /* Check the encoding, this is a separate field. */
  if (list_totalnodes(&(msg->rfcheader)) > 0)
    mime_findfield("content-transfer-encoding", &(msg->rfcheader), &mr);
  else if (list_totalnodes(&(msg->mimeheader)) > 0)
    mime_findfield("content-transfer-encoding", &(msg->mimeheader), &mr);
  
  if (mr != NULL) 
    {
      if (strncasecmp(mr->value, "7bit", strlen("7bit")) == 0)
	contents_data->encoding = M_ENC_7BIT;
      else if (strncasecmp(mr->value, "8bit", strlen("8bit")) == 0)
	contents_data->encoding = M_ENC_8BIT;
      /* Some mailers seem to use "binary" instead of "8bit". */
      else if (strncasecmp(mr->value, "binary", strlen("binary")) == 0)
	contents_data->encoding = M_ENC_8BIT;
      else if (strncasecmp(mr->value, "quoted-printable", strlen("quoted-printable")) == 0)
	contents_data->encoding = M_ENC_QP;
      else if (strncasecmp(mr->value, "base64", strlen("base64")) == 0)
	contents_data->encoding = M_ENC_BASE64;
      else
	contents_data->encoding = M_ENC_UNKNOWN;
    }
  else
    {
      /* Some mailers seem to leave the encoding field out if the
         message contains just ascii text. */
      if ((contents_data->content_type == M_CT_TEXT) 
	  && (contents_data->charset == M_CSET_US_ASCII))
	contents_data->encoding = M_ENC_7BIT;
      else
	contents_data->encoding = M_ENC_UNKNOWN;
    }
    
  /* Update statistics */
  if (stats != NULL) 
    {
      if (contents_data->content_type == M_CT_UNKNOWN)
	stats->parts_unknown_contenttype++;
      else if (contents_data->charset == M_CSET_UNKNOWN)
	stats->parts_unknown_charset++;
      else if (contents_data->encoding == M_ENC_UNKNOWN)
	stats->parts_unknown_encoding++;
      
      if ((contents_data->content_type == M_CT_MSWORD) || 
	  (contents_data->content_type == M_CT_PDF) || 
	  (contents_data->content_type == M_CT_BINARY) || 
	  (contents_data->content_type == M_CT_MULTIPART))
	stats->parts_unsupported_contenttype++;
      if (contents_data->encoding == M_ENC_BASE64)
	stats->parts_unsupported_encoding++;
    }

  /* The return value should be generalized somehow, for instance
     passing the supported formats as one argument. Now we just expect
     that the caller of the function has a similar idea on what is
     suitable for further processing. */
  if ((contents_data->content_type == M_CT_UNKNOWN) || (contents_data->charset == M_CSET_UNKNOWN)
      || (contents_data->encoding == M_ENC_UNKNOWN))
    return -1;
  else if ((contents_data->content_type == M_CT_MSWORD) || (contents_data->content_type == M_CT_PDF)
	   || (contents_data->content_type == M_CT_BINARY) || (contents_data->content_type == M_CT_MULTIPART))
    return 1;
  else if (contents_data->encoding == M_ENC_BASE64)
    return 1;
  else
    {
      if (stats != NULL)
	stats->parts_valid_for_analysis++;
      return 0;
    }
}


/* Convert a string from quoted printable to 8bit representation. The
   original string is replaced with the new contents, which is always
   equal to or less than the original in length.

   Return values:
   0:  conversion was successful and the resulting string doesn't 
       contain null characters 
   n:  positive integer indicating the position of the last chararcter, 
       if the string contains null characters in the middle. 
   -1: invalid quoted printable input was encountered 

   Note that the function is not a validator, a -1 return value
   indicates that there was an error in encoding, but other return
   values don't guarantee that the encoding was correct. */
u64_t convert_qp_8bit(unsigned char *str)
{
  u64_t nchars = 0;
  unsigned char *currptr;
  unsigned char *newptr;
  unsigned char currentchar;
  unsigned int charvalue;
  int istext = 1;

  currptr = str;
  newptr = str;
  currentchar = *currptr;
  while (currentchar != '\0') 
    {
      currentchar = *currptr;
      if (currentchar == '=') 
	{
	  /* Next two characters usually form the quoted printable
	     entity, but it is possible that there is a single '=' at
	     the end of a line to signify a soft linebreak. See RFC
	     2045, section 6.7. */
	  if (*(currptr+1) == '\r' && *(currptr+2) == '\n')
	    {
	      /* Soft line break, this is simply removed from the
		 resulting string. */
	      currptr = currptr + 3;
	    }
	  else if (*(currptr+1) == '\n')
	    {
	      /* This is actually invalid quoted printable data,
		 but can be present if a file is transferred from
		 Windows to Unix, so we treat it flexibly as a
		 soft newline too. */
	      currptr = currptr + 2;
	    }
	  /* Now it should be two numbers or uppercase letters A-F */
	  else 
	    {
	      if (*(currptr+1) < 48 || *(currptr+1) > 70 
		  || (*(currptr+1) > 57 && *(currptr+1) < 65))
		{
		  printf("convert_qp_8bit: Encountered an invalid quoted printable entity %c%c\n", 
			 *(currptr+1), *(currptr+2));
		  return -1;
		}
	      if (*(currptr+2) < 48 || *(currptr+2) > 70 
		  || (*(currptr+2) > 57 && *(currptr+2) < 65))
		{
		  printf("convert_qp_8bit: Encountered an invalid quoted printable entity %c%c\n", 
			 *(currptr+1), *(currptr+2));
		  return -1;
		}
	      
	      if (! sscanf((currptr+1), "%2x", &charvalue))
		{
		  printf("convert_qp_8bit: Unknown error in sscanf, entity %c%c\n",
			 *(currptr+1), *(currptr+2));
		  return -1;
		}
	      else if (charvalue > 255)
		{
		  /* This should never happen because we have checked that
		     the two characters are numbers or between A-F */
		  printf("convert_qp_8bit: Unknown error in sscanf, entity entity %c%c\n",
			 *(currptr+1), *(currptr+2));
		  return -1;
		}
	      else
		{
		  if (charvalue == 0)
		    {
		      /* This is perfectly legal in quoted printable, but a
			 null character in the resulting string is likely to
			 cause problems in further treatment. */
		      istext = 0;
		    }
		  currentchar = (unsigned char) charvalue;
		  *newptr = currentchar;
		  newptr++;
		  nchars++;
		  currptr= currptr + 3;
		}
	    }
	}
      else
	{
	  /* No conversion in this character. */
	  *newptr = *currptr;
	  newptr++;
	  nchars++;
	  currptr++;
	}
    }
  
  /* Terminate the resulting string. */
  *newptr = '\0';

  if (istext)
    return 0;
  else
    return nchars;
}


/* Extract the textual contents of the message for analysis. All the
   parts containing normal text are attached after each other. Headers
   and parts with unknown content type or character set are ignored.
   Quoted printable is converted to 8bit text. 

   The stats variable may be null, it just means that statistics won't
   be updated.

   Returns NULL on error or a character pointer to the contents on
   success. The caller is responsible of calling my_free later to free
   the memory occupied by the character buffer.  */
char* get_text_contents(u64_t msguid, mime_message_t *msgdata, 
			organize_statistics *stats)
{
  struct element *e = NULL;
  mime_message_t *currentpart = NULL;
  message_data contentsdata;
  long parts;
  long i;
  u64_t result;

  char *contents = NULL;
  char *currentpos = NULL;
  MEM *msgcache = NULL;

  void free_memory_on_error()
    {
      /* In the case of an error, also contents is freed because the
         function returns NULL and caller is thus unable to free
         it. */
      if (contents != NULL)
	my_free(contents);
      if (msgcache != NULL)
	mclose(&msgcache);
    }

  if (msgdata->message_has_errors) 
    {
      printf("get_text_contents: The message %llu has errors (check content-type)!\n", msguid);
      return NULL;
    }

  parts = list_totalnodes(&(msgdata->children));
  /* If there are zero children, this is a one part message. On the
     other hand, if there are children the first part contains all the
     other parts (multipart, MIME) in the body so we'll skip that and
     treat the other parts individually. */
  if (parts < 1)
    {
      parts = 1;
      currentpart = msgdata;
    }
  else
    {
      e = list_getstart(&(msgdata->children));
      if (e != NULL)
	currentpart = (mime_message_t *)e->data;
      else
	{
	  printf("get_text_contents: Error retrieving first part of a multipart message %llu\n", msguid);
	  free_memory_on_error();
	  return NULL;
	}
    }

  /* In multipart MIME messages the msgdata struct of the first part
     contains size information for the whole message (all parts
     together). Also converting from QP to 8bit only reduces the need
     of space, so this is a safe amount of memory to allocate. In most
     cases it's actually too much, this could be optimized later. 

     (One extra byte to terminate each part in a newline to avoid last
     and first words of two parts to merge plus one byte to terminate
     the whole string.)*/
  contents = my_malloc(msgdata->bodysize+parts+1); 
  currentpos = contents;
  msgcache = mopen();
  
  for (i = 0; i < parts; i++)
    {
      if (stats != NULL)
	stats->total_parts++;
      if (currentpart == NULL)
	{
	  printf("get_text_contents: Error, part %ld doesn't exist in message %llu!\n", i+1, msguid);
	  free_memory_on_error();
	  return NULL;
	}
      
      result = check_content_type(currentpart, &contentsdata, stats);

      if (result == 0)
	{
	  /* Part contains text */

	  /* The cache structure is actually completely useless because we
	     want the whole message body in one string, but I'll use it to
	     avoid modifying the db_dump_range function. */
	  mreset(msgcache);
	  if (db_dump_range(msgcache, currentpart->bodystart, 
			    currentpart->bodyend, msguid) == -1)
	    {
	      printf("get_text_contents: Error getting body of message %llu\n", msguid);
	      free_memory_on_error();
	      return NULL;
	    }
	  mrewind(msgcache);
	  mread(currentpos, currentpart->bodysize, msgcache);
	  currentpos[currentpart->bodysize] = '\0';
	  
	  if (contentsdata.encoding == M_ENC_QP)
	    {
	      result = convert_qp_8bit((unsigned char*)currentpos);
	      if (result != 0)
		{
		  /* We are actually not very interested whether the
		     conversion failed or the resulting string contains
		     null characters, in both cases the data cannot be
		     treated as text. However, we can print a proper 
		     error message. */
		  if (result == -1)
		    printf("get_text_contents: Quoted printable conversion failed in message %llu\n", msguid);
		  else
		    printf("get_text_contents: Message %llu contains null characters, "
			   "cannot be treated as text\n", msguid);
		    
		
		  free_memory_on_error();
		  return NULL;
		}
	      /* We don't know the length of the converted string so we'll
		 have to use strlen to advance currentpos. */
	      currentpos += strlen(currentpos);
	    }
	  else
	    {  
	      /* Set the position ready for next part */
	      currentpos += currentpart->bodysize;
	    }
	  /* Terminate the part in a newline to avoid merging the last
             word of this part and the first word of the next part. */
	  *currentpos = '\n';
	  currentpos++;
	}
      /* If result is non-zero, the part was not detected to contain
         text. This is not fatal, we just won't update currentpos and
         thus skip this part of the message. */
      
      /* Update the part for multipart messages */
      if (e != NULL)
	{
	  e = e->nextnode;      
	  if (e != NULL)
	    currentpart = (mime_message_t *)e->data;
	  else
	    break;
	}
    }

  if (currentpos == contents) 
    {
      /* None of the parts contained text, no contents to return */
      free_memory_on_error();
      return NULL;
    }
  
  *currentpos = '\0';

  /* Free memory, the contents pointer is NOT freed, it is the
     responsibility of the caller. */
  mclose(&msgcache);
  
  /* Update statistics */
  if (stats != NULL)
    stats->bytes_valid_for_analysis += currentpos - contents;

  return contents;
}

void dump_message_data(u64_t msguid, mime_message_t *msgdata, int vlevel)
{
  struct element *e;
  struct mime_record *mr;
  mime_message_t *currentpart;
  long parts;
  long i;

  char *msgbody;
  MEM *msgcache;

  if (msgdata->message_has_errors) 
    {
      printf("dump_message_data: The message %llu has errors (check content-type)!\n", msguid);
    }
  
  parts = list_totalnodes(&(msgdata->children)) + 1;
  printf("Total number of parts: %ld\n", parts);
  printf("Number of header lines: %lld MIME, %lld RFC\n", 
	 msgdata->mimerfclines, msgdata->rfcheaderlines);

  currentpart = msgdata;
  e = list_getstart(&(currentpart->children));
  for (i = 0; i < parts; i++)
    {
      if (currentpart == NULL)
	{
	  printf("dump_message_data: Error, message part %ld doesn't exist!\n", i+1);
	  return;
	}
      printf("Part %ld :\n", i+1);
      printf("Size: %lld lines, %lld bytes\n", 
	     currentpart->bodylines, currentpart->bodysize);
      
      if (list_totalnodes(&(currentpart->rfcheader)) > 0)
	mime_findfield("content-type", &(currentpart->rfcheader), &mr);
      else if (list_totalnodes(&(currentpart->mimeheader)) > 0)
	mime_findfield("content-type", &(currentpart->mimeheader), &mr);

      if (mr != NULL)
	printf("Content-type: %s\n", mr->value);
      
      printf("Character set: %s\n", "foo");
      
      if (vlevel > 0) 
	{
	  printf("Part body:\n");
	  msgcache = mopen();
	  if (db_dump_range(msgcache, currentpart->bodystart, 
			    currentpart->bodyend, msguid) == -1)
	    {
	      printf("dump_message_data: Error getting body of message %llu\n", msguid);
	      mclose(&msgcache);
	      return;
	    }
	  msgbody = my_malloc(currentpart->bodysize+1);
	  mrewind(msgcache);
	  mread(msgbody, currentpart->bodysize, msgcache);
	  msgbody[currentpart->bodysize] = '\0';
	  printf("%s", msgbody);
	  my_free(msgbody);
	  mclose(&msgcache);
	}

      if (e != NULL)
	{
	  currentpart = (mime_message_t *)e->data;
	  e = e->nextnode;
	}
    }
  printf("\n");

}

/* Dump bow word counts in a file in plain text. */
void
dump_wordcounts(FILE *f)
{
  int i;

  /* There is a global variable word_map (type bow_int4str, see
     libbow.h) which contains the necessary data. */
  for (i = 0; i < bow_num_words(); i++)
    {
      /* Two possibilities to refer to the words:
	 1. word_map->str_array[i]
	 2. bow_int2word(i)
	 Seem to give the same result, could be different because
	 the word map contains a hash */
      fprintf(f, "%s %d\n", bow_int2word(i), bow_words_occurrences_for_wi(i));

      /* The variable arrow_barrel (type bow_barrel, see libbow.h)
	 contains the statistics with respect to individual
	 documents. */

      /* The bow_barrel_print_word_count prints out the following for each
	 document: 
	 1. The number of occurrences of the word 
	 2. The total number of _different_ words in the document
	 3. The ratio number_of_occ/num_diff_words
	 4. The filename
      */
      /* bow_barrel_print_word_count (arrow_barrel, bow_int2word(i));
	 printf("\n"); */
    }  
}

/* Dump bow word vectors in a file in plain text. */
void
dump_wordvectors(bow_di2wv *di2wv, doc_msg_map *map, int numwords, u64_t userid, FILE *f)
{
  int i, j, k;
  u64_t msguid;
  bow_wv *wv;

  for (i = 0; i < di2wv->size; i++)
    {
      /* Some of the entries may perfectly well be NULL, don't insert
         those. */
      if (di2wv->entry[i] != NULL)
	{
	  msguid = docindex_get_msguid(map, i);
	  if (msguid == DOC_MSG_EMPTY) 
	    {
	      trace (TRACE_ERROR, "dump_wordvectors(): Couldn't get msguid for doc %d", i);
	      return;
	    }
	  
	  /* fprintf(f, "%llu", msguid); */
	  wv = di2wv->entry[i];
	  k = 0;
	  for (j = 0; j < numwords; j++)
	    {
	      if (wv->entry[k].wi != j)
		{
		  fprintf(f, " 0");
		}
	      else
		{
		  fprintf(f, " %f", wv->normalizer * wv->entry[k].weight);
		  k++;
		}
	    }
	  
	  fprintf(f, "\n");
	}
    }
  return; 
}


/* Dump in the log, which words match in two messages, including weight. */
void
dump_similaritydata(bow_di2wv *di2wv, doc_msg_map *map, u64_t msguid1, 
		    u64_t msguid2, int loglevel)
{
  int id1, id2;
  bow_wv *wv1, *wv2;
  int i1 = 0;
  int i2 = 0;

  id1 = docindex_get_di(map, msguid1);
  id2 = docindex_get_di(map, msguid2);
  wv1 = di2wv->entry[id1];
  wv2 = di2wv->entry[id2];

  while ((i1 < wv1->num_entries) && (i2 < wv2->num_entries))
    {
      if (wv1->entry[i1].wi < wv2->entry[i2].wi)
	i1++;
      else if (wv2->entry[i2].wi < wv1->entry[i1].wi)
	i2++;
      else 
	{
	  /* The same word in both */
	  trace(loglevel, "%s (%f)", bow_int2word(wv1->entry[i1].wi), 
		((wv1->entry[i1].weight * wv1->normalizer) 
		 * (wv2->entry[i2].weight * wv2->normalizer)));
	  i1++;
	  i2++;
	}
    }
}



/* Open a file for writing. If fullname is not NULL, it is considered
   as the requested filename, and other parameters are ignored. If
   fullname is NULL, the name of file is composed of other provided
   components.

   Return values as in fopen(). The caller is responsible for closing
   the file with fclose(). */
FILE*
open_file_for_writing(char *fullname, char *dirname, char *type,
		      char *user, char *ending)
{
  FILE *f;
  char filename[MAX_FILENAME_LENGTH+1];
  int result;

  if (fullname != NULL)
    {
      if (strlen(fullname) > MAX_FILENAME_LENGTH)
	{
	  printf("open_file_for_writing: File name too long (maximum length %d): %s \n", 
		 MAX_FILENAME_LENGTH, fullname);
	  return NULL;
	}
      else
	{
	  f = fopen(fullname, "w");
	  if (f == NULL)
	    {
	      printf("open_file_for_writing: Error opening file %s\n", fullname);
	      return NULL;
	    }
	  else 
	    return f;
	}
    }
  else
    {
      result = snprintf(filename, MAX_FILENAME_LENGTH+1, "%s/%s.%s.%s",
			dirname, type, user, ending);
      if (result == -1)
	{
	  printf("open_file_for_writing: File name too long (maximum length %d): %s \n", 
		 MAX_FILENAME_LENGTH, filename);
	  return NULL;
	}

      f = fopen(filename, "w");
      if (f == NULL)
	{
	  printf("open_file_for_writing: Error opening file %s\n", filename);
	  return NULL;
	}
      else 
	return f;
    }
}


int get_and_init_mailbox(u64_t mbnum, mailbox_t *mailbox, u64_t userid, int verbose)
{
  char name[IMAP_MAX_MAILBOX_NAMELEN];

  if (db_getmailboxname(mbnum, name))
    {
      printf("Internal database error.\n");
      return -1;
    }
  if (verbose)
    {
      printf("Processing mailbox %s...\n", name);
    }
  
  /* Initialize and get the mailbox */
  memset(mailbox, 0, sizeof(mailbox_t));
  mailbox->uid = db_findmailbox(name, userid);
  if (mailbox->uid == (u64_t)(-1))
    {
      printf("Internal database error.\n");
      return -1;
    }
  if (mailbox->uid == 0)
    {
      /* Mailbox does not exist. This shouldn't happen because we
	 are just going through the list, but it is always
	 possible that the user reorganizes manually his mail
	 while we're sorting it. */
      printf("Specified mailbox does not exist.\n");
      return -1;
    }
      
  /* Retrieve mailbox data */
  if (db_getmailbox(mailbox, userid))
    {
      printf("Couldn't retrieve data from mailbox %s.\n", name);
      return -1;
    }

  return 0;
}


int init_msgdata(mime_message_t *msgdata, u64_t msgnum)
{
  int result; 
  
  memset(msgdata, 0, sizeof(mime_message_t));
  result = db_fetch_headers(msgnum, msgdata);
  if (result != 0)
    {
      if (result == -1)
	{
	  /* Error in parsing, this is not fatal but may produce
	     unexpected word vectors */
	  printf("Warning: error parsing data for message number %lld\n", 
		 msgnum);
	  return 0;
	}
      else
	{
	  printf("Error fetching data for message number %lld\n", 
		 msgnum);
	  return -1;
	}
    }
  return 0;
}

/* The following is needed to keep track of counts in how many docs a
   particular word appears because the Bow library doesn't provide
   this functionality. 
   
   Could do this using a generic resizable array...
*/
int docs_per_word_add_ifnotlast(docs_per_word *doccounts, int wordindex, u64_t msguid)
{
  int i, old_size;

  if (wordindex >= doccounts->size)
    {     
      old_size = doccounts->size;
      doccounts->size = MAX(wordindex+1, old_size * 2);
      doccounts->word_map_doc_counts = bow_realloc (doccounts->word_map_doc_counts, 
						    sizeof(int) * doccounts->size);
      doccounts->word_map_last_doc = bow_realloc (doccounts->word_map_last_doc, 
						  sizeof(u64_t) * doccounts->size);
      
      /* Initialize the new part of the realloc'ed space. */
      for (i=old_size; i < doccounts->size; i++)
	{
	  doccounts->word_map_doc_counts[i] = 0;
	  doccounts->word_map_last_doc[i] = DOCS_PER_WORD_EMPTY;
	}
    }

  if ((doccounts->word_map_last_doc[wordindex] != msguid) ||
      (doccounts->word_map_last_doc[wordindex] == DOCS_PER_WORD_EMPTY))
    {
      doccounts->word_map_doc_counts[wordindex]++;
      doccounts->word_map_last_doc[wordindex] = msguid;
    }
  /* If the last doc is equal to the given doc, do nothing */

  return 0;
}

/* Capacity may be 0, in that case the default capacity is used */
docs_per_word *docs_per_word_new(int capacity)
{
  int i;
  int default_capacity = 1024;
  docs_per_word *d;

  if (capacity == 0)
    capacity = default_capacity;

  d = bow_malloc(sizeof(docs_per_word));  
  if (d == NULL) 
    return NULL;

  d->word_map_doc_counts = bow_malloc(sizeof(int) * capacity);
  if (d->word_map_doc_counts == NULL)
    {
      bow_free(d);
      return NULL;
    }

  d->word_map_last_doc = bow_malloc(sizeof(u64_t) * capacity);
  if (d->word_map_last_doc == NULL)
    {
      bow_free(d->word_map_doc_counts);
      bow_free(d);
      return NULL;
    }
  
  d->size = capacity; 
  for (i = 0; i < capacity; i++)
    {
      d->word_map_doc_counts[i] = 0;
      d->word_map_last_doc[i] = DOCS_PER_WORD_EMPTY;
    }
  return d;
}

void docs_per_word_free(docs_per_word *d)
{
  bow_free(d->word_map_doc_counts);
  bow_free(d->word_map_last_doc);
  bow_free(d);
}
  


/* The document index used by bow needs to be separate for each
   message across mailboxes. We trust here that the msguid numbers
   in the database don't change during the organization process,
   because the resulting word vectors are then stored based on the
   message id:s. 
   
   Normally this is the case, but the msguid:s may be changed for
   instance if the administrator runs an utility which purges
   deleted messages from the database and compresses the tables. 

   An additional problem is that bow uses normal 32bit integers for
   document indexes, and dbmail uses 64bit integers. One user normally
   doesn't have more than 2^31 mails, but the database might do. So we
   should really maintain a proper mapping with quick access (hash)
   between these two, but for initial testing purposes we just hope
   that the msguid:s are small enough and check it in this quick and
   dirty way. */

/* Returns 0 on success, -1 on error */
int docindex_add_mapping(doc_msg_map *map, u64_t msguid, int di)
{
  int i, old_size;

  /* There is only one msguid for each docindex, so we just use di as
     the index to the array and make sure that it isn't used
     already. Array size is increased if necessary. */
  if (di < 0) 
    return -1;

  if (di >= map->size)
    {
      old_size = map->size;
      map->size = MAX(di+1, old_size * 2);
      map->entries = bow_realloc (map->entries, sizeof (u64_t) * map->size);
      if (map->entries == NULL)
	return -1;
      
      /* Initialize the new part of the realloc'ed space. */
      for (i=old_size; i < map->size; i++)
	map->entries[i] = (u64_t)DOC_MSG_EMPTY;
    }

  if (map->entries[di] != DOC_MSG_EMPTY) /* mapping exists already! */
    return -1;
  else
    {
      map->entries[di] = msguid;
      map->num_docs++;
      return 0;
    }
}

/* Clear all existing mappings */
int docindex_clear_mappings(doc_msg_map *map)
{
  int i;
  for (i = 0; i < map->size; i++)
    {
      map->entries[i] = (u64_t)DOC_MSG_EMPTY;
    }
  return 0;
}

/* Return value DOC_MSG_EMPTY indicates that no mapping was found
   (usually an error). */
u64_t docindex_get_msguid(doc_msg_map *map, int docindex)
{
  return map->entries[docindex];
}

/* Return value -1 indicates that no mapping was found
   (usually an error). */
int docindex_get_di(doc_msg_map *map, u64_t msguid)
{
  /* Fetching the bow document index based on msguid is inefficient,
     don't use this in time critical situations or add a hash in the
     map implementation. */
  int i = 0;
  
  while (i < map->size)
    {
      if (map->entries[i] == msguid)
	return i;
      i++;
    }
  return -1;
}


/* Capacity may be 0, in that case the default capacity is used */
doc_msg_map *doc_msg_map_new(int capacity)
{
  int i;
  int default_capacity = 1024;
  doc_msg_map *map;

  if (capacity == 0)
    capacity = default_capacity;

  map = bow_malloc(sizeof(doc_msg_map));
  if (map == NULL) 
    return NULL;
  
  map->entries = bow_malloc(capacity * sizeof(u64_t));
  if (map->entries == NULL)
    {
      bow_free(map);
      return NULL;
    }
  
  map->size = capacity; 
  map->num_docs = 0;
  for (i = 0; i < capacity; i++)
    {
      map->entries[i] = DOC_MSG_EMPTY;
    }
  return map;
}

/* This is just a wrapper, the entries in the array are long integers
   so we don't need to free them separately. */
void doc_msg_map_free(doc_msg_map *map) 
{
  bow_free(map->entries);
  bow_free(map);
}


/* Check based on the message id if the two given messages are
   duplicates. Returns 0 if not duplicate, 1 if yes, -1 in error. */
int is_duplicate(u64_t uid1, u64_t uid2)
{
  int duplicate;
  mime_message_t msgdata1, msgdata2;
  struct mime_record *mr1 = NULL;
  struct mime_record *mr2 = NULL;
  
  duplicate = 1;
  if (init_msgdata(&msgdata1, uid1) != 0)
    {
      printf("is_duplicate: Error fetching message data for uid %llu\n", uid1);
      return -1;
    }
  if (init_msgdata(&msgdata2, uid2) != 0)
    {
      printf("is_duplicate: Error fetching message data for uid %llu\n", uid2);
      db_free_msg(&msgdata1);
      return -1;
    }
  if (list_totalnodes(&(msgdata1.rfcheader)) > 0)
    mime_findfield("message-id", &(msgdata1.rfcheader), &mr1);
  else if (list_totalnodes(&(msgdata1.mimeheader)) > 0)
    mime_findfield("message-id", &(msgdata1.mimeheader), &mr1);
  if (list_totalnodes(&(msgdata2.rfcheader)) > 0)
    mime_findfield("message-id", &(msgdata2.rfcheader), &mr2);
  else if (list_totalnodes(&(msgdata2.mimeheader)) > 0)
    mime_findfield("message-id", &(msgdata2.mimeheader), &mr2);
  if ((mr1 == NULL) || (mr2 == NULL))
    {
      printf("is_duplicate: Error fetching message-id value for message %llu or %llu\n", uid1, uid2);
      db_free_msg(&msgdata1);
      db_free_msg(&msgdata2);
      return -1;
    }
  if (strncmp(mr1->value, mr2->value, strlen(mr1->value)) != 0)
    duplicate = 0;
			  
  db_free_msg(&msgdata1);
  db_free_msg(&msgdata2);
  return duplicate;
}

  /* Get an array of message uid's similar to a given message based on
   the scalar product of their word vectors. The similarity values are
   returned with the message uid:s in the similar_msgs structure. The
   arrays are organized by similarity (most similar is the first one
   in the array etc.). If less than num messages can be returned, some
   of the array elements have the value MSGUID_INVALID.
   
   In case of an error returns NULL.  */
similar_msgs* get_similar_messages_bruteforce(bow_di2wv *di2wv, doc_msg_map *map,
					       u64_t msguid, int num)
{
  similar_msgs *msgs;
  int orig;
  int i, j, k;
  double result = 0;
  double last_on_list = 0;
  u64_t currentuid;

  /* We could make this a configurable option but I cannot think about
     any other situation than testing where we would not want to drop
     duplicates. (When duplicates are dropped messages are shown only
     once in the list even if they are saved in several folders). */
  int drop_duplicates = 1;
  int duplicate;

  /* Get the bow document index of the original message */
  orig = docindex_get_di(map, msguid);
  if (orig == -1) /* No bow docindex found */
    return NULL; 

  msgs = my_malloc(sizeof(similar_msgs) + sizeof(similarity_data) * num);
  msgs->count = 0;
  for (i=0; i<num; i++)
    {
      msgs->entry[i].uid = MSGUID_INVALID;
      msgs->entry[i].value = 0;
    }

  for (i = 0; i < di2wv->size; i++)
    {
      if ((di2wv->entry[i] != NULL) && (i != orig))
	{
	  result = bow_wv_scalar_product(di2wv->entry[i], di2wv->entry[orig]);
	  currentuid = docindex_get_msguid(map, i);
	  if (result > last_on_list)
	    {
	      /* Scan the array from the beginning and insert in the
                 appropriate slot. A more efficient data structure
                 could be a good idea here, especially if count is
                 sometimes big. */
	      j = 0;
	      while (j < num)
		{
		  if (msgs->entry[j].uid == MSGUID_INVALID)
		    {
		      msgs->entry[j].uid = currentuid;
		      msgs->count++;
		      break; /* Out of while loop */
		    }
		  else if (msgs->entry[j].value <= result)
		    {
		      /* If the result is exactly the same, it is
                         possible that the message is duplicate. */
		      if (drop_duplicates && (msgs->entry[j].value == result))
			{
			  duplicate = is_duplicate(msgs->entry[j].uid, currentuid);
			  if (duplicate == -1)
			    return NULL;
			  else if (duplicate == 1)
			    break; /* Out of while loop */
			}
		      
		      /* The last message in the array is dumped, but
                         we must check if there already is a message
                         to keep proper track of the count. */
		      if (msgs->entry[num-1].uid == MSGUID_INVALID)
			msgs->count++;
		      for (k = (num-2); k >= j; k--)
			msgs->entry[k+1] = msgs->entry[k];

		      msgs->entry[j].uid = currentuid;
		      msgs->entry[j].value = result;
		      break; /* Out of while loop */
		    }
		  else
		    j++;
		}
	    }
	}
    }
  return msgs;

}

