/* A lexer with special features for handling quoted printable text */

/* Copyright (C) 2002 Arto Ters

   Written by:  Arto Ters <arto.teras@hip.fi>

   This file is part of the Bag-Of-Words Library, `libbow'.

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License
   as published by the Free Software Foundation, version 2.

   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */

#include <bow/libbow.h>
#include <ctype.h>		/* for tolower() */

#define PARAMS (bow_default_lexer_parameters)

/* Mostly like bow_lexer_simple_get_raw_word, but converts quoted
   printable characters to 8-bit. 
   NOTE: The behavior is quite strict, in the case of invalid data the
   operation is aborted. Thus this function is not suitable for text
   blocks containing (also) data that is not quoted printable
   encoded. */
int
bow_lexer_qp_get_raw_word (bow_lexer *self, bow_lex *lex,
			   char *buf, int buflen)
{
  int wordlen;			/* number of characters in the word so far */
  unsigned char current;
  unsigned int charvalue;
  const char *docptr;
  const char *word_start;

  docptr = lex->document + lex->document_position;

  /* Ignore characters until we get a beginning character. Alphabetic
     characters and the equal sign which begins a quoted printable
     entity are valid. */
  while (*docptr)
    {
      current = (unsigned char)*docptr;
      if ((isalpha(current)) || (current == '='))
	{
	  word_start = docptr;
	  break; 
	}
      else
	docptr++;
    }
  if (*docptr == '\0')
    return 0;
  
  /* Add alphabetic characters to the word, at the same time
     converting quoted printable entities. */
  wordlen = 0;
  while (wordlen < buflen)
    {
      current = (unsigned char)*docptr;
      if (isalpha (current))
	{	
	  *buf++ = tolower(current);
	  wordlen++;
	  docptr++;
	}
      else if (current == '=') 
	{
	  /* Next two characters usually form the quoted printable
	     entity, but it is possible that there is a single '=' at
	     the end of a line to signify a soft linebreak. See RFC
	     2045, section 6.7. */
	  if (! sscanf((docptr+1), "%2x", &charvalue))
	    {
	      if (*(docptr+1) == '\r' && *(docptr+2) == '\n')
		{
		  /* Soft line break, word may continue on the next
		     line. We must skip the following linebreak and
		     continue. No characters added to the word. */
		  docptr = docptr + 3;
		}
	      else if (*(docptr+1) == '\n')
		{
		  /* This is actually invalid quoted printable data,
                     but can be present if a file is transferred from
                     Windows to Unix, so we treat it as a soft newline
                     too. */
		  docptr = docptr + 2;
		}
	      else
		{ 
		  bow_error ("Encountered an invalid quoted printable entity %c%c", 
			     *(docptr+1), *(docptr+2));
		  break;
		}
	    }
	  else if (charvalue > 255)
	    {
	      bow_error ("Encountered an invalid quoted printable entity %c%c", 
			 *(docptr+1), *(docptr+2));
	      break;
	    }
	  else
	    {
	      current = (unsigned char) charvalue;
	      *buf++ = tolower(current);
	      wordlen++;
	      docptr = docptr + 3;
	    }
	}
      else
	{
	  /* Character not part of a word */
	  break;
	}
    }

  /* Now DOCPTR is pointing to the non-alpha immediately after the
     word */

  /* Adjust the LEX's pointer into the document for the next word */
  lex->document_position += docptr - (lex->document + lex->document_position);
  
  /* Terminate the word buffer. */
  *buf = '\0';

  return wordlen;
}


/* A lexer that converts all quoted printable entities (as defined in
   RFC 2045) to 8-bit ascii characters. */
const bow_lexer _bow_qp_lexer =
{
  sizeof (bow_lex),
  NULL,
  bow_lexer_simple_open_text_fp,
  bow_lexer_simple_open_str,
  bow_lexer_simple_get_word,
  bow_lexer_qp_get_raw_word,
  bow_lexer_simple_postprocess_word,
  bow_lexer_simple_close
};
const bow_lexer *bow_qp_lexer = &_bow_qp_lexer;
