/*
 * Static Dictionary
 * եμΥ󥿡ե¸ߤǡ
 * å夵ΤǤǤ¸ߤʤñ
 * ®ˤɬפ롣
 *
 * anthy_sdic_fill_seq_ent_by_xstr()濴ȤʤؿǤ
 *  ꤷsdicꤷʸ򥤥ǥåȤƤĥȥ
 *  ղäseq_entɲä
 *
 * a)ηb)ѳʳѤc)񥢥ι®
 *  ΥǰäƤΤǤʤʣƤޤ
 *
 * Copyright (C) 2000-2003 TABATA Yusuke
 * Copyright (C) 2001-2002 TAKAI Kosuke
 *
 * $Id: sdic.c,v 1.17 2002/09/14 11:35:06 yusuke Exp $
 */

#include <sys/types.h>
#include <sys/stat.h>
#include <netinet/in.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <alloc.h>
#include <dic.h>
#include <xchar.h>
#include <logger.h>
#include "dic_main.h"
#include "dic_ent.h"

#define HASH_SHIFT 5
#define HASH_ARRAY 16384
#define HASH_BIT (1<<HASH_SHIFT)

#define form_euc_char(first, second) ((second * 256) + first)
#define is_euc_fragment(c) ((c) & 0x80)
#define is_printable_ascii(c) ((c) > 31)
#define is_printable(c) (is_euc_fragment(c) || is_printable_ascii(c))

struct sdic_page {
  /** Υڡκǽñ */
  xstr str;
  /** ñĹΥҥȥ */
  int len_hist;
};

/** sdic */
struct sdic_handle {
  /* mmapʤɤǤμݥ */
  /** ե뼫ΤΥݥ */
  char *dic_file;
  /** 񥨥ȥΥǥå(ͥåȥХȥ) */
  int *entry_index;
  /** 񥨥ȥ */
  char *entry;
  /** ǥåؤΥǥå */
  int *page_index;
  /** Υǥå */
  char *page;

  int file_size;

  int nr_pages;
  struct sdic_page *pages;
  int len_hist;
  int *hash_ent;
};

static allocator sdic_ator;

static void
sdic_dtor(void *p)
{
  sdic_t s = p;
  int i;
  munmap(s->dic_file, s->file_size);
  for (i = 0; i <  s->nr_pages; i++) {
    free(s->pages[i].str.str);
  }
  free(s->pages);
  free(s->hash_ent);
}

static int
hash(xstr *x)
{
  int h,i;
  h = 0;
  for (i = 0 ;i < x->len ;i++) {
    h *= 23;
    h -= x->str[i];
  }
  return h&(HASH_ARRAY*HASH_BIT-1);
}

static void
mark_hash_ent(struct sdic_handle *sd, xstr *xs)
{
  int val = hash(xs);
  int idx=(val>>HASH_SHIFT)&(HASH_ARRAY-1);
  int bit= val & ((1<<HASH_SHIFT)-1);
  sd->hash_ent[idx] |= (1<<bit);
}

static int
check_hash_ent(struct sdic_handle *sd, xstr *xs)
{
  int val = hash(xs);
  int idx = (val>>HASH_SHIFT)&(HASH_ARRAY-1);
  int bit = val & ((1<<HASH_SHIFT)-1);
  return sd->hash_ent[idx] & (1<<bit);
}

/*  */
static void
push_back_dic_ent_kv(struct seq_ent *s, wtype_t w, int freq,
		     xstr x, xstr *tail)
{
  xstr t = *tail;
  xstr *xs;
  t.str ++;
  t.len --;
  xs = anthy_xstr_dup(&x);
  xs = anthy_xstrcat(xs, &t);
  anthy_ddic_push_back_dic_ent(s, xs, w, freq, 0);
  anthy_free_xstr(xs);
}

static void
push_back_dic_ent(struct seq_ent *s, wtype_t w, int freq,
		  xstr x, xstr *tail, int ct, int cst)
{
  xstr *xs;
  anthy_wtype_set_ct(&w, ct);
  anthy_wtype_set_cst(&w, cst);

  /* 촴̾Τޤ */
  if (anthy_wtype_get_ajv(w) && ct != CT_HEAD) {
    /* ưʤΤ */
    anthy_wtype_set_pos(&w, POS_AJV);
  }

  if (cst == CST_V_KANOU) {
    /* ǽưʤгѷ񤭴 */
    anthy_wtype_set_cc(&w, CC_KS1);
    /* ǽư̵̿ΤǡCST_DEFAULTΤ */
    anthy_wtype_set_cst(&w, CST_DEFAULT);
  }

  /* ư */
  if (anthy_wtype_get_cc(w) == CC_KV) {
    push_back_dic_ent_kv(s, w, freq, x, tail);
    return ;
  }
  xs = anthy_xstr_dup(&x);
  if (tail) {
    /* Ѹɲ */
    anthy_xstrcat(xs, tail);
  }

  anthy_ddic_push_back_dic_ent(s, xs, w, freq, 0);

  if (anthy_wtype_get_meisi(w) &&
      anthy_wtype_get_ct(w) == CT_RENYOU &&
      (!(anthy_wtype_get_5(w) &&
	 anthy_wtype_get_cst(w) != CST_V_RENYOU_5))) {
    /* ̾ѲϢηʤ (ʻ줬XXrη)ɲ */
    anthy_wtype_set_pos(&w,POS_NOUN);
    anthy_ddic_push_back_dic_ent(s, xs, w, freq, 0);
  }
  anthy_free_xstr(xs);
}

/*
 * #XX*123 ȤCannadicηѡ
 * ֤ͤϲʸɤߤ
 */
static int
parse_wtype(char *s, wtype_t *t, int *f)
{
  int i,j;
  char buff1[10];/* ʻ */
  char buff2[10];/*  */
  for (i = 0, j = -1; s[i] && s[i]!= ' '; i++) {
    if (j == -1) {
      if (s[i] == '*') {
	buff1[i] = 0;
	j=0;
      }else{
	buff1[i] = s[i];
      }
    }else{
      buff2[j] = s[i];
      j++;
    }
  }
  if (j >= 0) {
    buff2[j] = 0;
    *f = atoi(buff2) * FREQ_RATIO;
  }else{
    buff1[i] = 0;
    *f = 1;
  }
  if (anthy_type_to_wtype(buff1, t) == -1) {
    anthy_wtype_set_pos(t, POS_INVAL);
  }
  return i;
}

/** seq_entdic_entɲä */
static int
add_dic_ent(struct seq_ent *se, wtype_t *wt, int freq, char *s,
	    xstr *tail, int cc, int ct, int cst)
{
  int i, j;
  char *buf;

  /* ʸ׻ */
  for (i = 0; s[i] && (s[i] != ' ') && (s[i] != '#'); i++);
  /**/
  if (cc == anthy_wtype_get_cc(*wt) &&
      anthy_wtype_get_pos(*wt) != POS_INVAL &&
      (ct == anthy_wtype_get_ct(*wt) ||
       CT_NONE == anthy_wtype_get_ct(*wt))) {
    /* ꤵƤѷ򤷤Ƥ*/

    xstr *xs;
    /* bufñ򥳥ԡ */
    buf = alloca(i+1);
    buf[i] = 0;
    for (j = 0; j < i; j++){
      buf[j] = s[j];
    }
    /**/
    xs = anthy_cstr_to_xstr(buf);
    push_back_dic_ent(se, *wt, freq, *xs, tail, ct, cst);
    anthy_free_xstr(xs);
  }
  return i;
}

/** Υȥξ򸵤seq_ent򤦤 */
static void
fill_dic_ent(char *dic ,int idx, struct seq_ent *ent,
	     xstr *tail, int cc, int ct, int cst)
{
  wtype_t wt;/* ʻ */
  int freq;
  int c = 0;/* ʸΥեå */
  int order_bonus = 0;/* νˤ٤Υܡʥ */
  char *e = &dic[idx];

  while (e[c]) {
    if (e[c] == '#') {
      c += parse_wtype(&e[c], &wt, &freq);
      order_bonus = FREQ_RATIO - 1;
    } else {
      c += add_dic_ent(ent, &wt, freq + order_bonus, &e[c],
		       tail, cc, ct, cst);
      if (order_bonus > 0) {
	order_bonus --;
      }
    }
    if (e[c] == ' ') {
      c++;
    }
  }
}

/*
 * s˽񤫤줿ʸˤäxѹ
 * ֤ͤɤ߿ʤ᤿Хȿ
 */
static int
mkxstr(char *s, xstr *x)
{
  int i;
  /* s[0]ˤϴᤷʸ */
  x->len -= (s[0] - 1);
  for (i = 1; is_printable(s[i]); i ++) {
    if (is_euc_fragment(s[i])) {
      unsigned char h,l;
      l = s[i];
      h = s[i+1];
      x->str[x->len] = form_euc_char(h, l);
      x->len++;
      i++;
    } else {
      x->str[x->len] = s[i];
      x->len ++;
    }
  } 
  return i;
}

/* Хʥꥵ򤹤뤿˽ĤǤ뤳Ȥݾڤanthy_xstrcmp */
static int
ordered_xstrcmp(xstr *x1, xstr *x2)
{
  int i, m;
  if (x1->len < x2->len) {
    m = x1->len;
  }else{
    m = x2->len;
  }
  for (i = 0 ; i < m ; i++) {
    if (x1->str[i] < x2->str[i]) {
      return 1;
    }
    if (x1->str[i] > x2->str[i]) {
      return -1;
    }
  }
  if (x1->len < x2->len) {
    return 1;
  }
  if (x1->len > x2->len) {
    return -1;
  }
  return 0;
}

/** ڡñξĴ٤ */
static int
search_word_in_page(xstr *x, char *s)
{
  int o = 0;
  xchar *buf;
  xstr xs;
  /* ΥڡˤäȤĹñǼĹ */
  buf = alloca(sizeof(xchar)*strlen(s)/2);
  xs.str = buf;
  xs.len = 0;
  while (*s) {
    s += mkxstr(s, &xs);
    if (!ordered_xstrcmp(&xs, x)) {
      return o;
    }
    o ++;
  }
  return -1;
}

static int
get_page_index_search(struct sdic_handle *sd, xstr *x, int f, int t)
{
  /* ordered_xstrcmp-1̵ʤäȤõ */
  int c,p;
  c = (f+t)/2;
  p = ordered_xstrcmp(x, &sd->pages[c].str);
  if (p == 1) {
    /* f<= <=c */
    if (f == c-1) {
      if (ordered_xstrcmp(x,&sd->pages[c-1].str) < 1) {
	return c-1;
      }
    }
    return get_page_index_search(sd, x, f, c+1);
  }
  if (p == -1) {
    /* c<= <t */
    return get_page_index_search(sd, x, c, t);
  }
  return c;
}

static int
get_page_index(struct sdic_handle *sd, xstr *x)
{
  if (ordered_xstrcmp(x,&sd->pages[0].str) == 1) {
    return -1;
  }
  if (ordered_xstrcmp(x, &sd->pages[sd->nr_pages-1].str) <= 0) {
    return sd->nr_pages-1;
  }
  return get_page_index_search(sd, x, 0, sd->nr_pages);
}

/*
 * ڡκǽñФ
 */
static void
extract_page(struct sdic_handle *h, struct sdic_page *p, char *s)
{
  int i, j, l = 0;
  xstr *x = &p->str;
  xstr y;
  p->len_hist = 0;
  /* ʸĹ */
  s++; /* ʸܤϴᤷʸ */
  for (i = 0; is_printable(s[i]); i++) {
    if (is_euc_fragment(s[i])) {
      i++;
    }
    l ++;
  }

  /* 򥳥ԡ */
  x->len = l;
  x->str = malloc(sizeof(xchar) * l);
  for (i = 0, j = 0; i < x->len; i++) {
    if (is_euc_fragment(s[j])) {
      unsigned char h,l;
      l = s[j+1];
      h = s[j];
      x->str[i] = form_euc_char(l, h);
      j += 2;
    } else {
      x->str[i] = s[j];
      j ++;
    }
  }
  /* ΥڡˤñĹhistgram*/
  s--;
  y.str = alloca(sizeof(xchar)*strlen(s));
  y.len = 0;
  while (*s) {
    s += mkxstr(s, &y);
    mark_hash_ent(h, &y);
    p->len_hist |= (1<< y.len);
  }
}

static int
get_nr_page(struct sdic_handle *h)
{
  int i;
  for (i = 1; ntohl(h->page_index[i]); i++);
  return i;
}

static void
make_dic_index(struct sdic_handle *h)
{
  int i;
  h->nr_pages = get_nr_page(h);
  h->pages = malloc(sizeof(struct sdic_page)*h->nr_pages);
  h->len_hist = 0;
  for (i = 0; i < h->nr_pages; i++) {
    int p = ntohl(h->page_index[i]);
    extract_page(h, &h->pages[i], &h->page[p]);
    h->len_hist |= h->pages[i].len_hist;
  }
}

/** եmmapơsdicγƥΥݥ󥿤 */
static int
map_sdic(struct sdic_handle *h, const char *fn)
{
  int fd, r;
  struct stat st;
  int *p;
  fd = open(fn, O_RDONLY);
  if (fd == -1) {
    anthy_log(0, "Failed to open (%s).\n", fn);
    free(h);
    return -1;
  }
  r = fstat(fd, &st);
  if (r == -1) {
    anthy_log(0, "Failed to stat() (%s).\n", fn);
    free(h);
    return -1;
  }

  h->file_size = st.st_size;
  h->dic_file = mmap(NULL, h->file_size, PROT_READ, MAP_SHARED, fd, 0);
  p = (int *)h->dic_file;
  h->entry_index = (int *)&h->dic_file[ntohl(p[2])];
  h->entry = (char *)&h->dic_file[ntohl(p[3])];
  h->page = (char *)&h->dic_file[ntohl(p[4])];
  h->page_index = (int *)&h->dic_file[ntohl(p[5])];
  close(fd);

  return 0;
}

/** ꤵ줿ñμΥǥåĴ٤ */
static int
search_word(struct sdic_handle *e, xstr *x)
{
  int p, o;
  int page_number;

  p = get_page_index(e, x);
  if (p == -1) {
    return -1;
  }
  if (!((1<< x->len) & (e->pages[p].len_hist))) {
    return -1;
  }
  page_number = ntohl(e->page_index[p]);
  o = search_word_in_page(x, &e->page[page_number]);
  if (o == -1) {
    return -1;
  }
  /* mkdic.cWORDS_PER_PAGE = 64 */
  return o + p * 64;
}

/* ư̰ */
static void
fill_seq_ent_by_xstr_kv(sdic_t sd, xstr *x, struct seq_ent *e,
			xstr *tail, int ct, int cst)
{
  int i;
  xstr *idx;
  /* ֤פ촴ɲ */
  xstr ku;
  xchar xc[1] = {HK_KU};
  ku.len = 1;
  ku.str = xc;
  idx = anthy_xstr_dup(x);
  idx = anthy_xstrcat(idx, &ku);

  i = search_word(sd, idx);
  if (i >= 0) {
    e->node_type |= ST_WORD;
    fill_dic_ent(sd->entry,
		 ntohl(sd->entry_index[i]),
		 e, tail, CC_KV, ct, cst);
  }
  anthy_free_xstr(idx);
}

/** sdicñ򸡺
 * 񥭥å夫ƤФ
 * ꤵ줿Ѥñsdicõtailղä
 * ddic_push_back_dic_entѤơseq_entɲä롥
 */
void
anthy_sdic_fill_seq_ent_by_xstr(sdic_t sd, xstr *x,
				struct seq_ent *e, xstr *tail,
				int cc, int ct, int cst)
{
  int i;

  if (cc == CC_KV) {
    /* Ѥ̰ */
    fill_seq_ent_by_xstr_kv(sd, x, e, tail, ct, cst);
    return ;
  }

  /* ̵ʸñ */
  if (!((1 << x->len) & sd->len_hist)) {
    /* 32ʸʾΤĤ⾡˽ */
    return;
  }
  /* hashˤʤʤ */
  if (!check_hash_ent(sd, x)) {
    return ;
  }
  i = search_word(sd, x);
  if (i >= 0) {
    int entry_index = ntohl(sd->entry_index[i]);
    e->node_type |= ST_WORD;
    fill_dic_ent(sd->entry,
		 entry_index,
		 e, tail, cc, ct, cst);
  }
}

struct sdic_handle *
anthy_create_sdic(const char *fn)
{
  struct sdic_handle *h;
  int i;
  h = anthy_smalloc(sdic_ator);
  h->hash_ent = malloc(sizeof(int) * HASH_ARRAY);
  for (i = 0; i < HASH_ARRAY; i++) {
    h->hash_ent[i] = 0;
  }
  if (map_sdic(h, fn) == -1) {
    return 0;
  }
  make_dic_index(h);
  return h;
}

void
anthy_release_sdic(sdic_t s)
{
  anthy_sfree(sdic_ator, s);
}

void
anthy_init_sdic(void)
{
  sdic_ator = anthy_create_allocator(sizeof(struct sdic_handle),
				     sdic_dtor);
}
