/**************************************************************************
 *                                                                        *
 * libfsearch 1.1                                                         *
 * part of libindexer package                                             *
 * Copyright (C) 2012 Fabien Menemenlis (nihilist@dead-inside.org)        *
 *                                                                        *
 * This library is free software; you can redistribute it and/or          *
 * modify it under the terms of the GNU Lesser General Public             *
 * License as published by the Free Software Foundation; either           *
 * version 2.1 of the License, or (at your option) any later version.     *
 *                                                                        *
 * This library is distributed in the hope that it will be useful,        *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU      *
 * Lesser General Public License for more details.                        *
 *                                                                        *
 * You should have received a copy of the GNU Lesser General Public       *
 * License along with this library; if not, write to the Free Software    *
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston,                  *
 * MA  02111-1307  USA                                                    *
 *                                                                        *
 **************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <db.h>
#include <fcntl.h>
#include <limits.h>
#include <sys/mman.h>

#include "libfsearch.h"
#include "latintable.h"
#include "dhash.h"

#define FILENAMESZ 1024
#define MAXWORD 20
#define RECSIZE (5 + sizeof(int) + sizeof(int))

#define FLAG_QUOTE 1
#define FLAG_LASTQUOTE 2
#define FLAG_NOT 4

#define FLAG_QUERYQUOTE 1
#define FLAG_HASNOT 2

#define ABS(a) ((a) < 0 ? (-a) : a)


/* word with its mapped memory to positions ids */
typedef struct {
    char word[2 * WORDSZ + 1];
    void *initzone;  /* ptr to mapped memory for this word */
    void *zone;  /* start of real data in mapped memory */
    char *ptr;   /* pointer to current position in mapped memory */
    int size;    /* word count */
    int mapsize; /* size of mapped region */
    s_pos wpos;  /* current word position + doc id */
    int pos;
    int flag;    /* bit 0 set to 1 when between quotes, bit 1 set to 1 when unwanted (NOT operator) */
    int isbitfield;
    s_ff *ff;    /* "fast forward" structures, so we can move faster to docids when merging */
    int ffc;     /* number of "fast forward" structures */
    int ffi;     /* ff structures we're currently in */
}
s_word;


typedef struct {
    char *query;
    int start;
    int count;
    int wcount;
    int flag;
    s_word *word;
    s_fsearch *fs;
}
s_query;


s_fsearch *fsearch_init(char *basename) {
    s_fsearch *fsearch, fs;
    char filename[FILENAMESZ + 1];
    
    
    memset(&fs, 0, sizeof(fs));
    snprintf(filename, FILENAMESZ, "%s.idx", basename);
    if (!(fs.fidx = open(filename, O_RDONLY))) {
        perror(filename);
        return(NULL);
    }
    snprintf(filename, FILENAMESZ, "%s.off", basename);
    if (!(fs.foff = fopen(filename, "r"))) {
        perror(filename);
        return(NULL);
    }
    fseeko(fs.foff, 0, SEEK_END);
    fs.globalcount = ftello(fs.foff) / sizeof(off_t);
    rewind(fs.foff);
    
    snprintf(filename, FILENAMESZ, "%s.dboff", basename);
    if (!(fs.foffdb = fopen(filename, "r"))) {
        perror(filename);
        return(NULL);
    }
    fseeko(fs.foffdb, 0, SEEK_END);
    fs.fsoffcnt = ftello(fs.foffdb) / sizeof(unsigned int);
    rewind(fs.foffdb);
    snprintf(filename, FILENAMESZ, "%s.dbdata", basename);
    if (!(fs.fdatadb = fopen(filename, "r"))) {
        perror(filename);
        return(NULL);
    }
    
    fsearch = (s_fsearch *)malloc(sizeof(s_fsearch));
    memcpy(fsearch, &fs, sizeof(s_fsearch));
    return(fsearch);
}


int fsearch_db_get(s_fsearch *fs, unsigned short keysize, char *key, void *data) {
    char readkey[WORDSZ * 2 + 1 + 1];
    unsigned short keysz;
    unsigned char count = 0; /* in case the fread fails */
    off_t off;


    fseeko(fs->foffdb, (off_t)( ((off_t)(dhash_hash2(key, key + keysize))) * ((off_t)6)), SEEK_SET);
    if (!fread(&count, sizeof(unsigned char), 1, fs->foffdb) || !count)
        return(0);

    off = 0;
    fread(&off, 5, 1, fs->foffdb);
    fseeko(fs->fdatadb, off, SEEK_SET);
    for (;;) {
        fread(&keysz, sizeof(unsigned short), 1, fs->fdatadb);
        fread(readkey, 1, keysz, fs->fdatadb);
        readkey[keysz] = '\0';
        if (!strcmp(readkey, key)) {
            fread(data, 1, RECSIZE, fs->fdatadb);
            return(1);
        }
        if (!--count)
            return(0);
        fseeko(fs->fdatadb, RECSIZE, SEEK_CUR);
    }
    return(0);
}


int fsearch_lookupword(s_fsearch *fs, s_word *word) {
    off_t off;
    char data[RECSIZE];
    int pagesize = getpagesize();
    off_t offset;

    if (!fsearch_db_get(fs, strlen(word->word), word->word, data)) {
        return(-1);
    }
    off = 0;
    memcpy(&off, (char *)data, 5);
    memcpy(&word->size, (char *)data + 5, sizeof(int));
    memcpy(&word->mapsize, (char *)data + 5 + sizeof(int), sizeof(int));
    
    word->ffc = word->size / FFGAP;
    if (word->ffc) {
        word->ff = (s_ff *)malloc(sizeof(s_ff) * word->ffc);
        lseek(fs->fidx, off, SEEK_SET);
        read(fs->fidx, (void *)word->ff, sizeof(s_ff) * word->ffc);
        word->ffi = -1;
    }
    /* FreeBSD only (mmap() does not require to be pagesize aligned) */
    word->zone = mmap(NULL, word->mapsize, PROT_READ, MAP_FILE, fs->fidx, off + word->ffc * sizeof(s_ff));
    madvise(word->zone, word->mapsize, MADV_SEQUENTIAL);
/*
    offset = off + word->ffc * sizeof(s_ff);
    word->initzone = mmap(NULL, (word->mapsize + 2 * pagesize) / pagesize * pagesize, PROT_READ, MAP_PRIVATE, fs->fidx, (offset / pagesize) * pagesize); // since we have to align to a pagesize, and then move to an offset inside this page, it could overlap with a next page, thus 2 * pagesize is safe
    if ((int)(word->initzone) == -1) {
        perror("mmap");
        exit(1);
    }
    madvise(word->initzone, (word->mapsize + pagesize) / pagesize * pagesize, MADV_SEQUENTIAL);
    word->zone = word->initzone + offset - (offset / pagesize) * pagesize;
*/
    
    if (word->size & FLAG_BITFIELD) {
        word->size = word->size ^ FLAG_BITFIELD;
        word->isbitfield = 1;
        word->wpos.wordpos = 0;
    } else {
        word->isbitfield = 0;
    }

    return(0);
}


int fsearch_parsequery(s_query *query) {
    char *ptr;
    int i, j;
    int wcount;
    int quote;
    s_word word[MAXWORD];
    char bufword[2 * WORDSZ + 1 + 1];


    wcount = 0;
    quote = 0;
    ptr = query->query;
    while (*ptr != '\0') {
        i = 0;
        while (ptr[i] != '\0' && ptr[i] != ' ') {
            i++;
        }
        if (*ptr == '"' && *(ptr + i - 1) == '"') {
            ptr++;
            memcpy(word[wcount].word, ptr, i - 2);
            word[wcount].word[i - 2] = '\0';
        } else {
            if (*ptr == '"') { /* string start */
                ptr++;
                i--;
                quote = 1;
                query->flag |= FLAG_QUERYQUOTE;
            }
            word[wcount].flag = 0;
            if (quote)
                word[wcount].flag |= FLAG_QUOTE;
            if (*(ptr + i - 1) == '"') { /* end of string */
                word[wcount].flag |= FLAG_LASTQUOTE;
                quote = 0;
                memcpy(word[wcount].word, ptr, i - 1);
                word[wcount].word[i - 1] = '\0';
            } else {
                if (*ptr == '~') {
                    ptr++;
                    i--;
                    word[wcount].flag |= FLAG_NOT;
                    query->flag |= FLAG_HASNOT;
                }
                memcpy(word[wcount].word, ptr, i);
                word[wcount].word[i] = '\0';
            }
        }
        wcount++;
        ptr += i;
        while (*ptr != '\0' && *ptr == ' ')
            ptr++;
    }

    /* "compress" words when looking for a string between quotes */
    for (i = 0, j = 0; i < wcount; i++) {
        if (word[i].flag & FLAG_QUOTE && i != wcount - 1 && !(word[i].flag & FLAG_LASTQUOTE)) {
            sprintf(bufword, "%s\t%s", word[i].word, word[i + 1].word);
            strcpy(word[j].word, bufword);
            word[j].flag = word[i + 1].flag;
        } else {
            if (word[i].flag & FLAG_LASTQUOTE)
                continue;
            if (i != j) {
                strcpy(word[j].word, word[i].word);
                word[j].flag = word[i].flag;
            }
        }
        j++;
    }
    wcount = j;
    
    for (i = 0; i < wcount; i++) {
        if (fsearch_lookupword(query->fs, &word[i]) == -1) {
            /* not found, unmap everything */
            for (j = 0; j < i; j++) {
                munmap(word[j].zone, word[j].mapsize);
                free(word[j].ff);
            }
            return(i + 1);
        }
    }

    query->word = (s_word *)malloc(wcount * sizeof(s_word));
    memcpy(query->word, word, wcount * sizeof(s_word));
    query->wcount = wcount;
    return(0);
}


int fsearch_getnextentry(s_word *word) {
    if (word->isbitfield) {
        while (word->pos < word->size) {
            if (word->ptr[word->pos >> 3] & (1 << (word->pos & 7))) {
                word->wpos.docid = word->pos;
                word->pos++;
                return(0);
            }
            word->pos++;
        }
        return(1);
    }
    if (word->pos == word->size)
        return(1);
    word->wpos = *(s_pos *)word->ptr;
    word->ptr += sizeof(s_pos);
    word->pos++;
    return(0);
}

    
int fsearch_adjusturlid(s_word *word, s_pos *wpos) {
    int i;
    
    
    if (word->ffc) {
        i = word->ffi;
        while (i + 1 < word->ffc && word->ff[i + 1].docid < wpos->docid) {
            i++;
        }
        if (i != word->ffi && wpos->docid > word->ff[i].docid) {
            word->ptr = (char *)(word->zone) + word->ff[i].offset;
            word->pos = word->ff[i].pos;
            word->ffi = i;
        }
    }
    
    while (word->wpos.docid < wpos->docid) {
        if (fsearch_getnextentry(word))
            return(1);
    }
    return(0);
}


int fsearch_checkquote(s_query *query) {
    int start, end;
    int i;
    int diff;
    int docid;
    

    docid = query->word[0].wpos.docid;
    for (start = 0; start < query->wcount - 1; start++) {
        if (query->word[start].flag & FLAG_QUOTE) { /* start of string */
            for (end = start; end < query->wcount && !(query->word[end].flag & FLAG_LASTQUOTE); end++); /* end of string */
            
            for (i = start; i < end; i++) {
                diff = query->word[i + 1].wpos.wordpos - query->word[i].wpos.wordpos;
                while (diff != 1 && query->word[i].wpos.docid == docid && query->word[i + 1].wpos.docid) {
                    /* words are not following each other, read next entry */
                    if (query->word[i].wpos.wordpos < query->word[i + 1].wpos.wordpos) {
                        if (fsearch_getnextentry(&query->word[i]))
                            return(0);
                    } else {
                        if (fsearch_getnextentry(&query->word[i + 1]))
                            return(0);
                    }
                    diff = query->word[i + 1].wpos.wordpos - query->word[i].wpos.wordpos;
                }
                if (diff != 1)
                    return(0);
            }
            
            start = end + 1;
        }
    }
    return(1);
}


int fsearch_comparepos(s_pos *a, s_pos *b) {
    return(a->wordpos - b->wordpos);
}


/*
 * docid must be NULL initialized
 */
unsigned int fsearch_fastmerge(s_query *query, s_pos **docids, int maxid, int globalcount) {
    int i;
    int hi;
    int docid;
    int pos;
    int end;
    int pertid, spertid; /* word id with "higher" pertinence (lowest total count) and second higher pertinence */
    unsigned int res;


    end = 0;
    res = 0;
    for (i = 0; i < query->wcount; i++) {
        query->word[i].pos = 0;
        query->word[i].ptr = (char *)query->word[i].zone;
        /* read first value */
        fsearch_getnextentry(&query->word[i]);
    }
    
    spertid = 1;
    pertid = 0;
    for (i = 1; i < query->wcount; i++) {
        if (query->word[i].size < query->word[pertid].size) {
            spertid = pertid;
            pertid = i;
        }
    }

    while (!end) {
        /* find highest urlid and adjust the others till they match this id */
        hi = 0;
        for (i = 1; i < query->wcount; i++) {
            if (query->word[i].wpos.docid > query->word[hi].wpos.docid)
                hi = i;
        }

        docid = query->word[hi].wpos.docid;
        for (i = 0; i < query->wcount; i++) {
            if (i == hi)
                continue;

            if (query->word[i].wpos.docid < query->word[hi].wpos.docid) {
                if (fsearch_adjusturlid(&query->word[i], &query->word[hi].wpos)) {
                    goto endreached;
                }
            }
        }

        /* check if all urlids match */
        for (i = 1; i < query->wcount; i++) {
            if (query->word[i].wpos.docid != query->word[0].wpos.docid) {
                /* no match */
                goto nextread;
            }
        }
        
        /* when looking for a string check if all word positions follow */
        if (query->flag & FLAG_QUERYQUOTE) {
            if (!fsearch_checkquote(query))
                goto nextread;
        }
        
        if (query->wcount > 1) {
            pos = query->word[pertid].wpos.wordpos + abs(query->word[pertid].wpos.wordpos - query->word[spertid].wpos.wordpos);
        } else {
            pos = query->word[pertid].wpos.wordpos;
        }
        
        /* they do */
        if (maxid == 0) {
            *docids = (s_pos *)realloc(*docids, sizeof(s_pos) * (res + 1)); /* TODO optimize allocation */
            (*docids)[res].docid = docid;
            (*docids)[res].wordpos = pos;
        } else if (res < maxid) {
            (*docids)[res].docid = docid;
            (*docids)[res].wordpos = pos;
        } else {
            qsort(*docids, res, sizeof(s_pos), (int(*)(const void *, const void *))fsearch_comparepos);
            return(res * globalcount / docid);
        }
        res++;

nextread:
        /* values for this id have been read, move to next id */
        for (i = 0; !end && i < query->wcount; i++) {
            while (query->word[i].wpos.docid == docid) {
                if (query->word[i].pos == query->word[i].size) {
                    goto endreached;
                }
                end = fsearch_getnextentry(&query->word[i]);
            }
        }
    }

endreached:
    qsort(*docids, res, sizeof(s_pos), (int(*)(const void *, const void *))fsearch_comparepos);
    return(res);
}


/*
 * docid must be NULL initialized
 */
unsigned int fsearch_merge(s_query *query, s_pos **docids, int maxid, int globalcount) {
    int i;
    int hi;
    int docid;
    int pos;
    int end;
    int start;
    int pertid, spertid; /* word id with "higher" pertinence (lowest total count) and second higher pertinence */
    unsigned int res;


    end = 0;
    res = 0;
    for (i = 0; i < query->wcount; i++) {
        query->word[i].pos = 0;
        query->word[i].ptr = (char *)query->word[i].zone;
        /* read first value */
        fsearch_getnextentry(&query->word[i]);
    }
    
    /* look for first word to be included in query */
    for (start = 0; start < query->wcount && query->word[start].flag & FLAG_NOT; start++);
    
    if (start == query->wcount)
        return(0);
    
    spertid = start + 1;
    pertid = start;
    for (i = start + 1; i < query->wcount; i++) {
        if (!(query->word[i].flag & FLAG_NOT) && query->word[i].size < query->word[pertid].size) {
            spertid = pertid;
            pertid = i;
        }
    }
    
    while (!end) {
        /* find highest urlid and adjust the others till they match this id */
        hi = start;
        for (i = start + 1; i < query->wcount; i++) {
            if (!(query->word[i].flag & FLAG_NOT) && query->word[i].wpos.docid > query->word[hi].wpos.docid)
                hi = i;
        }

        docid = query->word[hi].wpos.docid;

        for (i = 0; i < query->wcount; i++) {
            if (i == hi)
                continue;

            if (query->word[i].wpos.docid < query->word[hi].wpos.docid) {
                if (fsearch_adjusturlid(&query->word[i], &query->word[hi].wpos) && !(query->word[i].flag & FLAG_NOT)) {
                    goto endreached;
                }
            }
        }

        /* check if the included docids match */
        for (i = start + 1; i < query->wcount; i++) {
            if (!(query->word[i].flag & FLAG_NOT) && query->word[i].wpos.docid != query->word[start].wpos.docid) {
                /* no match */
                goto nextread;
            }
        }
        
        /* and check if no excluded docids match */
        for (i = 0; i < query->wcount; i++) {
            if (query->word[i].flag & FLAG_NOT && query->word[i].wpos.docid == query->word[start].wpos.docid) {
                goto nextread;
            }
        }
        
        /* when looking for a string check if all word positions follow */
        if (query->flag & FLAG_QUERYQUOTE)
            if (!fsearch_checkquote(query))
                goto nextread;
        
        if (query->wcount > 1) {
            pos = query->word[pertid].wpos.wordpos + abs(query->word[pertid].wpos.wordpos - query->word[spertid].wpos.wordpos);
        } else {
            pos = query->word[pertid].wpos.wordpos;
        }
        
        /* they do */
        if (maxid == 0) {
            *docids = (s_pos *)realloc(*docids, sizeof(s_pos) * (res + 1)); /* TODO optimize allocation */
            (*docids)[res].docid = docid;
            (*docids)[res].wordpos = pos;
        } else if (res < maxid) {
            (*docids)[res].docid = docid;
            (*docids)[res].wordpos = pos;
        } else {
            qsort(*docids, res, sizeof(s_pos), (int(*)(const void *, const void *))fsearch_comparepos);
            return(res * globalcount / docid);
        }
        res++;

nextread:
        /* values for this id have been read, move to next id */
        for (i = 0; !end && i < query->wcount; i++) {
            while (!(query->word[i].flag & FLAG_NOT) && query->word[i].wpos.docid == docid) {
                if (query->word[i].pos == query->word[i].size) {
                    goto endreached;
                }
                end = fsearch_getnextentry(&query->word[i]);
            }
        }
    }

endreached:
    qsort(*docids, res, sizeof(s_pos), (int(*)(const void *, const void *))fsearch_comparepos);
    return(res);
}


unsigned int fsearch_seek(s_fsearch *fs, char *query, s_pos **docids, int maxid) {
    unsigned int res;
    int i;
    s_query q;
    
    
    memset(&q, 0, sizeof(q));
    q.query = query;
    q.fs = fs;
    if (fsearch_parsequery(&q)) {
        return(-1);
    }
    if (q.flag & FLAG_HASNOT)
        res = fsearch_merge(&q, docids, maxid, fs->globalcount);
    else
        res = fsearch_fastmerge(&q, docids, maxid, fs->globalcount);
    for (i = 0; i < q.wcount; i++) {
        munmap(q.word[i].initzone, q.word[i].mapsize);
        free(q.word[i].ff);
    }
    free(q.word);
    return(res);
}


off_t fsearch_getoffset(s_fsearch *fs, int id) {
    off_t off;
    
    
    fseeko(fs->foff, id * sizeof(off_t), SEEK_SET);
    fread(&off, sizeof(off_t), 1, fs->foff);
    return(off);
}


char *fsearch_clearstring(char *orig) {
    char *new;
    char *ptr, *ptr2;
    int i, len;
    
    
    len = strlen(orig);
    new = (char *)malloc(len + 1);
    ptr2 = new;
    i = 0;
    ptr = orig;
    while (*ptr != '\0') {
        while (latinidx[(unsigned char)*ptr] == 32)
            ptr++;
        if (ptr2 != new) {
            *ptr2 = ' ';
            ptr2++;
        }
        while (*ptr != '\0' && latinidx[(unsigned char)*ptr] != 32) {
            *ptr2 = latinidx[(unsigned char)*ptr];
            ptr++;
            ptr2++;
        }
    }
    *ptr2 = '\0';
    new = (char *)realloc(new, strlen(new) + 1);
    return(new);
}


char *fsearch_clearstringlnk(char *orig) {
    char *new;
    char *ptr, *ptr2;
    int i, len;
    
    
    len = strlen(orig);
    new = (char *)malloc(len + 1);
    ptr2 = new;
    i = 0;
    ptr = orig;
    while (*ptr != '\0') {
        while (latinidx_lnk[(unsigned char)*ptr] == 32)
            ptr++;
        if (ptr2 != new) {
            *ptr2 = ' ';
            ptr2++;
        }
        while (*ptr != '\0' && latinidx_lnk[(unsigned char)*ptr] != 32) {
            *ptr2 = latinidx_lnk[(unsigned char)*ptr];
            ptr++;
            ptr2++;
        }
    }
    *ptr2 = '\0';
    new = (char *)realloc(new, strlen(new) + 1);
    return(new);
}


void fsearch_close(s_fsearch *fs) {
    fclose(fs->foffdb);
    fclose(fs->fdatadb);
    close(fs->fidx);
    fclose(fs->foff);
}
