#include <sys/types.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#define MAXLINE 1024
typedef struct FieldNode
{
int field;
struct FieldNode* right;
}field_node;
typedef struct AttributeNode
{
int key;
struct FieldNode* right;
struct AttributeNode* down;
}attr;
/* buffer for first file */
static char buffer1[MAXLINE];
static int occupied1 = 0;
/* buffer for second file */
static char buffer2[MAXLINE];
static int occupied2 = 0;
static int read_line(char *s, int lim, FILE *fp);
static field_node* create_field_node(int field);
static attr* create_attribute_node(int key);
static attr* create_head_node();
static int read_key(char *s);
static attr* read_group(attr *head, FILE *fp);
static void free_group_memory(attr **headp);
static void join_lines(char *s, attr *group, FILE *fp);
static FILE* open_file(char *filename);
static int getline(char *s, int lim, FILE *fp);
static char* getword(char *s, const char *sep);
/* join to files according to matching keys assuming that both files are sorted */
int natural_join(char *inputfile1, char *inputfile2, char *output)
{
FILE *fp1, *fp2, *fp_out;
attr *head;
char line[MAXLINE];
/* open files */
if((fp1 = open_file(inputfile1)) == NULL)
return 0;
if((fp2 = open_file(inputfile2)) == NULL)
return 0;
if((fp_out = fopen(output, "w")) == NULL)
return 0;
head = create_head_node();
/* join files*/
for(;;)
{
int key;
char *bufp;
/* read a group of lines with equal keys from second file */
head->down = read_group(head, fp2);
if(head->down == NULL) /* EOF for second file */
break;
/* read line from first file until an equal key or greater key than the group is found */
for(;;)
{
/* read a line from first file */
if(occupied1)
{
bufp = &buffer1[0];
occupied1 = 0;
}
else
{
if(read_line(line, sizeof(line), fp1) == EOF) /* end of first file, join completed */
break;
bufp = &line[0];
}
/* read key */
key = read_key(bufp);
if(key > head->down->key)
{
/* free memory for the group of records with equal keys and
save the line from first file file in static buffer */
free_group_memory(&head);
strcpy(buffer1, bufp);
occupied1 = 1;
break;
}
if(key == head->down->key)
join_lines(bufp, head->down, fp_out);
}
}
/* free memory */
free_group_memory(&head);
/* free head node */
free(head);
/* close open files */
fclose(fp1);
fclose(fp2);
fclose(fp_out);
return 1;
}
static void join_lines(char *s, attr *group, FILE *fp)
{
char copy[MAXLINE];
char *sep = "\t ";
char *p;
while(group != NULL)
{
field_node *f;
strcpy(copy, s);
for(p = getword(copy, sep); p; p = getword(NULL, sep))
fprintf(fp, "%s ", p);
for(f = group->right; f != NULL; f = f->right)
fprintf(fp, "%d ", f->field);
fprintf(fp, "\n");
group = group->down;
}
}
static attr* read_group(attr *head, FILE *fp)
{
char line[MAXLINE];
attr *group;
line[0] = '\0';
group = head;
for(;;)
{
char *bufp, *sep, *p;
int new_key;
field_node *fieldp;
sep = "\t ";
/* read line */
if(occupied2) /* buffer is occupied, read line from buffer */
{
bufp = &buffer2[0];
occupied2 = 0;
}
else /* read line from file */
{
if(read_line(line, sizeof(line), fp) == EOF)
break;
bufp = &line[0];
}
/* read key */
new_key = read_key(bufp);
/* compare new key with previous key */
if(new_key > group->key) /* this condition will always be false for the first line as initially key is INT_MAX */
{
strcpy(buffer2, bufp);
occupied2 = 1;
break;
}
/* read key into memory */
group->down = create_attribute_node(atoi(getword(bufp, sep)));
if(group->down == NULL)
return NULL;
/* read fields into memory */
if((p = getword(NULL, sep)) != NULL)
{
fieldp = create_field_node(atoi(p));
if(fieldp == NULL)
return NULL;
group->down->right = fieldp;
}
while((p = getword(NULL, sep)) != NULL)
{
fieldp->right = create_field_node(atoi(p));
if(fieldp->right == NULL)
return NULL;
fieldp = fieldp->right;
}
/* move down one level */
group = group->down;
}
return head->down;
}
static void free_group_memory(attr **headp)
{
attr *d;
if((*headp)->down != NULL)
{
field_node *r;
for(d = (*headp)->down; d != NULL;)
{
attr *dd;
for(r = d->right; r != NULL;)
{
field_node *rr;
rr = r->right;
free(r);
r = rr;
}
dd = d->down;
free(d);
d = dd;
}
}
(*headp)->down = NULL;
}
static int read_key(char *s)
{
int i, j;
char key[MAXLINE];
/* by pass space and tab */
for(i = 0; s[i] == '\t' || s[i] == ' '; i++);
/* copy key until space or tab is encountered */
for(j = 0; s[i] != '\t' && s[i] != ' '; i++, j++)
key[j] = s[i];
key[j] = '\0';
return atoi(key);
}
static int read_line(char *s, int lim, FILE *fp)
{
int len;
while(1)
{
len = getline(s, lim, fp);
if(len == 0)
return EOF;
/* skip empty lines */
if(s[0] != '\n')
break;
}
/* remove newline from line */
s[len-1] = '\0';
return (len-1);
}
static attr* create_attribute_node(int key)
{
attr *p;
if((p = (attr*)malloc(sizeof(*p))) == NULL)
return NULL;
p->key = key;
p->down = NULL;
p->right = NULL;
return p;
}
static field_node* create_field_node(int field)
{
field_node *p;
if((p = (field_node*)malloc(sizeof(*p))) == NULL)
return NULL;
p->field = field;
p->right = NULL;
return p;
}
static attr* create_head_node()
{
attr *p;
if((p = (attr*)malloc(sizeof(*p))) == NULL)
return NULL;
p->key = INT_MAX;
p->down = NULL;
p->right = NULL;
return p;
}
FILE* open_file(char *filename)
{
struct _stat buffer;
int status;
FILE *fp;
/* check file status */
status = _stat(filename, &buffer);
/* file doesn't exist */
if(status != 0)
return NULL;
/* path specifies an ordinary file and the file has read permission */
if((buffer.st_mode & _S_IFREG) && (buffer.st_mode & _S_IREAD))
{
/* file is empty */
if(buffer.st_size == 0)
return NULL;
/* open file in read mode */
fp = fopen(filename, "r");
if(fp == NULL)
return NULL;
return fp;
}
return NULL;
}
int getline(char *s, int lim, FILE *fp)
{
int i;
char c;
for(i = 0; i < (lim-1) && (c = getc(fp)) != EOF && c != '\n'; i++)
s[i] = c;
if(c == '\n')
s[i++] = c;
s[i] = '\0';
return i;
}
char* getword(char *s, const char *sep)
{
char *p, *tokenptr;
/* static pointer pointing to the current position in s */
static char *currentp = NULL;
/* if s is not NULL, point currentp to starting of NULL */
if(s != NULL)
currentp = s;
/* when the function is called for the first time and s is NULL, return NULL */
if(s == NULL && currentp == NULL)
return NULL;
/* bypass the initial seperator characters */
for(p = currentp; *p != '\0'; p++)
{
const char *sep_p;
/* check if the character is seperator */
for(sep_p = sep; *sep_p != '\0'; sep_p++)
{
if(*sep_p == *p)
break;
}
if(*sep_p == '\0')
break; /* if character other then separator is found, break out of the loop */
}
/* end of string is reached */
if(*p == '\0')
{
currentp = NULL;
return NULL;
}
/* find the next seperator character */
for(tokenptr = p; *p != '\0'; p++)
{
const char *sep_p;
for(sep_p = sep; *sep_p != '\0'; sep_p++)
{
if(*sep_p == *p)
break;
}
if(*sep_p != '\0')
break; /* if the character is separator then break out of loop */
}
if(*p == '\0')
currentp = p;
else
{
currentp = p+1;
*p = '\0';
}
return tokenptr;
}