#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "stdint.h"
#include <time.h>
#include <math.h>
//#include <omp.h>
#include <float.h> // add nakai

#define NAKAI

#ifdef NAKAI
#define real double
#define LIMIT 50
#else
#define real float
#endif

#define BUFFSIZE 10000
#define TOKENLEN 1000
#define HASH_SIZE 131073
#define NUM_OF_THREADS 16

#define BORDER "<S>"

#define LAYER_NUM 3
#define VEC_DIM 20  //100

#define SGM_THRESHOLD 15.0
#define TANH_THRESHOLD 8.0
#define TABLE_SIZE 50001

#define LEARNING_RATE 0.002

unsigned int nNumOfWords;
char **sWords;
unsigned int nBorder;

real dLearningRate;

unsigned long /* long */ nNumOfDocs;
unsigned int *nDocLen;
unsigned int **nWordSeq;
unsigned int nMaxDocLen;

unsigned int nIteration;
unsigned long /* long */ nNumOfSamples;
real dTotalLength;
real dDiffTime;

real dSgmTableFactor;
real *dSgmTable;
real dTanhTableFactor;
real *dTanhTable;

FILE *fOut;

unsigned int nLayerNum;
unsigned int *nVecDim;
unsigned int nLen;
unsigned int *nWSizeI;
unsigned int *nWSizeP;
unsigned int *nWSizeE;
unsigned int *nWSizeO;
real **dI;
real **dP;
real **dE;
real **dO;
real **dH;
real **dAI;
real **dAP;
real **dAE;
real **dAO;
real *dY;
real *dYHat;
real ***dWI;
real ***dWP;
real ***dWE;
real ***dWO;
real **dWOut;
real *dBiasOut;

real **dGH;
real **dGE;
real **dGAI;
real **dGAP;
real **dGAE;
real **dGAO;
real *dGYHat;
real ***dGWI;
real ***dGWP;
real ***dGWE;
real ***dGWO;
real **dGWOut;
real *dGBiasOut;
/* add nakai */
char **makeTestData(int,int *);
int    freeTestData(char **,int);
int    forwardSub(unsigned int,unsigned int,int);
void   printYpd(FILE *fw);

int isfinite(double val)
{
  if(val > DBL_MAX || val < DBL_MIN) return(0);
  else                               return(1);
}
//----------------------------------------------------------------
time_t start_time;
void put_time(FILE *f)
{
  time_t current_time;
  time(&current_time);
  dDiffTime = difftime(current_time, start_time);
  fprintf(f, "%.0lf", dDiffTime);
  fflush(f);
}

real randn()
{ return ((real) (((double) rand()) / RAND_MAX)) * 2.0 - 1.0; }

//----------------------------------------------------------------
// hash function by Paul Hsieh
// http://www.azillionmonkeys.com/qed/hash.html
#undef get16bits
#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__)	\
  || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
#define get16bits(d) (*((const uint16_t *) (d)))
#endif
#if !defined (get16bits)
#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)	\
		      +(uint32_t)(((const uint8_t *)(d))[0]) )
#endif

#ifdef NAKAI  //add nakai
unsigned int SuperFastHash (const char * data, int len) {
  return(0);
}
#else
unsigned int SuperFastHash (const char * data, int len) {
  uint32_t hash = len, tmp; int rem;
  if (len <= 0 || data == NULL) return 0;
  rem = len & 3; len >>= 2;
  for (;len > 0; len--) {
    hash  += get16bits (data);
    tmp    = (get16bits (data+2) << 11) ^ hash;
    hash   = (hash << 16) ^ tmp;
    data  += 2*sizeof (uint16_t);
    hash  += hash >> 11;
  }
  switch (rem) {
  case 3: hash += get16bits (data);
    hash ^= hash << 16;
    hash ^= ((signed char)data[sizeof (uint16_t)]) << 18;
    hash += hash >> 11;
    break;
  case 2: hash += get16bits (data);
    hash ^= hash << 11; hash += hash >> 17;
    break;
  case 1: hash += (signed char)*data;
    hash ^= hash << 10; hash += hash >> 1;
  }
  hash ^= hash << 3; hash += hash >> 5; hash ^= hash << 4;
  hash += hash >> 17; hash ^= hash << 25; hash += hash >> 6;
  return ((unsigned int) hash) % HASH_SIZE; // by me
}
#endif
//----------------------------------------------------------------
// hash table for words
unsigned int *nWordNums;
unsigned int **nWordIDs;
int initHash()
{
  unsigned int n;
  nNumOfWords = 0;
  nWordNums = (unsigned int *) malloc(sizeof(unsigned int) * HASH_SIZE);
  nWordIDs = (unsigned int **) malloc(sizeof(unsigned int *) * HASH_SIZE);
  if ((nWordNums == NULL) || (nWordIDs == NULL)) return 1;
  for (n = 0; n < HASH_SIZE; n ++) nWordNums[n] = 0;
  return 0;
}
unsigned int getWordID(const char *s)
{ 
  unsigned int n, nID, nHashKey;
  nID = 0;
  nHashKey = SuperFastHash(s, 100);
  for (n = 0; n < nWordNums[nHashKey]; n ++)
    if (strcmp(s, sWords[nWordIDs[nHashKey][n] - 1]) == 0) {
      nID = nWordIDs[nHashKey][n];
      break;
    }
  return nID;
}
unsigned int addWord(const char *s)
{ 
  unsigned int n, nID, nHashKey;

  nID = 0;
  nHashKey = SuperFastHash(s, 100);
  for (n = 0; n < nWordNums[nHashKey]; n ++)
    if (strcmp(s, sWords[nWordIDs[nHashKey][n] - 1]) == 0) {
      nID = nWordIDs[nHashKey][n];
      break;
    }
  if (nID == 0) {
    
    nNumOfWords ++;
    nWordNums[nHashKey] ++;
    if (nWordNums[nHashKey] == 1)
      nWordIDs[nHashKey] = (unsigned int *) malloc(sizeof(unsigned int));
    else
      nWordIDs[nHashKey]
	= (unsigned int *) realloc(nWordIDs[nHashKey],
				   sizeof(unsigned int) * nWordNums[nHashKey]);
    nWordIDs[nHashKey][nWordNums[nHashKey] - 1] = nNumOfWords;
    if (nNumOfWords == 1)
      sWords = (char **) malloc(sizeof(char *));
    else
      sWords = (char **) realloc(sWords, sizeof(char *) * nNumOfWords);
    sWords[nNumOfWords - 1] = (char *) malloc(sizeof(char) * (strlen(s) + 1));
    memset(sWords[nNumOfWords - 1], 0, strlen(s) + 1);
    strcpy(sWords[nNumOfWords - 1], s);
    nID = nNumOfWords;
    
  }
  return nID;
}
//----------------------------------------------------------------
real sgm(real dA)
{ 
  if (dA <= - SGM_THRESHOLD) return 0.0;
  else if (dA >= SGM_THRESHOLD) return 1.0;
  else return dSgmTable[((int) ((SGM_THRESHOLD + dA) * dSgmTableFactor))];
}
real _tanh(real dA)
{ 
  if (dA <= - TANH_THRESHOLD) return - 1.0;
  else if (dA >= TANH_THRESHOLD) return 1.0;
  else return dTanhTable[((int) ((TANH_THRESHOLD + dA) * dTanhTableFactor))];
}

void initParam()
{
  unsigned int nCK, nCN, nCC, nC;
  for (nCN = 0; nCN < nLayerNum; nCN ++)
    for (nCC = 0; nCC < nNumOfWords * nVecDim[nCN]; nCC ++)
      dWOut[nCN][nCC] = 0.5 * randn();
  for (nCK = 0; nCK < nNumOfWords; nCK ++)
    dBiasOut[nCK] = 0.5 * randn();
  for (nCN = 0; nCN < nLayerNum; nCN ++)
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {

      for (nC = 0; nC < nWSizeI[nCN]; nC ++)
	dWI[nCN][nCC][nC] = 0.5 * randn();
      for (nC = 0; nC < nWSizeP[nCN]; nC ++)
	dWP[nCN][nCC][nC] = 0.5 * randn();
      dWP[nCN][nCC][nWSizeP[nCN] - 1] = 1.0e10;
      for (nC = 0; nC < nWSizeE[nCN]; nC ++)
	dWE[nCN][nCC][nC] = 0.5 * randn();
      for (nC = 0; nC < nWSizeO[nCN]; nC ++)
	dWO[nCN][nCC][nC] = 0.5 * randn();

    }
  return;
}
void printYpd(FILE *fw)
{
  unsigned nTemp,nLen,nCD,nT,nCK;

  for(nCD=0;nCD < nNumOfDocs;nCD++) {
    for(nCK = 0; nCK < nNumOfWords; nCK ++) {
      fprintf(fw,"%s",sWords[nCK]);
      if(nCK < nNumOfWords -1) fprintf(fw,",");
      else                     fprintf(fw,"\n");
    }
    nLen = nDocLen[nCD] + 1; 
    for(nT=0;nT < nLen;nT++) {
      nTemp = (nT) * nNumOfWords;
      for (nCK = 0; nCK < nNumOfWords; nCK ++) {
        fprintf(fw,"%lf",dY[nTemp+nCK]);
        if(nCK < nNumOfWords-1) fprintf(fw,",");
        else                    fprintf(fw,"\n");
      }
    }
  }
}
void printParam(FILE *fw)
{
  unsigned int nCK, nCN, nCC, nC;
#if 1
  unsigned nTemp,nLen,nCD,nT;

  for(nCD=0;nCD < nNumOfDocs;nCD++) {
    nLen = nDocLen[nCD] + 1;  
    for(nT=0;nT < nLen;nT++) {
      if(nT < nLen-1) continue;

      nTemp = (nT) * nNumOfWords;
      for (nCK = 0; nCK < nNumOfWords; nCK ++) {
        fprintf(fw, "#w%d,%d,%s,", nT, nCK, sWords[nCK]);
        fprintf(fw,"%lf",dY[nTemp+nCK]);
        if(nCD < nNumOfDocs-1) fprintf(fw,",");
        else                   fprintf(fw,"\n");
      }
    }
  }
#else
  for (nCK = 0; nCK < nNumOfWords; nCK ++)
    fprintf(fw, "#w,%d,%s\n", nCK, sWords[nCK]);

  for (nCN = 0; nCN < nLayerNum; nCN ++)
    for (nCK = 0; nCK < nNumOfWords; nCK ++) {

      fprintf(fw, "#W,%d,%s,[,", nCN, sWords[nCK]);
      for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {

	fprintf(fw, "%f", dWOut[nCN][nCK * nVecDim[nCN] + nCC]);
	if (nCC < nVecDim[nCN] - 1) fprintf(fw, ",");

      }
      fprintf(fw, ",]\n");

    }
  for (nCK = 0; nCK < nNumOfWords; nCK ++)
    fprintf(fw, "#B,%s,%f\n", sWords[nCK], dBiasOut[nCK]);

  for (nCN = 0; nCN < nLayerNum; nCN ++) {

    for (nC = 0; nC < nWSizeI[nCN]; nC ++) {

      fprintf(fw, "#I%d,%d,[", nCN, nC);
      for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {
	
	fprintf(fw, "%f", dWI[nCN][nCC][nC]);
	if (nCC < nVecDim[nCN] - 1) fprintf(fw, ",");
	
      }
      fprintf(fw, ",]\n");
      
    }
    for (nC = 0; nC < nWSizeP[nCN]; nC ++) {

      fprintf(fw, "#P,%d,%d,[", nCN, nC);
      for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {
	
	fprintf(fw, "%f", dWP[nCN][nCC][nC]);
	if (nCC < nVecDim[nCN] - 1) fprintf(fw, ",");

      }
      fprintf(fw, ",]\n");

    }
    for (nC = 0; nC < nWSizeE[nCN]; nC ++) {

      fprintf(fw, "#E,%d,%d,[", nCN, nC);
      for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {

	fprintf(fw, "%f", dWE[nCN][nCC][nC]);
	if (nCC < nVecDim[nCN] - 1) fprintf(fw, ",");

      }
      fprintf(fw, ",]\n");

    }
    for (nC = 0; nC < nWSizeO[nCN]; nC ++) {

      fprintf(fw, "#O,%d,%d,[", nCN, nC);
      for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {

	fprintf(fw, "%f", dWO[nCN][nCC][nC]);
	if (nCC < nVecDim[nCN] - 1) fprintf(fw, ",");
	
      }
      fprintf(fw, ",]\n");
      
    }
    
  }
#endif
  return;
}
real forward(unsigned long /* long */ nCD)
{
  real dLogLH;
  unsigned int nLen;
  unsigned int nCT, nCK, nCN, nCC, nTemp;
  real dTemp;

  dLogLH = 0.0;
  nLen = nDocLen[nCD] + 1;
  for (nCT = 0; nCT < nLen; nCT ++){  /* Time(Doc͕̓) */

    if (nCT > 0) nCK = nWordSeq[nCD][nCT - 1];
    else nCK = nBorder;

#ifdef NAKAI
    forwardSub(nCT,nCK,0);

#else
    for (nCN = 0; nCN < nLayerNum; nCN ++) {  /* Layer */

#pragma omp parallel
      {
	unsigned nCC, nC;
	unsigned int nTempW, nTemp;
	real dTempI, dTempP, dTempE, dTempO, dTemp;
#ifdef _OPENMP
	unsigned int nThreadID = omp_get_thread_num();
	unsigned int nNumOfThreads = omp_get_num_threads();
#else
	unsigned int nThreadID = 0;
	unsigned int nNumOfThreads = 1;
#endif
        for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) { /* node */
	  
	  dTempI = dWI[nCN][nCC][nCK];   /* nLayer * nodeSize * wordAtDoc */ 
	  dTempP = dWP[nCN][nCC][nCK];
	  dTempE = dWE[nCN][nCC][nCK];
	  dTempO = dWO[nCN][nCC][nCK];
	  
	  nTempW = nNumOfWords;
	  if (nCN > 0) {
	    
	    for (nC = 0; nC < nVecDim[nCN - 1]; nC ++) {

	      dTemp = dH[nCN - 1][nVecDim[nCN - 1] * nCT + nC];
	      dTempI += dWI[nCN][nCC][nTempW + nC] * dTemp;
	      dTempP += dWP[nCN][nCC][nTempW + nC] * dTemp;
	      dTempE += dWE[nCN][nCC][nTempW + nC] * dTemp;
	      dTempO += dWO[nCN][nCC][nTempW + nC] * dTemp;

	    }
	    nTempW += nVecDim[nCN - 1];
	  
	  }
	  if (nCT > 0)
	    for (nC = 0; nC < nVecDim[nCN]; nC ++) {

	      dTemp = dH[nCN][nVecDim[nCN] * (nCT - 1) + nC];
	      dTempI += dWI[nCN][nCC][nTempW + nC] * dTemp;
	      dTempP += dWP[nCN][nCC][nTempW + nC] * dTemp;
	      dTempE += dWE[nCN][nCC][nTempW + nC] * dTemp;
	      dTempO += dWO[nCN][nCC][nTempW + nC] * dTemp;

	    }
	  nTempW += nVecDim[nCN];
	  if (nCT > 0) {

	    dTemp = dE[nCN][nVecDim[nCN] * (nCT - 1) + nCC];
	    dTempI += dWI[nCN][nCC][nTempW] * dTemp;
	    dTempP += dWP[nCN][nCC][nTempW] * dTemp;

	  }

	  dTempI += dWI[nCN][nCC][nTempW + 1];
	  dTempP += dWP[nCN][nCC][nTempW + 1];
	  dTempE += dWE[nCN][nCC][nTempW];
	  dTempO += dWO[nCN][nCC][nTempW + 1];

	  nTemp = nVecDim[nCN] * nCT + nCC;
	  dAI[nCN][nTemp] = dTempI;
	  dI[nCN][nTemp] = sgm(dTempI);
	  dAP[nCN][nTemp] = dTempP;
	  dP[nCN][nTemp] = sgm(dTempP);
	  dAE[nCN][nTemp] = dTempE;
	  dE[nCN][nTemp] = dI[nCN][nTemp] * _tanh(dAE[nCN][nTemp]);
	  if (nCT > 0)
	    dE[nCN][nTemp] += dP[nCN][nTemp] * dE[nCN][nVecDim[nCN] * (nCT - 1) + nCC];
	  dTempO += dWO[nCN][nCC][nTempW] * dE[nCN][nTemp];
	  dAO[nCN][nTemp] = dTempO;
	  dO[nCN][nTemp] = sgm(dTempO);
	  dH[nCN][nTemp] = dO[nCN][nTemp] * _tanh(dE[nCN][nTemp]);
	
        } /* node */
      } 

    } /* Layer */


    nTemp = nCT * nNumOfWords;
    for (nCK = 0; nCK < nNumOfWords; nCK ++) {

      dTemp = dBiasOut[nCK];
      for (nCN = 0; nCN < nLayerNum; nCN ++)
	for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
	  dTemp += dWOut[nCN][nVecDim[nCN] * nCK + nCC]
	    * dH[nCN][nVecDim[nCN] * nCT + nCC];  /* nd Wn,k,d * Hn,d */
      dYHat[nTemp + nCK] = dTemp; /* prob for Yk at T */
      
    }

    /* Negative Log Likelihood */
    dTemp = dYHat[nTemp];
    for (nCK = 1; nCK < nNumOfWords; nCK ++)
      if (dTemp < dYHat[nTemp + nCK]) dTemp = dYHat[nTemp + nCK];  /* Yt,k */
    for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] = dYHat[nTemp + nCK] - dTemp;
    for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] = exp(dY[nTemp + nCK]);
    dTemp = 0.0;
    for (nCK = 0; nCK < nNumOfWords; nCK ++) dTemp += dY[nTemp + nCK];
    for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] /= dTemp;
#endif  

    nTemp = nCT * nNumOfWords;

    if (nCT == nLen - 1) dLogLH += log(dY[nTemp + nBorder]);
    else dLogLH += log(dY[nTemp + nWordSeq[nCD][nCT]]);
                             /* Yt,Xt+1 */

  } /* Time(Doc͕̓) */
  return dLogLH;
}
int forwardSub(unsigned int nCT,unsigned int inCK,int flag)
{
  //real dLogLH;
  //unsigned int nLen;
  unsigned int /*nCT,*/ nCK, nCN, nCC, nTemp;
  real dTemp,rnd;
  int maxCK;

    for (nCN = 0; nCN < nLayerNum; nCN ++) {  /* Layer */

#pragma omp parallel
      {
	unsigned nCC, nC;
	unsigned int nTempW, nTemp;
	real dTempI, dTempP, dTempE, dTempO, dTemp;
#ifdef _OPENMP
	unsigned int nThreadID = omp_get_thread_num();
	unsigned int nNumOfThreads = omp_get_num_threads();
#else
	unsigned int nThreadID = 0;
	unsigned int nNumOfThreads = 1;
#endif
        for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) { /* node */
	  
	  dTempI = dWI[nCN][nCC][inCK];   /* nLayer * nodeSize * wordAtDoc */ 
	  dTempP = dWP[nCN][nCC][inCK];
	  dTempE = dWE[nCN][nCC][inCK];
	  dTempO = dWO[nCN][nCC][inCK];
	  
	  nTempW = nNumOfWords;
	  if (nCN > 0) {
	    
	    for (nC = 0; nC < nVecDim[nCN - 1]; nC ++) {

	      dTemp = dH[nCN - 1][nVecDim[nCN - 1] * nCT + nC];
	      dTempI += dWI[nCN][nCC][nTempW + nC] * dTemp;
	      dTempP += dWP[nCN][nCC][nTempW + nC] * dTemp;
	      dTempE += dWE[nCN][nCC][nTempW + nC] * dTemp;
	      dTempO += dWO[nCN][nCC][nTempW + nC] * dTemp;

	    }
	    nTempW += nVecDim[nCN - 1];
	  
	  }
	  if (nCT > 0)
	    for (nC = 0; nC < nVecDim[nCN]; nC ++) {

	      dTemp = dH[nCN][nVecDim[nCN] * (nCT - 1) + nC];
	      dTempI += dWI[nCN][nCC][nTempW + nC] * dTemp;
	      dTempP += dWP[nCN][nCC][nTempW + nC] * dTemp;
	      dTempE += dWE[nCN][nCC][nTempW + nC] * dTemp;
	      dTempO += dWO[nCN][nCC][nTempW + nC] * dTemp;

	    }
	  nTempW += nVecDim[nCN];
	  if (nCT > 0) {

	    dTemp = dE[nCN][nVecDim[nCN] * (nCT - 1) + nCC];
	    dTempI += dWI[nCN][nCC][nTempW] * dTemp;
	    dTempP += dWP[nCN][nCC][nTempW] * dTemp;

	  }

	  dTempI += dWI[nCN][nCC][nTempW + 1];
	  dTempP += dWP[nCN][nCC][nTempW + 1];
	  dTempE += dWE[nCN][nCC][nTempW];
	  dTempO += dWO[nCN][nCC][nTempW + 1];

	  nTemp = nVecDim[nCN] * nCT + nCC;
	  dAI[nCN][nTemp] = dTempI;
	  dI[nCN][nTemp] = sgm(dTempI);
	  dAP[nCN][nTemp] = dTempP;
	  dP[nCN][nTemp] = sgm(dTempP);
	  dAE[nCN][nTemp] = dTempE;
	  dE[nCN][nTemp] = dI[nCN][nTemp] * _tanh(dAE[nCN][nTemp]);
	  if (nCT > 0)
	    dE[nCN][nTemp] += dP[nCN][nTemp] * dE[nCN][nVecDim[nCN] * (nCT - 1) + nCC];
	  dTempO += dWO[nCN][nCC][nTempW] * dE[nCN][nTemp];
	  dAO[nCN][nTemp] = dTempO;
	  dO[nCN][nTemp] = sgm(dTempO);
	  dH[nCN][nTemp] = dO[nCN][nTemp] * _tanh(dE[nCN][nTemp]);
	
        } /* node */
      } 

    } /* Layer */

    nTemp = nCT * nNumOfWords;
    for (nCK = 0; nCK < nNumOfWords; nCK ++) {

      dTemp = dBiasOut[nCK];
      for (nCN = 0; nCN < nLayerNum; nCN ++)
	for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
	  dTemp += dWOut[nCN][nVecDim[nCN] * nCK + nCC]
	    * dH[nCN][nVecDim[nCN] * nCT + nCC];  /* nd Wn,k,d * Hn,d */
      dYHat[nTemp + nCK] = dTemp; /* prob for Yk at T */
      
    }

    /* Negative Log Likelihood */
    dTemp = dYHat[nTemp];
    for (nCK = 1; nCK < nNumOfWords; nCK ++)
      if (dTemp < dYHat[nTemp + nCK]) dTemp = dYHat[nTemp + nCK];  /* Yt,k */
    for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] = dYHat[nTemp + nCK] - dTemp;
    for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] = exp(dY[nTemp + nCK]);
    dTemp = 0.0;
    for (nCK = 0; nCK < nNumOfWords; nCK ++) dTemp += dY[nTemp + nCK];
    for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] /= dTemp;

    /* add Nakai */
    if(flag == 1) {
      rnd = (double)rand()/(double)RAND_MAX;
      dTemp = 0.0;
      for (nCK = 0; nCK < nNumOfWords; nCK ++) {
        dTemp += dY[nTemp + nCK];
        if (dTemp > rnd) {
          maxCK = nCK;
          break;
        }
      }
    }
    return(maxCK);
}
void backward(unsigned long /* long */ nCD)
{
  unsigned int nLen;
  unsigned int nCT, nCK, nCN, nCC;
  unsigned int nTemp;

  nLen = nDocLen[nCD] + 1;
  nCT = nLen;
  do {

    nCT --;
    nTemp = nCT * nNumOfWords;
    for (nCK = 0; nCK < nNumOfWords; nCK ++)
      dGYHat[nTemp + nCK] = - dY[nTemp + nCK];
    if (nCT < nLen - 1) dGYHat[nTemp + nWordSeq[nCD][nCT]] += 1.0;
    else dGYHat[nTemp + nBorder] += 1.0;
    
  } while (nCT > 0);

  for (nCK = 0; nCK < nNumOfWords; nCK ++)
    dGBiasOut[nCK] = 0.0;
  for (nCN = 0; nCN < nLayerNum; nCN ++)
    for (nCC = 0; nCC < nNumOfWords * nVecDim[nCN]; nCC ++)
      dGWOut[nCN][nCC] = 0.0;

  nCT = nLen;
  do {  /* timeTt */

    nCT --;
    nTemp = nCT * nNumOfWords;
    for (nCK = 0; nCK < nNumOfWords; nCK ++)
      dGBiasOut[nCK] += dGYHat[nTemp + nCK];

    nCN = nLayerNum;
    do { /* ww(t) */

      nCN --;

#pragma omp parallel
      {
	unsigned int nCK, nCNN, nCC, nC;
	unsigned int nTemp, nTemp2, nTemp3;
	real dTemp, dTemp2;
#ifdef _OPENMP
	unsigned int nThreadID = omp_get_thread_num();
	unsigned int nNumOfThreads = omp_get_num_threads();
#else
	unsigned int nThreadID = 0;
	unsigned int nNumOfThreads = 1;
#endif
	nTemp = nCT * nNumOfWords;
	for (nCK = nThreadID; nCK < nNumOfWords; nCK += nNumOfThreads)
	  for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
	    dGWOut[nCN][nVecDim[nCN] * nCK + nCC]
	      += dH[nCN][nVecDim[nCN] * nCT + nCC] * dGYHat[nTemp + nCK];

	for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) {

	  nTemp = nVecDim[nCN] * nCT + nCC;

	  nTemp2 = nCT * nNumOfWords;
	  dGH[nCN][nTemp] = 0.0;
	  for (nCK = 0; nCK < nNumOfWords; nCK ++)
	    dGH[nCN][nTemp] += dWOut[nCN][nVecDim[nCN] * nCK + nCC]
	      * dGYHat[nTemp2 + nCK];

	  if (nCN < nLayerNum - 1) {

	    nCNN = nCN + 1;
	    nTemp2 = nCT * nVecDim[nCNN];
	    nTemp3 = nNumOfWords + nCC;
	    for (nC = 0; nC < nVecDim[nCNN]; nC ++) {

	      dGH[nCN][nTemp] += dWI[nCNN][nC][nTemp3] * dGAI[nCNN][nTemp2 + nC];
	      dGH[nCN][nTemp] += dWP[nCNN][nC][nTemp3] * dGAP[nCNN][nTemp2 + nC];
	      dGH[nCN][nTemp] += dWE[nCNN][nC][nTemp3] * dGAE[nCNN][nTemp2 + nC];
	      dGH[nCN][nTemp] += dWO[nCNN][nC][nTemp3] * dGAO[nCNN][nTemp2 + nC];

	    }

	  }

	  if (nCT < nLen - 1) {

	    nTemp2 = (nCT + 1) * nVecDim[nCN];
	    nTemp3 = nNumOfWords + nCC;
	    for (nC = 0; nC < nVecDim[nCN]; nC ++) {

	      if (nCN > 0) nTemp3 += nVecDim[nCN - 1];
	      dGH[nCN][nTemp] += dWI[nCN][nC][nTemp3] * dGAI[nCN][nTemp2 + nC];
	      dGH[nCN][nTemp] += dWP[nCN][nC][nTemp3] * dGAP[nCN][nTemp2 + nC];
	      dGH[nCN][nTemp] += dWE[nCN][nC][nTemp3] * dGAE[nCN][nTemp2 + nC];
	      dGH[nCN][nTemp] += dWO[nCN][nC][nTemp3] * dGAO[nCN][nTemp2 + nC];
	    
	    }

	  }

	  dTemp = _tanh(dE[nCN][nTemp]);

	  dGAO[nCN][nTemp] = dGH[nCN][nTemp]
	    * dTemp * dO[nCN][nTemp] * (1.0 - dO[nCN][nTemp]);

	  nTemp2 = nNumOfWords + nVecDim[nCN];
	  if (nCN > 0) nTemp2 += nVecDim[nCN - 1];
	  dGE[nCN][nTemp] = dWO[nCN][nCC][nTemp2] * dGAO[nCN][nTemp];
	  dGE[nCN][nTemp] += dO[nCN][nTemp] * (1.0 - dTemp * dTemp) * dGH[nCN][nTemp];
	  if (nCT < nLen - 1) {

	    nTemp3 = nVecDim[nCN] * (nCT + 1) + nCC;
	    dGE[nCN][nTemp] += dP[nCN][nTemp3] * dGE[nCN][nTemp3];
	    dGE[nCN][nTemp] += dWP[nCN][nCC][nTemp2] * dGAP[nCN][nTemp3];
	    dGE[nCN][nTemp] += dWI[nCN][nCC][nTemp2] * dGAI[nCN][nTemp3];
	  
	  }
	
	  dTemp2 = _tanh(dAE[nCN][nTemp]);
	  dGAE[nCN][nTemp] = dI[nCN][nTemp] * (1.0 - dTemp2 * dTemp2) * dGE[nCN][nTemp];
	  dGAP[nCN][nTemp] = 0.0;
	  if (nCT > 0)
	    dGAP[nCN][nTemp] += dP[nCN][nTemp] * (1.0 - dP[nCN][nTemp])
	      * dE[nCN][nVecDim[nCN] * (nCT - 1) + nCC] * dGE[nCN][nTemp];
	  dGAI[nCN][nTemp] = dI[nCN][nTemp] * (1.0 - dI[nCN][nTemp]) * dTemp * dGE[nCN][nTemp];
	
	}
      }
      
    } while (nCN > 0);

  } while (nCT > 0);
  return;
}

void weight(unsigned long /* long */ nCD)
{
  unsigned int nLen;
  unsigned int nCN, nCNN;

  nLen = nDocLen[nCD] + 1;
  for (nCN = 0; nCN < nLayerNum; nCN ++) {

    nCNN = 0;
    if (nCN > 0) nCNN = nCN - 1;
    
#pragma omp parallel
    {
      unsigned int nCT, nCK, nCC, nC;
      unsigned int nTemp, nTemp2, nTempW;
      real dTempI, dTempP, dTempE, dTempO;
#ifdef _OPENMP
      unsigned int nThreadID = omp_get_thread_num();
      unsigned int nNumOfThreads = omp_get_num_threads();
#else
      unsigned int nThreadID = 0;
      unsigned int nNumOfThreads = 1;
#endif
      for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) {

	for (nC = 0; nC < nWSizeI[nCN]; nC ++)
	  dGWI[nCN][nCC][nC] = 0.0;
	for (nC = 0; nC < nWSizeP[nCN]; nC ++)
	  dGWP[nCN][nCC][nC] = 0.0;
	for (nC = 0; nC < nWSizeE[nCN]; nC ++)
	  dGWE[nCN][nCC][nC] = 0.0;
	for (nC = 0; nC < nWSizeO[nCN]; nC ++)
	  dGWO[nCN][nCC][nC] = 0.0;

	for (nCT = 0; nCT < nLen; nCT ++) {
	  
	  nTemp = nCT * nVecDim[nCN] + nCC;
	  dTempI = dGAI[nCN][nTemp];
	  dTempP = dGAP[nCN][nTemp];
	  dTempE = dGAE[nCN][nTemp];
	  dTempO = dGAO[nCN][nTemp];
	  
	  if (nCT > 0) nCK = nWordSeq[nCD][nCT - 1];
	  else nCK = nBorder;
	  dGWI[nCN][nCC][nCK] += dTempI;
	  dGWP[nCN][nCC][nCK] += dTempP;
	  dGWE[nCN][nCC][nCK] += dTempE;
	  dGWO[nCN][nCC][nCK] += dTempO;
	  nTempW = nNumOfWords;
	  
	  if (nCN > 0) {
	    
	    nTemp2 = nCT * nVecDim[nCNN];
	    for (nC = 0; nC < nVecDim[nCNN]; nC ++) {
	      
	      dGWI[nCN][nCC][nTempW + nC] += dTempI * dH[nCNN][nTemp2 + nC];
	      dGWP[nCN][nCC][nTempW + nC] += dTempP * dH[nCNN][nTemp2 + nC];
	      dGWE[nCN][nCC][nTempW + nC] += dTempE * dH[nCNN][nTemp2 + nC];
	      dGWO[nCN][nCC][nTempW + nC] += dTempO * dH[nCNN][nTemp2 + nC];
	      
	    }
	    nTempW += nVecDim[nCNN];
	    
	  }
	  if (nCT > 0) {
	    
	    nTemp2 = (nCT - 1) * nVecDim[nCN];
	    for (nC = 0; nC < nVecDim[nCN]; nC ++) {
	      
	      dGWI[nCN][nCC][nTempW + nC] += dTempI * dH[nCN][nTemp2 + nC];
	      dGWP[nCN][nCC][nTempW + nC] += dTempP * dH[nCN][nTemp2 + nC];
	      dGWE[nCN][nCC][nTempW + nC] += dTempE * dH[nCN][nTemp2 + nC];
	      dGWO[nCN][nCC][nTempW + nC] += dTempO * dH[nCN][nTemp2 + nC];
	      
	    }
	    
	  }
	  nTempW += nVecDim[nCN];
	  if (nCT > 0) {
	    
	    nTemp2 = (nCT - 1) * nVecDim[nCN];
	    dGWI[nCN][nCC][nTempW] += dTempI * dE[nCN][nTemp2 + nCC];
	    dGWP[nCN][nCC][nTempW] += dTempP * dE[nCN][nTemp2 + nCC];
	    
	  }
	  dGWO[nCN][nCC][nTempW] += dTempO * dE[nCN][nCT * nVecDim[nCN] + nCC];

	  dGWI[nCN][nCC][nTempW + 1] += dTempI;
	  dGWP[nCN][nCC][nTempW + 1] += dTempP;
	  dGWE[nCN][nCC][nTempW] += dTempE;
	  dGWO[nCN][nCC][nTempW + 1] += dTempO;
	  
	}
	
      }
    }

  }

#pragma omp parallel
  {
    unsigned int nCN, nCK, nCC, nC;
    real dTemp;
#ifdef _OPENMP
    unsigned int nThreadID = omp_get_thread_num();
    unsigned int nNumOfThreads = omp_get_num_threads();
#else
    unsigned int nThreadID = 0;
    unsigned int nNumOfThreads = 1;
#endif
    for (nCN = 0; nCN < nLayerNum; nCN ++) {
      
      for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) {

	for (nC = 0; nC < nWSizeI[nCN]; nC ++) {
	
	  dTemp = dWI[nCN][nCC][nC] + dLearningRate * dGWI[nCN][nCC][nC];
	  if (isfinite(dTemp)) dWI[nCN][nCC][nC] = dTemp;
	  
	}
	for (nC = 0; nC < nWSizeP[nCN]; nC ++) {
	  
	  dTemp = dWP[nCN][nCC][nC] + dLearningRate * dGWP[nCN][nCC][nC];
	  if (isfinite(dTemp)) dWP[nCN][nCC][nC] = dTemp;
	  
	}
	for (nC = 0; nC < nWSizeE[nCN]; nC ++) {
	  
	  dTemp = dWE[nCN][nCC][nC] + dLearningRate * dGWE[nCN][nCC][nC];
	  if (isfinite(dTemp)) dWE[nCN][nCC][nC] = dTemp;
	
	}
	for (nC = 0; nC < nWSizeO[nCN]; nC ++) {
	
	  dTemp = dWO[nCN][nCC][nC] + dLearningRate * dGWO[nCN][nCC][nC];
	  if (isfinite(dTemp)) dWO[nCN][nCC][nC] = dTemp;
	
	}

      }
      
      for (nCC = nThreadID; nCC < nNumOfWords * nVecDim[nCN]; nCC += nNumOfThreads) {
	
	dTemp = dWOut[nCN][nCC] + dLearningRate * dGWOut[nCN][nCC];
	if (isfinite(dTemp)) dWOut[nCN][nCC] = dTemp;
	
      }
      
    }
    for (nCK = nThreadID; nCK < nNumOfWords; nCK += nNumOfThreads) {
      
      dTemp = dBiasOut[nCK] + dLearningRate * dGBiasOut[nCK];
      if (isfinite(dTemp)) dBiasOut[nCK] = dTemp;
      
    }
  }
  return;
}

//#define MDATA

//--------
int main(int argc, char **argv)
{
  char sBuff[BUFFSIZE];
  unsigned int nWordID;
  char sTempWord[TOKENLEN], sTempDocName[TOKENLEN], sPrevDocName[TOKENLEN];
  unsigned long /* long */ nCD;
  unsigned int nCN, nCC;
  real dLogLH;
  real dTemp;

  FILE *fp,*fw,*fdb;  //add nakai
  char *pc;      //add nakai
  int  i,j;      //add nakai
  int  size;
  char **cArray;

  unsigned int iT,nextCK,nextOld; //add nakai

#ifdef MDATA
  /* make test Data */
  if(argc != 5) {
    fprintf(stderr,"cannot Find arg4 file for generate testData\n");
    exit(-9);
  }
  if(!(fw=fopen(argv[4],"w"))) {
    fprintf(stderr,"connot write dataFile=[%s]\n",argv[4]);
    exit(-1);
  }
  for(j=0;j<1000;j++) {
    cArray=makeTestData(10,&size);
    for(i=0;i<size;i++) {
      //fprintf(fw,"%s ",cArray[i]);
      fprintf(fw,"%d,%s\n",j,cArray[i]);
    }
    //fprintf(fw,"\n");
    //fprintf(fw,"1,<S>\n");
    freeTestData(cArray,size);
  }
  fclose(fw);
  exit(1);
#endif

  fOut = stdout;



  if (initHash()) { fprintf(stderr, "%s : initHash()\n", argv[0]); exit(1); }

  nNumOfDocs = 0;
  nDocLen = (unsigned int *) malloc(sizeof(unsigned int));
  nWordSeq = (unsigned int **) malloc(sizeof(unsigned int *));

  memset(sPrevDocName, 0, TOKENLEN);
#ifdef NAKAI
  if(argc < 3) {
    fprintf(stderr,"Ussage LSTM readFile.csv oututFile.csv (dbgFile.csv)\n");
    exit(-9);
  }

  if(!(fp=fopen(argv[1],"r"))) {
    fprintf(stderr,"connot read dataFile=[%s]\n",argv[1]);
    exit(-1);
  }
  if(!(fw=fopen(argv[2],"w"))) {
    fprintf(stderr,"connot write dataFile=[%s]\n",argv[2]);
    exit(-2);
  }

  fdb = NULL;
  if(argc >= 4) {
    if(!(fdb=fopen(argv[3],"w"))) {
      fprintf(stderr,"connot write dbgFile=[%s]\n",argv[3]);
      exit(-2);
    }
  }

  //nBorder = addWord(BORDER);  /* I}[NwordIDݒ */

  while (! feof(fp)) {
    memset(sBuff, 0, BUFFSIZE);
    fgets(sBuff, BUFFSIZE - 1, fp);
#else
  while (! feof(stdin)) {
    memset(sBuff, 0, BUFFSIZE);
    fgets(sBuff, BUFFSIZE - 1, stdin);
#endif
    
    memset(sTempDocName, 0, TOKENLEN);
    memset(sTempWord, 0, TOKENLEN);

#ifdef NAKAI
    sBuff[strlen(sBuff)-1]='\0';
    pc=strtok(sBuff," ,\t");  
    j=1;
    while(pc) {
      if(j == 1) strcpy(sTempDocName,pc);
      if(j == 2) strcpy(sTempWord,pc);
      j++;
      pc=strtok(NULL," ,\t");
    }
    if(j >= 2) {
#else
    if (sscanf(sBuff, "%s,%s\n", sTempDocName, sTempWord) == 2) {
#endif
      if (strcmp(sPrevDocName, sTempDocName) != 0) {
	
	nNumOfDocs ++;  /* doc̐XV */
	nDocLen = (unsigned int *) realloc(nDocLen, sizeof(unsigned int) * nNumOfDocs); /* Sdocword̎[̈XV */
	nWordSeq = (unsigned int **) realloc(nWordSeq, sizeof(unsigned int *) * nNumOfDocs); /* SdocwordID̎[̈XV */
	nDocLen[nNumOfDocs - 1] = 0; /* Vdocword0NA */
        nWordSeq[nNumOfDocs - 1] = (unsigned int *) malloc(sizeof(unsigned int)); /* Vdocword̎[ */

      }
      
      nWordID = addWord(sTempWord);  /* wordID(͏) */
      if (nWordID > 0) {

	nDocLen[nNumOfDocs - 1] ++; /* ߂docword̐XV */
	nWordSeq[nNumOfDocs - 1]
	  = (unsigned int *) realloc(nWordSeq[nNumOfDocs - 1],
				     sizeof(unsigned int) * nDocLen[nNumOfDocs - 1]);
	nWordSeq[nNumOfDocs - 1][nDocLen[nNumOfDocs - 1] - 1] = nWordID - 1; /* ߂docword̏ꏊwordID[ */

      } else { fprintf(stderr, "Invalid data : %s\n", sTempWord); exit(1); }
      
    }
    memset(sPrevDocName, 0, TOKENLEN);
    strcpy(sPrevDocName, sTempDocName); /* ߂doc̍XV */

  }
  if (nNumOfDocs == 0) { fprintf(stderr, "no valid data.\n"); exit(1); }

#ifdef NAKAI
  fclose(fp);
#endif

  nBorder = addWord(BORDER);  /* I}[NwordIDݒ */
  nBorder --;

  nMaxDocLen = 0;
  for (nCD = 0; nCD < nNumOfDocs; nCD ++)
    if (nMaxDocLen < nDocLen[nCD]) nMaxDocLen = nDocLen[nCD];
  nMaxDocLen ++;  /* Sdocł̍őwordZo */

  //----------------

  dSgmTable = (real *) malloc(sizeof(real) * TABLE_SIZE);
  dSgmTableFactor = ((real) TABLE_SIZE - 1) / (SGM_THRESHOLD * 2);
  dTanhTable = (real *) malloc(sizeof(real) * TABLE_SIZE);
  dTanhTableFactor = ((real) TABLE_SIZE - 1) / (TANH_THRESHOLD * 2);
  for (nCC = 0; nCC < TABLE_SIZE; nCC ++) {

    dTemp = exp(((real) nCC) / dSgmTableFactor - SGM_THRESHOLD);
    dSgmTable[nCC] = dTemp / (1.0 + dTemp);
    dTemp = exp(((real) nCC) / dTanhTableFactor - TANH_THRESHOLD);
    dTanhTable[nCC] = (dTemp - 1.0 / dTemp) / (dTemp + 1.0 / dTemp);
    
  }

  //----------------

  nLayerNum = LAYER_NUM;

  nVecDim = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
  dWOut = (real **) malloc(sizeof(real *) * nLayerNum);
  dGWOut = (real **) malloc(sizeof(real *) * nLayerNum);
  for (nCN = 0; nCN < nLayerNum; nCN ++) {

    nVecDim[nCN] = VEC_DIM;
    dWOut[nCN] = (real *) malloc(sizeof(real) * nNumOfWords * nVecDim[nCN]);  /* nLayer * numInputWord * nodeSize */
    dGWOut[nCN] = (real *) malloc(sizeof(real) * nNumOfWords * nVecDim[nCN]);

  }
  dBiasOut = (real *) malloc(sizeof(real) * nNumOfWords);  /*  nNumOfWords  */
  dGBiasOut = (real *) malloc(sizeof(real) * nNumOfWords);

  dY = (real *) malloc(sizeof(real) * nMaxDocLen * nNumOfWords);    /* maxWordAllDoc * numInputWord */
  dYHat = (real *) malloc(sizeof(real) * nMaxDocLen * nNumOfWords);
  dGYHat = (real *) malloc(sizeof(real) * nMaxDocLen * nNumOfWords);
  
  dI = (real **) malloc(sizeof(real *) * nLayerNum);
  dP = (real **) malloc(sizeof(real *) * nLayerNum);
  dE = (real **) malloc(sizeof(real *) * nLayerNum);
  dO = (real **) malloc(sizeof(real *) * nLayerNum);
  dH = (real **) malloc(sizeof(real *) * nLayerNum);
  dAI = (real **) malloc(sizeof(real *) * nLayerNum);
  dAP = (real **) malloc(sizeof(real *) * nLayerNum);
  dAE = (real **) malloc(sizeof(real *) * nLayerNum);
  dAO = (real **) malloc(sizeof(real *) * nLayerNum);
  dGH = (real **) malloc(sizeof(real *) * nLayerNum);
  dGE = (real **) malloc(sizeof(real *) * nLayerNum);
  dGAI = (real **) malloc(sizeof(real *) * nLayerNum);
  dGAP = (real **) malloc(sizeof(real *) * nLayerNum);
  dGAE = (real **) malloc(sizeof(real *) * nLayerNum);
  dGAO = (real **) malloc(sizeof(real *) * nLayerNum);
  for (nCN = 0; nCN < nLayerNum; nCN ++) {

    dI[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);   /* nLayer * nodeSize * maxWordAllDoc */
    dP[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dE[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dO[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dH[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dAI[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dAP[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dAE[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dAO[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dGH[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dGE[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dGAI[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dGAP[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dGAE[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
    dGAO[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);

  }

  nWSizeI = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
  nWSizeP = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
  nWSizeE = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
  nWSizeO = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
  for (nCN = 0; nCN < nLayerNum; nCN ++) {
    
    nWSizeI[nCN] = nNumOfWords + nVecDim[nCN] + 2;   /* numInputWord + nodeSize + 2 */
    if (nCN > 0) nWSizeI[nCN] += nVecDim[nCN - 1];   /* numInputWord + nodeSize + 2 + prevVecSize */
    nWSizeP[nCN] = nNumOfWords + nVecDim[nCN] + 2;
    if (nCN > 0) nWSizeP[nCN] += nVecDim[nCN - 1];
    nWSizeE[nCN] = nNumOfWords + nVecDim[nCN] + 1;
    if (nCN > 0) nWSizeE[nCN] += nVecDim[nCN - 1];
    nWSizeO[nCN] = nNumOfWords + nVecDim[nCN] + 2;
    if (nCN > 0) nWSizeO[nCN] += nVecDim[nCN - 1];

  }

  dWI = (real ***) malloc(sizeof(real **) * nLayerNum);  /* inut Gate   */
  dWP = (real ***) malloc(sizeof(real **) * nLayerNum);  /* forget Gate */
  dWE = (real ***) malloc(sizeof(real **) * nLayerNum);  /* cells */
  dWO = (real ***) malloc(sizeof(real **) * nLayerNum);  /* output Gate */
  dGWI = (real ***) malloc(sizeof(real **) * nLayerNum);
  dGWP = (real ***) malloc(sizeof(real **) * nLayerNum);
  dGWE = (real ***) malloc(sizeof(real **) * nLayerNum);
  dGWO = (real ***) malloc(sizeof(real **) * nLayerNum);
  for (nCN = 0; nCN < nLayerNum; nCN ++) {

    dWI[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
      dWI[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeI[nCN]);  /* nLayer * nodeSize * wightSize(numInput+vecSize*2 + 2) */
    dWP[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
      dWP[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeP[nCN]);
    dWE[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
      dWE[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeE[nCN]);
    dWO[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
      dWO[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeO[nCN]);

    dGWI[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
      dGWI[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeI[nCN]);
    dGWP[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
      dGWP[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeP[nCN]);
    dGWE[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
      dGWE[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeE[nCN]);
    dGWO[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
    for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
      dGWO[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeO[nCN]);

  }

  // print statistics
  fprintf(fOut, "# %llu docs", nNumOfDocs);
  fprintf(fOut, ", %u words", nNumOfWords);
  for (nCN = 0; nCN < nLayerNum; nCN ++)
    fprintf(fOut, ", %u units (%u)", nVecDim[nCN], nCN);
  fprintf(fOut, "\n");
  fflush(fOut);

  initParam();

  nIteration = 1;
  dLearningRate = LEARNING_RATE;
  time(&start_time);
  while (1) {

    nNumOfSamples = 0;
    dLogLH = 0.0;
    dTotalLength = 0.0;
    for (nCD = 0; nCD < nNumOfDocs; nCD ++) {

      dLogLH += forward(nCD);
      backward(nCD);
      weight(nCD);

      nNumOfSamples ++;
      dTotalLength += (real) (nDocLen[nCD] + 1);
      if (nNumOfSamples % 100 == 0) {

	fprintf(fOut, "## %d %llu ", nIteration, nNumOfSamples);
	fprintf(fOut, "%.0f ", dTotalLength);
	fprintf(fOut, "%.3f ", exp(- dLogLH / dTotalLength));
	put_time(fOut);
	fprintf(fOut, "\n");
	fflush(fOut);

      }

    }
    if(nIteration >= LIMIT) break;
    nIteration ++;

    //printParam(fw);
    //fclose(fw);


  }
    /* add nakai */
  if(fdb) {
    printYpd(fdb);
    fclose(fdb);
  }

    iT = 0;
    nextCK=0;
    nextOld=0;
    j = 0;
    while(j <= 1000 && iT < nMaxDocLen) {
      nextCK = forwardSub(iT,nextOld,1);
      /**
      for(i=0;i<nNumOfWords;i++) {
        fprintf(stderr,"%2d %3s %lf\n",iT,sWords[i],dY[iT*nNumOfWords+i]);
      }
      **/
      if(iT == 0 && nextCK == nBorder) {
        nextCK = nextOld;
        continue;
      }
      fprintf(fw,"%d,%s\n",j,sWords[nextCK]);

      //if(iT > 0 && nextCK == nBorder) break;
      //fprintf(stderr,"%s ",sWords[nextCK]);
      if(!strcmp(sWords[nextCK],"<S>")) {
        nextOld=0;
        iT=0;
        //fprintf(stderr,"\n");
        j++;
      }
      nextOld = nextCK;
      iT++;
    } 
    fclose(fw);
    
  exit(0);
}
/**************
  Zero Malloc
**************/
char *comAlloc(int size)
{
    char *pc;
    pc = (char *)malloc(size);
    memset(pc,'\0',size);

    return(pc);
}
/**************
  Make Test Data
***************/
char  **makeTestData(int N,int *pSize)
{
    double rands;
    int *iArray;
    char **cArray;
    int i,j,ix,iy,kai,keta;
    static char *moji[11]={"A","B","C","D","E","F","G","H","I","J","<S>"};

    kai=1;
    iArray = (int *)malloc(sizeof(int)*256*kai);

    iArray[0]=1;
    i=1;
    while(1) {
      rands = (double)rand()/(double)RAND_MAX;
      if(rands < 0.1) {
        iArray[i] = N;
        i++;

        break;
      }
      else if(rands < 0.6) {
        ix = iArray[i-1];
        iArray[i] = (ix + 1) % N;
        i++;
        if(i <= 256 * kai) {
          kai++;
          iArray = (int *)realloc(iArray,sizeof(int)*256*kai);
        }
      }
      else {
        ix = iArray[i-1];
        j  = ix % 5 + 1;
        if(j < i) iy = iArray[i-j];
        else      iy = iArray[0];
        iArray[i] = (iy + 1) % N;
        i++;
        if(i <= 256 * kai) {
          kai++;
          iArray = (int *)realloc(iArray,sizeof(int)*256*kai);
        }
      }
    }
    cArray = (char **)comAlloc(sizeof(char *)*i);
    for(j=0;j<i;j++) {
#if 0
      keta = (int)(log(iArray[j]))+2;
      cArray[j] = (char *)comAlloc(sizeof(char)*keta);
      sprintf(cArray[j],"%d",iArray[j]);
#else
      keta = strlen(moji[iArray[j]])+1;
      cArray[j] = (char *)comAlloc(sizeof(char)*keta);
      strcpy(cArray[j],moji[iArray[j]]);
#endif
    }
    free(iArray);
    
    *pSize=i;
    return(cArray);
}
/**************
  Free Test data
***************/
int freeTestData(char **cArray,int size)
{
    int i;

    for(i=0;i<size;i++) {
      free(cArray[i]);
    }
    free(cArray);
    
    return(size);
}
