Sunday 12 May 2013

There is a big file of words which is dynamically changing. We are continuously adding some words into it. How would you keep track of top 10 trending words at each moment?


/*
      First Chose a data structure, we have to keep track of frequency of each word.

      *  HashMap : use words as keys and frequency as value.
      *  TRIE    : Read word from file and add each word to prefix Tree, keep a counter attach to each node and increment it everytime same word is found.
      *  BST     : BST identifies duplicates hence each word can be associated with a counter which is incremented every time it is encountered.
      *            But problem with BST is it takes O(logn) time for searching and inserting.
      *            And also if BST is not balanced and number of words are large it will effect search and insert time.
      *
      *
      *
      *  1. Maintain an array topWords of 10 elements that stores top 10 trending words in sorted order of their frequency of occurrence.
      *  2. Read file word by word.
      *  3. Insert each word in a trie/prefix Tree maintaining the frequency of occurrence of this word at the same time.
      *  4. At each insert frequency of that word is incremented and checked in top 10 trending word.
      *  5. If the word’s frequency is greater than any top10 treding word it’s entry is made to the list.
      *  6. If word is already in top 10 trending words it’s position is changed if necessary to maintain the sorted order of trending words.
      *  In code below I have included only structure of class and not complete class (excluded some functions)
*/


      class TRIENode
      {
            private:
                    int prefixCount;
                    int frequency;
                    bool isWord;
                    TRIENode **child;
            public:
                   TRIENode()
                   {
                            prefixCount = 0;
                            frequency = 0;
                            isWord = false;
                            child = new TRIENode*[26];
                            for(int i=0; i<26; i++)
                                    child[i] = NULL;
                   }
      };
   
      class prefixTree
      {
            private:
                    TRIENode *root;
                    string topWords[10];
                    int count[10];
            public:
                    prefixTree()
                    {
                                root = new TRIENode();
                                for(int i=0; i<10; i++)
                                {
                                        count[i] = 0;
                                        topWords[i] = "";
                                }
                    }
                 
                    void insertWord(string word)
                    {
                         int len = word.length();
                         TRIENode *ptr = root;
                         for(int i=0 ; i< len ; i++)
                         {
                                 int index = word[i] - 'a';
                                 if(ptr->getChild(index) == NULL)
                                 {
                                                       TRIENode *newNode = new TRIENode();
                                                       ptr->createChild(newNode,index);
                                                       ptr = newNode;
                                 }
                                 else
                                 {
                                     ptr->incrementPrefixcount();
                                     ptr = ptr->getChild(index);
                                 }
                         }
                         ptr->setIsWord();
                         ptr->incrementFrequency();
                         int frequency = ptr->getFrequency();
                         if(isTopTenWord(frequency))
                         {
                                  insertInTopTen(word,frequency);                          
                         }
                    }
                 
     
       //Read File Word by Word.
                    void readFile()
                    {
                         string words;
                         ifstream fin("C:\\Users\\varun\\Desktop\\test.txt");
                         while(fin>>words)
                         {
                                          insertWord(words);
                         }
                    }          
      };
   
      int main()
      {
          prefixTree tree;
          tree.readFile();
          tree.displayTopTenWords();
          system("pause");
      }


6 comments:

  1. I think Hash-table and Max-heap will do better.

    ReplyDelete
  2. I wouldn't like to build a hash for million of words.
    Best approach is to use TRIE to store all words and use HashMap + Heap for storing top 'K' trending words.
    since I had to check only top 10 trending words in the above question I ignored that part.

    ReplyDelete
  3. Max heap and trie will be better !!

    ReplyDelete
  4. @Crazy you might be asked to fetch 1 million trending words out of some billion words, in that case when a word is encountered you can only check if word is present in heap by comparing min element of heap with current word, but you cannot exactly locate the word and increment its frequency in heap in O(1). And searching heap of 1 million elements is not feasible, so a Hash + Heap will be better.
    overall, TRIE to keep track of all words and HashMap + Heap to keep track of top trending words.

    ReplyDelete
  5. trie + special balance bst ..
    time complexity -->string length for updation in trie and log n for special balance bst;

    special balance bst :

    Node CountNode{
    CountNode* left;
    CountNode* right;
    int count ; // to store the same occurence value of a particular group of string
    stringNode *string_node; // to store the strings have the same occurence value
    int no_of_string; // no of string in string node
    }
    strcut stringNodeincount {
    stringNodeincount * left;
    stringNodeincount * right;
    string string_value ;
    }


    }

    ReplyDelete
    Replies
    1. *Struct CountNode inplaceof Node CountNode

      Delete