Create a wordcloud from any page

On the ODT web site we’d like to generate word clouds for our local and linked content. So I went looking for a simple javascript word cloud generator. I found a project on Clusterfy that created a jquery bookmarklet to do that on any page: http://clusterify.com/projects/list/fsavard/6/

Here’s the example: Generate Word Cloud

Note: I’ve also created a version that highlights the words on the page when you click them. Create a wordcloud from any page with search function

I made several modifications:

  1. New:
    var wordMatcher = /[A-Za-z-]{3,}/g;

    Old:

    var wordMatcher = /[A-Za-z0-9_-]+/g;

    I used {3,} to include words of at least 3 chars or more and I removed numbers and underscores

  2. New:
    var fontMin = 10;

    Old:

    var fontMin = 20;

    I decreased the minimum font size.

  3. New:
    javascript:function%20loadScript(scriptURL)%20{%20var%20scriptElem%20=%20document.createElement('SCRIPT');%20scriptElem.setAttribute('language',%20'JavaScript');%20scriptElem.setAttribute('src',%20scriptURL);%20document.body.appendChild(scriptElem);}%20loadScript('http://odt.uoregon.edu/v_includes/wordle_jquery.js');

    Old:

    javascript:function%20loadScript(scriptURL)%20{%20var%20scriptElem%20=%20document.createElement('SCRIPT');%20scriptElem.setAttribute('language',%20'JavaScript');%20scriptElem.setAttribute('src',%20scriptURL);%20document.body.appendChild(scriptElem);}%20loadScript('http://fsavard.com/code/wordcloud.js');

    Here I updated the code to our local url and pointed the appendChild to the head and not the body

  4. New:
        // Check if jQuery's loaded
        function GM_wait() {
    		if(typeof window.jQuery == 'undefined') {
    			if(typeof GM_JQ == 'undefined') {
    				// Add jQuery
    				var GM_JQ = document.createElement('script');
    				GM_JQ.src = JQUERY_URL;
    				GM_JQ.type = 'text/javascript';
    				document.getElementsByTagName('head')[0].appendChild(GM_JQ);
    			}
    			window.setTimeout(GM_wait,100);
    		} else {
    			$ = window.jQuery; letsJQuery(); }
        }
        GM_wait();
    

    Old:

        // Add jQuery
        var GM_JQ = document.createElement('script');
        GM_JQ.src = JQUERY_URL;
    
        GM_JQ.type = 'text/javascript';
        document.getElementsByTagName('head')[0].appendChild(GM_JQ);
    
        // Check if jQuery's loaded
        function GM_wait() {
            if(typeof window.jQuery == 'undefined') { window.setTimeout(GM_wait,100); }
        else { $ = window.jQuery; letsJQuery(); }
        }
        GM_wait();
    

    Here I ensured that we don’t add jquery twice if it already exists on the page.

  5. Old:
    spans.push("<span style='font-size: "+freqs[i].fontSize+"px'>"+freqs[i].word+"</span> ");

    New:

    spans.push("<span style='margin: auto 4px;font-size: "+freqs[i].fontSize+"px;line-height: "+freqs[i].fontSize+"px'>"+freqs[i].word+"</span> ");

    Here I added line height and a margin to each word’s span

That’s my first round of changes. These guys did good work that was laid on the foundation of a lot of other great work out there.

Other changes I’d like to add:

  • I’m adding a var for minimum Frequency too, as I don’t want to waste space on single occurrences: minFreq = 2;
  • Adding random colors or colors from a scheme for each word
  • Change the background color
  • Make the background closeable
  • Potentially creating images of each word and adding rotation
  • Potentially making the background draggable

Here’s the full code in it’s current iteration:

// Based on impeachGod code at http://impeachgod.webs.com/wordle.js
// for the word frequency handling
(function(){
		var wordMatcher = /[A-Za-z-]{3,}/g; /*Removed: 0-9_. {3,}: words must be 3 chars or more*/
    var commonWords = ["the", "of", "to", "and", "a", "in", "is", "it",
        "that", "was", "for", "on", "are", "with", "as", "be", "at",
        "one", "have", "this", "from", "or", "had", "by", "but", "some",
        "out", "were", "all", "when", "an", "each"];
    var minFreq = 0;
		var maxFreq = 0;
    var fontMin = 10;
    var fontMax = 70;
    var maxWords = 100;

    var JQUERY_URL = 'http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js';
    var CSS_URL = '';

    /* Bookmarklet code (change the url of the script to where you're hosting it):
javascript:function%20loadScript(scriptURL)%20{%20var%20scriptElem%20=%20document.createElement('SCRIPT');%20scriptElem.setAttribute('language',%20'JavaScript');%20scriptElem.setAttribute('src',%20scriptURL);%20document.body.appendChild(scriptElem);}%20loadScript('http://odt.uoregon.edu/v_includes/wordle_jquery.js');

Orig:
javascript:function%20loadScript(scriptURL)%20{%20var%20scriptElem%20=%20document.createElement('SCRIPT');%20scriptElem.setAttribute('language',%20'JavaScript');%20scriptElem.setAttribute('src',%20scriptURL);%20document.body.appendChild(scriptElem);}%20loadScript('http://fsavard.com/code/wordcloud.js');
    */

    //////////////////////////////////////////////////////////////////////////
    // Load CSS

    /*var headID = document.getElementsByTagName("head")[0];
    var cssNode = document.createElement('link');
    cssNode.type = 'text/css';
    cssNode.rel = 'stylesheet';
    cssNode.href = CSS_URL;
    cssNode.media = 'screen';
    headID.appendChild(cssNode);*/

    //////////////////////////////////////////////////////////////////////////
    // Jquery loading
    // Check if jQuery's loaded
    function GM_wait() {
		if(typeof window.jQuery == 'undefined') {
			if(typeof GM_JQ == 'undefined') {
				// Add jQuery
				var GM_JQ = document.createElement('script');
				GM_JQ.src = JQUERY_URL;
				GM_JQ.type = 'text/javascript';
				document.getElementsByTagName('head')[0].appendChild(GM_JQ);
			}
			window.setTimeout(GM_wait,100);
		} else {
			$ = window.jQuery; letsJQuery(); }
    }
    GM_wait();



    //////////////////////////////////////////////////////////////////////////
    // Actual functionality


    // Based on http://refactormycode.com/codes/341-jquery-all-descendent-text-nodes-within-a-node
    function extractText() {
        ret = [];
        $(document.body).contents().each( function() {
            var fn = arguments.callee;
            if ( this.nodeType == 3 )
                ret.push( this.nodeValue );
            else $(this).contents().each(fn);
        });
        return ret.join(' ');
    }

    // Based on impeachGod code at http://impeachgod.webs.com/wordle.js


    function parseWords(txt) {

        var words = txt.match(wordMatcher);
        var i;
        // convert to lowercase
        for (i = 0; i < words.length; i++) {
            words[i] = words[i].toLowerCase();
        }
        return words;
    }

    function countFrequencies(words) {
        var freqs = {};
        var word;
        var i;

        // count word occurence frequencies
        for (i = 0; i  maxFreq){
                    maxFreq = freqs[words[i]];
                }
            }
        }
        // remove common words
        for (i = 0; i < commonWords.length; i++) {
            delete freqs[commonWords[i]];
        }
        // convert to array
        var freqsArray = [];
        for (word in freqs) {
            if (freqs.hasOwnProperty(word)) {
                freqsArray.push({"word": word, "freq": freqs[word]});
            }
        }
        return freqsArray;
    }

    // Adapted to scale
    function scale(freqs) {
        var i;
        for (i = 0; i < freqs.length; i++) {
            freqs[i].fontSize = Math.round(freqs[i].freq/maxFreq * (fontMax - fontMin) + fontMin);
        }
    }


    function makeCloud(freqs) {
        var div = $(document.createElement(&#039;div&#039;));
        div.attr(&#039;style&#039;,&#039;position: fixed; left: 15px; top: 15px; z-index: 999; display: block; width: 90%; border: 1px solid black; padding: 30px; margin: 10px 10px; background-color: #74adcb;&#039;);

        spans = []

        for(var i=0; i<freqs.length; i++){
            spans.push("<span style='font-size: "+freqs[i].fontSize+"px'>"+freqs[i].word+"</span> ");
        }

        div.append(spans.join(" "));

        $(document.body).append(div);
    }

    //////////////////////////////////////////////////////////////////////////
    // Go

    // All your GM code must be inside this function
    function letsJQuery() {
        var txt = extractText();
        var words = parseWords(txt);
        var freqs = countFrequencies(words);

        freqs = freqs.sort(function(a, b) {
                return b.freq - a.freq;
            });

        // Keep only the first
        if(freqs.length > maxWords) {
            freqs.splice(maxWords, freqs.length - maxWords);
        }

        freqs = freqs.sort(function(a, b) {
                return b.word < a.word;
            });

        scale(freqs);

        makeCloud(freqs);
    }

})();
Leave a Comment