IDV Project II: Mapping the Twitterverse

To complete this project, we:

Collected data (tweets from the past 7 days containing the word 'wildlife') using Twitter's API (recent search endpoint) using the code below
Repeated the process for our keyword in six different languages
Converted the data from its output JSON file to an EXCEL file for formatting
Converted the formatted EXCEL file to a CSV file for upload
Uploaded the CSV file to Mapbox Studio and created a map style
Created a GitHub repository and page to host our content
Embedded the map into the HTML index file within our repository
Create a stacked bar chart using RStudio and Adobe Illustrator and added it to the HTML file
Added CSS rules for styling and cleaned up the HTML file

Here's a sample of the code we used to scrape Twitter based on Professor Pedro Cruz's code.

        
    const needle = require('needle');  // requires library to make HTTP requests
    const fs = require('fs');  // requires library to save files
    const token = "YOUR_BEARER_TOKEN";  // this is your bearer token
    
    const endpointUrl = "https://api.twitter.com/2/tweets/search/recent";  // End-point for recent search
    //const endpointUrl = "https://api.twitter.com/2/tweets/search/all";  // This would be the end-point for the full historical search, only available for academic licenses
    
    
    /** This function simply puts a request given a certain 'query'
    *  and a batch of paginated results indicated in 'nextToken' */
    
    async function getRequest(query, nextToken) {
    
    // builds a query object to send
    function buildQuery() {
    let q = {
        "query": query,  // the query
        "max_results": 100,  // 100 is the max results per requests for standard licenses in sandboxed environments
        "expansions": "geo.place_id,attachments.media_keys",  // whenever a tweet has geographical information in the form a of place_id, get more info on that place_id, and grab media
        "tweet.fields": "author_id,created_at,geo",  // by default the Tweet id and and the text are returned, but here we're also including the author_id, the time of publishing, and its geographical features
        "place.fields": "geo,name,full_name,place_type";  // the additional information associated with a place_id, namely its name and a geo object with geographical coordinates, either in the form of a point or a bounding box
        "media.fields": "preview_image_url,url" // grab media URLs
    };
    // the nextToken paramenter is optional (as there is none in the first request)
    // but if a nextToken was passed, then it inserts it into the query object
    if (nextToken !== undefined) q["next_token"] = nextToken;
    return q;
    }
    
    const response = await needle('get', endpointUrl, buildQuery(), {
    headers: {
        "User-Agent": "v2RecentSearchJS",  // Can be whatever you want
        "authorization": "Bearer " + token  // Attaches your Bearer token to the header of the request
    }
    })
    return response.body; // Returns the contents of the response
    }
    
    
    /** async funtions enable us to stop the program to wait on requests
    *  this function is the core of the program and where execution starts */
    
    (async function () {
    
    /** an anonymous function that gets a whole batch of tweet reponses
    *  and only adds the ones with geo information to 'array' */
    function filterTweets(array, batch) {
    batch.data.forEach(tweet => {
        if (tweet["geo"] !== undefined) {
            /* expands place_id */
            if (tweet.geo["place_id"] !== undefined) {
                /* associates the place_id with the expanded information on place_ids in the response */
                let expanded_geo = batch.includes.places.find(place => {
                    return place.id == tweet.geo.place_id;
                });
                // adds new variable to tweet object called 'place_info'
                tweet.place_info = expanded_geo;
            }
            if (tweet['place_info']['geo']['bbox'] == !undefined) {
                //average of the bbox
                /*
                bbox[0] -> x of bottom left corner
                bbox[1] -> y of bottom left corner
                bbox[2] -> x of top right corner
                bbox[3] -> y of top right corner
                */
    
                let xa = (tweet.place_info.geo.bbox[0] + tweet.place_info.geo.bbox[2]) / 2
                let ya = (tweet.place_info.geo.bbox[1] + tweet.place_info.geo.bbox[3]) / 2
    
                tweet.place_info.geo.center = [xa, ya];
            }
    
    
            if (tweet.attachments !== undefined) {
                // if (tweet.attachments['media_keys'] !== undefined) {
                let mediaKey = tweet.attachments.media_keys[0];
                /* associates the media_keys with the expanded information in the response */
                let relevantMedia = batch.includes.media.find(mediaEntry => {
                    return mediaEntry.media_key == mediaKey
                });
    
                // adds new variable to tweet object called 'media_url'
                if (relevantMedia.type == 'photo') {
                    tweet.media_url = relevantMedia.url;
                }
                else if (relevantMedia.type == 'video') {
                    tweet.media_url = relevantMedia.preview_image_url;
                }
            }
    
            array.push(tweet);
        }
    });
    
    }
    
    let filteredTweets = []; //array to keep all collected tweets
    
    const query = "wildlife -is:retweet"; // we are searching for the word 'wildlife' but only in tweets that are *not* retweets since retweets never have geo information
    let response = await getRequest(query);  // finally, put the request and wait for the response
    //console.log(response);  // DEBUG
    const nGeoTweets = 500;  // after how many collected tweets with geo info are stopping execution (in this example: 500)
    while (response.meta["next_token"] !== undefined) {
    response = await getRequest(query, response.meta.next_token);
    filterTweets(filteredTweets, response);
    console.log(filteredTweets.length); // DEBUG
    await sleep(2100);  /*  sleeps the program for 2.1seconds : 
                the standard rate limit is 450 requests per 15 min time period.
                If you make more than 450 requests in less than 15 mins, the API
                will block further requests until the 15 mins period is over;
                Since the percentage of tweets with geo information is low you will 
                need to place more than 450 requets (remember that each requests returns 100 tweets)
                In order to stay under the rate limit and leave the program executing in the
                background collecting tweets, only one request every 2 seconds should be placed */
    
    //  if we have enough tweets, stops collecting                        
    if (filteredTweets.length >= nGeoTweets) break;
    
    }
    
    console.log("TWEETS WITH GEO: " + filteredTweets.length); // DEBUG
    console.log(filteredTweets); // DEBUG
    fs.writeFileSync("English.json", JSON.stringify(filteredTweets)); // Save the results to a file in the disk
    process.exit(); // terminates the program
    })();
    
    /** Utility function that sleeps the program for 'ms' milliseconds */
    function sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
    }

Mapping the Twitterverse: Wildlife Safari Edition

Tweet:

Date:

Tweets by Origin Locations During Weekdays vs. Weekend Days

To complete this project, we: