Star Wars Publishing Context

Motivation

One of my favorite things about Star Wars is its range. Since its inception, stories have been told about the universe through almost every media type imaginable. From movies to video games to short stories to audio dramas, stories have been told about the Star Wars universe in just about every way imaginable. I thought it would be fun to make a little tool that tells users what was currently going on in the publishing context of the Star Wars universe around the time of a given date.

Explanation/Use

This project takes place in three steps.

Step 1: Get the Data
- First, we need the data! Luckily, there's a pretty exhaustive "List of Star Wars media" article on the Star Wars Wiki: Wookieepedia. We'll go to the page here, and "Right Click" -> "Save Page As..." -> "Web Page, HTML only" to save a local version of the associated Wookieepedia page.

Step 2: Prep the Data

Next, let's transform the saved webpage into an easily-digestable CSV file holding each media item's name, publication date, and associated Wookieepedia link. To do that, I wrote the following Node.js script, named "PageToDataFile.js"...

// Modules
const fs = require('fs');
const cheerio = require('cheerio');

// Global constants
// Path to saved HTML Wookieepedia page
const HTML_PAGE_FILE = "./Input/ListOfStarWarsMedia_Wookieepedia.html";
// Path for the output data file for this script
const DATA_OUTPUT_FILE = "./Output/ListOfStarWarsMedia_Data.csv";
const WOOKIEEPEDIA_ROOT_DOMAIN = "https://starwars.fandom.com";

// Returns HTML file at given path as a string
const readHTMLFile = async function(filePath) {
    try {
        const d = fs.readFileSync(filePath, 'utf8');
        return d;
    } catch (e) {
        console.error(e);
    }
}

// Converts given date string to associated integer
function convertMonthToInt(monthString) {
    let months = [
        "january", "february", "march", "april", "may", "june", 
        "july", "august", "september", "october", "november", "december"
    ];
    return months.indexOf(monthString.toLowerCase());
}

// Converts given date string of form "MONTH DAY, YEAR", "MONTH YEAR",
// ... or "YEAR" to form MM/DD/YYYY
function convertDate(dateString) {
    let date = new Date();
    let dateArray = dateString.split(" ");
    if (dateArray.length === 3) {
        // "MONTH DAY, YEAR"
        date.setMonth(convertMonthToInt(dateArray[0]));
        date.setDate(dateArray[1].slice(0, -1));
        date.setFullYear(dateArray[2]);
    } else if (dateArray.length === 2) {
        // "MONTH YEAR" or "YEAR"
        if (isNaN(dateArray[0])) {
            // "MONTH YEAR"
            date.setMonth(convertMonthToInt(dateArray[0]));
            date.setDate(1);
            date.setFullYear(dateArray[1]);
        } else {
            // "YEAR"
            date.setMonth(0);
            date.setDate(1);
            date.setFullYear(dateArray[0]);
        }
    } else {
        date.setMonth(0);
        date.setDate(1);
        date.setFullYear(dateArray[0]);
    }
    return date.toLocaleDateString();
}

// Main function
const main = async function() {

    console.log("Collecting media items...");

    const htmlFileAsString = await readHTMLFile(HTML_PAGE_FILE);

    // Turning the HTML string into a cheerio-parsable object
    const $ = cheerio.load(htmlFileAsString);

    // Finding all h2 elements that have a span element directly inside of them
    // ... with an id that is a number within the specified date range
    const h2Elements = $('h2').has('span[id]').filter((i, el) => {
        const id = parseInt($(el).find('span').attr('id'), 10);
        return id >= 1967 && id <= 2022;
    });
    
    // For each h2 element (year header), finding the year ul after it and adding it to a list
    const ulElements = [];
    h2Elements.each((i, el) => {
        // Years 2012 to 2021 store the year ul in the next div at
        // ... table -> tbody -> tr -> td -> div -> ul
        const year = parseInt($(el).find('span').first().text());
        if (year <= 2021 && year >= 2012) {
            const ul = $(el).next('div')
                .find('table').first()
                .find('tbody').first()
                .find('tr').first()
                .find('td').first()
                .find('div').first()
                .find('ul').first();
            ulElements.push(ul.html());
        }
        // Other years store the year ul as their first ul element
        else {
            const ul = $(el).next('ul');
            ulElements.push(ul.html());
        }
    });

    // For each year ul, add each li element (media item) to a list
    const liElements = [];
    ulElements.forEach((ul) => {
        $(ul).each((i, el) => {
            liElements.push(el);
        });
    });

    // For each li (media item), output its name, date, and href to a list,
    // ... and then append it to a list containing all of the media items
    const mediaItems = [];
    liElements.forEach((mediaItem) => {
        const mediaItemNameAndDate = $(mediaItem).text().trim();
        if (mediaItemNameAndDate == "") {
            return;
        }

        // Split the media item name and date string into its name and date
        let unicodeHyphens = ["―", "-", "–", "—", "⁃", "−", "‒", "⸗", "⸺", "⸻"];
        const hyphensAsRegex = new RegExp(unicodeHyphens.join('|'), 'g');
        const splitOnHyphens = mediaItemNameAndDate.split(hyphensAsRegex);
        if (!splitOnHyphens || splitOnHyphens.length === 1) {
            console.log("No hyphen characters found in the string");
        }
        // Parse the media item into a href, name, and date
        const mediaItemHref = $(mediaItem).find('a').attr('href');
        const mediaItemName = splitOnHyphens.slice(0, -1).join("").trim();
        const mediaItemDate = splitOnHyphens[splitOnHyphens.length - 1].trim();
        const mediaItemDateFormatted = convertDate(mediaItemDate);
        if (mediaItemDateFormatted === "Invalid Date") {
            console.log("Invalid Date: ", mediaItemDate);
        }
        const mediaItemData = [mediaItemDateFormatted, mediaItemName, mediaItemHref];
        mediaItems.push(mediaItemData);
    });

    // Checking the collected items
    console.log("Media items collected: ", mediaItems.length);
    const randIdx = Math.floor(Math.random() * mediaItems.length);
    const randomItem = $(mediaItems[randIdx]);
    // Every entry should have 3 elements (href, name, and date)
    for (let i=0; i<mediaItems.length; i++) {
        if (mediaItems[i].length != 3) {
            console.log("ERROR: Not three!");
        }
    }
    // Printing a random item for the eye test
    console.log("\tRandom item date: ", randomItem[0]);
    console.log("\tRandom item name: ", randomItem[1]);
    console.log("\tRandom item href: ", randomItem[2]);

    // Outputting the information to a csv file (as DATE,NAME,LINK)
    let csv = "date,name,link\n";
    mediaItems.forEach((row) => {
        row[2] = WOOKIEEPEDIA_ROOT_DOMAIN + row[2];
        csv += row.join(',') + '\n';
    });

    // Writing the CSV data to a file
    fs.writeFile(DATA_OUTPUT_FILE, csv, (err) => {
        if (err) {
            throw err;
        }
        console.log("CSV file successfully saved to "" + DATA_OUTPUT_FILE + """);
    });
}

main();

./Input/ListOfStarWarsMedia_Wookieepedia.html

node PageToDataFile.js

./Output/ListOfStarWarsMedia_Data.csv

Step 3: Query the Data
- Finally, we need to implement the part of the project that takes in user input and gives back the requested data. This is implemented here, in browser! First, it takes the user-entered date and number of surrounding media items to retrieve. Next, it reads in the CSV data file generated in the previous step and searches it for the appropriate data. Finally, it displays what it found (the date, name, and associated link for each item) to the user below! For reference, here is the CSV file this webpage uses (generated locally from the process explained in the previous step). Go ahead, give it a try!

TODO/Things to Add Later

It would be nice to change it such that instead of generating a CSV from a locally saved version of the "List of Star Wars media" Wookieepedia webpage, I would have a script that, each time I ran it, would just regenerate a CSV based on the actual, current Wookieepedia webpage (by scraping it). This way, it would make sure you have the most up to date version of the article instead of just relying on the status of the page at the time of saving it.

Created: 06/08/2023
Last Updated: 06/08/2023

Star Wars Publishing Context

Motivation

Explanation/Use

Select a date and number of surrounding media items:

Results:

TODO/Things to Add Later