One of my favorite things about Star Wars is its range. Since its inception, stories have been told about the universe through almost every media type imaginable. From movies to video games to short stories to audio dramas, stories have been told about the Star Wars universe in just about every way imaginable. I thought it would be fun to make a little tool that tells users what was currently going on in the publishing context of the Star Wars universe around the time of a given date.
This project takes place in three steps.
PageToDataFile.js
"...
// Modules
const fs = require('fs');
const cheerio = require('cheerio');
// Global constants
const HTML_PAGE_FILE = "./Input/ListOfStarWarsMedia_Wookieepedia.html"; // Path to saved HTML Wookieepedia page
const DATA_OUTPUT_FILE = "./Output/ListOfStarWarsMedia_Data.csv"; // Path for the output data file for this script
const WOOKIEEPEDIA_ROOT_DOMAIN = "https://starwars.fandom.com";
// Returns HTML file at given path as a string
const readHTMLFile = async function(filePath) {
try {
const d = fs.readFileSync(filePath, 'utf8');
return d;
} catch (e) {
console.error(e);
}
}
// Converts given date string to associated integer
function convertMonthToInt(monthString) {
let months = [
"january", "february", "march", "april", "may", "june",
"july", "august", "september", "october", "november", "december"
];
return months.indexOf(monthString.toLowerCase());
}
// Converts given date string of form "MONTH DAY, YEAR", "MONTH YEAR", or "YEAR" to form MM/DD/YYYY
function convertDate(dateString) {
let date = new Date();
let dateArray = dateString.split(" ");
if (dateArray.length === 3) {
// "MONTH DAY, YEAR"
date.setMonth(convertMonthToInt(dateArray[0]));
date.setDate(dateArray[1].slice(0, -1));
date.setFullYear(dateArray[2]);
} else if (dateArray.length === 2) {
// "MONTH YEAR" or "YEAR"
if (isNaN(dateArray[0])) {
// "MONTH YEAR"
date.setMonth(convertMonthToInt(dateArray[0]));
date.setDate(1);
date.setFullYear(dateArray[1]);
} else {
// "YEAR"
date.setMonth(0);
date.setDate(1);
date.setFullYear(dateArray[0]);
}
} else {
date.setMonth(0);
date.setDate(1);
date.setFullYear(dateArray[0]);
}
return date.toLocaleDateString();
}
// Main function
const main = async function() {
console.log("Collecting media items...");
const htmlFileAsString = await readHTMLFile(HTML_PAGE_FILE);
// Turning the HTML string into a cheerio-parsable object
const $ = cheerio.load(htmlFileAsString);
// Finding all h2 elements that have a span element directly inside of them with an id that is a number within the specified date range
const h2Elements = $('h2').has('span[id]').filter((i, el) => {
const id = parseInt($(el).find('span').attr('id'), 10);
return id >= 1967 && id <= 2022;
});
// For each h2 element (year header), finding the year ul after it and adding it to a list
const ulElements = [];
h2Elements.each((i, el) => {
// Years 2012 to 2021 store the year ul in the next div at table -> tbody -> tr -> td -> div -> ul
const year = parseInt($(el).find('span').first().text());
if (year <= 2021 && year >= 2012) {
const ul = $(el).next('div')
.find('table').first()
.find('tbody').first()
.find('tr').first()
.find('td').first()
.find('div').first()
.find('ul').first();
ulElements.push(ul.html());
}
// Other years store the year ul as their first ul element
else {
const ul = $(el).next('ul');
ulElements.push(ul.html());
}
});
// For each year ul, add each li element (media item) to a list
const liElements = [];
ulElements.forEach((ul) => {
$(ul).each((i, el) => {
liElements.push(el);
});
});
// For each li (media item), output its name, date, and href to a list, and then append it to a list containing all of the media items
const mediaItems = [];
liElements.forEach((mediaItem) => {
const mediaItemNameAndDate = $(mediaItem).text().trim();
if (mediaItemNameAndDate == "") {
return;
}
// Split the media item name and date string into its name and date
let unicodeHyphens = ["―", "-", "–", "—", "⁃", "−", "‒", "⸗", "⸺", "⸻"];
const hyphensAsRegex = new RegExp(unicodeHyphens.join('|'), 'g');
const splitOnHyphens = mediaItemNameAndDate.split(hyphensAsRegex);
if (!splitOnHyphens || splitOnHyphens.length === 1) {
console.log("No hyphen characters found in the string");
}
// Parse the media item into a href, name, and date
const mediaItemHref = $(mediaItem).find('a').attr('href');
const mediaItemName = splitOnHyphens.slice(0, -1).join("").trim();
const mediaItemDate = splitOnHyphens[splitOnHyphens.length - 1].trim();
const mediaItemDateFormatted = convertDate(mediaItemDate);
if (mediaItemDateFormatted === "Invalid Date") {
console.log("Invalid Date: ", mediaItemDate);
}
const mediaItemData = [mediaItemDateFormatted, mediaItemName, mediaItemHref];
mediaItems.push(mediaItemData);
});
// Checking the collected items
console.log("Media items collected: ", mediaItems.length);
const randIdx = Math.floor(Math.random() * mediaItems.length);
const randomItem = $(mediaItems[randIdx]);
// Every entry should have 3 elements (href, name, and date)
for (let i=0; i {
row[2] = WOOKIEEPEDIA_ROOT_DOMAIN + row[2];
csv += row.join(',') + '\n';
});
// Writing the CSV data to a file
fs.writeFile(DATA_OUTPUT_FILE, csv, (err) => {
if (err) {
throw err;
}
console.log("CSV file successfully saved to \"" + DATA_OUTPUT_FILE + "\"");
});
}
main();
./Input/ListOfStarWarsMedia_Wookieepedia.html
", then run the script in your terminal with the command "node PageToDataFile.js
" (make sure you've got the "fs" and "cheerio" Node.js modules installed) and the generated CSV file will be placed at "./Output/ListOfStarWarsMedia_Data.csv
".