Generating a dynamic HTML sitemap

Hey all,

Newbie here, but I had a trawl around and couldn’t find an answer so I wrote my own.

The problem: Wix does not generate dynamic sitemaps.

The solution: Generate it off the sitemap XML files, dynamically, and cache the result.

Requirements:

  • a blank page

  • a text box

  • a button

  • 2 collections: PageTitles, and SiteMap (to cache things for performance)

Code:

import { fetch } from 'wix-fetch';
import wixData from 'wix-data';
import wixUsers from'wix-users';

$w.onReady(function () {
	const currentUser = wixUsers.currentUser;
	if (currentUser.role === 'Admin'){ 
		$w('#button5').show();
	}

	else
	{
		$w('#button5').hide();
	}
  doLoad();
});

async function doClearCache() {
	await clearPageTitlesCache();
	await clearSiteMapCache();
	$w('#button5').label = "Sitemap cleared";
	$w('#text9').text = "Generating sitemap...";
	doLoad();
	$w('#button5').label = "Refresh Sitemap";
}

async function clearPageTitlesCache() {
  try {
    const result = await wixData.query('PageTitles')
      .limit(1000) // Adjust the limit based on the number of cache entries
      .find();

    if (result && result.items && result.items.length > 0) {
      const deletePromises = result.items.map(item => wixData.remove('PageTitles', item._id));
      await Promise.all(deletePromises);
    }
  } catch (error) {
    console.error(error);
  }
}

async function doLoad() {
  let parentElement = $w('#text9');
  const initialRender = await getHTMLFromCache();
  if (initialRender) { 
  	parentElement.html = initialRender;
  }

  // Check if there are changes in the sitemap XML files
  const hasChanges = await checkSitemapChanges();

  if (hasChanges) {
    // If there are changes, fetch and parse the XML as before
    fetchAndParseXML("/sitemap.xml", parentElement);
  } else {
    // If there are no changes, render the HTML from the SiteMap cache
    const htmlFromCache = await getHTMLFromCache();
    if (htmlFromCache) {
      parentElement.html = htmlFromCache;
    }
	else {
	  fetchAndParseXML("/sitemap.xml", parentElement);
	}
  }
}

async function checkSitemapChanges() {
  try {
    const response = await fetch("https://www.michael-elliott.photography/sitemap.xml");
    const xml = await response.text();
    const sitemapXMLs = extractSitemapUrls(xml);

    if (sitemapXMLs) {
      const pageURLs = await getPageURLsFromCache();
      const newPageURLs = [];

      for (let i = 0; i < sitemapXMLs.length; i++) {
		const sitemapURL = sitemapXMLs[i];
        const subSitemapResponse = await fetch(sitemapURL);
		const subSitemapXML = await subSitemapResponse.text();
		const subPageURLs = extractUrls(subSitemapXML);
		newPageURLs.push(...subPageURLs);
      }

      const addedURLs = newPageURLs.filter(url => !pageURLs.includes(url));
	  const deletedURLs = pageURLs.filter(url => !newPageURLs.includes(url));
	  
      if (addedURLs.length > 0 || deletedURLs.length > 0) {
        // Clear the SiteMap cache if there are changes
        await clearSiteMapCache();
        return true;
      }
    }

    return false;
  } catch (error) {
    console.error(error);
    return false;
  }
}

async function fetchAndParseXML(url, parentElement, shouldCache = true) {
  try {
    const response = await fetch(url);
    const xml = await response.text();
    const pages = extractPages(xml);

    let ulHTML = '<ul style="font-family: Arial; font-size: 16px; line-height: 1.4;">';

    if (pages) {
      const pageURLs = pages.map(page => getPageURL(page));
      const pageTitles = await getPageTitles(pageURLs);

      for (let i = 0; i < pageURLs.length; i++) {
        const pageURL = pageURLs[i];
        let pageTitle = pageTitles[pageURL];

        if (!pageTitle) {
          pageTitle = await getPageTitleFromCache(pageURL);

          if (!pageTitle) {
            pageTitle = await getPageTitle(pageURL);
            cachePageTitle(pageURL, pageTitle);
            pageTitle = pageTitle.replace(' | Michael Elliott', ''); // Remove the text " | Michael Elliott"
          }
        }
        pageTitle = pageTitle.replace(' | Michael Elliott', '');
        const liHTML = '<li><a href="' + pageURL + '">' + pageTitle + '</a></li>';
        ulHTML += liHTML;
      }
    }

    const sitemapXMLs = extractSitemapUrls(xml);
    if (sitemapXMLs) {
      for (let i = 0; i < sitemapXMLs.length; i++) {
        const sitemapURL = sitemapXMLs[i];
        const parentTitle = await getParentTitleFromURL(sitemapURL);
        const subSitemapHTML = await fetchAndParseXML(sitemapURL, null, false);
        if (subSitemapHTML) {
          ulHTML += '<li>' + parentTitle + '<ul>' + subSitemapHTML + '</ul></li>';
        }
      }
    }

    ulHTML += '</ul>';

    if (shouldCache) {
      // Cache the generated HTML code
      await cacheGeneratedHTML(ulHTML);
    }

    if (parentElement) {
      parentElement.html = ulHTML;
    } else {
      return ulHTML;
    }
  } catch (error) {
    console.error(error);
  }
}

async function getPageTitles(urls) {
  const titles = {};
  const cacheResults = await wixData.query('PageTitles')
    .hasSome('url', urls)
    .find();

  cacheResults.items.forEach(item => {
    titles[item.url] = item.title;
  });

  const missingURLs = urls.filter(url => !titles[url]);

  if (missingURLs.length > 0) {
    const rangeOptions = { headers: { 'Range': 'bytes=0-2048' } };
    const responses = await Promise.all(missingURLs.map(url => fetch(url, rangeOptions)));
    const texts = await Promise.all(responses.map(response => response.text()));

    for (let i = 0; i < missingURLs.length; i++) {
      const url = missingURLs[i];
      const text = texts[i];

      const match = text.match(/<title[^>]*>([^<]*)/i);
      if (match && match[1]) {
        titles[url] = match[1];
        cachePageTitle(url, match[1]);
      } else {
        titles[url] = '';
      }
    }
  }

  return titles;
}

async function getParentTitleFromURL(url) {
  const regex = /\/([^/]+)-sitemap.xml/;
  const match = url.match(regex);
  if (match && match[1]) {
    return capitalizeFirstLetter(match[1].replace(/-/g, ' '));
  }
  return '';
}

async function getPageTitle(url) {
  try {
    const rangeOptions = { headers: { 'Range': 'bytes=0-2048' } };
    const response = await fetch(url, rangeOptions);
    const text = await response.text();

    const match = text.match(/<title[^>]*>([^<]*)/i);
    if (match && match[1]) {
      return match[1];
    }
    return '';
  } catch (error) {
    console.error(error);
    return '';
  }
}

async function getPageTitleFromCache(url) {
  try {
    const result = await wixData.query('PageTitles')
      .eq('url', url)
      .limit(1)
      .find();

    if (result && result.items && result.items.length > 0) {
      let title = result.items[0].title;
      title = title.replace(" | Michael Elliott", ""); // Remove the text " | Michael Elliott"
      return title;
    }
    return '';
  } catch (error) {
    console.error(error);
    return '';
  }
}

async function cachePageTitle(url, title) {
  try {
    await wixData.insert('PageTitles', { url, title });
  } catch (error) {
    console.error(error);
  }
}

async function cacheGeneratedHTML(html) {
  try {
    await wixData.insert('SiteMap', { html });
  } catch (error) {
    console.error(error);
  }
}

async function getHTMLFromCache() {
  try {
    const result = await wixData.query('SiteMap')
      .limit(1)
      .find();

    if (result && result.items && result.items.length > 0) {
      return result.items[0].html;
    }
    return '';
  } catch (error) {
    console.error(error);
    return '';
  }
}

async function getPageURLsFromCache() {
  try {
    const result = await wixData.query('PageTitles')
	  .limit(1000)
      .find();
	
    if (result && result.items && result.items.length > 0) {
      return result.items.map(item => item.url);
    }
    return [];
  } catch (error) {
    console.error(error);
    return [];
  }
}

async function clearSiteMapCache() {
  try {
    const result = await wixData.query('SiteMap')
      .limit(1) // Adjust the limit based on the number of cache entries
      .find();

    if (result && result.items && result.items.length > 0) {
      const deletePromises = result.items.map(item => wixData.remove('SiteMap', item._id));
      await Promise.all(deletePromises);
    }
  } catch (error) {
    console.error(error);
  }
}

function extractPages(xml) {
  const regex = /<url>([\s\S]*?)<\/url>/g;
  const matches = xml.match(regex);
  if (matches) {
    return matches;
  }
  return null;
}

function getPageURL(page) {
  const regex = /<loc>(.*?)<\/loc>/;
  const match = page.match(regex);
  if (match) {
    return match[1];
  }
  return '';
}

function extractUrls(xml) {
  const regex = /<loc>(.*?)<\/loc>/g;
  const matches = xml.match(regex);
  if (matches) {
    const urls = matches.map(match => match.replace('<loc>', '').replace('</loc>', ''));
    return urls;
  }
  return null;
}

function extractSitemapUrls(xml) {
  const regex = /<sitemap>([\s\S]*?)<\/sitemap>/g;
  const matches = xml.match(regex);
  if (matches) {
    const urls = matches.map(match => {
      const locRegex = /<loc>(.*?)<\/loc>/;
      const locMatch = match.match(locRegex);
      if (locMatch) {
        return locMatch[1];
      }
      return '';
    });
    return urls;
  }
  return null;
}

function capitalizeFirstLetter(string) {
  return string.charAt(0).toUpperCase() + string.slice(1);
}

/**
*	Adds an event handler that runs when the element is clicked.
	[Read more](https://www.wix.com/corvid/reference/$w.ClickableMixin.html#onClick)
*	 @param {$w.MouseEvent} event
*/
export async function button5_click(event) {
	// This function was added from the Properties & Events panel. To learn more, visit http://wix.to/UcBnC-4
	// Add your code for this event here: 
	await doClearCache();
}

Notes:

  • there’s some styling in there to output as 16pt Arial 1.4 spaced unordered list

  • there’s no logic to order the parent or child elements

  • this should recurse infinitely down the sitemap XMLs

  • the first run will be slow, subsequent runs should be instant and then if there are any changes, update in the background

  • for admin users, add a button that allows you to force a refresh at any point.

Thoughts? Have I reinvented the wheel, or is this a reasonable solution to the problem?