From 72fa1d7fcbd461e9c62100549a59a3d8dfe51649 Mon Sep 17 00:00:00 2001 From: Mauricio Siu <47042324+Siumauricio@users.noreply.github.com> Date: Thu, 28 Aug 2025 00:05:32 -0600 Subject: [PATCH] feat: implement HTML cleaning function before Markdown conversion in blog post page --- .../website/app/[locale]/blog/[slug]/page.tsx | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/apps/website/app/[locale]/blog/[slug]/page.tsx b/apps/website/app/[locale]/blog/[slug]/page.tsx index bde7d43..95a95db 100644 --- a/apps/website/app/[locale]/blog/[slug]/page.tsx +++ b/apps/website/app/[locale]/blog/[slug]/page.tsx @@ -94,6 +94,38 @@ export default async function BlogPostPage({ params }: Props) { notFound(); } + // Limpiar HTML antes de convertir a Markdown + const cleanHtml = (html: string) => { + // Crear un DOM temporal para limpiar el HTML + if (typeof window !== "undefined") { + const parser = new DOMParser(); + const doc = parser.parseFromString(html, "text/html"); + + // Remover scripts JSON-LD y otros scripts + const scripts = doc.querySelectorAll( + 'script[type="application/ld+json"], script', + ); + scripts.forEach((script) => script.remove()); + + // Remover otros elementos no deseados + const unwantedElements = doc.querySelectorAll("style, meta, link"); + unwantedElements.forEach((el) => el.remove()); + + return doc.body.innerHTML; + } else { + // Fallback para servidor - usar regex para limpiar + return html + .replace( + /]*type="application\/ld\+json"[^>]*>[\s\S]*?<\/script>/gi, + "", + ) + .replace(/]*>[\s\S]*?<\/script>/gi, "") + .replace(/]*>[\s\S]*?<\/style>/gi, "") + .replace(/]*>/gi, "") + .replace(/]*>/gi, ""); + } + }; + // Convertir HTML a Markdown const turndownService = new TurndownService({ headingStyle: "atx", @@ -104,7 +136,8 @@ export default async function BlogPostPage({ params }: Props) { const strikethrough = turndownPluginGfm.strikethrough; turndownService.use([tables, strikethrough, gfm, remarkToc]); - const markdown = turndownService.turndown(post.html); + const cleanedHtml = cleanHtml(post.html); + const markdown = turndownService.turndown(cleanedHtml); const formattedDate = new Date(post.published_at).toLocaleDateString("en", { year: "numeric",