const msWordRegEx1 = /<meta\s*name="?generator"?\s*content="?microsoft\s*word\s*\d+"?\/?>/i
const msWordRegEx2 = /xmlns:o="urn:schemas-microsoft-com/i

export function isMsWord(htmlString) {
  return msWordRegEx1.test(htmlString) || msWordRegEx2.test(htmlString)
}

/**
 * Normalizes specific spacing generated by Safari when content pasted from Word (`<span class="Apple-converted-space"> </span>`)
 * by replacing all spaces sequences longer than 1 space with `&nbsp; ` pairs.
 *
 * @param htmlString HTML string in which spacing should be normalized
 * @returns Input HTML with spaces normalized.
 */
function normalizeSafariSpaceSpans(htmlString) {
  return htmlString.replace(/<span(?: class="Apple-converted-space"|)>(\s+)<\/span>/g, (fullMatch, spaces) => {
    return spaces.length === 1
      ? " "
      : Array(spaces.length + 1)
          .join("\u00A0 ")
          .substr(0, spaces.length)
  })
}

/**
 * Replaces last space preceding elements closing tag with `&nbsp;`.
 * This method also takes into account Word specific `<o:p></o:p>` empty tags.
 * Additionally multiline sequences of spaces and new lines between tags are removed (see #39 and #40).
 *
 * @param htmlString HTML string in which spacing should be normalized.
 * @returns Input HTML with spaces normalized.
 */
function normalizeSpacing(htmlString) {
  // Run normalizeSafariSpaceSpans() two times to cover nested spans.
  return (
    normalizeSafariSpaceSpans(normalizeSafariSpaceSpans(htmlString))
      // Remove all \r\n from "spacerun spans" so the last replace line doesn't strip all whitespaces.
      .replace(/(<span\s+style=['"]mso-spacerun:yes['"]>[^\S\r\n]*?)[\r\n]+([^\S\r\n]*<\/span>)/g, "$1$2")
      .replace(/<span\s+style=['"]mso-spacerun:yes['"]><\/span>/g, "")
      .replace(/(<span\s+style=['"]letter-spacing:[^'"]+?['"]>)[\r\n]+(<\/span>)/g, "$1 $2")
      .replace(/ <\//g, "\u00A0</")
      .replace(/ <o:p><\/o:p>/g, "\u00A0<o:p></o:p>")
      // Remove <o:p> block filler from empty paragraph. Safari uses \u00A0 instead of &nbsp;.
      .replace(/<o:p>(&nbsp;|\u00A0)<\/o:p>/g, "")
      // Remove all whitespaces when they contain any \r or \n.
      .replace(/>([^\S\r\n]*[\r\n]\s*)</g, "><")
  )
}

/**
 * Normalizes spacing in special Word `spacerun spans` (`<span style='mso-spacerun:yes'>\s+</span>`) by replacing
 * all spaces with `&nbsp; ` pairs.
 *
 * @param htmlDocument Native `Document` object in which spacing should be normalized.
 */
function normalizeSpacerunSpans(htmlDocument) {
  htmlDocument.querySelectorAll("span[style*=spacerun]").forEach(element => {
    const htmlElement = element
    const innerTextLength = htmlElement.innerText.length || 0

    htmlElement.innerText = Array(innerTextLength + 1)
      .join("\u00A0 ")
      .substr(0, innerTextLength)
  })
}

/**
 * Removes leftover content from between closing </body> and closing </html> tag:
 *
 * ```html
 * <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>
 * ```
 *
 * This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
 * @param htmlString The HTML string to be cleaned.
 * @returns The HTML string with leftover content removed.
 */
function cleanContentAfterBody(htmlString) {
  const bodyCloseTag = "</body>"
  const htmlCloseTag = "</html>"

  const bodyCloseIndex = htmlString.indexOf(bodyCloseTag)
  if (bodyCloseIndex < 0) return htmlString

  const htmlCloseIndex = htmlString.indexOf(htmlCloseTag, bodyCloseIndex + bodyCloseTag.length)

  return (
    htmlString.substring(0, bodyCloseIndex + bodyCloseTag.length) +
    (htmlCloseIndex >= 0 ? htmlString.substring(htmlCloseIndex) : "")
  )
}

export function parseHtml(htmlString) {
  const domParser = new DOMParser()

  // Remove Word specific "if comments" so content inside is not omitted by the parser.
  htmlString = htmlString.replace(/<!--\[if gte vml 1]>/g, "")

  // Clean the <head> section of MS Windows specific tags.
  // The regular expression matches the <o:SmartTagType> tag with optional attributes (with or without values).
  htmlString = htmlString.replace(/<o:SmartTagType(?:\s+[^\s>=]+(?:="[^"]*")?)*\s*\/?>/gi, "")

  const normalizedHtml = normalizeSpacing(cleanContentAfterBody(htmlString))

  // Parse htmlString as native Document object.
  const dom = domParser.parseFromString(normalizedHtml, "text/html")

  normalizeSpacerunSpans(dom)

  return dom
}
