export function extractIDsFromHtml(htmlContent) {
  const parser = new DOMParser();
  const doc = parser.parseFromString(htmlContent, "text/html");
  const elementsWithId = doc.querySelectorAll("[id]");
  return Array.from(elementsWithId).map((el) => el.getAttribute("id"));
}

/**
 * This function condenses a given financial statement (in HTML format) to include only relevant information
 * specified by the provided sentence. The financial statement can have multiple tables. For each table, the
 * first five rows (assumed to be header rows) are always retained. If a table contains the sentence,
 * the row with the sentence and the surrounding rows (three preceding and one following) are also retained.
 * If the sentence is not found in a table, all rows except the header rows are removed from that table.
 * After condensing the tables, the function removes the 'id' attributes from all HTML, XBRL, and XML elements
 * in the financial statement, except those contained within the sentence, to further condense the data.
 * The condensed financial statement is then returned as a string.
 *
 * @param {string} financialStatement - The original financial statement in HTML format.
 * @param {string} sentence - An object containing the 'sentence' property to search for within the financial statement.
 * @returns {string} - The condensed and anonymized financial statement.
 */

export function condenseFinancialStatementToOnlyBeFieldsRelevantToSentence(
  financialStatement,
  sentence,
  numHeadersToKeep = 5,
  numRowsBeforeSentenceToKeep = 3,
  numRowsAfterSentenceToKeep = 1
) {
  const parser = new DOMParser();
  const doc = parser.parseFromString(financialStatement, "text/html");
  const tables = doc.querySelectorAll("table");

  if (tables.length === 0) {
    console.error("No tables found in the financial statement");
    return financialStatement;
  }

  tables.forEach((table) => {
    const rows = Array.from(table.querySelectorAll("tr"));
    const headerRows = rows.slice(0, numHeadersToKeep);
    const sentenceRowIndex = rows.findIndex((row) =>
      row.innerHTML.includes(sentence)
    );

    if (sentenceRowIndex === -1) {
      rows.slice(numHeadersToKeep).forEach((row) => row.remove());
    } else {
      const startIdx = Math.max(
        0,
        sentenceRowIndex - numRowsBeforeSentenceToKeep
      );
      const endIdx = Math.min(
        rows.length,
        sentenceRowIndex + numRowsAfterSentenceToKeep + 1
      );
      const contextRows = rows.slice(startIdx, endIdx);
      rows.forEach((row) => {
        if (!headerRows.includes(row) && !contextRows.includes(row)) {
          row.remove();
        }
      });
    }
  });

  const sentenceDoc = parser.parseFromString(sentence, "text/html");
  const sentenceIds = Array.from(sentenceDoc.querySelectorAll("[id]")).map(
    (el) => el.id
  );

  doc.querySelectorAll("[id]").forEach((el) => {
    if (!sentenceIds.includes(el.id)) {
      el.removeAttribute("id");
    }
  });

  const serializer = new XMLSerializer();
  return serializer.serializeToString(doc);
}
