Export Transformer

By using export transformer you can export almost anything out of Datasaur. Your new export transformer will have this template:

/**
 * This function should be written as this template and return string.
 */
(document: Exportable): string => {
  /// Implement export function here
  return document.cells.map((cell) => cell.content).join('\n');
};

Sample Case

In this sample, we will export a token-based project. However, we want the format to be compatible with Google AutoMl. The export transformer is written in TypeScript:

function getCellMap(cells: Cell[]) {
  const cellMap = new Map<number, Cell>();
  cells.forEach(cell => {
    cellMap.set(cell.line, cell);
  })
  return cellMap;
}

function getLabelSetMap(labelSets: LabelSet[]) {
  const labelSetMap = new Map<string, LabelItem>();
  labelSets.forEach(labelSet => {
    labelSet.labelItems.forEach(labelItem => {
      labelSetMap.set(labelItem.id, labelItem);
    })
  });
  return labelSetMap;
}

function convertOffset(label: SimpleLabel, cell: Cell) {
  const offset = { "end_offset": 0, "start_offset": 0 };

  const startTokenIndex = label.startTokenIndex;
  const endTokenIndex = label.endTokenIndex;
  const startCharIndex = label.startCharIndex;
  const endCharIndex = label.endCharIndex;

  let offsetCounter = 0;
  for (let i = 0; i <= endTokenIndex; i++) {
    if (i == startTokenIndex) {
      offset.start_offset = offsetCounter + startCharIndex;
    }
    if (i == endTokenIndex) {
      offset.end_offset = offsetCounter + endCharIndex + 1;
      break
    }
    offsetCounter = offsetCounter + cell.tokens[i].length + 1;
  }
  return offset;
}

function stringifyWithSpaces(obj) {
	let result = JSON.stringify(obj, null, 1); // stringify, with line-breaks and indents
	result = result.replace(/^ +/gm, " "); // remove all but the first space for each line
	result = result.replace(/\n/g, ""); // remove line-breaks
	result = result.replace(/{ /g, "{").replace(/ }/g, "}"); // remove spaces between object-braces and first/last props
	result = result.replace(/\[ /g, "[").replace(/ \]/g, "]"); // remove spaces between array-brackets and first/last items
	return result;
}

/**
 * This function should be written as this template and return string.
 */
(document: Exportable): string => {
  /// Implement export function here
  const cellMap = getCellMap(document.cells);
  const labelSetMap = getLabelSetMap(document.labelSets);
  const examplesMap = new Map<number, Object>();

  document.labels.forEach(label => {
    const labelItem = labelSetMap.get(label.labelSetItemId);
    const cell = cellMap.get(label.startCellLine);
    const offset = convertOffset(label, cell);

    const annotation = { "text_extraction": {"text_segment": offset}, "display_name": labelItem.labelName };
    if (examplesMap.has(label.startCellLine)) {
      const example = examplesMap.get(label.startCellLine);
      example["annotations"].push(annotation);
      examplesMap.set(label.startCellLine, example);
    } else {
      const example = {
        "annotations": [annotation],
        "text_snippet": {"content": cell.tokens.join(' ')}
      };
      examplesMap.set(label.startCellLine, example);
    }
  });

  let output = [];
  examplesMap.forEach((value) => {
    output.push(stringifyWithSpaces(value));
  })
  return output.join('\n');
};

Uploading the Export File Transformer

There are nine steps to uploading and exporting with the custom export transformer:

  1. Copy the content of the TypeScript

  2. Go to the File Transformer page

  3. Create a New File Transformer

    1. Fill out the name

    2. Choose Export as the purpose

  4. Paste the script over all of the content

    1. Please ensure that the last saved indicator shows "Saved Now"

  5. Go to your project you would like to export

  6. Go to File > Export File

  7. Rename the projects by adding .jsonl as the extension

  8. Select Custom Format as the format

  9. Select Custom Format, then choose the export transformer you have been set previously

If you have any questions, please reach out to support@datasaur.ai.

Last updated