From 76e72eba5ef9ab8e6f277acec25fdcfe16be1676 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Tue, 28 Jul 2015 01:14:48 -0400 Subject: [PATCH] Make Annotation's serializable and initial implementation of adding annotations to the Exporter output --- .../digitalpebble/behemoth/Annotation.java | 8 +++- .../behemoth/util/AnnotationsUtil.java | 2 +- .../behemoth/util/ContentExtractor.java | 47 +++++++++++++++++-- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/com/digitalpebble/behemoth/Annotation.java b/core/src/main/java/com/digitalpebble/behemoth/Annotation.java index 6855a00..98c7afb 100644 --- a/core/src/main/java/com/digitalpebble/behemoth/Annotation.java +++ b/core/src/main/java/com/digitalpebble/behemoth/Annotation.java @@ -17,6 +17,7 @@ package com.digitalpebble.behemoth; +import java.io.Serializable; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -25,7 +26,12 @@ * Implementation of an Annotation. Has a type , metadata and start and end * offsets referring to the position in the text of a @class BehemothDocument. **/ -public class Annotation implements Comparable { +public class Annotation implements Comparable, Serializable { + + /** + * + */ + private static final long serialVersionUID = 1L; private String type; diff --git a/core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java b/core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java index ce6bd8a..99de022 100644 --- a/core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java +++ b/core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java @@ -21,13 +21,13 @@ import java.util.Collections; import java.util.Iterator; import java.util.List; -import java.util.Set; import com.digitalpebble.behemoth.Annotation; public class AnnotationsUtil { /** Sort the annotations by startOffset **/ + @SuppressWarnings("unchecked") public static void sort(List input) { Collections.sort(input, new AnnotationComparator()); } diff --git a/core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java b/core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java index ec699a5..d6b3347 100644 --- a/core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java +++ b/core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java @@ -17,8 +17,15 @@ package com.digitalpebble.behemoth.util; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.ObjectOutputStream; import java.net.URLEncoder; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; import java.util.UUID; import org.apache.commons.cli.CommandLine; @@ -31,6 +38,7 @@ import org.apache.commons.compress.archivers.ArchiveOutputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; @@ -44,6 +52,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.digitalpebble.behemoth.Annotation; import com.digitalpebble.behemoth.BehemothConfiguration; import com.digitalpebble.behemoth.BehemothDocument; import com.digitalpebble.behemoth.DocumentFilter; @@ -74,6 +83,8 @@ public static FileNamingMode toMode(String str) { // dump the text otherwise private boolean dumpBinary = false; + // don't dump annotations by default + private boolean dumpAnnotations = false; private ArchiveOutputStream currentArchive = null; @@ -107,6 +118,7 @@ public int run(String[] args) throws Exception { "dumps binary content, text otherwise"); options.addOption("n", "filenaming", true, "whether to name files based on URL, UUID (default) or NUM"); + options.addOption("a", "annotation", false, "whether to include annotation in output (off by default)"); // parse the command line arguments try { @@ -122,6 +134,7 @@ public int run(String[] args) throws Exception { return -1; } dumpBinary = line.hasOption("binary"); + dumpAnnotations = line.hasOption("annotation"); if (line.hasOption("filenaming")) { String naming = line.getOptionValue("n"); @@ -203,6 +216,7 @@ private void addToArchive(String fileName, byte[] content, Path dirPath) numEntriesInCurrentArchive++; currentArchive.putArchiveEntry(new ZipArchiveEntry(fileName)); currentArchive.write(content); + LOG.debug("Successfully wrote BehemothDocument 'content' to output."); currentArchive.closeArchiveEntry(); index.flush(); if (numEntriesInCurrentArchive == maxNumEntriesInArchive) { @@ -250,11 +264,32 @@ else if (!dumpBinary && inputDoc.getText() == null) fileName += ".txt"; byte[] contentBytes; - if (dumpBinary) + List annots = null; + if (dumpBinary) { contentBytes = inputDoc.getContent(); - else + if(dumpAnnotations) { + annots = inputDoc.getAnnotations(); + //write annotations with content? + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(bos); + oos.writeObject(annots); + byte[] annotsBytes = bos.toByteArray(); + contentBytes = concatAndDeepClone(contentBytes, annotsBytes); + } + } else { contentBytes = inputDoc.getText().getBytes("UTF-8"); - // out.write(contentBytes, 0, contentBytes.length); + if(dumpAnnotations) { + annots = inputDoc.getAnnotations(); + ArrayList annotsArrayList = new ArrayList(); + for (int i = 0; i < annots.size(); i++) { + annotsArrayList.add(annots.get(i).toString()); + } + + byte[] annotsBytes = annotsArrayList.toString().getBytes(Charset.forName("UTF-8")); + contentBytes = concatAndDeepClone(contentBytes, annotsBytes); + } + } + addToArchive(fileName, contentBytes, dir); // add the mapping URL->filename in the index -> archive num @@ -264,4 +299,10 @@ else if (!dumpBinary && inputDoc.getText() == null) current.close(); } } + + private byte[] concatAndDeepClone(byte[] contentBytes, byte[] annotsBytes) { + byte[] concatBytes = ArrayUtils.addAll(contentBytes, annotsBytes); + contentBytes = concatBytes.clone(); + return contentBytes; + } }