azkaban-uncached

fsviewers

10/13/2012 4:14:17 AM

Chenjie Yu

Commit: cf08c86

Tree: 138ff7b

Parents: f9d1939

Changes

src/java/azkaban/fsviewers/HdfsAvroFileViewer.java 104(+104 -0)

src/java/azkaban/fsviewers/HdfsFileViewer.java 35(+35 -0)

src/java/azkaban/fsviewers/HdfsSequenceFileViewer.java 67(+67 -0)

src/java/azkaban/fsviewers/JsonSequenceFileViewer.java 85(+85 -0)

src/java/azkaban/fsviewers/TextFileViewer.java 82(+82 -0)

Details

src/java/azkaban/fsviewers/HdfsAvroFileViewer.java 104(+104 -0)

diff --git a/src/java/azkaban/fsviewers/HdfsAvroFileViewer.java b/src/java/azkaban/fsviewers/HdfsAvroFileViewer.java
new file mode 100644
index 0000000..45a7814
--- /dev/null
+++ b/src/java/azkaban/fsviewers/HdfsAvroFileViewer.java
@@ -0,0 +1,104 @@
+package azkaban.fsviewers;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileStream;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.io.DatumWriter;
+import org.apache.avro.io.Encoder;
+import org.apache.avro.io.JsonEncoder;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.Logger;
+import org.codehaus.jackson.JsonEncoding;
+import org.codehaus.jackson.JsonFactory;
+import org.codehaus.jackson.JsonGenerator;
+
+/**
+ * This class implements a viewer of avro files
+ * 
+ * @author lguo
+ * 
+ */
+public class HdfsAvroFileViewer implements HdfsFileViewer {
+
+    private static Logger logger = Logger.getLogger(HdfsAvroFileViewer.class);
+    // Will spend 5 seconds trying to pull data and then stop.
+    private static long STOP_TIME = 2000l;
+    
+    @Override
+    public boolean canReadFile(FileSystem fs, Path path) {
+
+        if(logger.isDebugEnabled())
+            logger.debug("path:" + path.toUri().getPath());
+
+        try {
+            DataFileStream<Object> avroDataStream = getAvroDataStream(fs, path);
+            Schema schema = avroDataStream.getSchema();
+            avroDataStream.close();
+            return schema != null;
+        } catch(IOException e) {
+            if(logger.isDebugEnabled()) {
+                logger.debug(path.toUri().getPath() + " is not an avro file.");
+                logger.debug("Error in getting avro schema: " + e.getLocalizedMessage());
+            }
+            return false;
+        }
+    }
+
+    private DataFileStream<Object> getAvroDataStream(FileSystem fs, Path path) throws IOException {
+        if(logger.isDebugEnabled())
+            logger.debug("path:" + path.toUri().getPath());
+
+        GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
+        InputStream hdfsInputStream = fs.open(path);
+        return new DataFileStream<Object>(hdfsInputStream, avroReader);
+
+    }
+
+    @Override
+    public void displayFile(FileSystem fs,
+                            Path path,
+                            OutputStream outputStream,
+                            int startLine,
+                            int endLine) throws IOException {
+
+        if(logger.isDebugEnabled())
+            logger.debug("display avro file:" + path.toUri().getPath());
+
+        DataFileStream<Object> avroDatastream = null;
+
+        try {
+            avroDatastream = getAvroDataStream(fs, path);
+            Schema schema = avroDatastream.getSchema();
+            DatumWriter<Object> avroWriter = new GenericDatumWriter<Object>(schema);
+
+            JsonGenerator g = new JsonFactory().createJsonGenerator(outputStream, JsonEncoding.UTF8);
+            g.useDefaultPrettyPrinter();
+            Encoder encoder = new JsonEncoder(schema, g);
+
+            long endTime = System.currentTimeMillis() + STOP_TIME;
+            int lineno = 1; // line number starts from 1
+            while(avroDatastream.hasNext() && lineno <= endLine && System.currentTimeMillis() <= endTime) {
+                Object datum = avroDatastream.next();
+                if(lineno >= startLine) {
+                    String record = "\n\n Record " + lineno + ":\n";
+                    outputStream.write(record.getBytes("UTF-8"));
+                    avroWriter.write(datum, encoder);
+                    encoder.flush();
+                }
+                lineno++;
+            }
+        } catch(IOException e) {
+            outputStream.write(("Error in display avro file: " + e.getLocalizedMessage()).getBytes("UTF-8"));
+            throw e;
+        } finally {
+            avroDatastream.close();
+        }
+    }
+
+}

src/java/azkaban/fsviewers/HdfsFileViewer.java 35(+35 -0)

diff --git a/src/java/azkaban/fsviewers/HdfsFileViewer.java b/src/java/azkaban/fsviewers/HdfsFileViewer.java
new file mode 100644
index 0000000..574080c
--- /dev/null
+++ b/src/java/azkaban/fsviewers/HdfsFileViewer.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2010 LinkedIn, Inc
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package azkaban.fsviewers;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public interface HdfsFileViewer {
+
+    public boolean canReadFile(FileSystem fs, Path path);
+
+    public void displayFile(FileSystem fs,
+                            Path path,
+                            OutputStream outStream,
+                            int startLine,
+                            int endLine) throws IOException;
+
+}
\ No newline at end of file

src/java/azkaban/fsviewers/HdfsSequenceFileViewer.java 67(+67 -0)

diff --git a/src/java/azkaban/fsviewers/HdfsSequenceFileViewer.java b/src/java/azkaban/fsviewers/HdfsSequenceFileViewer.java
new file mode 100644
index 0000000..b0ca4a7
--- /dev/null
+++ b/src/java/azkaban/fsviewers/HdfsSequenceFileViewer.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2010 LinkedIn, Inc
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package azkaban.fsviewers;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+
+public abstract class HdfsSequenceFileViewer implements HdfsFileViewer {
+
+    protected abstract boolean canReadFile(SequenceFile.Reader reader);
+
+    protected abstract void displaySequenceFile(SequenceFile.Reader reader,
+                                                PrintWriter output,
+                                                int startLine,
+                                                int endLine) throws IOException;
+
+    public boolean canReadFile(FileSystem fs, Path file) {
+        boolean result = false;
+        try {
+            SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, new Configuration());
+            result = canReadFile(reader);
+            reader.close();
+        } catch(IOException e) {
+            return false;
+        }
+
+        return result;
+    }
+
+    public void displayFile(FileSystem fs,
+                            Path file,
+                            OutputStream outputStream,
+                            int startLine,
+                            int endLine) throws IOException {
+        SequenceFile.Reader reader = null;
+        PrintWriter writer = new PrintWriter(outputStream);
+        try {
+            reader = new SequenceFile.Reader(fs, file, new Configuration());
+            displaySequenceFile(reader, writer, startLine, endLine);
+        } catch(IOException e) {
+            writer.write("Error opening sequence file " + e);
+        } finally {
+            if(reader != null) {
+                reader.close();
+            }
+        }
+    }
+}
\ No newline at end of file

src/java/azkaban/fsviewers/JsonSequenceFileViewer.java 85(+85 -0)

diff --git a/src/java/azkaban/fsviewers/JsonSequenceFileViewer.java b/src/java/azkaban/fsviewers/JsonSequenceFileViewer.java
new file mode 100644
index 0000000..37833c2
--- /dev/null
+++ b/src/java/azkaban/fsviewers/JsonSequenceFileViewer.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2010 LinkedIn, Inc
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package azkaban.fsviewers;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader;
+import org.apache.hadoop.io.Text;
+import org.apache.log4j.Logger;
+
+import voldemort.serialization.json.JsonTypeSerializer;
+
+public class JsonSequenceFileViewer extends HdfsSequenceFileViewer {
+
+    private static Logger logger = Logger.getLogger(JsonSequenceFileViewer.class);
+
+    public boolean canReadFile(Reader reader) {
+        Text keySchema = reader.getMetadata().get(new Text("key.schema"));
+        Text valueSchema = reader.getMetadata().get(new Text("value.schema"));
+
+        return keySchema != null && valueSchema != null;
+    }
+
+    public void displaySequenceFile(SequenceFile.Reader reader,
+                                    PrintWriter output,
+                                    int startLine,
+                                    int endLine) throws IOException {
+
+        if(logger.isDebugEnabled())
+            logger.debug("display json file");
+
+        try {
+            BytesWritable keyWritable = new BytesWritable();
+            BytesWritable valueWritable = new BytesWritable();
+            Text keySchema = reader.getMetadata().get(new Text("key.schema"));
+            Text valueSchema = reader.getMetadata().get(new Text("value.schema"));
+
+            JsonTypeSerializer keySerializer = new JsonTypeSerializer(keySchema.toString());
+            JsonTypeSerializer valueSerializer = new JsonTypeSerializer(valueSchema.toString());
+
+            // skip lines before the start line
+            for(int i = 1; i < startLine; i++)
+                reader.next(keyWritable, valueWritable);
+
+            // now actually output lines
+            for(int i = startLine; i <= endLine; i++) {
+                boolean readSomething = reader.next(keyWritable, valueWritable);
+                if(!readSomething)
+                    break;
+                output.write(safeToString(keySerializer.toObject(keyWritable.getBytes())));
+                output.write("\t=>\t");
+                output.write(safeToString(valueSerializer.toObject(valueWritable.getBytes())));
+                output.write("\n");
+                output.flush();
+            }
+        } finally {
+            reader.close();
+        }
+    }
+
+    private String safeToString(Object value) {
+        if(value == null)
+            return "null";
+        else
+            return value.toString();
+    }
+
+}
\ No newline at end of file

src/java/azkaban/fsviewers/TextFileViewer.java 82(+82 -0)

diff --git a/src/java/azkaban/fsviewers/TextFileViewer.java b/src/java/azkaban/fsviewers/TextFileViewer.java
new file mode 100644
index 0000000..ee30ce8
--- /dev/null
+++ b/src/java/azkaban/fsviewers/TextFileViewer.java
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2010 LinkedIn, Inc
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package azkaban.fsviewers;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.HashSet;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.Logger;
+
+public class TextFileViewer implements HdfsFileViewer {
+
+    private static Logger logger = Logger.getLogger(TextFileViewer.class);
+    private HashSet<String> acceptedSuffix = new HashSet<String>();
+
+    public TextFileViewer() {
+        acceptedSuffix.add(".txt");
+        acceptedSuffix.add(".csv");
+        acceptedSuffix.add(".props");
+        acceptedSuffix.add(".xml");
+        acceptedSuffix.add(".html");
+        acceptedSuffix.add(".json");
+        acceptedSuffix.add(".log");
+    }
+
+    public boolean canReadFile(FileSystem fs, Path path) {
+        return true;
+    }
+
+    public void displayFile(FileSystem fs,
+                            Path path,
+                            OutputStream outputStream,
+                            int startLine,
+                            int endLine) throws IOException {
+
+        if(logger.isDebugEnabled())
+            logger.debug("read in uncompressed text file");
+        InputStream inputStream = fs.open(path);
+        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
+        PrintWriter output = new PrintWriter(outputStream);
+        for(int i = 1; i < startLine; i++)
+            reader.readLine();
+        
+        final int bufferLimit = 1000000; //only display the first 1M chars. it is used to prevent showing/downloading gb of data
+        int bufferSize = 0;
+        for(int i = startLine; i < endLine; i++) {
+            String line = reader.readLine();
+            if(line == null)
+                break;
+            
+            // bread if reach the buffer limit
+            bufferSize += line.length();
+            if (bufferSize >= bufferLimit)
+                break;
+            
+            output.write(line);
+            output.write("\n");
+        }
+        output.flush();
+        reader.close();
+    }
+}
\ No newline at end of file