Scalding - Hadoop Word Count in LESS than 70 lines of code

ScaldingHadoop Word Count in < 70 lines of code

Konrad 'ktoso' MalawskiJARCamp #3 12.04.2013

Friday, April 12, 13

ScaldingHadoop Word Count

in 4 lines of code

Konrad 'ktoso' MalawskiJARCamp #3 12.04.2013



Agenda


Agenda

Why Scalding? (10%)


Agenda

Why Scalding? (10%)+


Agenda


Hadoop Basics (20%)


Agenda


Hadoop Basics (20%)+


Agenda



Enter Cascading (40%)


Agenda



Enter Cascading (40%)+


Agenda




Hello Scalding (30%)


Agenda




Hello Scalding (30%)=


Agenda




Hello Scalding (30%)=

100%


Why Scalding?Word Count in Types

type Word = Stringtype Count = Int

String => Map[Word, Count]


Why Scalding?Word Count in Scala



val text = "a a a b b"




def wordCount(text: String): Map[Word, Count] =




def wordCount(text: String): Map[Word, Count] = text




def wordCount(text: String): Map[Word, Count] = text .split(" ")




def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1))




def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1)




def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map { a => a._1 -> a._2.map(_._2).sum }




def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map { a => a._1 -> a._2.map(_._2).sum }

wordCount(text) should equal (Map("a" -> 3), ("b" -> 2)))


Stuff > MemoryScala collections... fun but, memory bound!

val text = "so many words... waaah! ..."

text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))





in Memory





in Memory

in Memory





in Memory

in Memory

in Memory





in Memory

in Memory

in Memory

in Memory





in Memory

in Memory

in Memory

in Memory

in Memory


Apache Hadoop (HDFS + MR)http://hadoop.apache.org/


http://hadoop.apache.org/

http://hadoop.apache.org/

package org.myorg;

import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.*;

import java.io.IOException;import java.util.Iterator;import java.util.StringTokenizer;

public class WordCount {

public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text();

public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one); } } }

public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } }

public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount");

conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class);

conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class);

conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1]));

JobClient.runJob(conf); }}

Why Scalding?Word Count in Hadoop MR


package org.myorg;

import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.*;

import java.io.IOException;import java.util.Iterator;import java.util.StringTokenizer;

public class WordCount {

public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text();

public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one); } } }

public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } }

public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount");

conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class);

conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class);

conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1]));

JobClient.runJob(conf); }}

Why Scalding?Word Count in Hadoop MR


Trivia: How old is Hadoop?










Cascadingwww.cascading.org/


http://www.cascading.org/


Cascadingwww.cascading.org/




Cascadingis


Cascadingis

Taps & Pipes


Cascadingis

Taps & Pipes& Pipes

& SinksFriday, April 12, 13

1: Distributed Copy


1: Distributed Copy


1: Distributed Copy

// source TapTap inTap = new Hfs(new TextDelimited(true, "\t"), inPath);


1: Distributed Copy


// sink TapTap outTap = new Hfs(new TextDelimited(true, "\t"), outPath);


1: Distributed Copy



// a Pipe, connects tapsPipe copyPipe = new Pipe("copy");


1: Distributed Copy




// build the FlowFlowDef flowDef = FlowDef.flowDef()


1: Distributed Copy




// build the FlowFlowDef flowDef = FlowDef.flowDef() .addSource( copyPipe, inTap )


1: Distributed Copy




// build the FlowFlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap);


1: Distributed Copy




// build the FlowFlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap);

// run!flowConnector.connect(flowDef).complete();


1. DCP - Full Codepublic class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1];

Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

Tap inTap = new Hfs( new TextDelimited(true, "\t"), inPath);

Tap outTap = new Hfs(new TextDelimited(true, "\t"), outPath);

Pipe copyPipe = new Pipe("copy");

FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap);

flowConnector.connect(flowDef).complete();}}










































2: Word Count

String docPath = args[ 0 ]; String wcPath = args[ 1 ];

Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );

// specify a regex operation to split the "document" text lines into a token stream Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } }


2: Word Count









2: Word Count












Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );

// only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );




2: Word Count


2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ];








2: Word Count


2: Word Count









2: Word Count


2: Word Count









2: Word Count


2: Word Count









2: Word Count


2: Word Count









2: Word Count


2: Word Count









2: Word Count


Cascading - how?


Cascading - how?// pseudo code...



val flow = FlowDef



val flow = FlowDefval flowConnector: FlowDef => List[MRJob] = ...




val jobs: List[MRJob] = flowConnector(flow)





HadoopCluster.execute(jobs)





HadoopCluster.execute(jobs)


Cascading tipsPipe assembly = new Pipe( "assembly" );assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );// ...

// head and tail have same nameFlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly );




flowDef.setDebugLevel( DebugLevel.NONE );




flowDef.setDebugLevel( DebugLevel.NONE );

flowConnector will NOT create the Debug pipe!


Scalding=+

Twitter Scaldinggithub.com/twitter/scalding


https://github.com/twitter/scalding

https://github.com/twitter/scalding

Scalding API


map


val data = 1 :: 2 :: 3 :: Nil

mapScala:


val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }

mapScala:


val data = 1 :: 2 :: 3 :: Nil


// Int => Int

mapScala:


val data = 1 :: 2 :: 3 :: Nil


// Int => Int

mapScala:


val data = 1 :: 2 :: 3 :: Nil


// Int => Int

map

IterableSource(data)

Scala:

Scalding:


val data = 1 :: 2 :: 3 :: Nil


// Int => Int

map

IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 }

Scala:

Scalding:


val data = 1 :: 2 :: 3 :: Nil


// Int => Int

map


// Int => Int

Scala:

Scalding:


val data = 1 :: 2 :: 3 :: Nil


// Int => Int

map


// Int => Int

Scala:

Scalding:

available in Pipe


val data = 1 :: 2 :: 3 :: Nil


// Int => Int

map


// Int => Int

Scala:

Scalding:

available in Pipestays in Pipe


val data = 1 :: 2 :: 3 :: Nil


// Int => Int

map


// Int => Int

Scala:

Scalding:

must choose type!


mapTo


var data = 1 :: 2 :: 3 :: Nil

mapToScala:


var data = 1 :: 2 :: 3 :: Nil


mapToScala:


var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }data = null

mapToScala:


var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }data = null // Int => Int

mapToScala:


var data = 1 :: 2 :: 3 :: Nil


mapToScala:

release reference


var data = 1 :: 2 :: 3 :: Nil


mapToScala:

release reference


var data = 1 :: 2 :: 3 :: Nil


mapTo

IterableSource(data)

Scala:

Scalding:

release reference


var data = 1 :: 2 :: 3 :: Nil


mapTo

IterableSource(data) .mapTo('doubled) { n: Int => n * 2 }

Scala:

Scalding:

release reference


var data = 1 :: 2 :: 3 :: Nil


mapTo


// Int => Int

Scala:

Scalding:

release reference


var data = 1 :: 2 :: 3 :: Nil


mapTo


// Int => Int

Scala:

Scalding:

doubled stays in Pipe

release reference


var data = 1 :: 2 :: 3 :: Nil


mapTo


// Int => Int

Scala:

Scalding:

doubled stays in Pipenumber is removed

release reference


flatMap


val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String]

flatMapScala:



val numbers = data flatMap { line => // String

flatMapScala:



val numbers = data flatMap { line => // String line.split(",") // Array[String]

flatMapScala:



val numbers = data flatMap { line => // String line.split(",") // Array[String]} map { _.toInt } // List[Int]

flatMapScala:




numbers // List[Int]

flatMapScala:




numbers // List[Int]numbers should equal (List(1, 2, 2, 3, 3, 3))

flatMapScala:





flatMapScala:





flatMap

TextLine(data) // like List[String]

Scala:

Scalding:





flatMap

TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String]

Scala:

Scalding:





flatMap

TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String] .map('word -> 'number) { _.toInt } // like List[Int]

Scala:

Scalding:





flatMap

TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String] .map('word -> 'number) { _.toInt } // like List[Int]

Scala:

Scalding:

MR map outside


flatMap



flatMapScala:



val numbers = data flatMap { line => // String

flatMapScala:



val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int]

flatMapScala:



val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int]}

flatMapScala:




numbers // List[Int]

flatMapScala:





flatMapScala:





flatMapScala:





flatMap

TextLine(data) // like List[String]

Scala:

Scalding:





flatMap

TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) }

Scala:

Scalding:





flatMap

TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) } // like List[Int]

Scala:

Scalding:





flatMap

TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) } // like List[Int]

Scala:

Scalding:

map inside Scala


groupBy


val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]

groupByScala:



val groups = data groupBy { _ < 10 }

groupByScala:




groups // Map[Boolean, Int]

groupByScala:





groups(true) should equal (List(1, 2))

groupByScala:





groups(true) should equal (List(1, 2))groups(false) should equal (List(30, 42))

groupByScala:






groupByScala:






groupBy

IterableSource(List(1, 2, 30, 42), 'num)

Scala:

Scalding:






groupBy

IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 }

Scala:

Scalding:






groupBy

IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size }

Scala:

Scalding:






groupBy


Scala:

Scalding:

groups all with == value






groupBy


Scala:

Scalding:

groups all with == value 'lessThanTenCounts


groupBy

Scalding:


groupBy

IterableSource(List(1, 2, 30, 42), 'num)

Scalding:


groupBy

IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 }

Scalding:


groupBy

IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.sum('total) }

Scalding:


groupBy

IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.sum('total) }

Scalding:

'total = [3, 74]


Scalding API


Scalding APIproject / discard



map / mapTo



map / mapToflatMap / flatMapTo




rename




renamefilter




renamefilter

unique




renamefilter

uniquegroupBy / groupAll / groupRandom / shuffle




renamefilter


limit




renamefilter


limitdebug




renamefilter


limitdebug

Group operations




renamefilter


limitdebug

Group operations

joinsFriday, April 12, 13

Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {



val input = Tsv(args("input")) val output = Tsv(args("output"))





input.read.write(output)

}





input.read.write(output)

}


The End.


import org.apache.hadoop.util.ToolRunnerimport com.twitter.scalding

object ScaldingJobRunner extends App {

ToolRunner.run(new Configuration, new scalding.Tool, args)

}

Main Class - "Runner"


import org.apache.hadoop.util.ToolRunnerimport com.twitter.scalding

object ScaldingJobRunner extends App {

ToolRunner.run(new Configuration, new scalding.Tool, args)

}

Main Class - "Runner"

from App



}

Word Count in Scalding



val inputFile = args("input") val outputFile = args("output")

}





TextLine(inputFile)

}





TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) }

def tokenize(text: String): Array[String] = implemented}





TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size }






TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { group => group.size }






TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { group => group.size('count) }






TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } .write(Tsv(outputFile))






TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } .write(Tsv(outputFile))



4{


Dzięki! Thanks!ありがとう！

Konrad Malawski @ java.plt: ktosopl / g: ktosob: blog.project13.pl


http://twitter.com/ktosopl

http://twitter.com/ktosopl

http://github.com/ktoso

http://github.com/ktoso

http://blog.project13.pl

http://blog.project13.pl

Technology

Scalding - Hadoop Word Count in LESS than 70 lines of code