21
11/10/2015 CIS 660: Data Mining Project Dhruv Patel CSU ID: 2652790 Darshan Pathak CSU ID: 2640944 Instructor: Dr. Sunnie S Chung

Dhruv Patel CSU ID: 2652790 Darshan Pathak CSU ID: 2640944cis.Csuohio.edu/~sschung/CIS660/CIS660_ProjectReport_Dhruv.pdfCleveland State University Page 2 1. Data Gathering In this

  • Upload
    others

  • View
    7

  • Download
    0

Embed Size (px)

Citation preview

11/10/2015

CIS 660: Data Mining Project

Dhruv Patel

CSU ID: 2652790

Darshan Pathak

CSU ID: 2640944

Instructor:

Dr. Sunnie S Chung

Cleveland State University

Page 1

Project description:

Text Mining on Web Documents www.infoplease.com/t/hist/state-of-the-union/

This download contains the text for 219 State of the Union addresses of U.S.

Presidents between 1790and 2006.

Once data is successfully stored, we can build Document Frequency and Inverted

Index on Information Retrieval to build Cosine similarity and build weight matrix

by calculating weighted score based on tf-idf.

Project Member:-

Dhruv Patel: 2652790

Darshan Pathak: 2640944

Tools and Technologies:-

SDK: C#.Net

Nature of Data: SQL Data

Cleveland State University

Page 2

1. Data Gathering

In this step we removed tedious job by going to the website and copy all the

data and save into the notepad file, we just write simple C# code which

going to the website and collect the div of the body element and store the

data in the SQL Data structure.

Program.cs

using System;

using System.Collections.Generic;

using System.Data.SqlClient;

using System.IO;

using System.Linq;

using System.Net;

using System.Text;

using System.Threading.Tasks;

using System.Windows.Forms;

namespace FetchHtml

{

class Program

{

static string urlAddress = "http://www.infoplease.com/t/hist/state-of-the-union/";

static void Main(string[] args)

{

List<LinkData> lstLinks = new List<LinkData>();

List<string> lstText = new List<string>();

string data = getHTMLString(urlAddress);

lstLinks = StripHTMLLinks(data);

foreach (var link in lstLinks)

{

var strHtml = getHTMLString(link.URL);

//lstText.Add(StripHTMLText(strHtml));

using (SqlConnection con = new SqlConnection("Data Source=FRNDZ;Initial

Catalog=StateOfTheUnion;Integrated Security=True"))

{

try

{

con.Open();

SqlCommand sqlStatement = new SqlCommand("Insert into tblHistory values ('"

+ link.DocName + "','" + StripHTMLText(strHtml) + "');", con); ;

Cleveland State University

Page 3

sqlStatement.ExecuteNonQuery();

con.Close();

}

catch (Exception e)

{

}

}

}

Console.Read();

}

public static string getHTMLString(string url)

{

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

string data = string.Empty;

if (response.StatusCode == HttpStatusCode.OK)

{

Stream receiveStream = response.GetResponseStream();

StreamReader readStream = null;

if (response.CharacterSet == null)

{

readStream = new StreamReader(receiveStream);

}

else

{

readStream = new StreamReader(receiveStream,

Encoding.GetEncoding(response.CharacterSet));

}

data = readStream.ReadToEnd();

response.Close();

readStream.Close();

}

return data;

}

public static List<LinkData> StripHTMLLinks(string str)

{

List<LinkData> lstHistory = new List<LinkData>();

HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

doc.LoadHtml(str);

doc.DocumentNode.Descendants().Where(n => n.Name == "div" &&

n.GetAttributeValue("class", "") ==

Cleveland State University

Page 4

"toc").ToList().FirstOrDefault().DescendantNodes().Where(n => n.Name == "a").Where(n =>

!string.IsNullOrEmpty(n.GetAttributeValue("href", "")) &&

!string.IsNullOrEmpty(n.InnerText)).ToList().ForEach(n => lstHistory.Add(new LinkData(

n.InnerText, "http://www.infoplease.com/t/hist/state-of-the-union/" +

n.GetAttributeValue("href", ""))));

return lstHistory;

}

public static string StripHTMLText(string str)

{

HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

doc.LoadHtml(str);

return doc.DocumentNode.Descendants().Where(n => n.Name == "div" &&

(n.GetAttributeValue("class", "") == "article" || n.GetAttributeValue("class", "") ==

"section")).FirstOrDefault().InnerText.Replace('\n', ' ').ToString().Replace(',', '

').ToString().Replace('\'', ' ');

}

}

public class LinkData

{

public string DocName = string.Empty;

public string URL = string.Empty;

public LinkData(string docname,string text){

DocName = docname;

URL = text;

}

}

}

Cleveland State University

Page 5

Fig 1. Here we can see that the SQL Data which we load through the program.

Cleveland State University

Page 6

2. Text Cleaning

In the Text Cleaning Section we just removed Common word by the

programing, below the common word list which we removed from the text.

"a","about","above","across","after","afterwards","again","against","all","almost","alone"

,"along","already","also","although","always","am","among","amongst","amount","an","

and","another","any","anyhow","anyone","anything","anyway","anywhere","are","aroun

d","as","at","back","be","became","because","become","becomes","becoming","been","b

efore","beforehand","behind","being","below","beside","besides","between","beyond","b

ill","both","bottom","but","by","call","can","cannot","cant","co","computer","con","coul

d","couldnt","cry","de","describe","detail","do","done","down","due","during","each","e

g","eight","either","eleven","else","elsewhere","empty","enough","etc","even","ever","ev

ery","everyone","everything","everywhere","except","few","fifteen","fify","fill","find","f

ire","first","five","for","former","formerly","forty","found","four","from","front","full","f

urther","get","give","go","had","has","have","he","hence","her","here","hereafter","hereb

y","herein","hereupon","hers","herself","him","himself","his","how","however","hundred

","i","ie","if","in","inc","indeed","interest","into","is","it","its","itself","keep","last","latte

r","latterly","least","less","ltd","made","many","may","me","meanwhile","might","mill",

"mine","more","moreover","most","mostly","move","much","must","my","myself","nam

e","namely","neither","never","nevertheless","next","nine","no","nobody","none","nor","

not","nothing","now","nowhere","of","off","often","on","once","one","only","onto","or",

"other","others","otherwise","our","ours","ourselves","out","over","own","part","per","pe

rhaps","please","put","rather","re","same","see","seem","seemed","seeming","seems","se

rious","several","she","should","show","side","since","sincere","six","sixty","so","some",

"somehow","someone","something","sometime","sometimes","somewhere","still","such"

,"system","take","ten","than","that","the","their","them","themselves","then","thence","th

ere","thereafter","thereby","therefore","therein","thereupon","these","they","thick","thin"

,"third","this","those","though","three","through","throughout","thru","thus","to","togeth

er","too","top","toward","towards","twelve","twenty","two","un","under","until","up","u

pon","us","very","via","was","we","well","were","what","whatever","when","whence","

whenever","where","whereafter","whereas","whereby","wherein","whereupon","whereve

r","whether","which","while","whither","who","whoever","whole","whom","whose","wh

y","will","with","within","without","would","yet","you","your","yours","yourself","yours

elves"

Cleveland State University

Page 7

3. Calculate Frequency Count , document weight , weight matrix and

Cosine matrix :

Tf-idf Weight values :

Here, = and =

Cleveland State University

Page 8

Common.cs

using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; namespace TextMining { public static class Common { public static Dictionary<string, bool> CommonWords = new Dictionary<string, bool> {{ "a", true },{ "about", true },{ "above", true },{ "across", true },{ "after", true },{ "afterwards", true },{ "again", true },{ "against", true },{ "all", true },{ "almost", true },{ "alone", true },{ "along", true },{ "already", true },{ "also", true },{ "although", true },{ "always", true },{ "am", true },{ "among", true },{ "amongst", true },{ "amount", true },{ "an", true },{ "and", true },{ "another", true },{ "any", true },{ "anyhow", true },{ "anyone", true },{ "anything", true },{ "anyway", true },{ "anywhere", true },{ "are", true },{ "around", true },{ "as", true },{ "at", true },{ "back", true },{ "be", true },{ "became", true },{ "because", true },{ "become", true },{ "becomes", true },{ "becoming", true },{ "been", true },{ "before", true },{ "beforehand", true },{ "behind", true },{ "being", true },{ "below", true },{ "beside", true },{ "besides", true },{ "between", true },{ "beyond", true },{ "bill", true },{ "both", true },{ "bottom", true },{ "but", true },{ "by", true },{ "call", true },{ "can", true },{ "cannot", true },{ "cant", true },{ "co", true },{ "computer", true },{ "con", true },{ "could", true },{ "couldnt", true },{ "cry", true },{ "de", true },{ "describe", true },{ "detail", true },{ "do", true },{ "done", true },{ "down", true },{ "due", true },{ "during", true },{ "each", true },{ "eg", true },{ "eight", true },{ "either", true },{ "eleven", true },{ "else", true },{ "elsewhere", true },{ "empty", true },{ "enough", true },{ "etc", true },{ "even", true },{ "ever", true },{ "every", true },{ "everyone", true },{ "everything", true },{ "everywhere", true },{ "except", true },{ "few", true },{ "fifteen", true },{ "fify", true },{ "fill", true },{ "find", true },{ "fire", true },{ "first", true },{ "five", true },{ "for", true },{ "former", true },{ "formerly", true },{ "forty", true },{ "found", true },{ "four", true },{ "from", true },{ "front", true },{ "full", true },{ "further", true },{ "get", true },{ "give", true },{ "go", true },{ "had", true },{ "has", true

Cleveland State University

Page 9

},{ "have", true },{ "he", true },{ "hence", true },{ "her", true },{ "here", true },{ "hereafter", true },{ "hereby", true },{ "herein", true },{ "hereupon", true },{ "hers", true },{ "herself", true },{ "him", true },{ "himself", true },{ "his", true },{ "how", true },{ "however", true },{ "hundred", true },{ "i", true },{ "ie", true },{ "if", true },{ "in", true },{ "inc", true },{ "indeed", true },{ "interest", true },{ "into", true },{ "is", true },{ "it", true },{ "its", true },{ "itself", true },{ "keep", true },{ "last", true },{ "latter", true },{ "latterly", true },{ "least", true },{ "less", true },{ "ltd", true },{ "made", true },{ "many", true },{ "may", true },{ "me", true },{ "meanwhile", true },{ "might", true },{ "mill", true },{ "mine", true },{ "more", true },{ "moreover", true },{ "most", true },{ "mostly", true },{ "move", true },{ "much", true },{ "must", true },{ "my", true },{ "myself", true },{ "name", true },{ "namely", true },{ "neither", true },{ "never", true },{ "nevertheless", true },{ "next", true },{ "nine", true },{ "no", true },{ "nobody", true },{ "none", true },{ "nor", true },{ "not", true },{ "nothing", true },{ "now", true },{ "nowhere", true },{ "of", true },{ "off", true },{ "often", true },{ "on", true },{ "once", true },{ "one", true },{ "only", true },{ "onto", true },{ "or", true },{ "other", true },{ "others", true },{ "otherwise", true },{ "our", true },{ "ours", true },{ "ourselves", true },{ "out", true },{ "over", true },{ "own", true },{ "part", true },{ "per", true },{ "perhaps", true },{ "please", true },{ "put", true },{ "rather", true },{ "re", true },{ "same", true },{ "see", true },{ "seem", true },{ "seemed", true },{ "seeming", true },{ "seems", true },{ "serious", true },{ "several", true },{ "she", true },{ "should", true },{ "show", true },{ "side", true },{ "since", true },{ "sincere", true },{ "six", true },{ "sixty", true },{ "so", true },{ "some", true },{ "somehow", true },{ "someone", true },{ "something", true },{ "sometime", true },{ "sometimes", true },{ "somewhere", true },{ "still", true },{ "such", true },{ "system", true },{ "take", true },{ "ten", true },{ "than", true },{ "that", true },{ "the", true },{ "their", true },{ "them", true },{ "themselves", true },{ "then", true },{ "thence", true },{ "there", true },{ "thereafter", true },{ "thereby", true },{ "therefore", true },{ "therein", true },{ "thereupon", true },{ "these", true },{ "they", true },{ "thick", true },{ "thin", true },{ "third", true },{ "this", true },{ "those", true },{ "though", true },{ "three", true },{ "through", true },{ "throughout", true },{ "thru", true },{ "thus", true },{ "to", true },{ "together", true },{ "too", true },{ "top", true },{ "toward", true },{ "towards", true },{ "twelve", true },{ "twenty", true },{ "two", true },{ "un", true },{ "under", true },{ "until", true },{ "up", true },{ "upon", true },{ "us", true },{ "very", true },{ "via", true },{ "was", true },{ "we", true },{ "well", true },{ "were", true },{ "what", true },{ "whatever", true },{ "when", true },{ "whence", true },{ "whenever",

Cleveland State University

Page 10

true },{ "where", true },{ "whereafter", true },{ "whereas", true },{ "whereby", true },{ "wherein", true },{ "whereupon", true },{ "wherever", true },{ "whether", true },{ "which", true },{ "while", true },{ "whither", true },{ "who", true },{ "whoever", true },{ "whole", true },{ "whom", true },{ "whose", true },{ "why", true },{ "will", true },{ "with", true },{ "within", true },{ "without", true },{ "would", true },{ "yet", true },{ "you", true },{ "your", true },{ "yours", true },{ "yourself", true },{ "yourselves", true } }; public static char[] Splitters = new char[] { ' ',',',';','.','(',')' }; } }

TexMining.cs

using System;

using System.Collections.Generic;

using System.Data.SqlClient;

using System.IO;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

using System.Threading.Tasks;

namespace TextMining

{

class KeyWordDetails

{

public string Key = string.Empty;

public int df = 0;

public KeyWordDetails(string key, int value)

{

Key = key;

df = value;

}

}

class DocumentDetail

{

public string DocName = string.Empty;

public string Text = string.Empty;

Cleveland State University

Page 11

public DocumentDetail(string dn, string txt)

{

DocName = dn;

Text = txt;

}

}

class TextMining

{

static string[] aryKeyWord;

static string DocPath = @"../../Documents/";

static List<KeyWordDetails> lstdft = new List<KeyWordDetails>();

static double N=0;

static Dictionary<string, Dictionary<string, int>> keyWordCount = new Dictionary<string,

Dictionary<string, int>>();

static Dictionary<string, Dictionary<string, double>> WeightMatrix = new

Dictionary<string, Dictionary<string, double>>();

static Dictionary<string, Dictionary<string, double>> NormMatrix = new

Dictionary<string, Dictionary<string, double>>();

static Dictionary<string, double> ScoreVecor = new Dictionary<string, double>();

static string strTex = string.Empty;

static int KeywordCount=10;

static void Main(string[] args)

{

List<DocumentDetail> aryDoc = new List<DocumentDetail>();

using (SqlConnection con = new SqlConnection("Data Source=FRNDZ;Initial

Catalog=StateOfTheUnion;Integrated Security=True"))

{

try

{

con.Open();

SqlCommand sqlStatement = new SqlCommand("Select * from tblHistory;", con); ;

SqlDataReader reader= sqlStatement.ExecuteReader();

while (reader.Read())

{

aryDoc.Add(new DocumentDetail(((string)reader["DocName"]).Replace(',',' '),

((string)reader["Text"]).ToLower()));

}

con.Close();

}

catch (Exception e)

{

Console.WriteLine("Some error occurred.");

}

}

Console.WriteLine("Text Cleanng & Fetch Keyword list : started at "+DateTime.Now);

Cleveland State University

Page 12

aryKeyWord = GetKeyWordList(string.Join(" ", aryDoc.Select(s => s.Text)));

Console.WriteLine("Text Cleanng & Fetch Keyword list : completed at " +

DateTime.Now);

strTex = "," + String.Join(",", aryKeyWord) + "\n";

foreach (var item in aryKeyWord)

{

lstdft.Add(new KeyWordDetails(item.Trim(), 0));

}

N = aryDoc.Count;

#region Counting keyword counts for all documents

Console.WriteLine("Calculate Keyword count : started at " + DateTime.Now);

for (int i = 0; i < aryDoc.Count; i++)

{

var keyval = keyWordCount.Where(a => a.Key ==

aryDoc[i].DocName).ToList().Count > 0 ? aryDoc[i].DocName + "1" : aryDoc[i].DocName;

strTex += keyval+ ",";

keyWordCount.Add(keyval, CalculateCounts(aryDoc[i].Text.ToLower()));

strTex += "\n";

}

Console.WriteLine("Calculate Keyword count : completed at " + DateTime.Now);

#endregion

File.WriteAllText(DocPath + "Count.csv", strTex);

strTex = "Keywords,df values\n";

strTex += string.Join("\n",lstdft.Select(s=>s.Key+","+s.df));

File.WriteAllText(DocPath + "DocumentFreq.csv", strTex);

Console.WriteLine("Calculate Weight value : started at " + DateTime.Now);

strTex = "," + String.Join(",", aryKeyWord) + ",,Score Value" + "\n";

foreach (var doc in keyWordCount)

{

strTex += doc.Key + ",";

WeightMatrix.Add(doc.Key, CalculateWeightVal(doc.Key,doc.Value));

strTex += "\n";

}

Console.WriteLine("\n\nTop 10 pages using scoere : :");

Console.WriteLine(string.Join("\n", ScoreVecor.OrderByDescending(s =>

s.Value).ToList().Select(s => s.Key).Take(10)));

Console.WriteLine("\n\n");

Console.WriteLine("Calculate Weight value : completed at " + DateTime.Now);

File.WriteAllText(DocPath + "WeightMatrix.csv", strTex);

Console.WriteLine("Constructing Cosine Matrix : started at " + DateTime.Now);

strTex = "," + String.Join(",", aryKeyWord) + "\n";

foreach (var doc in WeightMatrix)

{

Cleveland State University

Page 13

strTex += doc.Key + ",";

NormMatrix.Add(doc.Key, CalculateNormalizeVal(doc.Value));

}

File.WriteAllText(DocPath + "NormMatrix.csv", strTex);

#region Calculate Norm Cosine values

var aryDocOutput = new double[aryDoc.Count, aryDoc.Count];

for (int i = 0; i < NormMatrix.Count; i++)

{

var doc1 = NormMatrix.ElementAt(i).Value;

aryDocOutput[i, i] = 1;// 1= similar document

for (int j = i + 1; j < NormMatrix.Count; j++)

{

var doc2 = NormMatrix.ElementAt(j).Value;

aryDocOutput[i, j] = aryDocOutput[j, i] = calculateNormCosineVal(doc1, doc2);

}

}

#endregion

#region Print output

var lstDocKey=NormMatrix.Select(s => s.Key).ToList();

strTex = "," + string.Join(",",lstDocKey )+"\n";

for (int i = 0; i <= aryDocOutput.GetUpperBound(0); i++)

{

strTex += lstDocKey[i];

for (int j = 0; j <= aryDocOutput.GetUpperBound(0); j++)

{

string str = "" + aryDocOutput[i, j];

str = str.PadLeft(9);

strTex += "," + aryDocOutput[i, j];

}

strTex += "\n";

}

#endregion

File.WriteAllText(DocPath + "CosineMatrix.csv", strTex);

Console.WriteLine("Constructing Cosine Matrix : completed at " + DateTime.Now);

Console.WriteLine("Done");

Console.Read();

}

public static string[] GetKeyWordList(string input)

{

var words =

input.Split(Common.Splitters,StringSplitOptions.RemoveEmptyEntries).ToList();

var temp = 0;

Cleveland State University

Page 14

words.RemoveAll(s => Common.CommonWords.Where(r => r.Key == s).Select(p =>

p.Key).FirstOrDefault() != null || int.TryParse(s,out temp));

return

words.GroupBy(s=>s).ToList().OrderByDescending(s=>s.ToList().Count).Select(s=>s.Key).Dist

inct().ToArray().Take(KeywordCount).ToArray();

}

private static Dictionary<string, int> CalculateCounts(string doc)

{

Dictionary<string, int> aryCount = new Dictionary<string, int>();

foreach (var keyWord in aryKeyWord)

{

int count = doc.Split(' ').Where(x => x.Trim().ToLower() ==

keyWord.Trim()).ToList().Count;

if (count > 0)

{

var obj = lstdft.Where(i => i.Key.Trim() == keyWord.Trim()).FirstOrDefault();

obj.df++;

}

aryCount.Add(keyWord.Trim(), count);

}

strTex += string.Join(",", aryCount.Select(s => s.Value).ToList());

return aryCount;

}

public static Dictionary<string, double> CalculateWeightVal(string doc,Dictionary<string,

int> lsttf)

{

Dictionary<string, double> weightVector = new Dictionary<string,double>();

var score = 0.0;

foreach (var tf in lsttf)

{

var term=lstdft.Where(p=>p.Key.Trim()==tf.Key.Trim()).FirstOrDefault();

var weight = Math.Round((Math.Log10(1 + tf.Value) * Math.Log10(N / term.df)), 4);

weightVector.Add(term.Key, weight);

score += weight;

}

ScoreVecor.Add(doc, score);

strTex += String.Join(",", weightVector.Select(s => s.Value)) + ",," + score;

return weightVector;

}

public static Dictionary<string, double> CalculateNormalizeVal(Dictionary<string, double>

docVector)

Cleveland State University

Page 15

{

Dictionary<string, double> normVector = new Dictionary<string, double>();

var vectorVal = 0.0;

docVector.Select(s => s.Value).ToList().ForEach(s => { vectorVal += Math.Pow(s, 2);

});

foreach (var item in docVector)

{

var normVal = vectorVal!=0?Math.Round(item.Value / Math.Sqrt(vectorVal), 2):0;

normVector.Add(item.Key, normVal);

}

strTex += String.Join(",", normVector.Select(s => s.Value))+"\n";

return normVector;

}

private static double calculateNormCosineVal(Dictionary<string, double> doc1,

Dictionary<string, double> doc2)

{

double dotProduct = 0;

foreach (var key in aryKeyWord)

{

var d1 = doc1.Where(p => p.Key == key.Trim()).FirstOrDefault().Value;

var d2 = doc2.Where(p => p.Key == key.Trim()).FirstOrDefault().Value;

dotProduct += d1 * d2;

}

return Math.Round(dotProduct , 2);

}

}

}

Cleveland State University

Page 16

Output : All Excel output files are also attached with this document.

Console :

DocumentFreq.xls :

Cleveland State University

Page 17

Count.xls :

Cleveland State University

Page 18

WeightMatrix.xls :

Cleveland State University

Page 19

NormMatrix.xls :

Cleveland State University

Page 20

CosineMatrix.xls :