Dr R Anurekha: File Handling 17: Files and Dictionary: Histogram, Word Frequency, remove noise words

# Write a program that creates a dictionary
# of the frequency of all words in a file.

# Files, Dictionary and Command line arguments
# Histogram - Word frequency

import sys

fname = sys.argv[1]
fs = open(fname, "r")

Word_freq = {}

while(True):
new_Line = ""
Line = fs.readline().strip().lower()

if not Line:
break

for ch in Line:
if (ch.isalpha() or ch.isspace()):
new_Line+=ch
L = new_Line.split(" ")

for wd in L:
if wd in Word_freq:
Word_freq[wd] += 1
else:
Word_freq[wd] = 1

for ele in Word_freq:
print(ele.ljust(15), Word_freq[ele])

print("Number of unique words in file = ", len(Word_freq))

fs.close()

# Write a program that creates a dictionary
# of the frequency of all words in a file.
# Remove noise words in file like a, an, the etc

# Files, Dictionary and Command line arguments
# Histogram - Word frequency

import sys

fname = sys.argv[1]
fs = open(fname, "r")

Word_freq = {}

Noise_words = ['an', 'as', 'of', 'it','by', 'to', 'so', 'do', 'be', 'up', 'on', 'ie', 'its', 'are', 'all', 'has', 'can', 'how', 'end', 'any', 'may', 'for', 'will', 'use', 'one', 'two', 'the', 'also', 'have', 'this', 'that', 'what', 'where', 'when', 'then', 'those', 'from', 'once', 'more', 'most' ]

while(True):
new_Line = ""
Line = fs.readline().strip().lower()

if not Line:
break

for ch in Line:
if (ch.isalpha() or ch.isspace()):
new_Line+=ch
L = new_Line.split(" ")

for wd in L:
if (len(wd)<=1 or wd in Noise_words):
continue

if wd in Word_freq:
Word_freq[wd] += 1
else:
Word_freq[wd] = 1

for ele in Word_freq:
print(ele.ljust(15), Word_freq[ele])

print("Number of unique words in file = ", len(Word_freq))

fs.close()

Dr R Anurekha

File Handling 17: Files and Dictionary: Histogram, Word Frequency, remove noise words

No comments:

Post a Comment