1
|
/*
|
2
|
Copyright (c) 2003-2006 Niels Kokholm and Peter Sestoft
|
3
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
of this software and associated documentation files (the "Software"), to deal
|
5
|
in the Software without restriction, including without limitation the rights
|
6
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
copies of the Software, and to permit persons to whom the Software is
|
8
|
furnished to do so, subject to the following conditions:
|
9
|
|
10
|
The above copyright notice and this permission notice shall be included in
|
11
|
all copies or substantial portions of the Software.
|
12
|
|
13
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
SOFTWARE.
|
20
|
*/
|
21
|
|
22
|
// C5 example: anagrams represented as sorted strings 2004-08-26
|
23
|
|
24
|
// To represent an anagram class, use a string containing the sorted
|
25
|
// characters of a word.
|
26
|
|
27
|
// This is faster than a TreeBag<char> because the words and hence
|
28
|
// bags are small. Takes 15 CPU seconds and 138 MB RAM to find the
|
29
|
// 26,058 anagram classes among 347,000 distinct words.
|
30
|
|
31
|
// Compile with
|
32
|
// csc /r:C5.dll Anagrams.cs
|
33
|
|
34
|
using System;
|
35
|
using System.IO; // StreamReader, TextReader
|
36
|
using System.Text; // Encoding
|
37
|
using System.Text.RegularExpressions; // Regex
|
38
|
using C5;
|
39
|
using SCG = System.Collections.Generic;
|
40
|
|
41
|
namespace AnagramStrings
|
42
|
{
|
43
|
class MyTest
|
44
|
{
|
45
|
public static void Main(String[] args)
|
46
|
{
|
47
|
Console.OutputEncoding = Encoding.GetEncoding("iso-8859-1");
|
48
|
SCG.IEnumerable<String> ss;
|
49
|
if (args.Length == 1)
|
50
|
ss = ReadFileWords(args[0]);
|
51
|
else
|
52
|
ss = args;
|
53
|
|
54
|
Timer t = new Timer();
|
55
|
SCG.IEnumerable<SCG.IEnumerable<String>> classes = AnagramClasses(ss);
|
56
|
int count = 0;
|
57
|
foreach (SCG.IEnumerable<String> anagramClass in classes)
|
58
|
{
|
59
|
count++;
|
60
|
// foreach (String s in anagramClass)
|
61
|
// Console.Write(s + " ");
|
62
|
// Console.WriteLine();
|
63
|
}
|
64
|
Console.WriteLine("{0} anagram classes", count);
|
65
|
Console.WriteLine(t.Check());
|
66
|
}
|
67
|
|
68
|
// Read words from a file
|
69
|
|
70
|
public static SCG.IEnumerable<String> ReadFileWords(String filename)
|
71
|
{
|
72
|
Regex delim = new Regex("[^a-z???A-Z???0-9-]+");
|
73
|
using (TextReader rd = new StreamReader(filename, Encoding.GetEncoding("iso-8859-1")))
|
74
|
{
|
75
|
for (String line = rd.ReadLine(); line != null; line = rd.ReadLine())
|
76
|
foreach (String s in delim.Split(line))
|
77
|
if (s != "")
|
78
|
yield return s.ToLower();
|
79
|
}
|
80
|
}
|
81
|
|
82
|
// From an anagram point of view, a word is just a bag of characters.
|
83
|
|
84
|
public static CharBag AnagramClass(String s)
|
85
|
{
|
86
|
return new CharBag(s);
|
87
|
}
|
88
|
|
89
|
// Given a sequence of strings, return all non-trivial anagram classes
|
90
|
|
91
|
public static SCG.IEnumerable<SCG.IEnumerable<String>> AnagramClasses(SCG.IEnumerable<String> ss)
|
92
|
{
|
93
|
IDictionary<CharBag, HashSet<String>> classes
|
94
|
= new TreeDictionary<CharBag, HashSet<String>>();
|
95
|
foreach (String s in ss)
|
96
|
{
|
97
|
CharBag anagram = AnagramClass(s);
|
98
|
HashSet<String> anagramClass;
|
99
|
if (!classes.Find(anagram, out anagramClass))
|
100
|
classes[anagram] = anagramClass = new HashSet<String>();
|
101
|
anagramClass.Add(s);
|
102
|
}
|
103
|
foreach (HashSet<String> anagramClass in classes.Values)
|
104
|
if (anagramClass.Count > 1
|
105
|
) // && anagramClass.Exists(delegate(String s) { return !s.EndsWith("s"); }))
|
106
|
yield return anagramClass;
|
107
|
}
|
108
|
}
|
109
|
|
110
|
// A bag of characters is represented as a sorted string of the
|
111
|
// characters, with multiplicity. Since natural language words are
|
112
|
// short, the bags are small, so this is vastly better than
|
113
|
// representing character bags using HashBag<char> or TreeBag<char>
|
114
|
|
115
|
class CharBag : IComparable<CharBag>
|
116
|
{
|
117
|
private readonly String contents; // The bag's characters, sorted, with multiplicity
|
118
|
|
119
|
public CharBag(String s)
|
120
|
{
|
121
|
char[] chars = s.ToCharArray();
|
122
|
Array.Sort(chars);
|
123
|
this.contents = new String(chars);
|
124
|
}
|
125
|
|
126
|
public override int GetHashCode()
|
127
|
{
|
128
|
return contents.GetHashCode();
|
129
|
}
|
130
|
|
131
|
public bool Equals(CharBag that)
|
132
|
{
|
133
|
return this.contents.Equals(that.contents);
|
134
|
}
|
135
|
|
136
|
public int CompareTo(CharBag that)
|
137
|
{
|
138
|
return this.contents.CompareTo(that.contents);
|
139
|
}
|
140
|
}
|
141
|
|
142
|
// Crude timing utility ----------------------------------------
|
143
|
|
144
|
public class Timer
|
145
|
{
|
146
|
private DateTime start;
|
147
|
|
148
|
public Timer()
|
149
|
{
|
150
|
start = DateTime.Now;
|
151
|
}
|
152
|
|
153
|
public double Check()
|
154
|
{
|
155
|
TimeSpan dur = DateTime.Now - start;
|
156
|
return dur.TotalSeconds;
|
157
|
}
|
158
|
}
|
159
|
}
|