1
|
/*
|
2
|
Copyright (c) 2003-2006 Niels Kokholm and Peter Sestoft
|
3
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
of this software and associated documentation files (the "Software"), to deal
|
5
|
in the Software without restriction, including without limitation the rights
|
6
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
copies of the Software, and to permit persons to whom the Software is
|
8
|
furnished to do so, subject to the following conditions:
|
9
|
|
10
|
The above copyright notice and this permission notice shall be included in
|
11
|
all copies or substantial portions of the Software.
|
12
|
|
13
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
SOFTWARE.
|
20
|
*/
|
21
|
|
22
|
// C5 example: anagrams 2004-08-08, 2004-11-16
|
23
|
|
24
|
// Compile with
|
25
|
// csc /r:C5.dll Anagrams.cs
|
26
|
|
27
|
using System;
|
28
|
using System.IO; // StreamReader, TextReader
|
29
|
using System.Text; // Encoding
|
30
|
using System.Text.RegularExpressions; // Regex
|
31
|
using C5;
|
32
|
using SCG = System.Collections.Generic;
|
33
|
|
34
|
namespace Anagrams
|
35
|
{
|
36
|
class MyTest
|
37
|
{
|
38
|
public static void Main(String[] args)
|
39
|
{
|
40
|
Console.OutputEncoding = Encoding.GetEncoding("iso-8859-1");
|
41
|
SCG.IEnumerable<String> ss;
|
42
|
if (args.Length == 2)
|
43
|
ss = ReadFileWords(args[0], int.Parse(args[1]));
|
44
|
else
|
45
|
ss = args;
|
46
|
// foreach (String s in FirstAnagramOnly(ss))
|
47
|
// Console.WriteLine(s);
|
48
|
// Console.WriteLine("===");
|
49
|
Timer t = new Timer();
|
50
|
SCG.IEnumerable<SCG.IEnumerable<String>> classes = AnagramClasses(ss);
|
51
|
int count = 0;
|
52
|
foreach (SCG.IEnumerable<String> anagramClass in classes)
|
53
|
{
|
54
|
count++;
|
55
|
// foreach (String s in anagramClass)
|
56
|
// Console.Write(s + " ");
|
57
|
// Console.WriteLine();
|
58
|
}
|
59
|
Console.WriteLine("{0} non-trivial anagram classes", count);
|
60
|
Console.WriteLine(t.Check());
|
61
|
}
|
62
|
|
63
|
// Read words at most n words from a file
|
64
|
|
65
|
public static SCG.IEnumerable<String> ReadFileWords(String filename, int n)
|
66
|
{
|
67
|
Regex delim = new Regex("[^a-z???A-Z???0-9-]+");
|
68
|
Encoding enc = Encoding.GetEncoding("iso-8859-1");
|
69
|
using (TextReader rd = new StreamReader(filename, enc))
|
70
|
{
|
71
|
for (String line = rd.ReadLine(); line != null; line = rd.ReadLine())
|
72
|
{
|
73
|
foreach (String s in delim.Split(line))
|
74
|
if (s != "")
|
75
|
yield return s.ToLower();
|
76
|
if (--n == 0)
|
77
|
yield break;
|
78
|
}
|
79
|
}
|
80
|
}
|
81
|
|
82
|
// From an anagram point of view, a word is just a bag of
|
83
|
// characters. So an anagram class is represented as TreeBag<char>
|
84
|
// which permits fast equality comparison -- we shall use them as
|
85
|
// elements of hash sets or keys in hash maps.
|
86
|
|
87
|
public static TreeBag<char> AnagramClass(String s)
|
88
|
{
|
89
|
TreeBag<char> anagram = new TreeBag<char>(Comparer<char>.Default, EqualityComparer<char>.Default);
|
90
|
foreach (char c in s)
|
91
|
anagram.Add(c);
|
92
|
return anagram;
|
93
|
}
|
94
|
|
95
|
// Given a sequence of strings, return only the first member of each
|
96
|
// anagram class.
|
97
|
|
98
|
public static SCG.IEnumerable<String> FirstAnagramOnly(SCG.IEnumerable<String> ss)
|
99
|
{
|
100
|
SCG.IEqualityComparer<TreeBag<char>> tbh
|
101
|
= UnsequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;
|
102
|
HashSet<TreeBag<char>> anagrams = new HashSet<TreeBag<char>>(tbh);
|
103
|
foreach (String s in ss)
|
104
|
{
|
105
|
TreeBag<char> anagram = AnagramClass(s);
|
106
|
if (!anagrams.Contains(anagram))
|
107
|
{
|
108
|
anagrams.Add(anagram);
|
109
|
yield return s;
|
110
|
}
|
111
|
}
|
112
|
}
|
113
|
|
114
|
// Given a sequence of strings, return all non-trivial anagram
|
115
|
// classes. Should use a *sequenced* equalityComparer on a TreeBag<char>,
|
116
|
// obviously: after all, characters can be sorted by ASCII code. On
|
117
|
// 347 000 distinct Danish words this takes 70 cpu seconds, 180 MB
|
118
|
// memory, and 263 wall-clock seconds (due to swapping).
|
119
|
|
120
|
// Using a TreeBag<char> and a sequenced equalityComparer takes 82 cpu seconds
|
121
|
// and 180 MB RAM to find the 26,058 anagram classes among 347,000
|
122
|
// distinct words.
|
123
|
|
124
|
// Using an unsequenced equalityComparer on TreeBag<char> or HashBag<char>
|
125
|
// makes it criminally slow: at least 1200 cpu seconds. This must
|
126
|
// be because many bags get the same hash code, so that there are
|
127
|
// many collisions. But exactly how the unsequenced equalityComparer works is
|
128
|
// not clear ... or is it because unsequenced equality is slow?
|
129
|
|
130
|
public static SCG.IEnumerable<SCG.IEnumerable<String>> AnagramClasses(SCG.IEnumerable<String> ss)
|
131
|
{
|
132
|
bool unseq = true;
|
133
|
IDictionary<TreeBag<char>, TreeSet<String>> classes;
|
134
|
if (unseq)
|
135
|
{
|
136
|
SCG.IEqualityComparer<TreeBag<char>> unsequencedTreeBagEqualityComparer
|
137
|
= UnsequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;
|
138
|
classes = new HashDictionary<TreeBag<char>, TreeSet<String>>(unsequencedTreeBagEqualityComparer);
|
139
|
}
|
140
|
else
|
141
|
{
|
142
|
SCG.IEqualityComparer<TreeBag<char>> sequencedTreeBagEqualityComparer
|
143
|
= SequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;
|
144
|
classes = new HashDictionary<TreeBag<char>, TreeSet<String>>(sequencedTreeBagEqualityComparer);
|
145
|
}
|
146
|
foreach (String s in ss)
|
147
|
{
|
148
|
TreeBag<char> anagram = AnagramClass(s);
|
149
|
TreeSet<String> anagramClass;
|
150
|
if (!classes.Find(anagram, out anagramClass))
|
151
|
classes[anagram] = anagramClass = new TreeSet<String>();
|
152
|
anagramClass.Add(s);
|
153
|
}
|
154
|
foreach (TreeSet<String> anagramClass in classes.Values)
|
155
|
if (anagramClass.Count > 1)
|
156
|
yield return anagramClass;
|
157
|
}
|
158
|
}
|
159
|
|
160
|
// Crude timing utility ----------------------------------------
|
161
|
|
162
|
public class Timer
|
163
|
{
|
164
|
private DateTime start;
|
165
|
|
166
|
public Timer()
|
167
|
{
|
168
|
start = DateTime.Now;
|
169
|
}
|
170
|
|
171
|
public double Check()
|
172
|
{
|
173
|
TimeSpan dur = DateTime.Now - start;
|
174
|
return dur.TotalSeconds;
|
175
|
}
|
176
|
}
|
177
|
}
|