1
|
/*
|
2
|
Copyright (c) 2003-2006 Niels Kokholm and Peter Sestoft
|
3
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
of this software and associated documentation files (the "Software"), to deal
|
5
|
in the Software without restriction, including without limitation the rights
|
6
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
copies of the Software, and to permit persons to whom the Software is
|
8
|
furnished to do so, subject to the following conditions:
|
9
|
|
10
|
The above copyright notice and this permission notice shall be included in
|
11
|
all copies or substantial portions of the Software.
|
12
|
|
13
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
SOFTWARE.
|
20
|
*/
|
21
|
|
22
|
// C5 example: anagrams 2004-08-08, 2004-11-16
|
23
|
|
24
|
// Compile with
|
25
|
// csc /r:C5.dll AnagramHashBag.cs
|
26
|
|
27
|
using System;
|
28
|
using System.IO; // StreamReader, TextReader
|
29
|
using System.Text; // Encoding
|
30
|
using System.Text.RegularExpressions; // Regex
|
31
|
using C5;
|
32
|
using SCG = System.Collections.Generic;
|
33
|
|
34
|
namespace AnagramHashBag
|
35
|
{
|
36
|
class MyTest
|
37
|
{
|
38
|
public static void Main(String[] args)
|
39
|
{
|
40
|
Console.OutputEncoding = Encoding.GetEncoding("iso-8859-1");
|
41
|
SCG.IEnumerable<String> ss;
|
42
|
if (args.Length == 2)
|
43
|
ss = ReadFileWords(args[0], int.Parse(args[1]));
|
44
|
else
|
45
|
ss = args;
|
46
|
// foreach (String s in FirstAnagramOnly(ss))
|
47
|
// Console.WriteLine(s);
|
48
|
// Console.WriteLine("===");
|
49
|
Timer t = new Timer();
|
50
|
SCG.IEnumerable<SCG.IEnumerable<String>> classes = AnagramClasses(ss);
|
51
|
int count = 0;
|
52
|
foreach (SCG.IEnumerable<String> anagramClass in classes)
|
53
|
{
|
54
|
count++;
|
55
|
foreach (String s in anagramClass)
|
56
|
Console.Write(s + " ");
|
57
|
Console.WriteLine();
|
58
|
}
|
59
|
Console.WriteLine("{0} non-trivial anagram classes", count);
|
60
|
Console.WriteLine(t.Check());
|
61
|
}
|
62
|
|
63
|
// Read words at most n words from a file
|
64
|
|
65
|
public static SCG.IEnumerable<String> ReadFileWords(String filename, int n)
|
66
|
{
|
67
|
Regex delim = new Regex("[^a-z???A-Z???0-9-]+");
|
68
|
Encoding enc = Encoding.GetEncoding("iso-8859-1");
|
69
|
using (TextReader rd = new StreamReader(filename, enc))
|
70
|
{
|
71
|
for (String line = rd.ReadLine(); line != null; line = rd.ReadLine()) {
|
72
|
foreach (String s in delim.Split(line))
|
73
|
if (s != "")
|
74
|
yield return s.ToLower();
|
75
|
if (--n == 0)
|
76
|
yield break;
|
77
|
}
|
78
|
}
|
79
|
}
|
80
|
|
81
|
// From an anagram point of view, a word is just a bag of
|
82
|
// characters. So an anagram class is represented as HashBag<char>
|
83
|
// which permits fast equality comparison -- we shall use them as
|
84
|
// elements of hash sets or keys in hash maps.
|
85
|
|
86
|
public static HashBag<char> AnagramClass(String s) {
|
87
|
HashBag<char> anagram = new HashBag<char>();
|
88
|
foreach (char c in s)
|
89
|
anagram.Add(c);
|
90
|
return anagram;
|
91
|
}
|
92
|
|
93
|
// Given a sequence of strings, return only the first member of each
|
94
|
// anagram class.
|
95
|
|
96
|
public static SCG.IEnumerable<String> FirstAnagramOnly(SCG.IEnumerable<String> ss)
|
97
|
{
|
98
|
HashSet<HashBag<char>> anagrams = new HashSet<HashBag<char>>();
|
99
|
foreach (String s in ss) {
|
100
|
HashBag<char> anagram = AnagramClass(s);
|
101
|
if (!anagrams.Contains(anagram)) {
|
102
|
anagrams.Add(anagram);
|
103
|
yield return s;
|
104
|
}
|
105
|
}
|
106
|
}
|
107
|
|
108
|
// Given a sequence of strings, return all non-trivial anagram
|
109
|
// classes.
|
110
|
|
111
|
// Using HashBag<char> and an unsequenced equalityComparer, this performs as
|
112
|
// follows on 1600 MHz Mobile P4 and .Net 2.0 beta 1 (wall-clock
|
113
|
// time):
|
114
|
// 50 000 words 2 822 classes 2.0 sec
|
115
|
// 100 000 words 5 593 classes 4.3 sec
|
116
|
// 200 000 words 11 705 classes 8.8 sec
|
117
|
// 300 000 words 20 396 classes 52.0 sec includes swapping
|
118
|
// 347 165 words 24 428 classes 146.0 sec includes swapping
|
119
|
|
120
|
// The maximal memory consumption is less than 180 MB.
|
121
|
|
122
|
public static SCG.IEnumerable<SCG.IEnumerable<String>>
|
123
|
AnagramClasses(SCG.IEnumerable<String> ss)
|
124
|
{
|
125
|
IDictionary<HashBag<char>, TreeSet<String>> classes
|
126
|
= new HashDictionary<HashBag<char>, TreeSet<String>>();
|
127
|
foreach (String s in ss) {
|
128
|
HashBag<char> anagram = AnagramClass(s);
|
129
|
TreeSet<String> anagramClass;
|
130
|
if (!classes.Find(anagram, out anagramClass))
|
131
|
classes[anagram] = anagramClass = new TreeSet<String>();
|
132
|
anagramClass.Add(s);
|
133
|
}
|
134
|
foreach (TreeSet<String> anagramClass in classes.Values)
|
135
|
if (anagramClass.Count > 1)
|
136
|
yield return anagramClass;
|
137
|
}
|
138
|
}
|
139
|
|
140
|
// Crude timing utility ----------------------------------------
|
141
|
|
142
|
public class Timer
|
143
|
{
|
144
|
private DateTime start;
|
145
|
|
146
|
public Timer()
|
147
|
{
|
148
|
start = DateTime.Now;
|
149
|
}
|
150
|
|
151
|
public double Check()
|
152
|
{
|
153
|
TimeSpan dur = DateTime.Now - start;
|
154
|
return dur.TotalSeconds;
|
155
|
}
|
156
|
}
|
157
|
|
158
|
}
|