Project

General

Profile

root / branches / compiler / cSharp / ooasCompiler / src / libs / c5 / UserGuideExamples / AnagramHashBag.cs @ 3

1
/*
2
 Copyright (c) 2003-2006 Niels Kokholm and Peter Sestoft
3
 Permission is hereby granted, free of charge, to any person obtaining a copy
4
 of this software and associated documentation files (the "Software"), to deal
5
 in the Software without restriction, including without limitation the rights
6
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
 copies of the Software, and to permit persons to whom the Software is
8
 furnished to do so, subject to the following conditions:
9
 
10
 The above copyright notice and this permission notice shall be included in
11
 all copies or substantial portions of the Software.
12
 
13
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
 SOFTWARE.
20
*/
21

    
22
// C5 example: anagrams 2004-08-08, 2004-11-16
23

    
24
// Compile with 
25
//   csc /r:C5.dll AnagramHashBag.cs 
26

    
27
using System;
28
using System.IO;                        // StreamReader, TextReader
29
using System.Text;                      // Encoding
30
using System.Text.RegularExpressions;   // Regex
31
using C5;
32
using SCG = System.Collections.Generic;
33

    
34
namespace AnagramHashBag
35
{
36
  class MyTest
37
  {
38
    public static void Main(String[] args)
39
    {
40
      Console.OutputEncoding = Encoding.GetEncoding("iso-8859-1");
41
      SCG.IEnumerable<String> ss;
42
      if (args.Length == 2)
43
        ss = ReadFileWords(args[0], int.Parse(args[1]));
44
      else
45
        ss = args;
46
      // foreach (String s in FirstAnagramOnly(ss)) 
47
      //   Console.WriteLine(s);
48
      //   Console.WriteLine("===");
49
      Timer t = new Timer();
50
      SCG.IEnumerable<SCG.IEnumerable<String>> classes = AnagramClasses(ss);
51
      int count = 0;
52
      foreach (SCG.IEnumerable<String> anagramClass in classes)
53
      {
54
        count++;
55
        foreach (String s in anagramClass) 
56
           Console.Write(s + " ");
57
         Console.WriteLine();
58
      }
59
      Console.WriteLine("{0} non-trivial anagram classes", count);
60
      Console.WriteLine(t.Check());
61
    }
62

    
63
    // Read words at most n words from a file
64

    
65
    public static SCG.IEnumerable<String> ReadFileWords(String filename, int n)
66
    {
67
      Regex delim = new Regex("[^a-z???A-Z???0-9-]+");
68
      Encoding enc = Encoding.GetEncoding("iso-8859-1");
69
      using (TextReader rd = new StreamReader(filename, enc))
70
      {
71
        for (String line = rd.ReadLine(); line != null; line = rd.ReadLine()) {
72
          foreach (String s in delim.Split(line))
73
            if (s != "")
74
              yield return s.ToLower();
75
          if (--n == 0)
76
            yield break;
77
        }
78
      }
79
    }
80

    
81
    // From an anagram point of view, a word is just a bag of
82
    // characters.  So an anagram class is represented as HashBag<char>
83
    // which permits fast equality comparison -- we shall use them as
84
    // elements of hash sets or keys in hash maps.
85

    
86
    public static HashBag<char> AnagramClass(String s) {
87
      HashBag<char> anagram = new HashBag<char>();
88
      foreach (char c in s)
89
        anagram.Add(c);
90
      return anagram;
91
    }
92

    
93
    // Given a sequence of strings, return only the first member of each
94
    // anagram class.
95

    
96
    public static SCG.IEnumerable<String> FirstAnagramOnly(SCG.IEnumerable<String> ss)
97
    {
98
      HashSet<HashBag<char>> anagrams = new HashSet<HashBag<char>>();
99
      foreach (String s in ss) {
100
        HashBag<char> anagram = AnagramClass(s);
101
        if (!anagrams.Contains(anagram)) {
102
          anagrams.Add(anagram);
103
          yield return s;
104
        }
105
      }
106
    }
107

    
108
    // Given a sequence of strings, return all non-trivial anagram
109
    // classes.  
110

    
111
    // Using HashBag<char> and an unsequenced equalityComparer, this performs as
112
    // follows on 1600 MHz Mobile P4 and .Net 2.0 beta 1 (wall-clock
113
    // time):
114
    //  50 000 words  2 822 classes   2.0 sec
115
    // 100 000 words  5 593 classes   4.3 sec
116
    // 200 000 words 11 705 classes   8.8 sec
117
    // 300 000 words 20 396 classes  52.0 sec includes swapping
118
    // 347 165 words 24 428 classes 146.0 sec includes swapping
119

    
120
    // The maximal memory consumption is less than 180 MB.
121

    
122
    public static SCG.IEnumerable<SCG.IEnumerable<String>>
123
      AnagramClasses(SCG.IEnumerable<String> ss)
124
    {
125
      IDictionary<HashBag<char>, TreeSet<String>> classes
126
        = new HashDictionary<HashBag<char>, TreeSet<String>>();
127
      foreach (String s in ss) {
128
        HashBag<char> anagram = AnagramClass(s);
129
        TreeSet<String> anagramClass;
130
        if (!classes.Find(anagram, out anagramClass))
131
          classes[anagram] = anagramClass = new TreeSet<String>();
132
        anagramClass.Add(s);
133
      }
134
      foreach (TreeSet<String> anagramClass in classes.Values)
135
        if (anagramClass.Count > 1)
136
          yield return anagramClass;
137
    }
138
  }
139

    
140
// Crude timing utility ----------------------------------------
141

    
142
  public class Timer
143
  {
144
    private DateTime start;
145

    
146
    public Timer()
147
    {
148
      start = DateTime.Now;
149
    }
150

    
151
    public double Check()
152
    {
153
      TimeSpan dur = DateTime.Now - start;
154
      return dur.TotalSeconds;
155
    }
156
  }
157

    
158
}