@article{11685,
  abstract     = {We consider the problem of sampling URLs uniformly at random from the Web. A tool for sampling URLs uniformly can be used to estimate various properties of Web pages, such as the fraction of pages in various Internet domains or written in various languages. Moreover, uniform URL sampling can be used to determine the sizes of various search engines relative to the entire Web. In this paper, we consider sampling approaches based on random walks of the Web graph. In particular, we suggest ways of improving sampling based on random walks to make the samples closer to uniform. We suggest a natural test bed based on random graphs for testing the effectiveness of our procedures. We then use our sampling approach to estimate the distribution of pages over various Internet domains and to estimate the coverage of various search engine indexes.},
  author       = {Henzinger, Monika H and Heydon, Allan and Mitzenmacher, Michael and Najork, Marc},
  issn         = {1389-1286},
  journal      = {Computer Networks},
  keywords     = {URL sampling, Random walks, Internet domain distribution, Search engine size},
  number       = {1-6},
  pages        = {295--308},
  publisher    = {Elsevier},
  title        = {{On near-uniform URL sampling}},
  doi          = {10.1016/s1389-1286(00)00055-4},
  volume       = {33},
  year         = {2000},
}

@article{11688,
  abstract     = {Recent research has studied how to measure the size of a search engine, in terms of the number of pages indexed. In this paper, we consider a different measure for search engines, namely the quality of the pages in a search engine index. We provide a simple, effective algorithm for approximating the quality of an index by performing a random walk on the Web, and we use this methodology to compare the index quality of several major search engines.},
  author       = {Henzinger, Monika H and Heydon, Allan and Mitzenmacher, Michael and Najork, Marc},
  issn         = {1389-1286},
  journal      = {Computer Networks},
  keywords     = {Search engines, Index quality, Random walks, PageRank},
  number       = {11-16},
  pages        = {1291--1303},
  publisher    = {Elsevier},
  title        = {{Measuring index quality using random walks on the web}},
  doi          = {10.1016/s1389-1286(99)00016-x},
  volume       = {31},
  year         = {1999},
}

