Commit 2429c5da authored by mkanat%bugzilla.org's avatar mkanat%bugzilla.org

Bug 399370: Fulltext search with a LIKE on bugs.short_desc is too slow (make…

Bug 399370: Fulltext search with a LIKE on bugs.short_desc is too slow (make Bugzilla use a separate fulltext table) Patch By Max Kanat-Alexander <mkanat@bugzilla.org> (module owner) a=mkanat
parent d7f12988
......@@ -408,12 +408,6 @@ sub create {
}
}
$dbh->bz_commit_transaction();
# Because MySQL doesn't support transactions on the longdescs table,
# we do this after we've committed the transaction. That way we're
# fairly sure we're inserting a good Bug ID.
# And insert the comment. We always insert a comment on bug creation,
# but sometimes it's blank.
my @columns = qw(bug_id who bug_when thetext);
......@@ -429,6 +423,12 @@ sub create {
$dbh->do('INSERT INTO longdescs (' . join(',', @columns) . ")
VALUES ($qmarks)", undef, @values);
$dbh->bz_commit_transaction();
# Because MySQL doesn't support transactions on the fulltext table,
# we do this after we've committed the transaction. That way we're
# sure we're inserting a good Bug ID.
$bug->_sync_fulltext('new bug');
return $bug;
}
......@@ -615,6 +615,13 @@ sub update {
$self->{delta_ts} = $delta_ts;
}
# The only problem with this here is that update() is often called
# in the middle of a transaction, and if that transaction is rolled
# back, this change will *not* be rolled back. As we expect rollbacks
# to be extremely rare, that is OK for us.
$self->_sync_fulltext()
if $self->{added_comments} || $changes->{short_desc};
return $changes;
}
......@@ -766,6 +773,30 @@ sub update_keywords {
return [$removed_keywords, $added_keywords];
}
# Should be called any time you update short_desc or change a comment.
sub _sync_fulltext {
my ($self, $new_bug) = @_;
my $dbh = Bugzilla->dbh;
if ($new_bug) {
$dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc)
SELECT bug_id, short_desc FROM bugs WHERE bug_id = ?',
undef, $self->id);
}
else {
$dbh->do('UPDATE bugs_fulltext SET short_desc = ? WHERE bug_id = ?',
undef, $self->short_desc, $self->id);
}
my $comments = $dbh->selectall_arrayref(
'SELECT thetext, isprivate FROM longdescs WHERE bug_id = ?',
undef, $self->id);
my $all = join("\n", map { $_->[0] } @$comments);
my @no_private = grep { !$_->[1] } @$comments;
my $nopriv_string = join("\n", map { $_->[0] } @no_private);
$dbh->do('UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ?
WHERE bug_id = ?', undef, $all, $nopriv_string, $self->id);
}
# This is the correct way to delete bugs from the DB.
# No bug should be deleted from anywhere else except from here.
#
......@@ -821,11 +852,10 @@ sub remove_from_db {
# Several of the previous tables also depend on attach_id.
$dbh->do("DELETE FROM attachments WHERE bug_id = ?", undef, $bug_id);
$dbh->do("DELETE FROM bugs WHERE bug_id = ?", undef, $bug_id);
$dbh->do("DELETE FROM longdescs WHERE bug_id = ?", undef, $bug_id);
$dbh->bz_commit_transaction();
$dbh->do("DELETE FROM longdescs WHERE bug_id = ?", undef, $bug_id);
# Now this bug no longer exists
$self->DESTROY;
return $self;
......@@ -2696,6 +2726,7 @@ sub update_comment {
# We assume _check_comment() has already been called earlier.
Bugzilla->dbh->do('UPDATE longdescs SET thetext = ? WHERE comment_id = ?',
undef, ($new_comment, $comment_id));
$self->_sync_fulltext();
# Update the comment object with this new text.
$current_comment_obj[0]->{'body'} = $new_comment;
......
......@@ -255,11 +255,11 @@ EOT
print "\nISAM->MyISAM table conversion done.\n\n";
}
my $sd_index_deleted = 0;
my ($sd_index_deleted, $longdescs_index_deleted);
my @tables = $self->bz_table_list_real();
# We want to convert the bugs table to MyISAM, but it's possible that
# it has a fulltext index on it and this will fail unless we remove
# the index.
# We want to convert tables to InnoDB, but it's possible that they have
# fulltext indexes on them, and conversation will fail unless we remove
# the indexes.
if (grep($_ eq 'bugs', @tables)) {
if ($self->bz_index_info_real('bugs', 'short_desc')) {
$self->bz_drop_index_raw('bugs', 'short_desc');
......@@ -268,6 +268,13 @@ EOT
$self->bz_drop_index_raw('bugs', 'bugs_short_desc_idx');
$sd_index_deleted = 1; # Used for later schema cleanup.
}
if ($self->bz_index_info_real('longdescs', 'thetext')) {
$self->bz_drop_index_raw('longdescs', 'thetext');
}
if ($self->bz_index_info_real('longdescs', 'longdescs_thetext_idx')) {
$self->bz_drop_index_raw('longdescs', 'longdescs_thetext_idx');
$longdescs_index_deleted = 1; # For later schema cleanup.
}
}
# Upgrade tables from MyISAM to InnoDB
......@@ -480,6 +487,11 @@ EOT
$self->_bz_real_schema->delete_index('bugs', 'bugs_short_desc_idx');
$self->_bz_store_real_schema;
}
if ($longdescs_index_deleted) {
$self->_bz_real_schema->delete_index('longdescs',
'longdescs_thetext_idx');
$self->_bz_store_real_schema;
}
# The old timestamp fields need to be adjusted here instead of in
# checksetup. Otherwise the UPDATE statements inside of bz_add_column
......
......@@ -179,6 +179,10 @@ sub bz_setup_database {
# field, because it can't have index data longer than 2770
# characters on that field.
$self->bz_drop_index('longdescs', 'longdescs_thetext_idx');
# Same for all the comments fields in the fulltext table.
$self->bz_drop_index('bugs_fulltext', 'bugs_fulltext_comments_idx');
$self->bz_drop_index('bugs_fulltext',
'bugs_fulltext_comments_noprivate_idx');
# PostgreSQL also wants an index for calling LOWER on
# login_name, which we do with sql_istrcmp all over the place.
......
......@@ -279,6 +279,29 @@ use constant ABSTRACT_SCHEMA => {
],
},
bugs_fulltext => {
FIELDS => [
bug_id => {TYPE => 'INT3', NOTNULL => 1, PRIMARYKEY => 1,
REFERENCES => {TABLE => 'bugs',
COLUMN => 'bug_id',
DELETE => 'CASCADE'}},
short_desc => {TYPE => 'varchar(255)', NOTNULL => 1},
# Comments are stored all together in one column for searching.
# This allows us to examine all comments together when deciding
# the relevance of a bug in fulltext search.
comments => {TYPE => 'LONGTEXT'},
comments_noprivate => {TYPE => 'LONGTEXT'},
],
INDEXES => [
bugs_fulltext_short_desc_idx => {FIELDS => ['short_desc'],
TYPE => 'FULLTEXT'},
bugs_fulltext_comments_idx => {FIELDS => ['comments'],
TYPE => 'FULLTEXT'},
bugs_fulltext_comments_noprivate_idx => {
FIELDS => ['comments_noprivate'], TYPE => 'FULLTEXT'},
],
},
bugs_activity => {
FIELDS => [
bug_id => {TYPE => 'INT3', NOTNULL => 1},
......@@ -336,8 +359,6 @@ use constant ABSTRACT_SCHEMA => {
longdescs_bug_id_idx => ['bug_id'],
longdescs_who_idx => [qw(who bug_id)],
longdescs_bug_when_idx => ['bug_when'],
longdescs_thetext_idx => {FIELDS => ['thetext'],
TYPE => 'FULLTEXT'},
],
},
......
......@@ -86,7 +86,7 @@ use constant REVERSE_MAPPING => {
# as in their db-specific version, so no reverse mapping is needed.
};
use constant MYISAM_TABLES => qw(longdescs);
use constant MYISAM_TABLES => qw(bugs_fulltext);
#------------------------------------------------------------------------------
sub _initialize {
......
......@@ -315,12 +315,6 @@ sub update_table_definitions {
$dbh->do('UPDATE quips SET userid = NULL WHERE userid = 0');
}
# Right now, we only create the "thetext" index on MySQL.
if ($dbh->isa('Bugzilla::DB::Mysql')) {
$dbh->bz_add_index('longdescs', 'longdescs_thetext_idx',
{TYPE => 'FULLTEXT', FIELDS => [qw(thetext)]});
}
_convert_attachments_filename_from_mediumtext();
$dbh->bz_add_column('quips', 'approved',
......@@ -525,6 +519,9 @@ sub update_table_definitions {
$dbh->bz_alter_column('namedqueries', 'query_type',
{TYPE => 'BOOLEAN', NOTNULL => 1, DEFAULT => 0});
$dbh->bz_drop_index('longdescs', 'longdescs_thetext_idx');
_populate_bugs_fulltext();
################################################################
# New --TABLE-- changes should go *** A B O V E *** this point #
################################################################
......@@ -2992,6 +2989,39 @@ sub _check_content_length {
}
}
sub _populate_bugs_fulltext {
my $dbh = Bugzilla->dbh;
my $fulltext = $dbh->selectrow_array('SELECT 1 FROM bugs_fulltext '
. $dbh->sql_limit(1));
# We only populate the table if it's empty...
if (!$fulltext) {
# ... and if there are bugs in the bugs table.
my $bug_ids = $dbh->selectcol_arrayref('SELECT bug_id FROM bugs');
return if !@$bug_ids;
print "Populating bugs_fulltext.short_desc...\n";
$dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc)
SELECT bug_id, short_desc FROM bugs');
print "Populating bugs_fulltext comments fields...\n";
my $count = 1;
my $sth_all = $dbh->prepare('SELECT thetext FROM longdescs
WHERE bug_id = ?');
my $sth_nopriv = $dbh->prepare('SELECT thetext FROM longdescs
WHERE bug_id = ? AND isprivate = 0');
my $sth_update = $dbh->prepare(
'UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ?
WHERE bug_id = ?');
foreach my $id (@$bug_ids) {
my $all = $dbh->selectcol_arrayref($sth_all, undef, $id);
my $nopriv = $dbh->selectcol_arrayref($sth_nopriv, undef, $id);
$sth_update->execute(join("\n", @$all), join("\n", @$nopriv), $id);
indicate_progress({ total => scalar @$bug_ids, every => 100,
current => $count++ });
}
print "\n";
}
}
1;
__END__
......
......@@ -47,11 +47,6 @@ use Bugzilla::Keyword;
use Date::Format;
use Date::Parse;
# How much we should add to the relevance, for each word that matches
# in bugs.short_desc, during fulltext search. This is high because
# we want summary matches to be *very* relevant, by default.
use constant SUMMARY_RELEVANCE_FACTOR => 7;
# Some fields are not sorted on themselves, but on other fields.
# We need to have a list of these fields and what they map to.
# Each field points to an array that contains the fields mapped
......@@ -1021,21 +1016,6 @@ sub BuildOrderBy {
push(@$stringlist, trim($orderfield . ' ' . $orderdirection));
}
# This is a helper for fulltext search
sub _split_words_into_like {
my ($field, $text) = @_;
my $dbh = Bugzilla->dbh;
# This code is very similar to Bugzilla::DB::sql_fulltext_search,
# so you can look there if you'd like an explanation of what it's
# doing.
my $lower_text = lc($text);
my @words = split(/\s+/, $lower_text);
@words = map($dbh->quote("%$_%"), @words);
map(trick_taint($_), @words);
@words = map($dbh->sql_istrcmp($field, $_, 'LIKE'), @words);
return @words;
}
#####################################################################
# Search Functions
#####################################################################
......@@ -1252,44 +1232,24 @@ sub _content_matches {
# accept the "matches" operator, which is specific to full-text
# index searches.
# Add the longdescs table to the query so we can search comments.
my $table = "longdescs_$$chartid";
my $extra = "";
if (Bugzilla->params->{"insidergroup"}
&& !Bugzilla->user->in_group(Bugzilla->params->{"insidergroup"}))
{
$extra = "AND $table.isprivate < 1";
}
push(@$supptables, "LEFT JOIN longdescs AS $table " .
"ON bugs.bug_id = $table.bug_id $extra");
# Add the fulltext table to the query so we can search on it.
my $table = "bugs_fulltext_$$chartid";
my $comments_col = "comments";
$comments_col = "comments_noprivate" unless $self->{'user'}->is_insider;
push(@$supptables, "LEFT JOIN bugs_fulltext AS $table " .
"ON bugs.bug_id = $table.bug_id");
# Create search terms to add to the SELECT and WHERE clauses.
# $term1 searches comments.
my $term1 = $dbh->sql_fulltext_search("${table}.thetext", $$v);
# short_desc searching for the WHERE clause
my @words = _split_words_into_like('bugs.short_desc', $$v);
my $term2_where = join(' OR ', @words);
# short_desc relevance
my $factor = SUMMARY_RELEVANCE_FACTOR;
my @s_words = map("CASE WHEN $_ THEN $factor ELSE 0 END", @words);
my $term2_select = join(' + ', @s_words);
my $term1 = $dbh->sql_fulltext_search("$table.$comments_col", $$v);
my $term2 = $dbh->sql_fulltext_search("$table.short_desc", $$v);
# The term to use in the WHERE clause.
$$term = "$term1 > 0 OR ($term2_where)";
$$term = "$term1 > 0 OR $term2 > 0";
# In order to sort by relevance (in case the user requests it),
# we SELECT the relevance value and give it an alias so we can
# add it to the SORT BY clause when we build it in buglist.cgi.
#
# Note: We should be calculating the relevance based on all
# comments for a bug, not just matching comments, but that's hard
# (see http://bugzilla.mozilla.org/show_bug.cgi?id=145588#c35).
my $select_term = "(SUM($term1) + $term2_select) AS relevance";
# add the column not used in aggregate function explicitly
push(@$groupby, 'bugs.short_desc');
my $select_term = "($term1 + $term2) AS relevance";
# Users can specify to display the relevance field, in which case
# it'll show up in the list of fields being selected, and we need
......
......@@ -1253,6 +1253,7 @@ sub process_bug {
VALUES (?,?,?,?,?,?)", undef,
$id, $exporterid, $timestamp, $worktime, $private, $long_description
);
Bugzilla::Bug->new($id)->_sync_fulltext();
# Add this bug to each group of which its product is a member.
my $sth_group = $dbh->prepare("INSERT INTO bug_group_map (bug_id, group_id)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment