yyy问了一个更具体的问题:Collation 我理解了,但是 Collation Level 是什么概念呢?
好问题!Collation Level 的用途是,当两个 Collation 做比较时,Collation Level 可以规定以哪个 Collation 为标准来做比较。详见下面的函数实现(特别关注 collation level 不相等的那几行代码【Line75~80】):
/** note from mysql:
Aggregate two collations together taking
into account their coercibility (aka derivation):.
0 == DERIVATION_EXPLICIT - an explicitly written COLLATE clause @n
1 == DERIVATION_NONE - a mix of two different collations @n
2 == DERIVATION_IMPLICIT - a column @n
3 == DERIVATION_COERCIBLE - a string constant.
The most important rules are:
-# If collations are the same:
chose this collation, and the strongest derivation.
-# If collations are different:
- Character sets may differ, but only if conversion without
data loss is possible. The caller provides flags whether
character set conversion attempts should be done. If no
flags are substituted, then the character sets must be the same.
Currently processed flags are:
MY_COLL_ALLOW_SUPERSET_CONV - allow conversion to a superset
MY_COLL_ALLOW_COERCIBLE_CONV - allow conversion of a coercible value
- two EXPLICIT collations produce an error, e.g. this is wrong:
CONCAT(expr1 collate latin1_swedish_ci, expr2 collate latin1_german_ci)
- the side with smaller derivation value wins,
i.e. a column is stronger than a string constant,
an explicit COLLATE clause is stronger than a column.
- if derivations are the same, we have DERIVATION_NONE,
we'll wait for an explicit COLLATE clause which possibly can
come from another argument later: for example, this is valid,
but we don't know yet when collecting the first two arguments:
@code
CONCAT(latin1_swedish_ci_column,
latin1_german1_ci_column,
expr COLLATE latin1_german2_ci)
@endcode
*/
/** this function is to determine use which charset when compare
* We consider only three charsets(binary, gbk and utf8mb4), so the rule is simpler. Especially,
* res_level can not be CS_LEVEL_NONE.
*
* MySQL uses coercibility values with the following rules to resolve ambiguities:
* 1. Use the collation with the lowest coercibility value.
* 2. If both sides have the same coercibility, then:
* 2.a If both sides are Unicode, or both sides are not Unicode, it is an error.
* 2.b If one of the sides has a Unicode character set, and another side has a non-Unicode character set, the side with Unicode character set wins,
* and automatic character set conversion is applied to the non-Unicode side.
* 2.c For an operation with operands from the same character set but that mix a _bin collation and a _ci or _cs collation, the _bin collation is used.
* This is similar to how operations that mix nonbinary and binary strings evaluate the operands as binary strings, except that it is for collations rather than data types.
*/
#endif
int ObCharset::aggregate_collation(
const ObCollationLevel collation_level1,
const ObCollationType collation_type1,
const ObCollationLevel collation_level2,
const ObCollationType collation_type2,
ObCollationLevel &res_level,
ObCollationType &res_type)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(
CS_LEVEL_INVALID == collation_level1
|| CS_LEVEL_INVALID == collation_level2
|| !is_valid_collation(collation_type1)
|| !is_valid_collation(collation_type2))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN ("invalid collation level or type",
K(ret), K(collation_level1), K(collation_type1), K(collation_level2), K(collation_type2));
} else {
/** 先比较level,level小的优先级大,使用相应的结果。
* 如果优先级相同,binary和string比较,统一用binary比较
* 如果都是string,按照规则进行处理
*/
ObCharsetType cs1 = charset_type_by_coll(collation_type1);
ObCharsetType cs2 = charset_type_by_coll(collation_type2);
if (collation_level1 < collation_level2) {
res_type = collation_type1;
res_level = collation_level1;
} else if (collation_level2 < collation_level1) {
res_type = collation_type2;
res_level = collation_level2;
} else if (CS_TYPE_BINARY == collation_type1) {
res_level = collation_level1;
res_type = collation_type1;
} else if (CS_TYPE_BINARY == collation_type2) {
res_level = collation_level2;
res_type = collation_type2;
} else if (cs1 != cs2) {
/**
* 左右字符集不相同的情况
* 主要以下情况
* utf8mb4和utf16:使用utf16
* utf8mb4和gbk:使用utf8mb4
* utf16和gbk:使用utf16
* utf8mb4和gb18030:使用utf8mb4
* utf16和gb18030:使用utf16
* gbk和gb18030:使用gb18030
* gb18030_2022 与 gb18030 的 AGGREGATE 暂定禁止
* 以上任一字符集X与latin1的组合结果都为X,latin1目前地位最低
*/
int res = AGGREGATE_2CHARSET[cs1][cs2];
if (res == 1) {
res_type = collation_type1;
res_level = collation_level1;
} else if (res == 2) {
res_type = collation_type2;
res_level = collation_level2;
} else {
// 所有不能转换的情况都到这里
ret = OB_CANT_AGGREGATE_2COLLATIONS;
}
} else {
//处理相同字符集的情况,每种字符集单独考虑
if (collation_type1 == collation_type2) {
res_type = collation_type1;
res_level = collation_level1;
} else if (CS_LEVEL_EXPLICIT == collation_level1) {
ret = OB_CANT_AGGREGATE_2COLLATIONS;
// ERROR 1267 (HY000): Illegal mix of collations (utf8_general_ci,EXPLICIT) and (utf8_bin,EXPLICIT) for operation '='
// LOG_USER_ERROR(ret);
} else if (charset_type_by_coll(collation_type1) == CHARSET_UTF8MB4) {
if (collation_type1 == CS_TYPE_UTF8MB4_BIN || collation_type2 == CS_TYPE_UTF8MB4_BIN) {
res_type = CS_TYPE_UTF8MB4_BIN;
res_level = (CS_TYPE_UTF8MB4_BIN == collation_type1) ? collation_level1 : collation_level2;
} else {
// utf8mb4_unicode_ci和utf8mb4_general_ci的情况报错,和mysql兼容
ret = OB_CANT_AGGREGATE_2COLLATIONS;
}
} else if (charset_type_by_coll(collation_type1) == CHARSET_GBK) {
res_type = CS_TYPE_GBK_BIN;
res_level = (CS_TYPE_GBK_BIN == collation_type1) ? collation_level1 : collation_level2;
} else if (charset_type_by_coll(collation_type1) == CHARSET_UTF16) {
if (collation_type1 == CS_TYPE_UTF16_BIN || collation_type2 == CS_TYPE_UTF16_BIN) {
res_type = CS_TYPE_UTF16_BIN;
res_level = (CS_TYPE_UTF16_BIN == collation_type1) ? collation_level1 : collation_level2;
} else {
// utf16_unicode_ci和utf16_general_ci直接报错,不应该出现这种情况
ret = OB_CANT_AGGREGATE_2COLLATIONS;
}
} else if (charset_type_by_coll(collation_type1) == CHARSET_GB18030) {
res_type = CS_TYPE_GB18030_BIN;
res_level = (CS_TYPE_GB18030_BIN == collation_type1) ? collation_level1 : collation_level2;
} else if (charset_type_by_coll(collation_type1) == CHARSET_LATIN1) {
if (collation_type1 == CS_TYPE_LATIN1_BIN || collation_type2 == CS_TYPE_LATIN1_BIN) {
res_type = CS_TYPE_LATIN1_BIN;
res_level = (CS_TYPE_LATIN1_BIN == collation_type1) ? collation_level1 : collation_level2;
} else {
//未来可能支持latin1_german,与latin1_swedish不兼容
ret = OB_CANT_AGGREGATE_2COLLATIONS;
}
} else if (charset_type_by_coll(collation_type1) == CHARSET_GB18030_2022) {
res_type = CS_TYPE_GB18030_2022_BIN;
res_level = (CS_TYPE_GB18030_2022_BIN == collation_type1) ? collation_level1 : collation_level2;
} else {
ret = OB_ERR_UNEXPECTED;
LOG_ERROR("Unexpected charset", K(collation_type1), K(collation_type2), KCSTRING(lbt()));
}
}
if (OB_SUCC(ret)) {
ObCharsetType res_cs = charset_type_by_coll(res_type);
if (CHARSET_GB18030 == res_cs) {
if (CHARSET_GB18030_2022 == cs1 || CHARSET_GB18030_2022 == cs2) {
ret = OB_CANT_AGGREGATE_2COLLATIONS;
}
} else if (CHARSET_GB18030_2022 == res_cs) {
if (CHARSET_GB18030 == cs1 || CHARSET_GB18030 == cs2) {
ret = OB_CANT_AGGREGATE_2COLLATIONS;
}
}
}
if (OB_FAIL(ret)) {
LOG_WARN("Illegal mix of collations", K(ret),
"type1", ObCharset::collation_name(collation_type1),
"level1", ObCharset::collation_level(collation_level1),
"type2", ObCharset::collation_name(collation_type2),
"level2", ObCharset::collation_level(collation_level2));
}
}
return ret;
}