101 Patch |
6 Review |
34fdeebefcbf183ed7f916f931aa0586fdaa1b40,
12249c42030003a1b78e80cd0c8c2facf01fa6e4,
7a686db6d366bb536c0abce24749af6b33cdd574,
64175cf2bdc63dfaeee16b3604cc76afc2ddbe50,
d4a7befd0a6fc713afedc2256fb4811272313551,
46ebac819ea1ba5efb9e1078b293114815af9e26,
c27e8098fa3e9e2d59e114a63e8d09f3729fc472,
86b5752ed989e8bd1f0e551987a21480000ddabe,
640e5ffe9e3a91d6a035f837d045e3ab5e5b0f02,
66d2761ccde7c5d840ecc0c97102107f4bb90def,
4367d090becb1a32ec585e539e70e62e5235594e,
87a7661401e9f9c08066f1ac8feaf9a88478d789,
3fba474ba052eb3b587a0d6ef14e451294a8311f,
a596d4823d65f246922ad7c40caa99da47f9cc29,
0c281799332f4289766b7e89e52ec837a0750a52,
b90e2262db63962f3a7771dff80f003ae00b89b9,
ebdebffd7952d8ac747e1c6e883aa5d8f810248f,
1697467c840844ed218220e80d71fd852250141b,
ea9b541d083f821e8d74d0d00304fc4554bfd990,
6a82cb4542e249a7c3789343f314838eba8b72b3,
ef327e7c4180ebec920007b96c20ea818e4dd28e,
214d08cda3f729bfaa515ce416dad3e68e0ae06e,
4e63a44d15d40bd1a5b405d3f6030833e318e4b4,
93bf3da72368850af8e46bf1ed164487fe9bc166,
b8c4e1605a585c110d073b5391929755442c51a2,
32d95025ae4e4c28af9be09e85a2ac3dcfb2b393,
1c9ec8f651a2f47e2ac7d7b7e5030b1a6d7a2575,
9ba73717a93897876ef305c77daf4df440de8f90,
e557c0877889384c123329d19ffdd062c636a523,
b25227efea00fa55cda33b28629099bafb08b43a,
381f54556395531a8905a42d54ca357f70966769,
fe55d80b186e16ef11ea79c0cd98065d85886cbe,
c18067a2fb951be118df03ce3a72b7cac8653ec3,
11364d2122af1eb310fa90c97e37fb9c8dda27aa,
53b2437c2bb49994ca6b3bf8974b40f5857174dd,
9cf1e71082de020c21f52a2f77803c4688f578f8,
a1cb849f8513e4644d9500e6d504a3dfeb1156b2,
f13c8b9eb1453ebcaa4bb54b662294f80eae9b5e,
d2ed1bb74f3118a83a352e9ce912be765001efa4,
4650c4d67e1eb2c3ab3587691a41510c4c97a0fc,
66ec1520631c9e3b9b40777d2febd537e09ddaaa,
ae365faf08fb5f4d02bf828fd155fa5109c59a6b,
4d42aa3a5f690598a34453ba94f5a379c83f5c94,
473180d4afd379efd8f069edf4389bf2c0b7c96a,
88fc872e0f99c188e432e2f3be5749593ef65e14,
a5b0ab9200e66350d1b6d8e958026e6f60c16424,
7525cfce1751002e228799467681812d6bb12516,
a58c656e1e06b7b66fc42de7cd84879e04e023cf,
d6ea6644a25d97399c95bdae31058da08013c034,
a61f47f6585ad45068e6ac6f1a75d26300a068b0,
4e5ba558b809cdaea4b4fb777662520b370b703b,
2edce5845d60e95ed1d8d19db1fe400680bc6265,
1268384b22aeea6ae7f83661763abb24bc7f951d,
830d6d032ff1e63d47566a436beb5fcbd630ec69,
344f583233efdf6d48b4091ea6c6a0b70d48514c,
4ee4f6240fc424bfff102846d0f99222db1376d8,
73e4d19fd77f743f76f519e16bf61fc5add70cf2,
ea66763d643a88084f74a4d5ea53f48aac0d2b2c,
848732d16c7d20b54eba3066cdf836c16c55e9d7,
9a3953d1826df998f14cc6857738786d1691a326,
9516d8f53a4653125d9ceaf92934734af2115c43,
2bbc067e7799b07c60556c3d7f279fa00d2bac1a,
c1694d089a186ec1b0421b9a419df9e36bf2b45c,
b7ef815b4556afdf69128aa42a66c901e46d28cd,
312a614907c8d2e19f6b14aff16015436cb78423,
558dfae50b5d369de77dae132dbfa64968e3abd4,
87a5d7a5ff74ffa9eb716f05f6e26191836bf03d,
80ab1e13e30998eef848d9de0bfe4a0f1403b31c,
65899a66288bc4d71d113096ef3608f3c63eb665,
486542b7b4b4ff3b53e0ae89e99fd311449f9ee2,
1418e1105e720e09744dd50f9b21aed3911bf434,
27ecec82288579a021a926bbfc6b12d1a31a62d8,
e96e0011d09a9bed3e13115695f5751ca8c17e9b,
2235133ed4600106e27ded1d4b447b566d086d12,
a04dba85ed25d3091d0c6fba6fc806ecd35067c9,
4364a4b9cfdd5b806725cc7d2e9abf3bdf461166,
df312a6b3d64ea0a1d6871e3eace6a7ff30f5ed6,
9e3bb6cbd8817fc0752ee6f4461a412f79496344,
b234f8761be4acb56a86126fa43f8ee818d9156d,
43a8f9bf6605c25fcaa921fb7c9f8b3d6024f05d,
e49edf966d9036342234c0f42c9643fbffdefaf5,
93eedddb64669e218f31cff48059066b03fabf83,
5378c8f664e946e421b16a490513675b8419bdc7,
f09c24d7b32aeda2b6ca6cbb22a7e39c1619deef,
57b003a04490b10c9b64b839edf0c202c34b4f38,
7153289b5afbe9c963870a725871b8eabaedade5,
cd92c541a0b5013df76fd0f980d978b76839f9a9,
9b1842907e238671a483cc064a3a6cdd92ae6c53,
372a19b1a852e634680fd27feec516984c681d1d,
9faa4c08d962622e2c23794f4234efe32dc8bdb4,
16cf9ea1b3e28090d416d36528f304e4553e6b56,
e163eff7a8c6863df050dae7d3daa4bf98474d8e,
76e94bbd190c5e89be4d470502a27bb175c2f0f5,
55a38e15c5db050d4bbe96061a31cbe2d661fa3e,
9f4050124b35c304b9c0289df78961386e5b9d6c,
45b3bcba3eafbdcdcd1c2f64ae187fe5b931ebda,
e00e8c6226e8675aeda90ea9b97e12270e7e1c15,
0b5041cb38ef4f37637350294f1b710138e74665,
05b63acddea93506fb0303cc6cbe247117a05df9,
d656b285f5dabbfb080d2d8e7346e1e86eb50aba,
ecab4857acdf9a915702cc4192220e4ce2786526 |
f87e0eacadbe85a48098ab3cf58a1d561284c82e,
80ea0eb24a0fa66134af6f7f4b38a82e352b3b83,
8194fec699eeed85c1b472699dcb051b6792c136,
16009b37a142e05a479cc778dffd2255c393ceb2,
55a25911e5d2e1d481a43c75b28da1a9e955c0d0,
e83bb39fa76524ed6b375fa9d256ca2b55d4b9df |
80ea0eb24a0fa66134af6f7f4b38a82e352b3b83 | Author: mck <mck@apache.org>
| 2024-09-06 07:36:46+02:00
Mark Marcus King and Jens-W. Schicke-Uffmann as agreed to donate contributions to ASF
ref: https://github.com/apache/cassandra-gocql-driver/issues/1751
patch by Mick Semb Wever; reviewed by Martin Sucha for CASSANDRA-19723
8194fec699eeed85c1b472699dcb051b6792c136 | Author: mck <mck@apache.org>
| 2024-08-24 10:52:53+02:00
Mark Michael Highstead, Oliver Boyle and Bartosz Burclaf, as agreed to donate contributions to ASF
ref: https://github.com/apache/cassandra-gocql-driver/issues/1751
patch by Mick Semb Wever; reviewed by Martin Sucha for CASSANDRA-19723
16009b37a142e05a479cc778dffd2255c393ceb2 | Author: mck <mck@apache.org>
| 2024-07-30 21:08:02+02:00
Add .asf.yml
follows same settings as cassandra-java-driver, with the exception that GH issues are still enabled.
patch by Mick Semb Wever; reviewed by Martin Sucha for CASSANDRA-19799
12249c42030003a1b78e80cd0c8c2facf01fa6e4 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-08-28 10:03:16+02:00
Increase default timeouts
Client timeouts need to be higher than server timeouts,
so that work does not accumulate on the server with retries.
This was not true by default, the gocql default timeout was lower
than the Cassandra default timeout.
Closes https://github.com/gocql/gocql/issues/1671
Closes https://github.com/gocql/gocql/issues/1701
64175cf2bdc63dfaeee16b3604cc76afc2ddbe50 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-07-10 12:14:42+02:00
Add com.scylladb.auth.{SaslauthdAuthenticator,TransitionalAuthenticator} authenticators
These are already allowed in github.com/scylladb/gocql.
Closes https://github.com/gocql/gocql/issues/1703
3fba474ba052eb3b587a0d6ef14e451294a8311f | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-25 11:09:45+02:00
Synchronize access to pools in TestControlConn_ReconnectRefreshesRing
This should fix the following data race:
```
Read at 0x00c000202090 by goroutine 15:
github.com/gocql/gocql.TestControlConn_ReconnectRefreshesRing.func3()
/home/runner/work/gocql/gocql/control_ccm_test.go:121 +0x249
github.com/gocql/gocql.TestControlConn_ReconnectRefreshesRing()
/home/runner/work/gocql/gocql/control_ccm_test.go:129 +0x942
testing.tRunner()
/opt/hostedtoolcache/go/1.20.3/x64/src/testing/testing.go:1576 +0x216
testing.(*T).Run.func1()
/opt/hostedtoolcache/go/1.20.3/x64/src/testing/testing.go:1629 +0x47
Previous write at 0x00c000202090 by goroutine 83:
runtime.mapdelete_faststr()
/opt/hostedtoolcache/go/1.20.3/x64/src/runtime/map_faststr.go:301 +0x0
github.com/gocql/gocql.(*policyConnPool).removeHost()
/home/runner/work/gocql/gocql/connectionpool.go:265 +0xd5
github.com/gocql/gocql.(*Session).handleNodeDown()
/home/runner/work/gocql/gocql/events.go:244 +0x2b8
github.com/gocql/gocql.(*hostConnPool).fillingStopped()
/home/runner/work/gocql/gocql/connectionpool.go:504 +0x4a8
github.com/gocql/gocql.(*hostConnPool).fill.func1()
/home/runner/work/gocql/gocql/connectionpool.go:456 +0x64
Goroutine 15 (running) created at:
testing.(*T).Run()
/opt/hostedtoolcache/go/1.20.3/x64/src/testing/testing.go:1629 +0x805
testing.runTests.func1()
/opt/hostedtoolcache/go/1.20.3/x64/src/testing/testing.go:2036 +0x8d
testing.tRunner()
/opt/hostedtoolcache/go/1.20.3/x64/src/testing/testing.go:1576 +0x216
testing.runTests()
/opt/hostedtoolcache/go/1.20.3/x64/src/testing/testing.go:2034 +0x87c
testing.(*M).Run()
/opt/hostedtoolcache/go/1.20.3/x64/src/testing/testing.go:1906 +0xb44
main.main()
_testmain.go:157 +0x2e9
Goroutine 83 (finished) created at:
github.com/gocql/gocql.(*hostConnPool).fill()
/home/runner/work/gocql/gocql/connectionpool.go:452 +0x411
github.com/gocql/gocql.(*hostConnPool).HandleError.func2()
/home/runner/work/gocql/gocql/connectionpool.go:618 +0x39
```
b90e2262db63962f3a7771dff80f003ae00b89b9 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-20 21:29:52+02:00
Upgrade actions to latest version
Old versions use node 12 which is end of life:
https://github.blog/changelog/2022-09-22-github-actions-all-actions-will-begin-running-on-node16-instead-of-node12/
We should use node 16. The newest actions do.
Removing actions/cache because actions/setup-go has enabled
builtin caching since version 4:
https://github.com/actions/setup-go/tree/dd84a9531a6f8e72c321f2aa3b9048ed359670e4#v4
ebdebffd7952d8ac747e1c6e883aa5d8f810248f | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-20 21:03:02+02:00
Expect 99p for speculative_retry for Cassandra 4.0 and later
Support for case insensitivity and short notation was added in
Cassandra 4.0.
See https://issues.apache.org/jira/browse/CASSANDRA-14293
1697467c840844ed218220e80d71fd852250141b | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-20 20:45:14+02:00
Expect 0 for DcLocalReadRepairChance in Cassandra 4 and later
The option was removed, see CASSANDRA-13910.
ea9b541d083f821e8d74d0d00304fc4554bfd990 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-20 20:16:32+02:00
Use chunk_length_in_kb 16 for Cassandra 4 and later
The default value has changed from 64 to 16 in Cassandra 4.
ef327e7c4180ebec920007b96c20ea818e4dd28e | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-11 10:04:34+02:00
Use Cassandra 4.0.8 and 4.1.1 in CI
We should aim for the latest 2 stable versions.
Also this might help fixing CI.
214d08cda3f729bfaa515ce416dad3e68e0ae06e | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-11 08:05:38+02:00
Store ccm state as artifact on failure
Hopefully there will be some logs to aid debugging.
93bf3da72368850af8e46bf1ed164487fe9bc166 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-11 07:08:51+02:00
Upgrade ccm to latest version
Integration tests are stuck on something with latest Ubuntu version,
let's try to upgrade ccm to the latest version.
b8c4e1605a585c110d073b5391929755442c51a2 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-06 19:34:07+02:00
Fix a deadlock in Session.Close() during control connection reconnect
We switched from separate mutex for closing to sessionStateMu in
312a614907c8d2e19f6b14aff16015436cb78423.
This change introduced a deadlock.
We don't need to hold the mutex for the whole duration of Close(),
we only need to update the status atomically.
Previously IsClosed() returned true only after all closing is done
(because of the deferred unlock).
We can emulate that by setting isClosed at the end of Close().
We need a new variable to ensure that Close() is only executed once.
Fixes https://github.com/gocql/gocql/issues/1687
32d95025ae4e4c28af9be09e85a2ac3dcfb2b393 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2023-04-04 10:15:57+02:00
Use ubuntu-latest OS in CI
ubuntu-18.04 is not available anymore:
https://github.com/actions/runner-images/issues/6002
Increased timeout as the tests failed during the cluster setup phase.
c18067a2fb951be118df03ce3a72b7cac8653ec3 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-10-27 10:35:30+02:00
Check all hosts in host selection policy tests
Most of the tests were missing checks that the list of hosts
is finished, this commit adds them.
Removed checkList so that all tests look similar.
11364d2122af1eb310fa90c97e37fb9c8dda27aa | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-09-06 14:04:20+02:00
Reformat code using go fmt v 1.19
There are different format rules for Go 1.19, reformat the comments
with the new version so that the formatting changes do not show up in
unrelated merge requests.
53b2437c2bb49994ca6b3bf8974b40f5857174dd | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-09-06 10:44:30+02:00
Use Go 1.19 in CI
We generally support the last two stable versions of Go,
updating the CI to test against them.
a1cb849f8513e4644d9500e6d504a3dfeb1156b2 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-07-15 15:23:48+02:00
Allocate map with the initial size during unmarshal
We know the number of elements in the map in advance, so we can
preallocate the map to avoid allocations when growing.
d2ed1bb74f3118a83a352e9ce912be765001efa4 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-07-04 16:09:58+02:00
Marshal missing UDT fields as null instead of failing
We can't return an error in case a field is added to the UDT,
otherwise existing code would break by simply altering the UDT in the
database. For extra fields at the end of the UDT, we can either omit
them or put nulls.
The Java driver[1] and Python driver[2] serialize nulls when they
don't have a value for a field even for fields in the middle, so let's
do that. This behaviour matches even gocql when serializing structs.
[1] https://github.com/datastax/java-driver/blob/ef56d561d97adcae48e0e6e8807f334aedc0d783/core/src/main/java/com/datastax/oss/driver/internal/core/type/codec/UdtCodec.java#L86
[2] https://github.com/datastax/python-driver/blob/15d715f4e686032b02ce785eca1d176d2b25e32b/cassandra/cqltypes.py#L1036
ae365faf08fb5f4d02bf828fd155fa5109c59a6b | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-06-24 08:57:02+02:00
Reformat ClusterConfig to have doc comments above fields
This change is to make maintanance easier.
With comments above the fields, each field is indented separately,
so we can insert new fields into the struct and update the documentation
comments without gofmt reformating the whole struct.
This should reduce diffs in commits and minimize conflicts when
merging to forks.
4d42aa3a5f690598a34453ba94f5a379c83f5c94 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-06-21 12:58:20+02:00
Upgrade Cassandra versions used in CI
This should fix CI that fails because of URISyntaxException in nodetool
(CASSANDRA-17581).
66ec1520631c9e3b9b40777d2febd537e09ddaaa | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-06-14 16:46:45+02:00
Add HostDialer interface
There are multiple use cases where the Dialer interface is not
sufficient.
When using TLS, users need to control also TLS context per host.
For example, host certificates might be either UUID in common name,
some hostname, IP address per host, etc.
Hosted instances or instances deployed in Kubernetes cluster tend
to be behind proxies. A proxy might use TLS server name indication
to identify which database host to connect to.
d6ea6644a25d97399c95bdae31058da08013c034 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-04-01 15:47:20+02:00
Make TestUnmarshalUDT more readable
Suggested at https://github.com/gocql/gocql/pull/1583/files#r774983890
a61f47f6585ad45068e6ac6f1a75d26300a068b0 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-03-18 11:42:32+01:00
Run tests with Go 1.18
Go 1.18 was released, let's test with 1.17 and 1.18 as those
are the only supported versions of Go now.
a5b0ab9200e66350d1b6d8e958026e6f60c16424 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2022-01-27 17:35:32+01:00
Make sure policies are not reused between sessions
Sharing a policy between sessions is not supported because the policy
receives state updates from the session.
Let's update the documentation and add a panic in TokenAwareHostPolicy
constructor. It is better to panic early than to have races and
undefined behavior later.
See also discussion in https://github.com/scylladb/gocql/issues/94
7525cfce1751002e228799467681812d6bb12516 | Author: Martin Sucha <git@mm.ms47.eu>
| 2021-12-23 20:54:11+01:00
Don't panic when there is no Go type available
When there is no corresponding Go type available,
unmarshaling panics. We want to return an error instead.
Unfortunately, TypeInfo.New() can't return an error.
We can't change the signature of New as it might be used
externally. For New(), the only option is to keep panicking.
We now at least provide a more useful panic message.
We are adding a new method to interface TypeInfo that returns
as error instead of panicking. Changing the interface is a
breaking change for users that implement custom TypeInfo.
I don't expect any custom implementations of this interface
though, so I think this change is safe.
An alternative to adding the method to the interface would
be to add a separate interface and a function to use the
new interface when available, with fallback to TypeInfo:
type TypeInfo2 interface {
TypeInfo
NewWithError() (interface{}, error)
}
func NewFromType(t TypeInfo) (interface{}, error) {
if t2, ok := t.(TypeInfo2); ok {
return t2.NewWithError()
}
return t.New(), nil
}
However, I think that is overkill in this case as it's unlikely there
are custom implementations of TypeInfo and NewFromType still wouldn't
guarantee no panics.
Related gocql/gocql#1148
848732d16c7d20b54eba3066cdf836c16c55e9d7 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-10-05 16:48:19+02:00
Add check for pointer kind to unmarshalUDT
reflect.Value.Elem panics when the value is not a suitable type.
In all other places we check that we received a pointer value,
but the check was missing in unmarshalUDT.
We want to return an error instead of the panic that happened
previously:
panic: reflect: call of reflect.Value.Elem on map Value [recovered]
panic: reflect: call of reflect.Value.Elem on map Value
goroutine 6 [running]:
testing.tRunner.func1.2({0x72af00, 0xc00000e150})
/usr/local/go/src/testing/testing.go:1209 +0x24e
testing.tRunner.func1()
/usr/local/go/src/testing/testing.go:1212 +0x218
panic({0x72af00, 0xc00000e150})
/usr/local/go/src/runtime/panic.go:1038 +0x215
reflect.Value.Elem({0x72dd60, 0xc000108f00, 0xc000108f00})
/usr/local/go/src/reflect/value.go:1178 +0x15a
github.com/gocql/gocql.unmarshalUDT({0x801eb0, 0xc000028360},
{0xc00001c540, 0x13, 0x13}, {0x72dd60, 0xc000108f00})
/home/martin/Projects/gocql/marshal.go:2294 +0x69f
github.com/gocql/gocql.Unmarshal({0x801eb0, 0xc000028360},
{0xc00001c540, 0x829974, 0x13}, {0x72dd60, 0xc000108f00})
/home/martin/Projects/gocql/marshal.go:252 +0x7b1
github.com/gocql/gocql.TestUnmarshalUDT(0xc00011ed00)
/home/martin/Projects/gocql/marshal_test.go:2218 +0x285
testing.tRunner(0xc00011ed00, 0x79f508)
/usr/local/go/src/testing/testing.go:1259 +0x102
created by testing.(*T).Run
/usr/local/go/src/testing/testing.go:1306 +0x35a
9a3953d1826df998f14cc6857738786d1691a326 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-07-06 16:01:56+02:00
Do not reconnect control conn when closing session
When closing the session, the following deadlock happens:
1. Session.Close() - [try to close the
session](https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/session.go#L450),
this locks `Session.sessionStateMu`
2. s.control.close() - [close the control
connection](https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/session.go#L464)
3. ch.conn.Close() - [close the control
connection](https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/control.go#L493)
4. c.closeWithError(nil) - [close the connection without
error](https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/conn.go#L555)
5. c.close() - [close the
connection](https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/conn.go#L540)
returns an error e.g. `write tcp 172.XX.XXX.X:41228->XX.XXX.XX.XXX:9142: i/o timeout`
6. c.errorHandler.HandleError(c, cerr, true) - [call error handler with returned
error](https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/conn.go#L546)
7. c.reconnect(false) - HandleError method doesn't check if the
connection is supposed to be closed or not, so that it tries to
reconnect again -
https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/control.go#L406
8. c.setupConn(newConn) - [try to set up a new
connection](https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/control.go#L382)
9. c.session.initialized() - [this tries to lock Session.sessionStateMu
again](https://github.com/gocql/gocql/blob/769848eae4625444c6abdabc4a67eacb117c9200/control.go#L288)
leading to a deadlock.
We don't want to reconnect during Session.close because of other
triggers (like heartbeats), so adding the check to c.reconnect.
This will prevent the deadlock from occurring.
We might add separate mutexes for session initialized/closed state
in a separate commit, that would be sufficient to remove the deadlock
as well.
Co-Authored-By: Yuto Doi <yutodoi.seattle@gmail.com>
9516d8f53a4653125d9ceaf92934734af2115c43 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-07-06 15:46:19+02:00
Rename controlConn.started to controlConn.state
The variable also tracks if close() was called.
Added constants to make the code more readable.
2bbc067e7799b07c60556c3d7f279fa00d2bac1a | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-07-02 08:58:50+02:00
Remove lz4 integration test
Unfortunately we can't depend on lz4 in the main gocql package because
it breaks some users. Removing the test to fix compilation error in
tests.
c1694d089a186ec1b0421b9a419df9e36bf2b45c | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-06-29 16:30:14+02:00
Move lz4 compressor to separate module (#1567)
Some users have problems using gocql/gocql with dep as
dep can't work with v4 dependency.
Moving the lz4 compressor to a separate module.
This is a breaking change, but there probably aren't many
people using the lz4 compressor yet.
4e5ba558b809cdaea4b4fb777662520b370b703b | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-06-16 17:24:38+02:00
Fix deadlock in closeWithError
We have seen goroutines stuck at:
select
github.com/gocql/gocql.(*Conn).closeWithError
/go/pkg/mod/github.com/kiwicom/gocql@v1.8.0/conn.go:569
github.com/gocql/gocql.(*Conn).exec
/go/pkg/mod/github.com/kiwicom/gocql@v1.8.0/conn.go:1113
github.com/gocql/gocql.(*Conn).executeQuery
/go/pkg/mod/github.com/kiwicom/gocql@v1.8.0/conn.go:1414
semacquire
sync.runtime_SemacquireMutex
/usr/local/go/src/runtime/sema.go:71
sync.(*Mutex).lockSlow
/usr/local/go/src/sync/mutex.go:138
sync.(*Mutex).Lock
/usr/local/go/src/sync/mutex.go:81
github.com/gocql/gocql.(*Conn).exec
/go/pkg/mod/github.com/kiwicom/gocql@v1.8.0/conn.go:1058
github.com/gocql/gocql.(*Conn).executeQuery
/go/pkg/mod/github.com/kiwicom/gocql@v1.8.0/conn.go:1414
github.com/gocql/gocql.(*Query).execute
When closeWithError is notifying callReqs about the error, it
selects between writing to call.resp or until call.timeout is closed.
There were branches leading to return with missing close(call.timeout)
calls in exec. When exec returned without either closing call.timeout or
reading the call.resp, this left the call without any further progress.
closeWithError selected between two conditions that never happened.
2edce5845d60e95ed1d8d19db1fe400680bc6265 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-06-16 17:05:55+02:00
Don't hold connection mutex when closing callReqs
Previously when closing the connection, closeWithError would send
connection close errors to all currently outstanding callReqs while
holding the connection mutex. During this time, other goroutines trying
to start a request on this connection are blocked. These goroutines
could have created new callReqs for this connection and started trying
to write. The write would be probably canceled by closing the
underlying connection.
This commit changes the behaviour so that connection is marked as closed
and no new callReqs are created for the connection nor any more frames
are read. Taking all the callReqs to the goroutine that is closing
the connection allows us to iterate the callReqs without holding the
connection lock. If the iteration takes long (or if there is a
deadlock), only one goroutine is affected instead of any goroutine that
would try to create a request on this connection.
312a614907c8d2e19f6b14aff16015436cb78423 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-04-30 17:23:32+02:00
Fix reconnecting to cluster (#1555)
After merging https://github.com/gocql/gocql/pull/1369 it could happen
that after losing connection to a host, it is not reconnected.
That could happen in the following case:
1. We lose connection to a host.
2. We receive UP event for the host, the host is marked as UP even
though we don't have active connections to it.
3. We don't reconnect to the host in reconnectDownedHosts because it's
marked as UP already.
In PR 1369 we intended to change the host states so that a host is
marked as UP only after we have some connection to it. However, we also
removed the call to fill the pool on UP event in case the host already
existed.
This commit adds the call to fill the pool back to handleNodeUp.
We introduce handleNodeConnected for marking the host as up,
so that handleNodeUp handles only the UP event and the responsibilities
are clear.
Control connection is tried to be reconnected every second after
it is lost.
Before PR 1369, after establishing a control connection to a host,
we triggered pool refill for the same host.
PR 1369 removed this behaviour so that it is possible to wait
for the connection to be added to the pool in Session.init (and not
mark the host UP if we have only control connection.
This means that it could take up to cfg.ReconnectInterval after
estabilishing control connection to have some usable host in the pool
to service user queries after PR 1369.
To restore the old behavior, we add back triggering of pool refill after
control connection is connected. We can only do so after the session is
initialized, so that we don't the interfere with initialization code
in Session.init.
Related https://github.com/gocql/gocql/issues/915
Co-Authored-By: Ivan Boyarkin <ivan.boyarkin@kiwi.com>
87a5d7a5ff74ffa9eb716f05f6e26191836bf03d | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-04-13 17:39:02+02:00
Fix panic in replicaMap when rf=0 (#1552)
In case a network topology strategy with zero replication factor
for at least one DC is encountered, the driver panicked with
token map different size to token ring.
In case there is a datacenter with rf=0 it is correct to have fewer
tokens in the replica map because the data is not replicated in that
datacenter.
The driver panicked anyway, because the check assumed that rf>0 for all
datacenters. len(dcRacks) includes the datacenters where replication
factor is zero, but these datacenters should not have any replicas.
558dfae50b5d369de77dae132dbfa64968e3abd4 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-04-12 18:16:31+02:00
Add per-session logger (#1553)
There are race conditions in tests that swap the global logger,
we shouldn't modify any global state in tests or otherwise.
1268384b22aeea6ae7f83661763abb24bc7f951d | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-04-09 14:03:24+02:00
Add a separate write timeout
Using the same write timeout and read timeout means that the query
can take cfg.Timeout * 2 time. We want a shorter write timeout.
830d6d032ff1e63d47566a436beb5fcbd630ec69 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-04-07 17:01:19+02:00
Make write to connection cancellable
There were a few issues with the original implementation:
- deadlineWriter didn't use a critical section around
SetWriteDeadline/Write pair, so an incoming writer moved the deadline
for pending writes.
- There was a lock around Write in the writeCoalescer implementation.
This means that all goroutines trying to write to the connection could
be stuck if Write blocks (i.e. when the TCP buffer is full, probably
because of the remote node not reading fast enough).
- If a write was queued before the long Write call, it would be added to
buffers (I'm not sure if an attempt would be made to write it before
the connection is closed, but it seems possible).
When a connection is stuck in Write and there are other writes queueing
up, we want to abort them if the context is canceled and the write
waiting in queue was not started yet.
We can't cancel writes that are blocked in Write when the context
is canceled because context can be canceled due to external factors
like a user disconnecting. Canceling the pending Write could result
in partial write of a frame, clobbering the connection state.
Added checks for SetWriteDeadline errors, since not setting the
deadline could stuck the write goroutines potentially for infinitely
long, so it seems better to just return error.
It seems that SetWriteDeadline could fail only if the network connection
does not use a facility like epoll, which is highly unlikely. I found
checked Go code and as far as I can tell, only some file descriptors
other that network connections don't support the deadline.
Also added correct return values (written byte count) since returning
0 when an error is made is misleading.
344f583233efdf6d48b4091ea6c6a0b70d48514c | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-04-06 16:51:23+02:00
Split request and response framer
There is no need to share the write buffer with read goroutine,
so I'm splitting the code to use a separate framer for
request and response. This should help prevent accidental
reuse of framer in the future.
4ee4f6240fc424bfff102846d0f99222db1376d8 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-04-06 16:49:33+02:00
Don't reset connection on non-write errors
If there is an error like too big frame that we are sending,
we don't want to close the connection as we abort before trying
to write there. We don't clobber the data stream in this case.
As framer is basically a buffer of frame data,
I moved the I/O operations to separate methods with
explicit reader/writer arguments so that we can
distinguish when IO fails.
80ab1e13e30998eef848d9de0bfe4a0f1403b31c | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-03-30 16:19:01+02:00
Fix panic when replication factor is zero (#1550)
Replication factor can be zero in normal course of operations.
For example, when one doesn't want a keyspace to be replicated
in the given datacenter, when adding or removing datacenters, etc.
Panicking is harmful since it can bring all instances of a service
down at once. Instead, we just don't update the topology if we can't
parse it and we log the error. This will keep existing topology
in case it's set to something broken and the client can continue
running.
In case of network topology strategy, if only one DC has unsupported
replication factor, we just skip it, so that we can run with the valid
DCs.
Fixes https://github.com/gocql/gocql/issues/1388
Fixes https://github.com/scylladb/gocql/issues/71
1418e1105e720e09744dd50f9b21aed3911bf434 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-03-10 11:37:14+01:00
Fix build
This is a regression from 27ecec82288579a021a926bbfc6b12d1a31a62d8
that removed c.timeouts even when it should not.
2235133ed4600106e27ded1d4b447b566d086d12 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-03-02 18:17:00+01:00
Don't do network IO with already canceled context (#1540)
It does not make sense to send requests where the results will
be immediately discarded. In fact, it can be harmful in situations
where a lot of contexts are canceled (e.g. because of timeouts).
a04dba85ed25d3091d0c6fba6fc806ecd35067c9 | Author: Martin Sucha <git@mm.ms47.eu>
| 2021-03-01 07:25:20+01:00
Remove Go 1.14 from build matrix
As documented in README, we only support latest two stable
versions of Go.
df312a6b3d64ea0a1d6871e3eace6a7ff30f5ed6 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-01-29 09:47:47+01:00
Use the same host iterator in speculative execution
Before this commit, regular query execution path and speculative
execution path did not use the same host iterator.
This means that speculative execution could retry on the same
host as the original attempt. This could lead to overloading that
slow host even more instead of trying on some other host if
the policy consistently selects that host (for example based on
token).
We fix this by sharing the host iterator returned by the host selection
policy. The returned iterator could now accessed from multiple
goroutines, so we need to synchronize access to it. I chose this instead
of implementing synchronization in each policy because I can't fix
any user-provided policies. Synchronizing in the driver avoids
introducing new data race for user-provided policies as those likely
won't be updated.
Fixes https://github.com/gocql/gocql/issues/1530
43a8f9bf6605c25fcaa921fb7c9f8b3d6024f05d | Author: Martin Sucha <git@mm.ms47.eu>
| 2021-01-24 22:11:04+01:00
Fix TLS host verification when ServerName is not provided in TLS config
Enabling host verification in SslOptions fails if ServerName is
not set in the configured *tls.Config with:
gocql: unable to create session: control: unable to connect to initial hosts: tls: either ServerName or InsecureSkipVerify must be specified in the tls.Config
Before https://github.com/gocql/gocql/pull/1368 we used
tls.DialWithDialer that sets the ServerName in the tls.Config to
the host part of the connect address if ServerName was not explicitly
provided. After that MR, we wrap the connection returned by the dialer
ourselves and the ServerName is not set.
This commit restores the original behavior to set the ServerName
to host part of connect address if it is not provided explicitly
in the TLS config.
Thanks Suhail Patel for reporting the issue:
https://github.com/gocql/gocql/pull/1493
e96e0011d09a9bed3e13115695f5751ca8c17e9b | Author: Martin Sucha <git@mm.ms47.eu>
| 2021-01-24 21:49:17+01:00
Don't ignore TLS config's InsecureSkipVerify
tls.Config by default verifies certificates, gocql.SslOptions does
not. If one only provided the tls.Config with InsecureSkipVerify=false,
callers expect that the host will be verified, but gocql resets the
InsecureSkipVerify to true.
It's safer to explicitly disable host verification than to explicitly
enable it, so if the tls.Config is provided, let's honor it's
security settings.
If a TLS config with InsecureSkipVerify=true is provided at the same
time as EnableHostVerification=true is provided, this is a conflict
in settings. We could either return an error or fall back to verify
the host. We chose to verify the host.
The issue is in gocql codebase since commit
6495810decf1d4db412640a0f5667d4ffac62a8f, when the tls.Config was
embedded to SslOptions struct.
73e4d19fd77f743f76f519e16bf61fc5add70cf2 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2021-01-05 16:30:05+01:00
Add ability to observe streams
This allows to get metrics like current count of inflight streams.
A StreamObserver that does not want to store any context per stream
can always return the same value.
f87e0eacadbe85a48098ab3cf58a1d561284c82e | Author: Sam Tunnicliffe <sam@beobal.com>
| 2020-12-15 17:53:17+00:00
Update native protocol v5 spec with new framing format
Also include a note in the Upgrading section of NEWS.txt
Patch by Sam Tunnicliffe; reviewed by Mick Semb Wever, Alex Petrov
and Martin Sucha for CASSANDRA-14688
e49edf966d9036342234c0f42c9643fbffdefaf5 | Author: Martin Sucha <git@mm.ms47.eu>
| 2020-12-15 17:53:27+01:00
Fix TestMarshal_Decode
In 179bae5f16901945ab66d7c2d9088dc4282281c4 we changed the error
message from upper case to lower case, but the test was not updated.
I don't know whether I missed failing CI build when merging or
whether CI did not run at all.
Anyway, this commit fixes the test.
93eedddb64669e218f31cff48059066b03fabf83 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2020-12-04 15:29:55+01:00
Add Go version to issue template (#1510)
Some issues might be reproducible only using specific
Go version. If someone is not using latest Go version,
we might want to ask them to upgrade and try to reproduce.
This change is inspired by the following comment:
https://github.com/gocql/gocql/issues/1481#issuecomment-728758618
4364a4b9cfdd5b806725cc7d2e9abf3bdf461166 | Author: Martin Sucha <git@mm.ms47.eu>
| 2020-12-01 23:15:39+01:00
Add package-level documentation (#1513)
The examples are regular godoc/go test examples.
Because go examples don't support marking the example as skipped,
the // Output line before output is intentionally omitted.
This way, the examples are compiled by go test, but not executed (as
that would fail because we need to access the database).
When someone wants to execute the example, adding the Output line
will make it executable.
Co-authored-by: Matouš Dzivjak <matousdzivjak@gmail.com>
486542b7b4b4ff3b53e0ae89e99fd311449f9ee2 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2020-11-02 18:03:04+01:00
Copy query when fetching next page (#1508)
Currently the attempt count in query metrics is incremented both on
retries and when fetching next page, making it impossible to track just
the retry count. By shallow-copying the query and changing the metrics
we ensure that attempts are counted per-age (so attempts>0 means always
a retry).
We were copying the Query previously
(before 271c061c7f16702ca550b0e9721320b1c92e00cb), but I don't know why
exactly the copying was removed there.
In any case we can't change the paging state in the original Query
because there might be a separate goroutine spawned by speculative
execution that accesses the field. Even if we added a lock around
Query.pagingState field, the speculative execution could try to fetch
different page.
We could possibly move pagingState and metrics to nextIter or other
struct to avoid copying whole Query, but that would require changing
signature of executeQuery to take that struct as parameter instead.
Batches don't implement fetching subsequent pages, so removing the note
about pages from ObservedBatch.Attempts as well.
9e3bb6cbd8817fc0752ee6f4461a412f79496344 | Author: Martin Sucha <git@mm.ms47.eu>
| 2020-10-25 11:37:28+01:00
Start errors with lowercase letter (#1506)
Error messages shouldn't be capitalized.
https://github.com/golang/go/wiki/CodeReviewComments#error-strings
b234f8761be4acb56a86126fa43f8ee818d9156d | Author: Martin Sucha <git@mm.ms47.eu>
| 2020-10-24 19:46:06+02:00
Document type conversions for Marshal and Unmarshal
It was not documented anywhere how cql data types map to Go
data types.
I haven't included data for exact cases when type aliases are
supported.
5378c8f664e946e421b16a490513675b8419bdc7 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2020-08-15 13:09:48+02:00
Add Attempt count to ObservedBatch (#1439)
We have attempt count in ObservedQuery, add it to ObservedBatch as well
for feature parity.
f09c24d7b32aeda2b6ca6cbb22a7e39c1619deef | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2020-08-14 03:22:50+02:00
Support Go 1.14 and 1.15 (#1480)
We support two latest versions of Go.
57b003a04490b10c9b64b839edf0c202c34b4f38 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2020-05-11 15:54:41+02:00
Add eof checks to unmarshalList/Map (#1437)
There were missing checks for length which could lead to panic when the
server sends malformed data. This commits adds the missing checks.
7153289b5afbe9c963870a725871b8eabaedade5 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2020-03-20 18:53:21+01:00
Fix data race when accessing hostMetrics (#1413)
ObservedQuery and ObservedBatch expose a pointer to hostMetrics struct
to the respective observers. However, the fields of the returned
*hostMetrics are not safe to read, since the observer that might access
hostMetrics' fields is called without the lock held.
In order to return a safe value to clients, we need to make a copy of
the hostMetrics struct while the lock is still locked.
This commit also optimizes the count of lock/unlock calls, previously we
did 3 pairs (for updating attempts, for updating latency and for getting
hostMetrics), now we only do one.
cd92c541a0b5013df76fd0f980d978b76839f9a9 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2020-02-28 11:30:33+01:00
Remove tokenRing.GetHostForPartitionKey (#1404)
This function is not used anymore.
9b1842907e238671a483cc064a3a6cdd92ae6c53 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2020-02-27 17:05:51+01:00
Fix selection of host by token (#1405)
We've seen increased per-query latency after upgrading to latest gocql.
This commit fixes the latency issue for us.
This partially reverts 7b17705d75148afe48e336a49743178a201cdda3. There
were multiple unrelated changes in that commit. The relevant part of
that commit's description is:
Fix finding the correct token in the token ring for host selection.
Unfortunately that description does not contain the details of the issue
it was aimed to fix.
The change in 7b17705d75148afe48e336a49743178a201cdda3 also made the two
binary search implementations (tokenRingReplicas.replicasFor and
tokenRing.GetHostForToken) inconsistent, reverting the change fixed this
discrepancy as well.
The token values returned in system.peers and system.local tables are
ends of ranges that the nodes are responsible for[1], so we never need
to rollunder.
When searching for a node responsible for token X (computed from
partition key), we need to find hostTokens with end token Y with the
lowest value such that Y >= X. In case X is larger than the largest
end token value in tokenRingReplicas, we wrap the ring and the node with
the lowest token is responsible (i.e. the hostTokens at index 0).
[1] https://docs.datastax.com/en/dse/6.7/dse-arch/datastax_enterprise/dbArch/archAboutDataDistribute.html
372a19b1a852e634680fd27feec516984c681d1d | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2020-02-21 12:38:47+01:00
Add documentation for token ring values (#1406)
https://docs.datastax.com/en/dse/6.7/dse-arch/datastax_enterprise/dbArch/archAboutDataDistribute.html
9faa4c08d962622e2c23794f4234efe32dc8bdb4 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2019-11-02 14:15:23+01:00
Fix unmarshaling to unsigned integers (#1360)
There were a few inconsistencies with unmarshaling to unsigned integers:
- negative values were not range checked for primitive uint types
- negative values were not allowed at all for named uint types
This commit restricts the range of accepted values so that we marshal
the same value as we unmarshal. We accept negative values for a type
only if the width of cql type is smaller or equal than the width of Go
type. This ensures that unmarshaling is bijection from CQL to values
to Go values, without the check the following situation can happen:
- we unmarshal CQL smallint 0xffff (-1) to Go uint8 0xff
- we unmarshal CQL smallint 0x00ff (255) to Go uint8 0xff
- we marshal uint8 0xff to CQL smallint 0x00ff (255)
Therefore smallint value -1 would be turned into 255 during round-trip.
We also need to apply the same logic consistently regardless of whether
we are unmarshaling into a native or named type.
16cf9ea1b3e28090d416d36528f304e4553e6b56 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2019-09-15 17:32:52+02:00
Test with Go 1.13 and 1.12 in CI (#1347)
* Fix tests in Go 1.13
The flag.Parse() in common-test.go init() causes errors to be raised
since package testing no longer registers it's flags in its own init(),
but instead provides testing.Init().
One option to fix this is to call testing.Init() just before calling
flag.Parse(). However, testing.Init() is not available in older Go
releases so we'd need to create separate versions of init() based on
build tags.
Instead, we can remove the call to flag.Parse() altogether as it was
only used to set clusterHosts variable and we defer parsing of
clusterHosts to a time when tests are executed. This version of code
runs both on Go 1.12 and Go 1.13.
* Test with Go 1.13 and 1.12 in CI
As per the README: In general, the gocql team will focus on supporting
the current and previous versions of Go.
e163eff7a8c6863df050dae7d3daa4bf98474d8e | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2019-08-29 15:09:54+02:00
Add NonLocalReplicasFallback option to TokenAwareHostPolicy (#1328)
* Add NonLocalReplicasFallback option to TokenAwareHostPolicy
* Add a test for TokenAware and DCAwareRR host policies used together
Token aware policy with DC aware RR as fallback behaves in a way
that might surprise some users. This combination queries the
host selected by token in the local DC first and since all hosts
for that token are in another DC, falls back to other hosts in
the local DC.
Moreover for retry count > replication factor, the host selected
by token aware policy is only queried once.
This option allows to fallback to nodes based by token in
remote DCs before falling back to other nodes in local DC.
This is useful in particular when used with
{'class': 'NetworkTopologyStrategy', 'a': '1', 'b': '1', 'c': '1'}
76e94bbd190c5e89be4d470502a27bb175c2f0f5 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2019-07-28 12:53:48+02:00
Fix updating cluster state in tokenAwareHostPolicy (#1332)
tokenAwareHostPolicy keeps track of hosts in the cluster, keeps the
token ring and a map of token to replicas that keep the data (which is
computed using a placement strategy).
One issue that is present in old the code is that it updates the host
list and token ring whenever a node goes up or down. While the replica
map is only updated in AddHost/RemoveHost, it could be computed based
on partial host list in case AddHost/RemoveHost is called whenever some
host is down. The placement strategy would not see all nodes in the
cluster in that case. In fact, HostUp is never called anywhere in the
code now and AddHost is called even in case when a node goes up. So this
commit changes tokenAwareHostPolicy to only update token ring and
replica map whenever hosts change in AddHost/RemoveHost and not
HostUp/HostDown.
There is also a race condition in updateKeyspaceMetadata when computing
replicas, as it accesses fields that are synchronized separately
(tokenRing and hosts), but the placement strategy assumes the values it
receives are consistent. This leads to panics triggered in replicaMap
in case the race is hit when hosts are rapidly added/removed or getting
up/down (such as when we lose connection to multiple nodes at the same
time).
One sequence of events that might lead to the race condition and panic
in networkTopology.replicaMap could be like below:
1. A: calls tokenAwareHostPolicy.AddHost
2. A: calls tokenAwareHostPolicy.HostUp
3. A: calls tokenAwareHostPolicy.updateKeyspaceMetadata
4. A: calls tr, _ := t.tokenRing.Load().(*tokenRing) to read token Ring
5. B: calls tokenAwareHostPolicy.RemoveHost
6. B: calls tokenAwareHostPolicy.HostDown which calls
t.hosts.remove(host.ConnectAddress())
7. A: calls strat.replicaMap(t.hosts.get(), tr.tokens)
To fix the inconsistencies, we need to perform the entire state change
atomically.
I think we can just lock (for writing) during the entire state update
as host add/remove should be rare. The remaining reads in Pick still
use lock-free access, but we now copy and replace all the metadata with
a single atomic.Value store instead of multiple atomic.Value fields.
I left the hosts field as cowHostList type, but it could be replaced by
a simple unsynchronized type as all accesses to hosts are now protected
by `tokenAwareHostPolicy.mu`.
Fixes #1274
55a38e15c5db050d4bbe96061a31cbe2d661fa3e | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2019-07-08 16:50:58+02:00
Add attempt number to ObservedQuery (#1324)
This will allow the observer to know whether the attempt was
original query or retry.
9f4050124b35c304b9c0289df78961386e5b9d6c | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2019-07-08 16:50:01+02:00
Update supported Go versions to 1.11 and 1.12 (#1327)
As the README states, the current and previous version of Go is
supported, so tests should be run accordingly.
Also note that this commit removes Go 1.10, the last version
that does not support Go modules.
Thanks @dahankzter for the suggestion.
45b3bcba3eafbdcdcd1c2f64ae187fe5b931ebda | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2019-06-19 18:26:18+02:00
Refactor query metrics code
This removes code duplication and ensures the metrics mutex is
locked only once in addAttempts and addLatency.
e00e8c6226e8675aeda90ea9b97e12270e7e1c15 | Author: Martin Sucha <2007393+martin-sucha@users.noreply.github.com>
| 2019-06-11 00:22:57+02:00
Add information about Host to FrameHeaderObserver (#1317)
FrameHeaderObserver is the only observer that does not report
host currently, so add it there as well since Host is useful
if you want to tag the metrics by host/rack/dc/cluster.
0b5041cb38ef4f37637350294f1b710138e74665 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2019-03-19 11:18:35+01:00
Make sure all connections are observed
Currently, control connections used for example to discover protocol
version are not observable using connection observer. This change
aims to make sure all connections created by the driver are seen
by the connection observer.
The `dialWithoutObserver` name is intentionally longer than `dial`
so that one must explicitly acknowledge the connection will not be
observed when using the function.
ecab4857acdf9a915702cc4192220e4ce2786526 | Author: Martin Sucha <martin.sucha@kiwi.com>
| 2019-02-27 12:07:15+01:00
Fix range checks in readCollectionSize
When using version greater than protoVersion2 the range check in
readCollectionSize only checked for two bytes instead of four bytes.
This commit moves the range check inside the function so that it is
near the logic that accesses the buffer.
This is the panic that the missing range check caused:
panic: runtime error: index out of range
goroutine 78088437 [running]:
github.com/gocql/gocql.readCollectionSize(...)
/go/pkg/mod/github.com/scylladb/gocql@v1.0.0/marshal.go:1405
github.com/gocql/gocql.unmarshalList(0xfb2ac0, 0xc003a01900, 0xc008a012d5, 0x2, 0x2f, 0xcf7860, 0xc00632a748, 0x918641, 0x4)
/go/pkg/mod/github.com/scylladb/gocql@v1.0.0/marshal.go:1443 +0xd39
github.com/gocql/gocql.Unmarshal(0xfb2ac0, 0xc003a01900, 0xc008a012d5, 0x2, 0x2f, 0xcf7860, 0xc00632a748, 0xc0073f7130, 0x2)
/go/pkg/mod/github.com/scylladb/gocql@v1.0.0/marshal.go:152 +0xa3b
github.com/gocql/gocql.scanColumn(0xc008a012d5, 0x2, 0x2f, 0xc001845617, 0x7, 0xc001845630, 0x7, 0xc001845640, 0xc, 0xfb2ac0, ...)
/go/pkg/mod/github.com/scylladb/gocql@v1.0.0/session.go:1295 +0x291
github.com/gocql/gocql.(*Iter).Scan(0xc00250a360, 0xc00aa406c0, 0x23, 0x23, 0x23)
/go/pkg/mod/github.com/scylladb/gocql@v1.0.0/session.go:1395 +0x2e6
github.com/scylladb/gocqlx.(*Iterx).StructScan(0xc00183f8c0, 0xdf7280, 0xc00632a600, 0xe78060)
/go/pkg/mod/github.com/scylladb/gocqlx@v1.2.1/iterx.go:242 +0x1bb
github.com/scylladb/gocqlx.(*Iterx).scanAny(0xc00183f8c0, 0xdf7280, 0xc00632a600, 0xc00183f800, 0xc00250a360)
/go/pkg/mod/github.com/scylladb/gocqlx@v1.2.1/iterx.go:105 +0x232
github.com/scylladb/gocqlx.(*Iterx).Get(0xc00183f8c0, 0xdf7280, 0xc00632a600, 0xc001714c74, 0xc001714c74)
/go/pkg/mod/github.com/scylladb/gocqlx@v1.2.1/iterx.go:66 +0x48
github.com/scylladb/gocqlx.(*Queryx).Get(0xc00081b5c0, 0xdf7280, 0xc00632a600, 0xc00081b5c0, 0x40750b)
/go/pkg/mod/github.com/scylladb/gocqlx@v1.2.1/queryx.go:212 +0x75