-
Notifications
You must be signed in to change notification settings - Fork 14.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix k8s pod.execute randomly stuck indefinitely by logs consumption (#23497) #23618
Changes from all commits
ccdbd54
362c150
beeb883
109f87b
5605a5a
86171ff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,8 @@ | |
# specific language governing permissions and limitations | ||
# under the License. | ||
"""Launches PODs""" | ||
import asyncio | ||
import concurrent | ||
import json | ||
import math | ||
import time | ||
|
@@ -193,6 +195,40 @@ def follow_container_logs(self, pod: V1Pod, container_name: str) -> PodLoggingSt | |
) | ||
return self.fetch_container_logs(pod=pod, container_name=container_name, follow=True) | ||
|
||
def log_iterable(self, logs: Iterable[bytes]) -> Optional[DateTime]: | ||
timestamp = None | ||
for line in logs: | ||
timestamp, message = self.parse_log_line(line.decode('utf-8', errors="backslashreplace")) | ||
self.log.info(message) | ||
return timestamp | ||
|
||
def consume_container_logs_stream( | ||
self, pod: V1Pod, container_name: str, stream: Iterable[bytes] | ||
) -> Optional[DateTime]: | ||
async def async_await_container_completion() -> None: | ||
await asyncio.sleep(1) | ||
while self.container_is_running(pod=pod, container_name=container_name): | ||
await asyncio.sleep(1) | ||
|
||
loop = asyncio.get_event_loop() | ||
await_container_completion = loop.create_task(async_await_container_completion()) | ||
log_stream = asyncio.ensure_future(loop.run_in_executor(None, self.log_iterable, stream)) | ||
tasks: Iterable[asyncio.Task] = {await_container_completion, log_stream} | ||
loop.run_until_complete(asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)) | ||
if log_stream.done(): | ||
return log_stream.result() | ||
|
||
log_stream.cancel() | ||
try: | ||
loop.run_until_complete(log_stream) | ||
except concurrent.futures.CancelledError: | ||
self.log.warning( | ||
"Container %s log read was interrupted at some point caused by log rotation " | ||
"see https://github.com/apache/airflow/issues/23497 for reference.", | ||
container_name, | ||
) | ||
return None | ||
|
||
def fetch_container_logs( | ||
self, pod: V1Pod, container_name: str, *, follow=False, since_time: Optional[DateTime] = None | ||
) -> PodLoggingStatus: | ||
|
@@ -220,10 +256,11 @@ def consume_logs(*, since_time: Optional[DateTime] = None, follow: bool = True) | |
), | ||
follow=follow, | ||
) | ||
for raw_line in logs: | ||
line = raw_line.decode('utf-8', errors="backslashreplace") | ||
timestamp, message = self.parse_log_line(line) | ||
self.log.info(message) | ||
if follow: | ||
timestamp = self.consume_container_logs_stream(pod, container_name, logs) | ||
else: | ||
timestamp = self.log_iterable(logs) | ||
|
||
except BaseHTTPError as e: | ||
self.log.warning( | ||
"Reading of logs interrupted with error %r; will retry. " | ||
|
@@ -256,7 +293,7 @@ def consume_logs(*, since_time: Optional[DateTime] = None, follow: bool = True) | |
time.sleep(1) | ||
|
||
def await_container_completion(self, pod: V1Pod, container_name: str) -> None: | ||
while not self.container_is_running(pod=pod, container_name=container_name): | ||
while self.container_is_running(pod=pod, container_name=container_name): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that is the change unrelated to the issue. Tests are all mocking the function, so this is not detected by them. Probably that's not a great deal as the default parameter for the operator is I dont know if this is out of the scope of this pr and should be added separately - which guideline do you follow for these cases? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does look suspiciously wrong indeed. Since this is used in your fix, I see no problem with having it as part of the PR. |
||
time.sleep(1) | ||
|
||
def await_pod_completion(self, pod: V1Pod) -> V1Pod: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you remove it?
This was just added in https://github.com/apache/airflow/pull/23301/files
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didnt. It is happening in
log_iterable
. I just moved the new kwarg (, errors="backslashreplace"
) which was added by the pr you linked